1 #include <unordered_set>
6
7 // if this macro is defined then we use the driver API, else use the
8 // runtime API. Typically the driver API has 10-20% less overhead
10
11 // if this macro is defined then we profile the CUDA API calls
12 //#define API_PROFILE
13
14 #ifdef API_PROFILE
15 #define PROFILE(f, idx) \
16 apiTimer.TPSTART(idx); \
17 f; \
18 apiTimer.TPSTOP(idx);
19 #else
20 #define PROFILE(f, idx) f;
21 #endif
22
24 {
25
26 // No need to abstract these across the library so keep these definitions local to CUDA target
27
34 void qudaFuncSetAttribute_(
const void *kernel, cudaFuncAttribute attr,
int value,
const char *func,
const char *file,
35 const char *line);
36
42 void qudaFuncGetAttributes_(cudaFuncAttributes &attr,
const void *kernel,
const char *func,
const char *file,
43 const char *line);
44
45 #define qudaFuncSetAttribute(kernel, attr, value) \
46 ::quda::qudaFuncSetAttribute_(kernel, attr, value, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
47
48 #define qudaFuncGetAttributes(attr, kernel) \
49 ::quda::qudaFuncGetAttributes_(attr, kernel, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
50
51 #ifdef USE_DRIVER_API
52 static TimeProfile apiTimer("CUDA API calls (driver)");
53 #else
54 static TimeProfile apiTimer("CUDA API calls (runtime)");
55 #endif
56
58 {
60 static std::unordered_set<const void *> cache;
61 auto search = cache.find(func);
62 if (search == cache.end()) {
63 cache.insert(func);
64 qudaFuncSetAttribute(func, cudaFuncAttributePreferredSharedMemoryCarveout, (
int)cudaSharedmemCarveoutMaxShared);
65 cudaFuncAttributes attributes;
69 }
70 }
71
72 // no driver API variant here since we have C++ functions
77 }
78
80 {
81 void *dst;
82 const void *src;
83 const size_t count;
84 const int value;
85 const bool copy;
86 const cudaMemcpyKind kind;
87 const bool async;
88 const char *name;
89 const bool active_tuning;
90
91 unsigned int sharedBytesPerThread() const { return 0; }
92 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0; }
93
94 public:
95 inline QudaMem(
void *dst,
const void *src,
size_t count, cudaMemcpyKind kind,
const cudaStream_t &
stream,
96 bool async, const char *func, const char *file, const char *line) :
97 dst(dst),
98 src(src),
99 count(count),
100 value(0),
102 kind(kind),
103 async(async),
105 {
106 if (!async) {
107 switch (kind) {
108 case cudaMemcpyDeviceToHost: name = "cudaMemcpyDeviceToHost"; break;
109 case cudaMemcpyHostToDevice: name = "cudaMemcpyHostToDevice"; break;
110 case cudaMemcpyHostToHost: name = "cudaMemcpyHostToHost"; break;
111 case cudaMemcpyDeviceToDevice: name = "cudaMemcpyDeviceToDevice"; break;
112 case cudaMemcpyDefault: name = "cudaMemcpyDefault"; break;
113 default:
errorQuda(
"Unsupported cudaMemcpyKind %d", kind);
114 }
115 } else {
116 switch (kind) {
117 case cudaMemcpyDeviceToHost: name = "cudaMemcpyAsyncDeviceToHost"; break;
118 case cudaMemcpyHostToDevice: name = "cudaMemcpyAsyncHostToDevice"; break;
119 case cudaMemcpyHostToHost: name = "cudaMemcpyAsyncHostToHost"; break;
120 case cudaMemcpyDeviceToDevice: name = "cudaMemcpyAsyncDeviceToDevice"; break;
121 case cudaMemcpyDefault: name = "cudaMemcpyAsyncDefault"; break;
122 default:
errorQuda(
"Unsupported cudaMemcpyKind %d", kind);
123 }
124 }
130
132 }
133
134 inline QudaMem(
void *dst,
int value,
size_t count,
const cudaStream_t &
stream,
bool async,
const char *func,
135 const char *file, const char *line) :
136 dst(dst),
137 src(nullptr),
138 count(count),
139 value(value),
141 kind(cudaMemcpyDefault),
142 async(async),
144 {
145 name = !async ? "cudaMemset" : "cudaMemsetAsync";
151
153 }
154
156 {
158
160 if (async) {
161 #ifdef USE_DRIVER_API
162 switch (kind) {
163 case cudaMemcpyDeviceToHost:
165 break;
166 case cudaMemcpyHostToDevice:
168 break;
169 case cudaMemcpyDeviceToDevice:
171 break;
172 case cudaMemcpyDefault:
174 break;
175 default:
errorQuda(
"Unsupported cuMemcpyTypeAsync %d", kind);
176 }
177 #else
179 switch (kind) {
184 default:
errorQuda(
"Unsupported cudaMemcpyTypeAsync %d", kind);
185 }
186
187 PROFILE(cudaMemcpyAsync(dst, src, count, kind,
stream), type);
188 #endif
189 } else {
190 #ifdef USE_DRIVER_API
191 switch (kind) {
192 case cudaMemcpyDeviceToHost: cuMemcpyDtoH(dst, (CUdeviceptr)src, count); break;
193 case cudaMemcpyHostToDevice: cuMemcpyHtoD((CUdeviceptr)dst, src, count); break;
194 case cudaMemcpyHostToHost: memcpy(dst, src, count); break;
195 case cudaMemcpyDeviceToDevice: cuMemcpyDtoD((CUdeviceptr)dst, (CUdeviceptr)src, count); break;
196 case cudaMemcpyDefault: cuMemcpy((CUdeviceptr)dst, (CUdeviceptr)src, count); break;
197 default:
errorQuda(
"Unsupported cudaMemcpyType %d", kind);
198 }
199 #else
200 cudaMemcpy(dst, src, count, kind);
201 #endif
202 }
203 } else {
204 #ifdef USE_DRIVER_API
205 if (async)
206 cuMemsetD32Async((CUdeviceptr)dst, value, count / 4,
stream);
207 else
208 cuMemsetD32((CUdeviceptr)dst, value, count / 4);
209 #else
210 if (async)
211 cudaMemsetAsync(dst, value, count,
stream);
212 else
213 cudaMemset(dst, value, count);
214 #endif
215 }
216 }
217
219
221 {
222 char vol[128];
223 strcpy(vol, "bytes=");
224 u64toa(vol + 6, (uint64_t)count);
226 }
227
228 long long flops()
const {
return 0; }
229 long long bytes()
const {
return kind == cudaMemcpyDeviceToDevice ? 2 * count : count; }
230 };
231
232 void qudaMemcpy_(
void *dst,
const void *src,
size_t count, cudaMemcpyKind kind,
const char *func,
const char *file,
233 const char *line)
234 {
235 if (count == 0) return;
236 QudaMem copy(dst, src, count, kind, 0,
false, func, file, line);
237 cudaError_t error = cudaGetLastError();
238 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
239 }
240
242 const char *func, const char *file, const char *line)
243 {
244 if (count == 0) return;
245
246 if (kind == cudaMemcpyDeviceToDevice) {
248 } else {
249 #ifdef USE_DRIVER_API
250 switch (kind) {
251 case cudaMemcpyDeviceToHost:
253 break;
254 case cudaMemcpyHostToDevice:
256 break;
257 case cudaMemcpyDeviceToDevice:
259 break;
260 case cudaMemcpyDefault:
262 break;
263 default:
errorQuda(
"Unsupported cuMemcpyTypeAsync %d", kind);
264 }
265 #else
268 #endif
269 }
270 }
271
272 void qudaMemcpy2D_(
void *dst,
size_t dpitch,
const void *src,
size_t spitch,
size_t width,
size_t height,
273 cudaMemcpyKind kind, const char *func, const char *file, const char *line)
274 {
275 #ifdef USE_DRIVER_API
277 param.srcPitch = spitch;
279 param.srcXInBytes = 0;
280 param.dstPitch = dpitch;
282 param.dstXInBytes = 0;
283 param.WidthInBytes = width;
284 param.Height = height;
285
286 switch (kind) {
287 case cudaMemcpyDeviceToHost:
288 param.srcDevice = (CUdeviceptr)src;
289 param.srcMemoryType = CU_MEMORYTYPE_DEVICE;
291 param.dstMemoryType = CU_MEMORYTYPE_HOST;
292 break;
293 default:
errorQuda(
"Unsupported cuMemcpyType2DAsync %d", kind);
294 }
296 #else
298 #endif
299 }
300
301 void qudaMemcpy2DAsync_(
void *dst,
size_t dpitch,
const void *src,
size_t spitch,
size_t width,
size_t height,
303 const char *line)
304 {
305 #ifdef USE_DRIVER_API
307 param.srcPitch = spitch;
309 param.srcXInBytes = 0;
310 param.dstPitch = dpitch;
312 param.dstXInBytes = 0;
313 param.WidthInBytes = width;
314 param.Height = height;
315
316 switch (kind) {
317 case cudaMemcpyDeviceToHost:
318 param.srcDevice = (CUdeviceptr)src;
319 param.srcMemoryType = CU_MEMORYTYPE_DEVICE;
321 param.dstMemoryType = CU_MEMORYTYPE_HOST;
322 break;
323 default:
errorQuda(
"Unsupported cuMemcpyType2DAsync %d", kind);
324 }
326 #else
328 #endif
329 }
330
331 void qudaMemset_(
void *ptr,
int value,
size_t count,
const char *func,
const char *file,
const char *line)
332 {
333 if (count == 0) return;
334 QudaMem set(ptr, value, count, 0,
false, func, file, line);
335 cudaError_t error = cudaGetLastError();
337 errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
338 }
339
341 const char *file, const char *line)
342 {
343 if (count == 0) return;
345 cudaError_t error = cudaGetLastError();
346 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
347 }
348
349 void qudaMemset2D_(
void *ptr,
size_t pitch,
int value,
size_t width,
size_t height,
const char *func,
350 const char *file, const char *line)
351 {
352 cudaError_t error = cudaMemset2D(ptr, pitch, value, width, height);
353 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
354 }
355
357 const char *func, const char *file, const char *line)
358 {
359 cudaError_t error = cudaMemset2DAsync(ptr, pitch, value, width, height,
stream);
360 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
361 }
362
364 const char *func, const char *file, const char *line)
365 {
366 int dev_id = 0;
370 dev_id = cudaCpuDeviceId;
371 else
373
374 cudaError_t error = cudaMemPrefetchAsync(ptr, count, dev_id,
stream);
375 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
376 }
377
378 bool qudaEventQuery_(cudaEvent_t &event,
const char *func,
const char *file,
const char *line)
379 {
380 #ifdef USE_DRIVER_API
382 switch (error) {
383 case CUDA_SUCCESS: return true;
384 case CUDA_ERROR_NOT_READY: return false;
385 default: {
386 const char *str;
387 cuGetErrorName(error, &str);
388 errorQuda(
"cuEventQuery returned error %s\n (%s:%s in %s())", str, file, line, func);
389 }
390 }
391 #else
393 switch (error) {
394 case cudaSuccess: return true;
395 case cudaErrorNotReady: return false;
396 default:
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
397 }
398 #endif
399 return false;
400 }
401
403 {
404 #ifdef USE_DRIVER_API
406 if (error != CUDA_SUCCESS) {
407 const char *str;
408 cuGetErrorName(error, &str);
409 errorQuda(
"cuEventRecord returned error %s\n (%s:%s in %s())", str, file, line, func);
410 }
411 #else
413 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
414 #endif
415 }
416
418 const char *file, const char *line)
419 {
420 #ifdef USE_DRIVER_API
422 if (error != CUDA_SUCCESS) {
423 const char *str;
424 cuGetErrorName(error, &str);
425 errorQuda(
"cuStreamWaitEvent returned error %s\n (%s:%s in %s())", str, file, line, func);
426 }
427 #else
429 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
430 #endif
431 }
432
434 {
435 #ifdef USE_DRIVER_API
437 if (error != CUDA_SUCCESS) {
438 const char *str;
439 cuGetErrorName(error, &str);
440 errorQuda(
"cuEventSynchronize returned error %s\n (%s:%s in %s())", str, file, line, func);
441 }
442 #else
444 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
445 #endif
446 }
447
449 {
450 #ifdef USE_DRIVER_API
452 if (error != CUDA_SUCCESS) {
453 const char *str;
454 cuGetErrorName(error, &str);
455 errorQuda(
"(CUDA) cuStreamSynchronize returned error %s\n (%s:%s in %s())\n", str, file, line, func);
456 }
457 #else
460 errorQuda(
"(CUDA) %s\n (%s:%s in %s())", cudaGetErrorString(error), file, line, func);
461 #endif
462 }
463
465 {
466 #ifdef USE_DRIVER_API
468 if (error != CUDA_SUCCESS) {
469 const char *str;
470 cuGetErrorName(error, &str);
471 errorQuda(
"cuCtxSynchronize returned error %s (%s:%s in %s())\n", str, file, line, func);
472 }
473 #else
475 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
476 #endif
477 }
478
479 void qudaFuncSetAttribute_(
const void *kernel, cudaFuncAttribute attr,
int value,
const char *func,
const char *file,
480 const char *line)
481 {
482 // no driver API variant here since we have C++ functions
484 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
485 }
486
488 const char *line)
489 {
490 // no driver API variant here since we have C++ functions
492 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
493 }
494
496 {
497 #ifdef API_PROFILE
499 #endif
500 }
501
502 } // namespace quda
QudaMem(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const cudaStream_t &stream, bool async, const char *func, const char *file, const char *line)
void apply(const qudaStream_t &stream)
QudaMem(void *dst, int value, size_t count, const cudaStream_t &stream, bool async, const char *func, const char *file, const char *line)
bool advanceTuneParam(TuneParam ¶m) const
bool set_max_shared_bytes
@ QUDA_CUDA_FIELD_LOCATION
@ QUDA_CPU_FIELD_LOCATION
enum QudaFieldLocation_s QudaFieldLocation
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
size_t max_dynamic_shared_memory()
Returns the maximum dynamic shared memory per block.
TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
void qudaMemset2DAsync_(void *ptr, size_t pitch, int value, size_t width, size_t height, const qudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemsetAsync or driver API equivalent.
bool qudaEventQuery_(cudaEvent_t &event, const char *func, const char *file, const char *line)
Wrapper around cudaEventQuery or cuEventQuery with built-in error checking.
void qudaMemcpy2DAsync_(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, cudaMemcpyKind kind, const qudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy2DAsync or driver API equivalent.
void qudaMemset2D_(void *ptr, size_t pitch, int value, size_t width, size_t height, const char *func, const char *file, const char *line)
Wrapper around cudaMemset2D or driver API equivalent.
void printAPIProfile()
Print out the timer profile for CUDA API calls.
void qudaDeviceSynchronize_(const char *func, const char *file, const char *line)
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize with built-in error checking.
void qudaFuncSetAttribute_(const void *kernel, cudaFuncAttribute attr, int value, const char *func, const char *file, const char *line)
Wrapper around cudaFuncSetAttribute with built-in error checking.
void qudaStreamSynchronize_(qudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize with built-in error checking.
@ QUDA_PROFILE_MEMCPY_H2D_ASYNC
@ QUDA_PROFILE_EVENT_SYNCHRONIZE
@ QUDA_PROFILE_FUNC_SET_ATTRIBUTE
@ QUDA_PROFILE_MEMCPY_D2D_ASYNC
@ QUDA_PROFILE_DEVICE_SYNCHRONIZE
@ QUDA_PROFILE_STREAM_SYNCHRONIZE
@ QUDA_PROFILE_EVENT_QUERY
@ QUDA_PROFILE_STREAM_WAIT_EVENT
@ QUDA_PROFILE_MEMCPY_DEFAULT_ASYNC
@ QUDA_PROFILE_MEMCPY_D2H_ASYNC
@ QUDA_PROFILE_LAUNCH_KERNEL
@ QUDA_PROFILE_EVENT_RECORD
@ QUDA_PROFILE_MEMCPY2D_D2H_ASYNC
bool activeTuning()
query if tuning is in progress
void qudaEventSynchronize_(cudaEvent_t &event, const char *func, const char *file, const char *line)
Wrapper around cudaEventSynchronize or cuEventSynchronize with built-in error checking.
void qudaMemcpy2D_(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, cudaMemcpyKind kind, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy2DAsync or driver API equivalent.
qudaError_t qudaLaunchKernel(const void *func, const TuneParam &tp, void **args, qudaStream_t stream)
Wrapper around cudaLaunchKernel.
void qudaMemsetAsync_(void *ptr, int value, size_t count, const qudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemsetAsync or driver API equivalent.
void qudaMemPrefetchAsync_(void *ptr, size_t count, QudaFieldLocation mem_space, const qudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemPrefetchAsync or driver API equivalent.
__host__ __device__ std::enable_if<!isFixed< T1 >::value &&!isFixed< T2 >::value, void >::type copy(T1 &a, const T2 &b)
Copy function which is trival between floating point types. When converting to an integer type,...
void u64toa(char *buffer, uint64_t value)
void qudaMemcpy_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy or driver API equivalent.
void qudaEventRecord_(cudaEvent_t &event, qudaStream_t stream, const char *func, const char *file, const char *line)
Wrapper around cudaEventRecord or cuEventRecord with built-in error checking.
void qudaStreamWaitEvent_(qudaStream_t stream, cudaEvent_t event, unsigned int flags, const char *func, const char *file, const char *line)
Wrapper around cudaStreamWaitEvent or cuStreamWaitEvent with built-in error checking.
void qudaMemset_(void *ptr, int value, size_t count, const char *func, const char *file, const char *line)
Wrapper around cudaMemset or driver API equivalent.
void qudaFuncGetAttributes_(cudaFuncAttributes &attr, const void *kernel, const char *func, const char *file, const char *line)
Wrapper around cudaFuncGetAttributes with built-in error checking.
void qudaMemcpyAsync_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const qudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpyAsync or driver API equivalent.
#define qudaFuncGetAttributes(attr, kernel)
#define qudaFuncSetAttribute(kernel, attr, value)
cudaStream_t qudaStream_t
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaVerbosity getVerbosity()