3 #include <quda.h> // for QUDA_VERSION_STRING
4 #include <sys/stat.h> // for stat()
5 #include <fcntl.h>
6 #include <cfloat> // for FLT_MAX
7 #include <ctime>
8 #include <fstream>
9 #include <typeinfo>
10 #include <map>
11 #include <list>
12 #include <unistd.h>
14
15 #include <deque>
16 #include <queue>
17 #include <functional>
18
20
21 //#define LAUNCH_TIMER
23
25 {
26 static TuneKey last_key;
27 }
28
29 // intentionally leave this outside of the namespace for now
31
33 {
34 typedef std::map<TuneKey, TuneParam>
map;
35
37
40
45
47
55 {
56 }
57
65 {
66 }
67
69 {
70 if (&trace != this) {
77 }
78 return *this;
79 }
80 };
81
82 // linked list that is augmented each time we call a kernel
83 static std::list<TraceKey> trace_list;
84 static int enable_trace = 0;
85
87 {
88 static bool init =
false;
89
91 char *enable_trace_env = getenv("QUDA_ENABLE_TRACE");
92 if (enable_trace_env) {
93 if (strcmp(enable_trace_env, "1") == 0) {
94 // only explicitly posted trace events are included
95 enable_trace = 1;
96 } else if (strcmp(enable_trace_env, "2") == 0) {
97 // enable full kernel trace and posted trace events
98 enable_trace = 2;
99 }
100 }
102 }
103 return enable_trace;
104 }
105
106 void postTrace_(
const char *func,
const char *file,
int line)
107 {
110 strcpy(aux, file);
111 strcat(aux, ":");
117 trace_list.push_back(trace_entry);
118 }
119 }
120
121 static const std::string quda_hash = QUDA_HASH;
// defined in lib/Makefile
123 static map tunecache;
124 static map::iterator it;
125 static size_t initial_cache_size = 0;
126
128 #define STR(x) STR_(x)
131 #undef STR
132 #undef STR_
133
135 static bool tuning = false;
136
138
139 static bool profile_count = true;
140
143
145
149 static void deserializeTuneCache(std::istream &in)
150 {
152 std::stringstream ls;
153
154 TuneKey key;
156
160
161 int check;
162
163 while (in.good()) {
164 getline(in, line);
165 if (!line.length()) continue; // skip blank lines (e.g., at end of file)
166 ls.clear();
167 ls.str(line);
169 check = snprintf(key.volume, key.volume_n, "%s", v.c_str());
170 if (check < 0 || check >= key.volume_n)
errorQuda(
"Error writing volume string (check = %d)", check);
171 check = snprintf(key.name, key.name_n, "%s", n.c_str());
172 if (check < 0 || check >= key.name_n)
errorQuda(
"Error writing name string (check=%d)", check);
173 check = snprintf(key.aux, key.aux_n, "%s", a.c_str());
174 if (check < 0 || check >= key.aux_n)
errorQuda(
"Error writing aux string (check=%d)", check);
177 ls.ignore(1); // throw away tab before comment
178 getline(ls,
param.comment);
// assume anything remaining on the line is a comment
179 param.comment +=
"\n";
// our convention is to include the newline, since ctime() likes to do this
180 tunecache[key] =
param;
181 }
182 }
183
187 static void serializeTuneCache(std::ostream &out)
188 {
189 map::iterator entry;
190
191 for (entry = tunecache.begin(); entry != tunecache.end(); entry++) {
192 TuneKey key = entry->first;
193 TuneParam
param = entry->second;
194
195 out << std::setw(16) << key.volume << "\t" << key.name << "\t" << key.aux << "\t";
196 out <<
param.block.x <<
"\t" <<
param.block.y <<
"\t" <<
param.block.z <<
"\t";
197 out <<
param.grid.x <<
"\t" <<
param.grid.y <<
"\t" <<
param.grid.z <<
"\t";
198 out <<
param.shared_bytes <<
"\t" <<
param.aux.x <<
"\t" <<
param.aux.y <<
"\t" <<
param.aux.z <<
"\t"
199 <<
param.aux.w <<
"\t";
200 out <<
param.time <<
"\t" <<
param.comment;
// param.comment ends with a newline
201 }
202 }
203
206 {
207 return lhs.second.time * lhs.second.n_calls < rhs.second.time * rhs.second.n_calls;
208 }
209 };
210
214 static void serializeProfile(std::ostream &out, std::ostream &async_out)
215 {
216 map::iterator entry;
217 double total_time = 0.0;
218 double async_total_time = 0.0;
219
220 // first let's sort the entries in decreasing order of significance
221 typedef std::pair<TuneKey, TuneParam> profile_t;
222 typedef std::priority_queue<profile_t, std::deque<profile_t>, less_significant<profile_t>> queue_t;
223 queue_t q(tunecache.begin(), tunecache.end());
224
225 // now compute total time spent in kernels so we can give each kernel a significance
226 for (entry = tunecache.begin(); entry != tunecache.end(); entry++) {
227 TuneKey key = entry->first;
228 TuneParam
param = entry->second;
229
232 bool is_policy_kernel = strncmp(
tmp,
"policy_kernel", 13) == 0 ? true :
false;
233 bool is_policy = (strncmp(
tmp,
"policy", 6) == 0 && !is_policy_kernel) ?
true :
false;
234 if (
param.n_calls > 0 && !is_policy) total_time +=
param.n_calls *
param.time;
235 if (
param.n_calls > 0 && is_policy) async_total_time +=
param.n_calls *
param.time;
236 }
237
238 while (!q.empty()) {
239 TuneKey key = q.top().first;
240 TuneParam
param = q.top().second;
241
244 bool is_policy_kernel = strncmp(
tmp,
"policy_kernel", 13) == 0 ? true :
false;
245 bool is_policy = (strncmp(
tmp,
"policy", 6) == 0 && !is_policy_kernel) ?
true :
false;
246 bool is_nested_policy = (strncmp(
tmp,
"nested_policy", 6) == 0) ?
true :
false;
// nested policies not included
247
248 // synchronous profile
249 if (
param.n_calls > 0 && !is_policy && !is_nested_policy) {
251
252 out << std::setw(12) <<
param.n_calls *
param.time <<
"\t" << std::setw(12) << (time / total_time) * 100 <<
"\t";
253 out << std::setw(12) <<
param.n_calls <<
"\t" << std::setw(12) <<
param.time <<
"\t" << std::setw(16)
254 << key.volume << "\t";
255 out << key.name <<
"\t" << key.aux <<
"\t" <<
param.comment;
// param.comment ends with a newline
256 }
257
258 // async policy profile
259 if (
param.n_calls > 0 && is_policy) {
261
262 async_out << std::setw(12) <<
param.n_calls *
param.time <<
"\t" << std::setw(12)
263 << (time / async_total_time) * 100 << "\t";
264 async_out << std::setw(12) <<
param.n_calls <<
"\t" << std::setw(12) <<
param.time <<
"\t" << std::setw(16)
265 << key.volume << "\t";
266 async_out << key.name <<
"\t" << key.aux <<
"\t" <<
param.comment;
// param.comment ends with a newline
267 }
268
269 q.pop();
270 }
271
272 out << std::endl << "# Total time spent in kernels = " << total_time << " seconds" << std::endl;
273 async_out << std::endl
274 << "# Total time spent in asynchronous execution = " << async_total_time << " seconds" << std::endl;
275 }
276
280 static void serializeTrace(std::ostream &out)
281 {
282 for (auto it = trace_list.begin(); it != trace_list.end(); it++) {
283
284 TuneKey &key = it->key;
285
286 // special case kernel members of a policy
289 bool is_policy_kernel = strcmp(
tmp,
"policy_kernel") == 0 ? true :
false;
290
291 out << std::setw(12) << it->time << "\t";
292 out << std::setw(12) << it->device_bytes << "\t";
293 out << std::setw(12) << it->pinned_bytes << "\t";
294 out << std::setw(12) << it->mapped_bytes << "\t";
295 out << std::setw(12) << it->host_bytes << "\t";
296 out << std::setw(16) << key.volume << "\t";
297 if (is_policy_kernel) out << "\t";
298 out << key.name << "\t";
299 if (!is_policy_kernel) out << "\t";
300 out << key.aux << std::endl;
301 }
302 }
303
307 static void broadcastTuneCache()
308 {
309 #ifdef MULTI_GPU
310 std::stringstream serialized;
311 size_t size;
312
314 serializeTuneCache(serialized);
315 size = serialized.str().length();
316 }
318
319 if (size > 0) {
322 } else {
323 char *serstr = new char[size + 1];
325 serstr[size] = '0円'; // null-terminate
326 serialized.str(serstr);
327 deserializeTuneCache(serialized);
328 delete[] serstr;
329 }
330 }
331 #endif
332 }
333
334 /*
335 * Read tunecache from disk.
336 */
338 {
341 return;
342 }
343
344 char *path;
345 struct stat pstat;
347 std::ifstream cache_file;
348 std::stringstream ls;
349
350 path = getenv("QUDA_RESOURCE_PATH");
351
352 if (!path) {
353 warningQuda(
"Environment variable QUDA_RESOURCE_PATH is not set.");
354 warningQuda(
"Caching of tuned parameters will be disabled.");
355 return;
356 } else if (stat(path, &pstat) || !S_ISDIR(pstat.st_mode)) {
357 warningQuda(
"The path \"%s\" specified by QUDA_RESOURCE_PATH does not exist or is not a directory.", path);
358 warningQuda(
"Caching of tuned parameters will be disabled.");
359 return;
360 } else {
361 resource_path = path;
362 }
363
364 bool version_check = true;
365 char *override_version_env = getenv("QUDA_TUNE_VERSION_CHECK");
366 if (override_version_env && strcmp(override_version_env, "0") == 0) {
367 version_check = false;
368 warningQuda(
"Disabling QUDA tunecache version check");
369 }
370
371 #ifdef MULTI_GPU
373 #endif
374
375 cache_path = resource_path;
376 cache_path += "/tunecache.tsv";
377 cache_file.open(cache_path.c_str());
378
379 if (cache_file) {
380
381 if (!cache_file.good())
errorQuda(
"Bad format in %s", cache_path.c_str());
382 getline(cache_file, line);
383 ls.str(line);
384 ls >> token;
385 if (token.compare(
"tunecache"))
errorQuda(
"Bad format in %s", cache_path.c_str());
386 ls >> token;
387 if (version_check && token.compare(quda_version))
388 errorQuda(
"Cache file %s does not match current QUDA version. \nPlease delete this file or set the "
389 "QUDA_RESOURCE_PATH environment variable to point to a new path.",
390 cache_path.c_str());
391 ls >> token;
392 #ifdef GITVERSION
393 if (version_check && token.compare(
gitversion))
394 errorQuda(
"Cache file %s does not match current QUDA version. \nPlease delete this file or set the "
395 "QUDA_RESOURCE_PATH environment variable to point to a new path.",
396 cache_path.c_str());
397 #else
398 if (version_check && token.compare(quda_version))
399 errorQuda(
"Cache file %s does not match current QUDA version. \nPlease delete this file or set the "
400 "QUDA_RESOURCE_PATH environment variable to point to a new path.",
401 cache_path.c_str());
402 #endif
403 ls >> token;
404 if (version_check && token.compare(quda_hash))
405 errorQuda(
"Cache file %s does not match current QUDA build. \nPlease delete this file or set the "
406 "QUDA_RESOURCE_PATH environment variable to point to a new path.",
407 cache_path.c_str());
408
409 if (!cache_file.good())
errorQuda(
"Bad format in %s", cache_path.c_str());
410 getline(cache_file, line); // eat the blank line
411
412 if (!cache_file.good())
errorQuda(
"Bad format in %s", cache_path.c_str());
413 getline(cache_file, line); // eat the description line
414
415 deserializeTuneCache(cache_file);
416
417 cache_file.close();
418 initial_cache_size = tunecache.size();
419
421 printfQuda(
"Loaded %d sets of cached parameters from %s\n",
static_cast<int>(initial_cache_size),
422 cache_path.c_str());
423 }
424
425 } else {
426 warningQuda(
"Cache file not found. All kernels will be re-tuned (if tuning is enabled).");
427 }
428
429 #ifdef MULTI_GPU
430 }
431 #endif
432
433 broadcastTuneCache();
434 }
435
440 {
441 time_t now;
442 int lock_handle;
444 std::ofstream cache_file;
445
446 if (resource_path.empty()) return;
447
448 // FIXME: We should really check to see if any nodes have tuned a kernel that was not also tuned on node 0, since as things
449 // stand, the corresponding launch parameters would never get cached to disk in this situation. This will come up if we
450 // ever support different subvolumes per GPU (as might be convenient for lattice volumes that don't divide evenly).
451
452 #ifdef MULTI_GPU
454 #endif
455
456 if (tunecache.size() == initial_cache_size && !error) return;
457
458 // Acquire lock. Note that this is only robust if the filesystem supports flock() semantics, which is true for
459 // NFS on recent versions of linux but not Lustre by default (unless the filesystem was mounted with "-o flock").
460 lock_path = resource_path + "/tunecache.lock";
461 lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);
462 if (lock_handle == -1) {
463 warningQuda(
"Unable to lock cache file. Tuned launch parameters will not be cached to disk. "
464 "If you are certain that no other instances of QUDA are accessing this filesystem, "
465 "please manually remove %s",
466 lock_path.c_str());
467 return;
468 }
469 char msg[] = "If no instances of applications using QUDA are running,\n"
470 "this lock file shouldn't be here and is safe to delete.";
471 int stat = write(lock_handle, msg, sizeof(msg)); // check status to avoid compiler warning
472 if (stat == -1)
warningQuda(
"Unable to write to lock file for some bizarre reason");
473
474 cache_path = resource_path + (error ? "/tunecache_error.tsv" : "/tunecache.tsv");
475 cache_file.open(cache_path.c_str());
476
478 printfQuda(
"Saving %d sets of cached parameters to %s\n",
static_cast<int>(tunecache.size()), cache_path.c_str());
479 }
480
481 time(&now);
482 cache_file << "tunecache\t" << quda_version;
483 #ifdef GITVERSION
485 #else
486 cache_file << "\t" << quda_version;
487 #endif
488 cache_file << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;
489 cache_file << std::setw(16) << "volume"
490 << "\tname\taux\tblock.x\tblock.y\tblock.z\tgrid.x\tgrid.y\tgrid.z\tshared_bytes\taux.x\taux.y\taux."
491 "z\taux.w\ttime\tcomment"
492 << std::endl;
493 serializeTuneCache(cache_file);
494 cache_file.close();
495
496 // Release lock.
497 close(lock_handle);
498 remove(lock_path.c_str());
499
500 initial_cache_size = tunecache.size();
501
502 #ifdef MULTI_GPU
503 } else {
504 // give process 0 time to write out its tunecache if needed, but
505 // doesn't cause a hang if error is not triggered on process 0
506 if (error) sleep(10);
507 }
508 #endif
509 }
510
511 static bool policy_tuning = false;
513
515
516 static bool uber_tuning = false;
518
520
521 // flush profile, setting counts to zero
523 {
524 for (map::iterator entry = tunecache.begin(); entry != tunecache.end(); entry++) {
525 // set all n_calls = 0
528 }
529 }
530
531 // save profile
533 {
534 time_t now;
535 int lock_handle;
536 std::string lock_path, profile_path, async_profile_path, trace_path;
537 std::ofstream profile_file, async_profile_file, trace_file;
538
539 if (resource_path.empty()) return;
540
541 #ifdef MULTI_GPU
542 if (
comm_rank_global() == 0) {
// Make sure only one rank is writing to disk
543 #endif
544
545 // Acquire lock. Note that this is only robust if the filesystem supports flock() semantics, which is true for
546 // NFS on recent versions of linux but not Lustre by default (unless the filesystem was mounted with "-o flock").
547 lock_path = resource_path + "/profile.lock";
548 lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);
549 if (lock_handle == -1) {
550 warningQuda(
"Unable to lock profile file. Profile will not be saved to disk. "
551 "If you are certain that no other instances of QUDA are accessing this filesystem, "
552 "please manually remove %s",
553 lock_path.c_str());
554 return;
555 }
556 char msg[] = "If no instances of applications using QUDA are running,\n"
557 "this lock file shouldn't be here and is safe to delete.";
558 int stat = write(lock_handle, msg, sizeof(msg)); // check status to avoid compiler warning
559 if (stat == -1)
warningQuda(
"Unable to write to lock file for some bizarre reason");
560
561 // profile counter for writing out unique profiles
562 static int count = 0;
563
564 char *profile_fname = getenv("QUDA_PROFILE_OUTPUT_BASE");
565
566 if (!profile_fname) {
568 "Environment variable QUDA_PROFILE_OUTPUT_BASE not set; writing to profile.tsv and profile_async.tsv");
569 profile_path = resource_path + "/profile_" + std::to_string(count) + ".tsv";
570 async_profile_path = resource_path + "/profile_async_" + std::to_string(count) + ".tsv";
571 if (
traceEnabled()) trace_path = resource_path +
"/trace_" + std::to_string(count) +
".tsv";
572 } else {
573 profile_path = resource_path + "/" + profile_fname + "_" + std::to_string(count) + ".tsv";
574 async_profile_path = resource_path + "/" + profile_fname + "_" + std::to_string(count) + "_async.tsv";
576 trace_path = resource_path + "/" + profile_fname + "_trace_" + std::to_string(count) + ".tsv";
577 }
578
579 count++;
580
581 profile_file.open(profile_path.c_str());
582 async_profile_file.open(async_profile_path.c_str());
583 if (
traceEnabled()) trace_file.open(trace_path.c_str());
584
586 // compute number of non-zero entries that will be output in the profile
587 int n_entry = 0;
588 int n_policy = 0;
589 for (map::iterator entry = tunecache.begin(); entry != tunecache.end(); entry++) {
590 // if a policy entry, then we can ignore
594 bool is_policy = strcmp(
tmp,
"policy") == 0 ? true :
false;
595 if (
param.n_calls > 0 && !is_policy) n_entry++;
596 if (
param.n_calls > 0 && is_policy) n_policy++;
597 }
598
599 printfQuda(
"Saving %d sets of cached parameters to %s\n", n_entry, profile_path.c_str());
600 printfQuda(
"Saving %d sets of cached profiles to %s\n", n_policy, async_profile_path.c_str());
602 printfQuda(
"Saving trace list with %lu entries to %s\n", trace_list.size(), trace_path.c_str());
603 }
604
605 time(&now);
606
607 std::string Label = label.empty() ?
"profile" : label;
608
609 profile_file << Label << "\t" << quda_version;
610 #ifdef GITVERSION
612 #else
613 profile_file << "\t" << quda_version;
614 #endif
615 profile_file << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;
616 profile_file << std::setw(12) << "total time"
617 << "\t" << std::setw(12) << "percentage"
618 << "\t" << std::setw(12) << "calls"
619 << "\t" << std::setw(12) << "time / call"
620 << "\t" << std::setw(16) << "volume"
621 << "\tname\taux\tcomment" << std::endl;
622
623 async_profile_file << Label << "\t" << quda_version;
624 #ifdef GITVERSION
626 #else
627 async_profile_file << "\t" << quda_version;
628 #endif
629 async_profile_file << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;
630 async_profile_file << std::setw(12) << "total time"
631 << "\t" << std::setw(12) << "percentage"
632 << "\t" << std::setw(12) << "calls"
633 << "\t" << std::setw(12) << "time / call"
634 << "\t" << std::setw(16) << "volume"
635 << "\tname\taux\tcomment" << std::endl;
636
637 serializeProfile(profile_file, async_profile_file);
638
639 profile_file.close();
640 async_profile_file.close();
641
643 trace_file << "trace"
644 << "\t" << quda_version;
645 #ifdef GITVERSION
647 #else
648 trace_file << "\t" << quda_version;
649 #endif
650 trace_file << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;
651
652 trace_file << std::setw(12) << "time\t" << std::setw(12) << "device-mem\t" << std::setw(12) << "pinned-mem\t";
653 trace_file << std::setw(12) << "mapped-mem\t" << std::setw(12) << "host-mem\t";
654 trace_file << std::setw(16) << "volume"
655 << "\tname\taux" << std::endl;
656
657 serializeTrace(trace_file);
658
659 trace_file.close();
660 }
661
662 // Release lock.
663 close(lock_handle);
664 remove(lock_path.c_str());
665
666 #ifdef MULTI_GPU
667 }
668 #endif
669 }
670
671 static TimeProfile launchTimer("tuneLaunch");
672
678 {
679 #ifdef LAUNCH_TIMER
682 #endif
683
686 last_key = key;
687
688 #ifdef LAUNCH_TIMER
691 #endif
692
693 static const Tunable *active_tunable;
// for error checking
694 it = tunecache.find(key);
695
696 // first check if we have the tuned value and return if we have it
698
699 #ifdef LAUNCH_TIMER
702 #endif
703
705
709 }
710
711 #ifdef LAUNCH_TIMER
714 #endif
715
717
718 // we could be tuning outside of the current scope
719 if (!tuning && profile_count) param_tuned.
n_calls++;
720
721 #ifdef LAUNCH_TIMER
724 #endif
725
728 trace_list.push_back(trace_entry);
729 }
730
731 return param_tuned;
732 }
733
734 #ifdef LAUNCH_TIMER
737 #endif
738
740
748 }
749
750 return param_default;
751 } else if (!tuning) {
752
753 /* As long as global reductions are not disabled, only do the
754 tuning on node 0, else do the tuning on all nodes since we
755 can't guarantee that all nodes are partaking */
758 cudaError_t error = cudaSuccess;
760 float elapsed_time, best_time;
761 time_t now;
762
763 tuning = true;
764 active_tunable = &tunable;
765 best_time = FLT_MAX;
766
769
770 cudaEventCreate(&
start);
771 cudaEventCreate(&
end);
772
775 }
776
778 tune_timer.
Start(__func__, __FILE__, __LINE__);
779
781 while (tuning) {
782 cudaDeviceSynchronize();
783 cudaGetLastError(); // clear error counter
786 printfQuda(
"About to call tunable.apply block=(%d,%d,%d) grid=(%d,%d,%d) shared_bytes=%d aux=(%d,%d,%d)\n",
789 }
790 tunable.
apply(0);
// do initial call in case we need to jit compile for these parameters or if policy tuning
791
792 cudaEventRecord(
start, 0);
793 for (
int i = 0; i < tunable.
tuningIter(); i++) {
794 tunable.
apply(0);
// calls tuneLaunch() again, which simply returns the currently active param
795 }
796 cudaEventRecord(
end, 0);
797 cudaEventSynchronize(
end);
798 cudaEventElapsedTime(&elapsed_time,
start,
end);
799 cudaDeviceSynchronize();
800 error = cudaGetLastError();
801
802 { // check that error state is cleared
803 cudaDeviceSynchronize();
804 cudaError_t error = cudaGetLastError();
805 if (error != cudaSuccess)
errorQuda(
"Failed to clear error state %s\n", cudaGetErrorString(error));
806 }
807
809 if ((elapsed_time < best_time) && (error == cudaSuccess) && (tunable.
jitifyError() == CUDA_SUCCESS)) {
810 best_time = elapsed_time;
812 }
814 if (error == cudaSuccess && tunable.
jitifyError() == CUDA_SUCCESS) {
817 } else {
819 // if not jitify error must be regular error
821 } else {
822 // else is a jitify error
823 const char *str;
826 }
827 }
828 }
831 }
832
833 tune_timer.
Stop(__func__, __FILE__, __LINE__);
834
835 if (best_time == FLT_MAX) {
837 }
841 }
842 time(&now);
844 best_param.
comment +=
", tuning took " + std::to_string(tune_timer.
Last()) +
" seconds at ";
845 best_param.
comment += ctime(&now);
// includes a newline
846 best_param.
time = best_time;
847
848 cudaEventDestroy(
start);
849 cudaEventDestroy(
end);
850
852 tuning = true;
854 tuning = false;
856 tunecache[key] = best_param;
857 }
859
860 // check this process is getting the key that is expected
861 if (tunecache.find(key) == tunecache.end()) {
863 }
864 param = tunecache[key];
// read this now for all processes
865
868 trace_list.push_back(trace_entry);
869 }
870
871 } else if (&tunable != active_tunable) {
872 errorQuda(
"Unexpected call to tuneLaunch() in %s::apply()",
typeid(tunable).name());
873 }
874
875 param.n_calls = profile_count ? 1 : 0;
876
878 }
879
881 {
882 #ifdef LAUNCH_TIMER
884 #endif
885 }
886 } // namespace quda
virtual std::string perfString(float time) const
virtual std::string paramString(const TuneParam ¶m) const
CUresult jitifyError() const
virtual bool advanceTuneParam(TuneParam ¶m) const
virtual void initTuneParam(TuneParam ¶m) const
virtual TuneKey tuneKey() const =0
virtual void apply(const qudaStream_t &stream)=0
void checkLaunchParam(TuneParam ¶m)
virtual void defaultTuneParam(TuneParam ¶m) const
virtual int tuningIter() const
bool commGlobalReduction()
int comm_rank_global(void)
void comm_broadcast_global(void *data, size_t nbytes)
These routine broadcast the data according to the default communicator.
cudaColorSpinorField * tmp
enum QudaVerbosity_s QudaVerbosity
void init()
Create the BLAS context.
void start()
Start profiling.
void disableProfileCount()
Disable the profile kernel counting.
TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
const std::map< TuneKey, TuneParam > & getTuneCache()
Returns a reference to the tunecache map.
void saveTuneCache(bool error=false)
bool policyTuning()
Query whether we are currently tuning a policy.
void setUberTuning(bool)
Enable / disable whether we are tuning an uber kernel.
std::map< TuneKey, TuneParam > map
size_t host_allocated_peak()
void setPolicyTuning(bool)
Enable / disable whether are tuning a policy.
bool activeTuning()
query if tuning is in progress
void postTrace_(const char *func, const char *file, int line)
Post an event in the trace, recording where it was posted.
void flushProfile()
Flush profile contents, setting all counts to zero.
size_t mapped_allocated_peak()
void i32toa(char *buffer, int32_t value)
bool use_managed_memory()
size_t pinned_allocated_peak()
void enableProfileCount()
Enable the profile kernel counting.
size_t device_allocated_peak()
void saveProfile(const std::string label="")
Save profile to disk.
bool uberTuning()
Query whether we are tuning an uber kernel.
Main header file for the QUDA library.
#define QUDA_VERSION_SUBMINOR
#define QUDA_VERSION_MAJOR
#define QUDA_VERSION_MINOR
void Stop(const char *func, const char *file, int line)
void Start(const char *func, const char *file, int line)
TraceKey(const TraceKey &trace)
TraceKey & operator=(const TraceKey &trace)
TraceKey(const TuneKey &key, float time)
bool operator()(const T &lhs, const T &rhs)
quda::TuneKey getLastTuneKey()
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaVerbosity getVerbosity()