QUDA: quda/lib/tune.cpp Source File

QUDA v1.1.0

A library for QCD on GPUs

[フレーム]

tune.cpp

Go to the documentation of this file.

1 #include <tune_quda.h>

2 #include <comm_quda.h>

3 #include <quda.h> // for QUDA_VERSION_STRING

4 #include <sys/stat.h> // for stat()

5 #include <fcntl.h>

6 #include <cfloat> // for FLT_MAX

7 #include <ctime>

8 #include <fstream>

9 #include <typeinfo>

10 #include <map>

11 #include <list>

12 #include <unistd.h>

13 #include <uint_to_char.h>

15 #include <deque>

16 #include <queue>

17 #include <functional>

19 #include <communicator_quda.h>

21 //#define LAUNCH_TIMER

22 extern char *gitversion;

24 namespace quda

25 {

26 static TuneKey last_key;

27 }

29 // intentionally leave this outside of the namespace for now

30 quda::TuneKey getLastTuneKey() { return quda::last_key; }

32 namespace quda

33 {

34 typedef std::map<TuneKey, TuneParam> map;

36 struct TraceKey {

38 TuneKey key;

39 float time;

41 long device_bytes;

42 long pinned_bytes;

43 long mapped_bytes;

44 long host_bytes;

46 TraceKey() {}

48 TraceKey(const TuneKey &key, float time) :

49 key(key),

50 time(time),

51 device_bytes(device_allocated_peak()),

52 pinned_bytes(pinned_allocated_peak()),

53 mapped_bytes(mapped_allocated_peak()),

54 host_bytes(host_allocated_peak())

55 {

56 }

58 TraceKey(const TraceKey &trace) :

59 key(trace.key),

60 time(trace.time),

61 device_bytes(trace.device_bytes),

62 pinned_bytes(trace.pinned_bytes),

63 mapped_bytes(trace.mapped_bytes),

64 host_bytes(trace.host_bytes)

65 {

66 }

68 TraceKey &operator=(const TraceKey &trace)

69 {

70 if (&trace != this) {

71 key = trace.key;

72 time = trace.time;

73 device_bytes = trace.device_bytes;

74 pinned_bytes = trace.pinned_bytes;

75 mapped_bytes = trace.mapped_bytes;

76 host_bytes = trace.host_bytes;

77 }

78 return *this;

79 }

80 };

82 // linked list that is augmented each time we call a kernel

83 static std::list<TraceKey> trace_list;

84 static int enable_trace = 0;

86 int traceEnabled()

87 {

88 static bool init = false;

90 if (!init) {

91 char *enable_trace_env = getenv("QUDA_ENABLE_TRACE");

92 if (enable_trace_env) {

93 if (strcmp(enable_trace_env, "1") == 0) {

94 // only explicitly posted trace events are included

95 enable_trace = 1;

96 } else if (strcmp(enable_trace_env, "2") == 0) {

97 // enable full kernel trace and posted trace events

98 enable_trace = 2;

99 }

100 }

101 init = true;

102 }

103 return enable_trace;

104 }

105

106 void postTrace_(const char *func, const char *file, int line)

107 {

108 if (traceEnabled() >= 1) {

109 char aux[TuneKey::aux_n];

110 strcpy(aux, file);

111 strcat(aux, ":");

112 char tmp[TuneKey::aux_n];

113 i32toa(tmp, line);

114 strcat(aux, tmp);

115 TuneKey key("", func, aux);

116 TraceKey trace_entry(key, 0.0);

117 trace_list.push_back(trace_entry);

118 }

119 }

120

121 static const std::string quda_hash = QUDA_HASH; // defined in lib/Makefile

122 static std::string resource_path;

123 static map tunecache;

124 static map::iterator it;

125 static size_t initial_cache_size = 0;

126

127 #define STR_(x) #x

128 #define STR(x) STR_(x)

129 static const std::string quda_version

130 = STR(QUDA_VERSION_MAJOR) "." STR(QUDA_VERSION_MINOR) "." STR(QUDA_VERSION_SUBMINOR);

131 #undef STR

132 #undef STR_

133

135 static bool tuning = false;

136

137 bool activeTuning() { return tuning; }

138

139 static bool profile_count = true;

140

141 void disableProfileCount() { profile_count = false; }

142 void enableProfileCount() { profile_count = true; }

143

144 const map &getTuneCache() { return tunecache; }

145

149 static void deserializeTuneCache(std::istream &in)

150 {

151 std::string line;

152 std::stringstream ls;

153

154 TuneKey key;

155 TuneParam param;

156

157 std::string v;

158 std::string n;

159 std::string a;

160

161 int check;

162

163 while (in.good()) {

164 getline(in, line);

165 if (!line.length()) continue; // skip blank lines (e.g., at end of file)

166 ls.clear();

167 ls.str(line);

168 ls >> v >> n >> a >> param.block.x >> param.block.y >> param.block.z;

169 check = snprintf(key.volume, key.volume_n, "%s", v.c_str());

170 if (check < 0 || check >= key.volume_n) errorQuda("Error writing volume string (check = %d)", check);

171 check = snprintf(key.name, key.name_n, "%s", n.c_str());

172 if (check < 0 || check >= key.name_n) errorQuda("Error writing name string (check=%d)", check);

173 check = snprintf(key.aux, key.aux_n, "%s", a.c_str());

174 if (check < 0 || check >= key.aux_n) errorQuda("Error writing aux string (check=%d)", check);

175 ls >> param.grid.x >> param.grid.y >> param.grid.z >> param.shared_bytes >> param.aux.x >> param.aux.y

176 >> param.aux.z >> param.aux.w >> param.time;

177 ls.ignore(1); // throw away tab before comment

178 getline(ls, param.comment); // assume anything remaining on the line is a comment

179 param.comment += "\n"; // our convention is to include the newline, since ctime() likes to do this

180 tunecache[key] = param;

181 }

182 }

183

187 static void serializeTuneCache(std::ostream &out)

188 {

189 map::iterator entry;

190

191 for (entry = tunecache.begin(); entry != tunecache.end(); entry++) {

192 TuneKey key = entry->first;

193 TuneParam param = entry->second;

194

195 out << std::setw(16) << key.volume << "\t" << key.name << "\t" << key.aux << "\t";

196 out << param.block.x << "\t" << param.block.y << "\t" << param.block.z << "\t";

197 out << param.grid.x << "\t" << param.grid.y << "\t" << param.grid.z << "\t";

198 out << param.shared_bytes << "\t" << param.aux.x << "\t" << param.aux.y << "\t" << param.aux.z << "\t"

199 << param.aux.w << "\t";

200 out << param.time << "\t" << param.comment; // param.comment ends with a newline

201 }

202 }

203

204 template <class T> struct less_significant : std::binary_function<T, T, bool> {

205 inline bool operator()(const T &lhs, const T &rhs)

206 {

207 return lhs.second.time * lhs.second.n_calls < rhs.second.time * rhs.second.n_calls;

208 }

209 };

210

214 static void serializeProfile(std::ostream &out, std::ostream &async_out)

215 {

216 map::iterator entry;

217 double total_time = 0.0;

218 double async_total_time = 0.0;

219

220 // first let's sort the entries in decreasing order of significance

221 typedef std::pair<TuneKey, TuneParam> profile_t;

222 typedef std::priority_queue<profile_t, std::deque<profile_t>, less_significant<profile_t>> queue_t;

223 queue_t q(tunecache.begin(), tunecache.end());

224

225 // now compute total time spent in kernels so we can give each kernel a significance

226 for (entry = tunecache.begin(); entry != tunecache.end(); entry++) {

227 TuneKey key = entry->first;

228 TuneParam param = entry->second;

229

230 char tmp[TuneKey::aux_n] = {};

231 strncpy(tmp, key.aux, TuneKey::aux_n);

232 bool is_policy_kernel = strncmp(tmp, "policy_kernel", 13) == 0 ? true : false;

233 bool is_policy = (strncmp(tmp, "policy", 6) == 0 && !is_policy_kernel) ? true : false;

234 if (param.n_calls > 0 && !is_policy) total_time += param.n_calls * param.time;

235 if (param.n_calls > 0 && is_policy) async_total_time += param.n_calls * param.time;

236 }

237

238 while (!q.empty()) {

239 TuneKey key = q.top().first;

240 TuneParam param = q.top().second;

241

242 char tmp[TuneKey::aux_n] = {};

243 strncpy(tmp, key.aux, TuneKey::aux_n);

244 bool is_policy_kernel = strncmp(tmp, "policy_kernel", 13) == 0 ? true : false;

245 bool is_policy = (strncmp(tmp, "policy", 6) == 0 && !is_policy_kernel) ? true : false;

246 bool is_nested_policy = (strncmp(tmp, "nested_policy", 6) == 0) ? true : false; // nested policies not included

247

248 // synchronous profile

249 if (param.n_calls > 0 && !is_policy && !is_nested_policy) {

250 double time = param.n_calls * param.time;

251

252 out << std::setw(12) << param.n_calls * param.time << "\t" << std::setw(12) << (time / total_time) * 100 << "\t";

253 out << std::setw(12) << param.n_calls << "\t" << std::setw(12) << param.time << "\t" << std::setw(16)

254 << key.volume << "\t";

255 out << key.name << "\t" << key.aux << "\t" << param.comment; // param.comment ends with a newline

256 }

257

258 // async policy profile

259 if (param.n_calls > 0 && is_policy) {

260 double time = param.n_calls * param.time;

261

262 async_out << std::setw(12) << param.n_calls * param.time << "\t" << std::setw(12)

263 << (time / async_total_time) * 100 << "\t";

264 async_out << std::setw(12) << param.n_calls << "\t" << std::setw(12) << param.time << "\t" << std::setw(16)

265 << key.volume << "\t";

266 async_out << key.name << "\t" << key.aux << "\t" << param.comment; // param.comment ends with a newline

267 }

268

269 q.pop();

270 }

271

272 out << std::endl << "# Total time spent in kernels = " << total_time << " seconds" << std::endl;

273 async_out << std::endl

274 << "# Total time spent in asynchronous execution = " << async_total_time << " seconds" << std::endl;

275 }

276

280 static void serializeTrace(std::ostream &out)

281 {

282 for (auto it = trace_list.begin(); it != trace_list.end(); it++) {

283

284 TuneKey &key = it->key;

285

286 // special case kernel members of a policy

287 char tmp[TuneKey::aux_n] = {};

288 strncpy(tmp, key.aux, TuneKey::aux_n);

289 bool is_policy_kernel = strcmp(tmp, "policy_kernel") == 0 ? true : false;

290

291 out << std::setw(12) << it->time << "\t";

292 out << std::setw(12) << it->device_bytes << "\t";

293 out << std::setw(12) << it->pinned_bytes << "\t";

294 out << std::setw(12) << it->mapped_bytes << "\t";

295 out << std::setw(12) << it->host_bytes << "\t";

296 out << std::setw(16) << key.volume << "\t";

297 if (is_policy_kernel) out << "\t";

298 out << key.name << "\t";

299 if (!is_policy_kernel) out << "\t";

300 out << key.aux << std::endl;

301 }

302 }

303

307 static void broadcastTuneCache()

308 {

309 #ifdef MULTI_GPU

310 std::stringstream serialized;

311 size_t size;

312

313 if (comm_rank_global() == 0) {

314 serializeTuneCache(serialized);

315 size = serialized.str().length();

316 }

317 comm_broadcast_global(&size, sizeof(size_t));

318

319 if (size > 0) {

320 if (comm_rank_global() == 0) {

321 comm_broadcast_global(const_cast<char *>(serialized.str().c_str()), size);

322 } else {

323 char *serstr = new char[size + 1];

324 comm_broadcast_global(serstr, size);

325 serstr[size] = '0円'; // null-terminate

326 serialized.str(serstr);

327 deserializeTuneCache(serialized);

328 delete[] serstr;

329 }

330 }

331 #endif

332 }

333

334 /*

335 * Read tunecache from disk.

336 */

337 void loadTuneCache()

338 {

339 if (getTuning() == QUDA_TUNE_NO) {

340 warningQuda("Autotuning disabled");

341 return;

342 }

343

344 char *path;

345 struct stat pstat;

346 std::string cache_path, line, token;

347 std::ifstream cache_file;

348 std::stringstream ls;

349

350 path = getenv("QUDA_RESOURCE_PATH");

351

352 if (!path) {

353 warningQuda("Environment variable QUDA_RESOURCE_PATH is not set.");

354 warningQuda("Caching of tuned parameters will be disabled.");

355 return;

356 } else if (stat(path, &pstat) || !S_ISDIR(pstat.st_mode)) {

357 warningQuda("The path \"%s\" specified by QUDA_RESOURCE_PATH does not exist or is not a directory.", path);

358 warningQuda("Caching of tuned parameters will be disabled.");

359 return;

360 } else {

361 resource_path = path;

362 }

363

364 bool version_check = true;

365 char *override_version_env = getenv("QUDA_TUNE_VERSION_CHECK");

366 if (override_version_env && strcmp(override_version_env, "0") == 0) {

367 version_check = false;

368 warningQuda("Disabling QUDA tunecache version check");

369 }

370

371 #ifdef MULTI_GPU

372 if (comm_rank_global() == 0) {

373 #endif

374

375 cache_path = resource_path;

376 cache_path += "/tunecache.tsv";

377 cache_file.open(cache_path.c_str());

378

379 if (cache_file) {

380

381 if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());

382 getline(cache_file, line);

383 ls.str(line);

384 ls >> token;

385 if (token.compare("tunecache")) errorQuda("Bad format in %s", cache_path.c_str());

386 ls >> token;

387 if (version_check && token.compare(quda_version))

388 errorQuda("Cache file %s does not match current QUDA version. \nPlease delete this file or set the "

389 "QUDA_RESOURCE_PATH environment variable to point to a new path.",

390 cache_path.c_str());

391 ls >> token;

392 #ifdef GITVERSION

393 if (version_check && token.compare(gitversion))

394 errorQuda("Cache file %s does not match current QUDA version. \nPlease delete this file or set the "

395 "QUDA_RESOURCE_PATH environment variable to point to a new path.",

396 cache_path.c_str());

397 #else

398 if (version_check && token.compare(quda_version))

399 errorQuda("Cache file %s does not match current QUDA version. \nPlease delete this file or set the "

400 "QUDA_RESOURCE_PATH environment variable to point to a new path.",

401 cache_path.c_str());

402 #endif

403 ls >> token;

404 if (version_check && token.compare(quda_hash))

405 errorQuda("Cache file %s does not match current QUDA build. \nPlease delete this file or set the "

406 "QUDA_RESOURCE_PATH environment variable to point to a new path.",

407 cache_path.c_str());

408

409 if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());

410 getline(cache_file, line); // eat the blank line

411

412 if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());

413 getline(cache_file, line); // eat the description line

414

415 deserializeTuneCache(cache_file);

416

417 cache_file.close();

418 initial_cache_size = tunecache.size();

419

420 if (getVerbosity() >= QUDA_SUMMARIZE) {

421 printfQuda("Loaded %d sets of cached parameters from %s\n", static_cast<int>(initial_cache_size),

422 cache_path.c_str());

423 }

424

425 } else {

426 warningQuda("Cache file not found. All kernels will be re-tuned (if tuning is enabled).");

427 }

428

429 #ifdef MULTI_GPU

430 }

431 #endif

432

433 broadcastTuneCache();

434 }

435

439 void saveTuneCache(bool error)

440 {

441 time_t now;

442 int lock_handle;

443 std::string lock_path, cache_path;

444 std::ofstream cache_file;

445

446 if (resource_path.empty()) return;

447

448 // FIXME: We should really check to see if any nodes have tuned a kernel that was not also tuned on node 0, since as things

449 // stand, the corresponding launch parameters would never get cached to disk in this situation. This will come up if we

450 // ever support different subvolumes per GPU (as might be convenient for lattice volumes that don't divide evenly).

451

452 #ifdef MULTI_GPU

453 if (comm_rank_global() == 0) {

454 #endif

455

456 if (tunecache.size() == initial_cache_size && !error) return;

457

458 // Acquire lock. Note that this is only robust if the filesystem supports flock() semantics, which is true for

459 // NFS on recent versions of linux but not Lustre by default (unless the filesystem was mounted with "-o flock").

460 lock_path = resource_path + "/tunecache.lock";

461 lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);

462 if (lock_handle == -1) {

463 warningQuda("Unable to lock cache file. Tuned launch parameters will not be cached to disk. "

464 "If you are certain that no other instances of QUDA are accessing this filesystem, "

465 "please manually remove %s",

466 lock_path.c_str());

467 return;

468 }

469 char msg[] = "If no instances of applications using QUDA are running,\n"

470 "this lock file shouldn't be here and is safe to delete.";

471 int stat = write(lock_handle, msg, sizeof(msg)); // check status to avoid compiler warning

472 if (stat == -1) warningQuda("Unable to write to lock file for some bizarre reason");

473

474 cache_path = resource_path + (error ? "/tunecache_error.tsv" : "/tunecache.tsv");

475 cache_file.open(cache_path.c_str());

476

477 if (getVerbosity() >= QUDA_SUMMARIZE) {

478 printfQuda("Saving %d sets of cached parameters to %s\n", static_cast<int>(tunecache.size()), cache_path.c_str());

479 }

480

481 time(&now);

482 cache_file << "tunecache\t" << quda_version;

483 #ifdef GITVERSION

484 cache_file << "\t" << gitversion;

485 #else

486 cache_file << "\t" << quda_version;

487 #endif

488 cache_file << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;

489 cache_file << std::setw(16) << "volume"

490 << "\tname\taux\tblock.x\tblock.y\tblock.z\tgrid.x\tgrid.y\tgrid.z\tshared_bytes\taux.x\taux.y\taux."

491 "z\taux.w\ttime\tcomment"

492 << std::endl;

493 serializeTuneCache(cache_file);

494 cache_file.close();

495

496 // Release lock.

497 close(lock_handle);

498 remove(lock_path.c_str());

499

500 initial_cache_size = tunecache.size();

501

502 #ifdef MULTI_GPU

503 } else {

504 // give process 0 time to write out its tunecache if needed, but

505 // doesn't cause a hang if error is not triggered on process 0

506 if (error) sleep(10);

507 }

508 #endif

509 }

510

511 static bool policy_tuning = false;

512 bool policyTuning() { return policy_tuning; }

513

514 void setPolicyTuning(bool policy_tuning_) { policy_tuning = policy_tuning_; }

515

516 static bool uber_tuning = false;

517 bool uberTuning() { return uber_tuning; }

518

519 void setUberTuning(bool uber_tuning_) { uber_tuning = uber_tuning_; }

520

521 // flush profile, setting counts to zero

522 void flushProfile()

523 {

524 for (map::iterator entry = tunecache.begin(); entry != tunecache.end(); entry++) {

525 // set all n_calls = 0

526 TuneParam &param = entry->second;

527 param.n_calls = 0;

528 }

529 }

530

531 // save profile

532 void saveProfile(const std::string label)

533 {

534 time_t now;

535 int lock_handle;

536 std::string lock_path, profile_path, async_profile_path, trace_path;

537 std::ofstream profile_file, async_profile_file, trace_file;

538

539 if (resource_path.empty()) return;

540

541 #ifdef MULTI_GPU

542 if (comm_rank_global() == 0) { // Make sure only one rank is writing to disk

543 #endif

544

545 // Acquire lock. Note that this is only robust if the filesystem supports flock() semantics, which is true for

546 // NFS on recent versions of linux but not Lustre by default (unless the filesystem was mounted with "-o flock").

547 lock_path = resource_path + "/profile.lock";

548 lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);

549 if (lock_handle == -1) {

550 warningQuda("Unable to lock profile file. Profile will not be saved to disk. "

551 "If you are certain that no other instances of QUDA are accessing this filesystem, "

552 "please manually remove %s",

553 lock_path.c_str());

554 return;

555 }

556 char msg[] = "If no instances of applications using QUDA are running,\n"

557 "this lock file shouldn't be here and is safe to delete.";

558 int stat = write(lock_handle, msg, sizeof(msg)); // check status to avoid compiler warning

559 if (stat == -1) warningQuda("Unable to write to lock file for some bizarre reason");

560

561 // profile counter for writing out unique profiles

562 static int count = 0;

563

564 char *profile_fname = getenv("QUDA_PROFILE_OUTPUT_BASE");

565

566 if (!profile_fname) {

567 warningQuda(

568 "Environment variable QUDA_PROFILE_OUTPUT_BASE not set; writing to profile.tsv and profile_async.tsv");

569 profile_path = resource_path + "/profile_" + std::to_string(count) + ".tsv";

570 async_profile_path = resource_path + "/profile_async_" + std::to_string(count) + ".tsv";

571 if (traceEnabled()) trace_path = resource_path + "/trace_" + std::to_string(count) + ".tsv";

572 } else {

573 profile_path = resource_path + "/" + profile_fname + "_" + std::to_string(count) + ".tsv";

574 async_profile_path = resource_path + "/" + profile_fname + "_" + std::to_string(count) + "_async.tsv";

575 if (traceEnabled())

576 trace_path = resource_path + "/" + profile_fname + "_trace_" + std::to_string(count) + ".tsv";

577 }

578

579 count++;

580

581 profile_file.open(profile_path.c_str());

582 async_profile_file.open(async_profile_path.c_str());

583 if (traceEnabled()) trace_file.open(trace_path.c_str());

584

585 if (getVerbosity() >= QUDA_SUMMARIZE) {

586 // compute number of non-zero entries that will be output in the profile

587 int n_entry = 0;

588 int n_policy = 0;

589 for (map::iterator entry = tunecache.begin(); entry != tunecache.end(); entry++) {

590 // if a policy entry, then we can ignore

591 char tmp[TuneKey::aux_n] = {};

592 strncpy(tmp, entry->first.aux, TuneKey::aux_n);

593 TuneParam param = entry->second;

594 bool is_policy = strcmp(tmp, "policy") == 0 ? true : false;

595 if (param.n_calls > 0 && !is_policy) n_entry++;

596 if (param.n_calls > 0 && is_policy) n_policy++;

597 }

598

599 printfQuda("Saving %d sets of cached parameters to %s\n", n_entry, profile_path.c_str());

600 printfQuda("Saving %d sets of cached profiles to %s\n", n_policy, async_profile_path.c_str());

601 if (traceEnabled())

602 printfQuda("Saving trace list with %lu entries to %s\n", trace_list.size(), trace_path.c_str());

603 }

604

605 time(&now);

606

607 std::string Label = label.empty() ? "profile" : label;

608

609 profile_file << Label << "\t" << quda_version;

610 #ifdef GITVERSION

611 profile_file << "\t" << gitversion;

612 #else

613 profile_file << "\t" << quda_version;

614 #endif

615 profile_file << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;

616 profile_file << std::setw(12) << "total time"

617 << "\t" << std::setw(12) << "percentage"

618 << "\t" << std::setw(12) << "calls"

619 << "\t" << std::setw(12) << "time / call"

620 << "\t" << std::setw(16) << "volume"

621 << "\tname\taux\tcomment" << std::endl;

622

623 async_profile_file << Label << "\t" << quda_version;

624 #ifdef GITVERSION

625 async_profile_file << "\t" << gitversion;

626 #else

627 async_profile_file << "\t" << quda_version;

628 #endif

629 async_profile_file << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;

630 async_profile_file << std::setw(12) << "total time"

631 << "\t" << std::setw(12) << "percentage"

632 << "\t" << std::setw(12) << "calls"

633 << "\t" << std::setw(12) << "time / call"

634 << "\t" << std::setw(16) << "volume"

635 << "\tname\taux\tcomment" << std::endl;

636

637 serializeProfile(profile_file, async_profile_file);

638

639 profile_file.close();

640 async_profile_file.close();

641

642 if (traceEnabled()) {

643 trace_file << "trace"

644 << "\t" << quda_version;

645 #ifdef GITVERSION

646 trace_file << "\t" << gitversion;

647 #else

648 trace_file << "\t" << quda_version;

649 #endif

650 trace_file << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;

651

652 trace_file << std::setw(12) << "time\t" << std::setw(12) << "device-mem\t" << std::setw(12) << "pinned-mem\t";

653 trace_file << std::setw(12) << "mapped-mem\t" << std::setw(12) << "host-mem\t";

654 trace_file << std::setw(16) << "volume"

655 << "\tname\taux" << std::endl;

656

657 serializeTrace(trace_file);

658

659 trace_file.close();

660 }

661

662 // Release lock.

663 close(lock_handle);

664 remove(lock_path.c_str());

665

666 #ifdef MULTI_GPU

667 }

668 #endif

669 }

670

671 static TimeProfile launchTimer("tuneLaunch");

672

677 TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)

678 {

679 #ifdef LAUNCH_TIMER

680 launchTimer.TPSTART(QUDA_PROFILE_TOTAL);

681 launchTimer.TPSTART(QUDA_PROFILE_INIT);

682 #endif

683

684 TuneKey key = tunable.tuneKey();

685 if (use_managed_memory()) strcat(key.aux, ",managed");

686 last_key = key;

687

688 #ifdef LAUNCH_TIMER

689 launchTimer.TPSTOP(QUDA_PROFILE_INIT);

690 launchTimer.TPSTART(QUDA_PROFILE_PREAMBLE);

691 #endif

692

693 static const Tunable *active_tunable; // for error checking

694 it = tunecache.find(key);

695

696 // first check if we have the tuned value and return if we have it

697 if (enabled == QUDA_TUNE_YES && it != tunecache.end()) {

698

699 #ifdef LAUNCH_TIMER

700 launchTimer.TPSTOP(QUDA_PROFILE_PREAMBLE);

701 launchTimer.TPSTART(QUDA_PROFILE_COMPUTE);

702 #endif

703

704 TuneParam &param_tuned = it->second;

705

706 if (verbosity >= QUDA_DEBUG_VERBOSE) {

707 printfQuda("Launching %s with %s at vol=%s with %s\n", key.name, key.aux, key.volume,

708 tunable.paramString(param_tuned).c_str());

709 }

710

711 #ifdef LAUNCH_TIMER

712 launchTimer.TPSTOP(QUDA_PROFILE_COMPUTE);

713 launchTimer.TPSTART(QUDA_PROFILE_EPILOGUE);

714 #endif

715

716 tunable.checkLaunchParam(param_tuned);

717

718 // we could be tuning outside of the current scope

719 if (!tuning && profile_count) param_tuned.n_calls++;

720

721 #ifdef LAUNCH_TIMER

722 launchTimer.TPSTOP(QUDA_PROFILE_EPILOGUE);

723 launchTimer.TPSTOP(QUDA_PROFILE_TOTAL);

724 #endif

725

726 if (traceEnabled() >= 2) {

727 TraceKey trace_entry(key, param_tuned.time);

728 trace_list.push_back(trace_entry);

729 }

730

731 return param_tuned;

732 }

733

734 #ifdef LAUNCH_TIMER

735 launchTimer.TPSTOP(QUDA_PROFILE_PREAMBLE);

736 launchTimer.TPSTOP(QUDA_PROFILE_TOTAL);

737 #endif

738

739 static TuneParam param;

740

741 if (enabled == QUDA_TUNE_NO) {

742 TuneParam param_default;

743 tunable.defaultTuneParam(param_default);

744 tunable.checkLaunchParam(param_default);

745 if (verbosity >= QUDA_DEBUG_VERBOSE) {

746 printfQuda("Launching %s with %s at vol=%s with %s (untuned)\n", key.name, key.aux, key.volume,

747 tunable.paramString(param_default).c_str());

748 }

749

750 return param_default;

751 } else if (!tuning) {

752

753 /* As long as global reductions are not disabled, only do the

754 tuning on node 0, else do the tuning on all nodes since we

755 can't guarantee that all nodes are partaking */

756 if (comm_rank_global() == 0 || !commGlobalReduction() || policyTuning() || uberTuning()) {

757 TuneParam best_param;

758 cudaError_t error = cudaSuccess;

759 cudaEvent_t start, end;

760 float elapsed_time, best_time;

761 time_t now;

762

763 tuning = true;

764 active_tunable = &tunable;

765 best_time = FLT_MAX;

766

767 if (verbosity >= QUDA_DEBUG_VERBOSE) printfQuda("PreTune %s\n", key.name);

768 tunable.preTune();

769

770 cudaEventCreate(&start);

771 cudaEventCreate(&end);

772

773 if (verbosity >= QUDA_DEBUG_VERBOSE) {

774 printfQuda("Tuning %s with %s at vol=%s\n", key.name, key.aux, key.volume);

775 }

776

777 Timer tune_timer;

778 tune_timer.Start(__func__, __FILE__, __LINE__);

779

780 tunable.initTuneParam(param);

781 while (tuning) {

782 cudaDeviceSynchronize();

783 cudaGetLastError(); // clear error counter

784 tunable.checkLaunchParam(param);

785 if (verbosity >= QUDA_DEBUG_VERBOSE) {

786 printfQuda("About to call tunable.apply block=(%d,%d,%d) grid=(%d,%d,%d) shared_bytes=%d aux=(%d,%d,%d)\n",

787 param.block.x, param.block.y, param.block.z, param.grid.x, param.grid.y, param.grid.z,

788 param.shared_bytes, param.aux.x, param.aux.y, param.aux.z);

789 }

790 tunable.apply(0); // do initial call in case we need to jit compile for these parameters or if policy tuning

791

792 cudaEventRecord(start, 0);

793 for (int i = 0; i < tunable.tuningIter(); i++) {

794 tunable.apply(0); // calls tuneLaunch() again, which simply returns the currently active param

795 }

796 cudaEventRecord(end, 0);

797 cudaEventSynchronize(end);

798 cudaEventElapsedTime(&elapsed_time, start, end);

799 cudaDeviceSynchronize();

800 error = cudaGetLastError();

801

802 { // check that error state is cleared

803 cudaDeviceSynchronize();

804 cudaError_t error = cudaGetLastError();

805 if (error != cudaSuccess) errorQuda("Failed to clear error state %s\n", cudaGetErrorString(error));

806 }

807

808 elapsed_time /= (1e3 * tunable.tuningIter());

809 if ((elapsed_time < best_time) && (error == cudaSuccess) && (tunable.jitifyError() == CUDA_SUCCESS)) {

810 best_time = elapsed_time;

811 best_param = param;

812 }

813 if ((verbosity >= QUDA_DEBUG_VERBOSE)) {

814 if (error == cudaSuccess && tunable.jitifyError() == CUDA_SUCCESS) {

815 printfQuda(" %s gives %s\n", tunable.paramString(param).c_str(),

816 tunable.perfString(elapsed_time).c_str());

817 } else {

818 if (tunable.jitifyError() == CUDA_SUCCESS) {

819 // if not jitify error must be regular error

820 printfQuda(" %s gives %s\n", tunable.paramString(param).c_str(), cudaGetErrorString(error));

821 } else {

822 // else is a jitify error

823 const char *str;

824 cuGetErrorString(tunable.jitifyError(), &str);

825 printfQuda(" %s gives %s\n", tunable.paramString(param).c_str(), str);

826 }

827 }

828 }

829 tuning = tunable.advanceTuneParam(param);

830 tunable.jitifyError() = CUDA_SUCCESS;

831 }

832

833 tune_timer.Stop(__func__, __FILE__, __LINE__);

834

835 if (best_time == FLT_MAX) {

836 errorQuda("Auto-tuning failed for %s with %s at vol=%s", key.name, key.aux, key.volume);

837 }

838 if (verbosity >= QUDA_VERBOSE) {

839 printfQuda("Tuned %s giving %s for %s with %s\n", tunable.paramString(best_param).c_str(),

840 tunable.perfString(best_time).c_str(), key.name, key.aux);

841 }

842 time(&now);

843 best_param.comment = "# " + tunable.perfString(best_time);

844 best_param.comment += ", tuning took " + std::to_string(tune_timer.Last()) + " seconds at ";

845 best_param.comment += ctime(&now); // includes a newline

846 best_param.time = best_time;

847

848 cudaEventDestroy(start);

849 cudaEventDestroy(end);

850

851 if (verbosity >= QUDA_DEBUG_VERBOSE) printfQuda("PostTune %s\n", key.name);

852 tuning = true;

853 tunable.postTune();

854 tuning = false;

855 param = best_param;

856 tunecache[key] = best_param;

857 }

858 if (commGlobalReduction() || policyTuning() || uberTuning()) { broadcastTuneCache(); }

859

860 // check this process is getting the key that is expected

861 if (tunecache.find(key) == tunecache.end()) {

862 errorQuda("Failed to find key entry (%s:%s:%s)", key.name, key.volume, key.aux);

863 }

864 param = tunecache[key]; // read this now for all processes

865

866 if (traceEnabled() >= 2) {

867 TraceKey trace_entry(key, param.time);

868 trace_list.push_back(trace_entry);

869 }

870

871 } else if (&tunable != active_tunable) {

872 errorQuda("Unexpected call to tuneLaunch() in %s::apply()", typeid(tunable).name());

873 }

874

875 param.n_calls = profile_count ? 1 : 0;

876

877 return param;

878 }

879

880 void printLaunchTimer()

881 {

882 #ifdef LAUNCH_TIMER

883 launchTimer.Print();

884 #endif

885 }

886 } // namespace quda

quda::TimeProfile::Print

void Print()

Definition: timer.cpp:7

quda::Tunable

Definition: tune_quda.h:95

quda::Tunable::perfString

virtual std::string perfString(float time) const

Definition: tune_quda.h:321

quda::Tunable::paramString

virtual std::string paramString(const TuneParam &param) const

Definition: tune_quda.h:314

quda::Tunable::jitifyError

CUresult jitifyError() const

Definition: tune_quda.h:403

quda::Tunable::advanceTuneParam

virtual bool advanceTuneParam(TuneParam &param) const

Definition: tune_quda.h:363

quda::Tunable::initTuneParam

virtual void initTuneParam(TuneParam &param) const

Definition: tune_quda.h:332

quda::Tunable::postTune

virtual void postTune()

Definition: tune_quda.h:310

quda::Tunable::preTune

virtual void preTune()

Definition: tune_quda.h:309

quda::Tunable::tuneKey

virtual TuneKey tuneKey() const =0

quda::Tunable::apply

virtual void apply(const qudaStream_t &stream)=0

quda::Tunable::checkLaunchParam

void checkLaunchParam(TuneParam &param)

Definition: tune_quda.h:372

quda::Tunable::defaultTuneParam

virtual void defaultTuneParam(TuneParam &param) const

Definition: tune_quda.h:357

quda::Tunable::tuningIter

virtual int tuningIter() const

Definition: tune_quda.h:311

quda::TuneParam

Definition: tune_quda.h:25

quda::TuneParam::comment

std::string comment

Definition: tune_quda.h:34

quda::TuneParam::time

float time

Definition: tune_quda.h:35

quda::TuneParam::n_calls

long long n_calls

Definition: tune_quda.h:36

comm_quda.h

commGlobalReduction

bool commGlobalReduction()

Definition: communicator_stack.cpp:212

comm_rank_global

int comm_rank_global(void)

Definition: communicator_stack.cpp:89

verbosity

QudaVerbosity verbosity

Definition: command_line_params.cpp:33

communicator_quda.h

comm_broadcast_global

void comm_broadcast_global(void *data, size_t nbytes)

These routine broadcast the data according to the default communicator.

Definition: communicator_stack.cpp:190

end

void end(void)

Definition: covdev_test.cpp:141

tmp

cudaColorSpinorField * tmp

Definition: covdev_test.cpp:34

QUDA_DEBUG_VERBOSE

@ QUDA_DEBUG_VERBOSE

Definition: enum_quda.h:268

QUDA_SUMMARIZE

@ QUDA_SUMMARIZE

Definition: enum_quda.h:266

QUDA_VERBOSE

@ QUDA_VERBOSE

Definition: enum_quda.h:267

QudaTune

enum QudaTune_s QudaTune

QUDA_TUNE_YES

@ QUDA_TUNE_YES

Definition: enum_quda.h:272

QUDA_TUNE_NO

@ QUDA_TUNE_NO

Definition: enum_quda.h:272

QudaVerbosity

enum QudaVerbosity_s QudaVerbosity

quda::blas_lapack::native::init

void init()

Create the BLAS context.

Definition: blas_lapack_cublas.cpp:28

quda::device::profile::start

void start()

Start profiling.

Definition: device.cpp:226

quda

Definition: blas_lapack.h:24

quda::disableProfileCount

void disableProfileCount()

Disable the profile kernel counting.

Definition: tune.cpp:141

quda::tuneLaunch

TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)

Definition: tune.cpp:677

quda::getTuneCache

const std::map< TuneKey, TuneParam > & getTuneCache()

Returns a reference to the tunecache map.

Definition: tune.cpp:144

quda::saveTuneCache

void saveTuneCache(bool error=false)

Definition: tune.cpp:439

quda::loadTuneCache

void loadTuneCache()

Definition: tune.cpp:337

quda::policyTuning

bool policyTuning()

Query whether we are currently tuning a policy.

Definition: tune.cpp:512

quda::setUberTuning

void setUberTuning(bool)

Enable / disable whether we are tuning an uber kernel.

Definition: tune.cpp:519

quda::map

std::map< TuneKey, TuneParam > map

Definition: tune.cpp:34

quda::host_allocated_peak

size_t host_allocated_peak()

Definition: malloc.cpp:87

quda::setPolicyTuning

void setPolicyTuning(bool)

Enable / disable whether are tuning a policy.

Definition: tune.cpp:514

quda::QUDA_PROFILE_INIT

@ QUDA_PROFILE_INIT

Definition: timer.h:106

quda::QUDA_PROFILE_EPILOGUE

@ QUDA_PROFILE_EPILOGUE

Definition: timer.h:110

quda::QUDA_PROFILE_COMPUTE

@ QUDA_PROFILE_COMPUTE

Definition: timer.h:108

quda::QUDA_PROFILE_TOTAL

@ QUDA_PROFILE_TOTAL

Definition: timer.h:149

quda::QUDA_PROFILE_PREAMBLE

@ QUDA_PROFILE_PREAMBLE

Definition: timer.h:107

quda::activeTuning

bool activeTuning()

query if tuning is in progress

Definition: tune.cpp:137

quda::postTrace_

void postTrace_(const char *func, const char *file, int line)

Post an event in the trace, recording where it was posted.

Definition: tune.cpp:106

quda::flushProfile

void flushProfile()

Flush profile contents, setting all counts to zero.

Definition: tune.cpp:522

quda::printLaunchTimer

void printLaunchTimer()

Definition: tune.cpp:880

quda::mapped_allocated_peak

size_t mapped_allocated_peak()

Definition: malloc.cpp:83

quda::i32toa

void i32toa(char *buffer, int32_t value)

Definition: uint_to_char.h:117

quda::traceEnabled

int traceEnabled()

Definition: tune.cpp:86

quda::use_managed_memory

bool use_managed_memory()

Definition: malloc.cpp:178

quda::pinned_allocated_peak

size_t pinned_allocated_peak()

Definition: malloc.cpp:81

quda::enableProfileCount

void enableProfileCount()

Enable the profile kernel counting.

Definition: tune.cpp:142

quda::device_allocated_peak

size_t device_allocated_peak()

Definition: malloc.cpp:79

quda::saveProfile

void saveProfile(const std::string label="")

Save profile to disk.

Definition: tune.cpp:532

quda::uberTuning

bool uberTuning()

Query whether we are tuning an uber kernel.

Definition: tune.cpp:517

testing::internal::string

::std::string string

Definition: gtest-port.h:891

param

QudaGaugeParam param

Definition: pack_test.cpp:18

quda.h

Main header file for the QUDA library.

QUDA_VERSION_SUBMINOR

#define QUDA_VERSION_SUBMINOR

Definition: quda_constants.h:3

QUDA_VERSION_MAJOR

#define QUDA_VERSION_MAJOR

Definition: quda_constants.h:1

QUDA_VERSION_MINOR

#define QUDA_VERSION_MINOR

Definition: quda_constants.h:2

quda::Timer

Definition: timer.h:42

quda::Timer::Stop

void Stop(const char *func, const char *file, int line)

Definition: timer.h:72

quda::Timer::Last

double Last()

Definition: timer.h:88

quda::Timer::Start

void Start(const char *func, const char *file, int line)

Definition: timer.h:63

quda::TraceKey

Definition: tune.cpp:36

quda::TraceKey::TraceKey

TraceKey(const TraceKey &trace)

Definition: tune.cpp:58

quda::TraceKey::device_bytes

long device_bytes

Definition: tune.cpp:41

quda::TraceKey::time

float time

Definition: tune.cpp:39

quda::TraceKey::operator=

TraceKey & operator=(const TraceKey &trace)

Definition: tune.cpp:68

quda::TraceKey::key

TuneKey key

Definition: tune.cpp:38

quda::TraceKey::host_bytes

long host_bytes

Definition: tune.cpp:44

quda::TraceKey::TraceKey

TraceKey()

Definition: tune.cpp:46

quda::TraceKey::mapped_bytes

long mapped_bytes

Definition: tune.cpp:43

quda::TraceKey::TraceKey

TraceKey(const TuneKey &key, float time)

Definition: tune.cpp:48

quda::TraceKey::pinned_bytes

long pinned_bytes

Definition: tune.cpp:42

quda::TuneKey

Definition: tune_key.h:8

quda::TuneKey::aux_n

static const int aux_n

Definition: tune_key.h:12

quda::TuneKey::volume

char volume[volume_n]

Definition: tune_key.h:13

quda::TuneKey::name

char name[name_n]

Definition: tune_key.h:14

quda::TuneKey::aux

char aux[aux_n]

Definition: tune_key.h:15

quda::less_significant

Definition: tune.cpp:204

quda::less_significant::operator()

bool operator()(const T &lhs, const T &rhs)

Definition: tune.cpp:205

getLastTuneKey

quda::TuneKey getLastTuneKey()

Definition: tune.cpp:30

STR

#define STR(x)

Definition: tune.cpp:128

gitversion

char * gitversion

Definition: version.cpp:4

tune_quda.h

uint_to_char.h

getTuning

QudaTune getTuning()

Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...

Definition: util_quda.cpp:52

printfQuda

#define printfQuda(...)

Definition: util_quda.h:114

getVerbosity

QudaVerbosity getVerbosity()

Definition: util_quda.cpp:21

warningQuda

#define warningQuda(...)

Definition: util_quda.h:132

errorQuda

#define errorQuda(...)

Definition: util_quda.h:120

Generated on Thu Oct 28 2021 16:10:27 for QUDA by doxygen 1.9.1