Main Page Namespace List Class Hierarchy Alphabetical List Compound List File List Namespace Members Compound Members File Members Related Pages
CUDAAccel.C

Go to the documentation of this file.
00001 /***************************************************************************
00002 *cr 
00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the 
00004 *cr University of Illinois 
00005 *cr All Rights Reserved 
00006 *cr 
00007 ***************************************************************************/
00008 /***************************************************************************
00009 * RCS INFORMATION:
00010 *
00011 * $RCSfile: CUDAAccel.C,v $
00012 * $Author: johns $ $Locker: $ $State: Exp $
00013 * $Revision: 1.70 $ $Date: 2022年02月13日 05:34:21 $
00014 *
00015 ***************************************************************************/
00032 #include <stdio.h>
00033 #include <stdlib.h>
00034 #include "config.h" // rebuild on config changes
00035 #include "Inform.h"
00036 #include "ResizeArray.h"
00037 #include "CUDAAccel.h"
00038 #include "CUDAKernels.h"
00039 #include "WKFThreads.h"
00040 #include "ProfileHooks.h"
00041 
00042 CUDAAccel::CUDAAccel(void) {
00043 cudaavail = 0;
00044 numdevices = 0;
00045 numphysdevices = 0;
00046 
00047 nvmlh=NULL;
00048 cudapool=NULL;
00049 
00050 if (getenv("VMDNOCUDA") != NULL) {
00051 msgInfo << "VMDNOCUDA environment variable is set, CUDA support disabled."
00052 << sendmsg;
00053 return; 
00054 }
00055 
00056 #if defined(VMDCUDA)
00057 PROFILE_PUSH_RANGE("CUDAAccel::CUDAAccel()", 0);
00058 
00059 unsigned int gpumask = 0xffffffff;
00060 const char *gpumaskstr = getenv("VMDCUDADEVICEMASK");
00061 if (gpumaskstr != NULL) {
00062 unsigned int tmp;
00063 if (sscanf(gpumaskstr, "%x", &tmp) == 1) {
00064 gpumask = tmp;
00065 msgInfo << "Using GPU device mask '"
00066 << gpumaskstr << "'" << sendmsg;
00067 } else {
00068 msgInfo << "Failed to parse CUDA GPU device mask string '" 
00069 << gpumaskstr << "'" << sendmsg;
00070 }
00071 }
00072 
00073 // This is the very first CUDA API call during VMD startup.
00074 // There's a >= 2.0 second startup lag associated with it on the DGX-2, 
00075 // likely due to CUDA runtime library internal initialization overheads
00076 // across the 16 GPUs. The first internal call checks the CUDA runtime
00077 // and driver version compatibility.
00078 int usabledevices = 0;
00079 int rc = 0;
00080 if ((rc=vmd_cuda_num_devices(&numphysdevices)) != VMDCUDA_ERR_NONE) {
00081 numdevices = 0;
00082 numphysdevices = 0;
00083 
00084 // Only emit error messages when there are CUDA GPUs on the machine
00085 // but that they can't be used for some reason
00086 // XXX turning this off for the time being, as some people have 
00087 // NVIDIA drivers installed on machines with no NVIDIA GPU, as can
00088 // happen with some distros that package the drivers by default.
00089 switch (rc) {
00090 case VMDCUDA_ERR_NODEVICES:
00091 case VMDCUDA_ERR_SOMEDEVICES:
00092 // msgInfo << "No CUDA accelerator devices available." << sendmsg;
00093 break;
00094 
00095 #if 0
00096 case VMDCUDA_ERR_SOMEDEVICES:
00097 msgWarn << "One or more CUDA accelerators may exist but are not usable." << sendmsg; 
00098 msgWarn << "Check to make sure that GPU drivers are up to date." << sendmsg;
00099 break;
00100 #endif
00101 
00102 case VMDCUDA_ERR_DRVMISMATCH:
00103 msgWarn << "Detected a mismatch between CUDA runtime and GPU driver" << sendmsg; 
00104 msgWarn << "Check to make sure that GPU drivers are up to date." << sendmsg;
00105 // msgInfo << "No CUDA accelerator devices available." << sendmsg;
00106 break;
00107 }
00108 
00109 PROFILE_POP_RANGE();
00110 return;
00111 }
00112 
00113 // 
00114 // Runtime load of NVML shared library (packaged with CUDA driver) to
00115 // manually obtain function points for query of low-level host platform
00116 // and GPU hardware details such as the best CPU affinity mask associated 
00117 // with each GPU, taking into account the NUMA node, PCIe topology, and 
00118 // NVLink topology that exist on the system.
00119 nvmlh = wrap_nvml_create();
00120 
00121 
00122 // The following loop queries the individual GPU hardware and API 
00123 // compatibility properties and records their results for subsequent use.
00124 // This phase of startup costs about 0.05 seconds on a DGX-2 with 16 GPUs.
00125 if (numphysdevices > 0) {
00126 cudaavail = 1;
00127 
00128 int i;
00129 for (i=0; i<numphysdevices; i++) {
00130 cudadevprops dp;
00131 memset(&dp, 0, sizeof(dp));
00132 if (!vmd_cuda_device_props(i, dp.name, sizeof(dp.name),
00133 &dp.major, &dp.minor,
00134 &dp.membytes, &dp.clockratekhz, 
00135 &dp.smcount, &dp.integratedgpu,
00136 &dp.asyncenginecount, 
00137 &dp.kernelexectimeoutenabled,
00138 &dp.canmaphostmem, &dp.computemode,
00139 &dp.spdpfpperfratio, 
00140 &dp.pageablememaccess,
00141 &dp.pageablememaccessuseshostpagetables)) {
00142 dp.deviceid=i; // save the device index
00143 
00144 // Check that each GPU device has not been excluded by virtue of 
00145 // being used for display, by a GPU device mask, or by the CUDA
00146 // device mode being set to a "prohibited" status.
00147 if (!(dp.kernelexectimeoutenabled && getenv("VMDCUDANODISPLAYGPUS")) &&
00148 (gpumask & (1 << i)) && 
00149 (dp.computemode != computeModeProhibited)) {
00150 devprops.append(dp);
00151 usabledevices++;
00152 }
00153 } else {
00154 msgWarn << " Failed to retrieve properties for CUDA accelerator " << i << sendmsg; 
00155 }
00156 }
00157 }
00158 
00159 // assign the final usable device count as the number of available
00160 // CUDA devices (physical device count is maintained separately)
00161 numdevices=usabledevices;
00162 
00163 // This code creates a pool of CPU worker threads (one per GPU) that
00164 // are hereafter responsible for managing each device. To ensure that
00165 // the GPUs are all actually usable, each worker thread allocates a 
00166 // few bytes of memory and executes a trivial kernel on it.
00167 // On a DGX-2, this phase of startup costs about 7.63 seconds on 16 GPUs.
00168 devpool_init();
00169 
00170 PROFILE_POP_RANGE();
00171 #endif
00172 }
00173 
00174 
00175 // destructor
00176 CUDAAccel::~CUDAAccel(void) {
00177 devpool_fini(); 
00178 
00179 #if defined(VMDCUDA)
00180 // destroy the live connection to NVML library
00181 if (nvmlh != NULL) {
00182 wrap_nvml_destroy(nvmlh);
00183 }
00184 #endif
00185 }
00186 
00187 
00188 void CUDAAccel::devpool_init(void) {
00189 cudapool=NULL;
00190 
00191 #if defined(VMDCUDA)
00192 PROFILE_PUSH_RANGE("CUDAAccel::devpool_init()", 0);
00193 
00194 // don't proceed any further if there are no devices or CUDA usage
00195 // has been disabled by the user
00196 if (!cudaavail || numdevices == 0 || getenv("VMDNOCUDA") != NULL)
00197 return;
00198 
00199 // only use as many GPUs as CPU cores we're allowed to use
00200 int workercount=numdevices;
00201 if (workercount > wkf_thread_numprocessors())
00202 workercount=wkf_thread_numprocessors();
00203 
00204 int *devlist = new int[workercount];
00205 int i;
00206 for (i=0; i<workercount; i++) {
00207 devlist[i]=device_index(i);
00208 }
00209 
00210 msgInfo << "Creating CUDA device pool and initializing hardware..." << sendmsg;
00211 cudapool=wkf_threadpool_create(workercount, devlist);
00212 delete [] devlist;
00213 
00214 // associate each worker thread with a specific GPU
00215 if (getenv("VMDCUDAVERBOSE") != NULL)
00216 wkf_threadpool_launch(cudapool, vmd_cuda_devpool_setdevice, (void*)"VMD CUDA Dev Init", 1);
00217 else
00218 wkf_threadpool_launch(cudapool, vmd_cuda_devpool_setdevice, NULL, 1);
00219 
00220 // clear all available device memory on each of the GPUs
00221 wkf_threadpool_launch(cudapool, vmd_cuda_devpool_clear_device_mem, NULL, 1);
00222 
00223 // XXX enable fully-connected NVLink peer-to-peer GPU memory access
00224 // when requested (not fully generalized yet). This is done only
00225 // once per VMD process, per GPU, and never again.
00226 if (getenv("VMDCUDAP2PENABLE") != NULL) {
00227 msgInfo << "Enabling DGX-2 fully-connected NVLink GPU P2P..." << sendmsg;
00228 wkf_threadpool_launch(cudapool, vmd_cuda_devpool_enable_P2P, NULL, 1);
00229 }
00230 
00231 PROFILE_POP_RANGE();
00232 #endif
00233 }
00234 
00235 void CUDAAccel::devpool_fini(void) {
00236 if (!cudapool)
00237 return;
00238 
00239 #if defined(VMDCUDA)
00240 devpool_wait();
00241 wkf_threadpool_destroy(cudapool);
00242 #endif
00243 cudapool=NULL;
00244 }
00245 
00246 int CUDAAccel::devpool_launch(void *fctn(void *), void *parms, int blocking) {
00247 if (!cudapool)
00248 return -1;
00249 
00250 return wkf_threadpool_launch(cudapool, fctn, parms, blocking); 
00251 }
00252 
00253 int CUDAAccel::devpool_wait(void) {
00254 if (!cudapool)
00255 return -1;
00256 
00257 return wkf_threadpool_wait(cudapool);
00258 }
00259 
00260 void CUDAAccel::print_cuda_devices(void) {
00261 if (getenv("VMDCUDANODISPLAYGPUS")) {
00262 msgInfo << "Ignoring CUDA-capable GPUs used for display" << sendmsg;
00263 }
00264 
00265 if (!cudaavail || numdevices == 0) {
00266 msgInfo << "No CUDA accelerator devices available." << sendmsg;
00267 return;
00268 }
00269 
00270 if (nvmlh == NULL) {
00271 msgInfo << "Unable to load NVML library, GPU-CPU affinity unavailable." << sendmsg;
00272 }
00273 
00274 // XXX GPU P2P hardware features need to be abstracted by CUDAAccel in the
00275 // same way that usable CUDA devices are, so that VMDCUDADEVICEMASK 
00276 // affects the record keeping and reporting of P2P connectivity etc.
00277 // If the user selects a subset of GPUs, we should disinclude consideration
00278 // of the P2P topology that connects GPUs that were masked out. 
00279 // We should take into account the mask's impact on the number of P2P islands,
00280 // and not count or report links to GPUs that were masked out. 
00281 // Since the low-level peer matrix helper function doesn't know anything about
00282 // GPU device masks or other control environment variables, CUDAAccel 
00283 // should filter the output by copying only the P2P connectivity matrix elements 
00284 // that correspond to links between GPUs that are enabled. The final filtered
00285 // and abstracted P2P matrix can then be used by the rest of VMD with appropriate
00286 // accessor functions that take the potentially sparse GPU mapping into account.
00287 int p2plinkcount=0, p2pislands=0;
00288 #if defined(VMDCUDA)
00289 int numdev=0;
00290 int *p2pmat=NULL;
00291 int *p2psupp=NULL;
00292 int *p2patomics=NULL;
00293 int *p2parrays=NULL;
00294 int *perfmat=NULL;
00295 
00296 if (vmd_cuda_peer_matrix(&numdev, &p2pmat, &p2psupp, &p2patomics, &p2parrays,
00297 &perfmat, &p2plinkcount, &p2pislands) != VMDCUDA_ERR_NONE) {
00298 msgWarn << "Unable to ascertain GPU peer-to-peer connectivity" << sendmsg;
00299 }
00300 
00301 if (p2pmat)
00302 free(p2pmat);
00303 if (p2psupp)
00304 free(p2psupp);
00305 if (p2patomics)
00306 free(p2patomics);
00307 if (p2parrays)
00308 free(p2parrays);
00309 if (perfmat)
00310 free(perfmat);
00311 #endif
00312 
00313 // Report detected GPU hardware and PCIe/NVLink P2P topology
00314 msgInfo << "Detected " << numdevices << " available CUDA " 
00315 << ((numdevices > 1) ? "accelerators" : "accelerator:");
00316 
00317 // XXX update to account for device masks...
00318 if (p2plinkcount > 0) {
00319 msgInfo << ", " 
00320 << p2plinkcount << ((p2plinkcount > 1) ? " P2P links, " : " P2P link, ")
00321 << p2pislands << ((p2pislands > 1) ? " islands" : " island");
00322 }
00323 
00324 msgInfo << ":" << sendmsg;
00325 
00326 
00327 char oldstr[1024], outstr[1024], gpustr[1024], idxprefix[1024];
00328 int idxrangecount=0,firstidx=-1, lastidx=-1;
00329 const char *idxfmtstring10gpus = "[%d]";
00330 const char *idxfmtspaces10gpus = " ";
00331 const char *idxfmtstring100gpus = "[%2d]";
00332 const char *idxfmtspaces100gpus = " ";
00333 const char *gpuidxfmtstring, *gpuidxfmtspaces;
00334 
00335 #if 0
00336 int outputlineperdevice = 1;
00337 #else
00338 int outputlineperdevice = (getenv("VMDCUDAOUTPUTLINEPERDEVICE") != NULL);
00339 #endif
00340 
00341 // when enumerating large DGX-2 class hardware, we ensure columns line up
00342 // by choosing format strings to fit range of device IDs we got
00343 if (device_index(numdevices-1) > 10) {
00344 gpuidxfmtstring = idxfmtstring100gpus;
00345 gpuidxfmtspaces = idxfmtspaces100gpus;
00346 } else {
00347 gpuidxfmtstring = idxfmtstring10gpus;
00348 gpuidxfmtspaces = idxfmtspaces10gpus;
00349 }
00350 
00351 memset(oldstr, 0, sizeof(oldstr));
00352 memset(gpustr, 0, sizeof(gpustr));
00353 memset(idxprefix, 0, sizeof(idxprefix));
00354 
00355 int i;
00356 int shiftgpuidx=0;
00357 for (i=0; i<numdevices; i++) {
00358 memset(outstr, 0, sizeof(outstr));
00359 
00360 // list primary GPU device attributes
00361 const char *devname = device_name(i);
00362 sprintf(gpustr, " %-20s %2d SM_%d.%d %.1f GHz", 
00363 (devname) ? devname : "NULL Device Name!",
00364 (device_sm_count(i) > 0) ? device_sm_count(i) : 0,
00365 device_version_major(i), device_version_minor(i),
00366 device_clock_ghz(i));
00367 strcpy(outstr, gpustr);
00368 
00369 // list memory capacity 
00370 int gpumemmb = (device_membytes(i) / (1024 * 1024));
00371 if (gpumemmb < 1000) {
00372 sprintf(gpustr, ", %4dMB RAM", gpumemmb);
00373 } else if (gpumemmb < 10240) {
00374 sprintf(gpustr, ", %.1fGB RAM", gpumemmb / 1024.0);
00375 } else {
00376 // round up to nearest GB
00377 sprintf(gpustr, ", %dGB RAM", (gpumemmb + 512) / 1024);
00378 }
00379 strcat(outstr, gpustr);
00380 
00381 // list optional hardware features and configuration attributes here...
00382 if (device_computemode(i) == computeModeProhibited) {
00383 strcat(outstr, ", Compute Mode: Prohibited");
00384 } else {
00385 int sfpr = device_spdpfpperfratio(i);
00386 if (sfpr > 2) {
00387 sprintf(gpustr, " SP%d", sfpr);
00388 strcat(outstr, gpustr);
00389 }
00390 
00392 if (device_integratedgpu(i)) {
00393 strcat(outstr, " IGPU");
00394 }
00395 
00398 if (device_kerneltimeoutenabled(i)) {
00399 strcat(outstr, " KT");
00400 }
00401 
00403 if (device_asyncenginecount(i)) {
00404 sprintf(gpustr, " AE%d", device_asyncenginecount(i));
00405 strcat(outstr, gpustr);
00406 }
00407 
00409 if (device_canmaphostmem(i))
00410 strcat(outstr, " ZC");
00411 
00414 if (device_pageablememaccess(i)) {
00417 if (device_pageablememaccessuseshostpagetables(i))
00418 strcat(outstr, " PMT");
00419 else 
00420 strcat(outstr, " PM");
00421 }
00422 }
00423 
00424 if (outputlineperdevice) {
00425 // emit a status line per-device despite any redundancy
00426 sprintf(idxprefix, gpuidxfmtstring, device_index(i));
00427 msgInfo << idxprefix << outstr << sendmsg; 
00428 } else {
00429 // if the current GPU description is the same as the last one,
00430 // we don't bother duplicating its listing, and instead we 
00431 // list the GPU index range(s) with matching descriptive strings
00432 int newidx = device_index(i);
00433 if (!strcmp(oldstr, outstr)) {
00434 // if we have a gap in GPU IDs, we emit the partial index range
00435 // and continue from the first index after the gap
00436 if ((newidx - lastidx) > 1) { 
00437 if (lastidx > firstidx) {
00438 sprintf(idxprefix, "%d-%d", firstidx, lastidx);
00439 shiftgpuidx=1;
00440 } else {
00441 sprintf(idxprefix, "%s%d", (shiftgpuidx) ? " " : "", firstidx); 
00442 }
00443 
00444 msgInfo << ((idxrangecount == 0) ? "[" : ",") << idxprefix;
00445 idxrangecount++;
00446 firstidx = newidx;
00447 lastidx = newidx;
00448 }
00449 lastidx=newidx;
00450 } else {
00451 if (firstidx < 0) {
00452 firstidx = newidx;
00453 lastidx = newidx;
00454 strcpy(oldstr, outstr);
00455 continue; 
00456 }
00457 
00458 if (lastidx > firstidx) {
00459 sprintf(idxprefix, "%d-%d", firstidx, lastidx); 
00460 shiftgpuidx=1;
00461 } else {
00462 sprintf(idxprefix, "%s%d", (shiftgpuidx) ? " " : "", firstidx); 
00463 }
00464 msgInfo << ((idxrangecount == 0) ? "[" : ",") << idxprefix;
00465 msgInfo << "]" << oldstr << sendmsg; 
00466 
00467 idxrangecount = 0;
00468 firstidx = newidx;
00469 lastidx = newidx;
00470 strcpy(oldstr, outstr);
00471 memset(outstr, 0, sizeof(outstr));
00472 }
00473 } 
00474 } // end of loop over devices 
00475 
00476 if (!outputlineperdevice) {
00477 if (lastidx > firstidx) {
00478 sprintf(idxprefix, "%d-%d", firstidx, lastidx); 
00479 } else {
00480 sprintf(idxprefix, "%s%d", (shiftgpuidx) ? " " : "", firstidx); 
00481 }
00482 msgInfo << ((idxrangecount == 0) ? "[" : ",") << idxprefix;
00483 msgInfo << "]";
00484 if (idxrangecount > 2) {
00485 msgInfo << ":" << sendmsg;
00486 msgInfo << gpuidxfmtspaces; // shift to right to line up with column
00487 }
00488 msgInfo << oldstr << sendmsg; 
00489 }
00490 }
00491 
00492 int CUDAAccel::num_devices(void) {
00493 return numdevices;
00494 }
00495 
00496 int CUDAAccel::device_index(int dev) {
00497 return devprops[dev].deviceid;
00498 }
00499 
00500 const char *CUDAAccel::device_name(int dev) {
00501 if (!cudaavail || dev < 0 || dev >= numdevices)
00502 return NULL;
00503 return devprops[dev].name; 
00504 }
00505 
00506 int CUDAAccel::device_version_major(int dev) {
00507 if (!cudaavail || dev < 0 || dev >= numdevices)
00508 return 0; 
00509 return devprops[dev].major;
00510 }
00511 
00512 int CUDAAccel::device_version_minor(int dev) {
00513 if (!cudaavail || dev < 0 || dev >= numdevices)
00514 return 0; 
00515 return devprops[dev].minor;
00516 }
00517 
00518 unsigned long CUDAAccel::device_membytes(int dev) {
00519 if (!cudaavail || dev < 0 || dev >= numdevices)
00520 return 0; 
00521 return devprops[dev].membytes;
00522 }
00523 
00524 float CUDAAccel::device_clock_ghz(int dev) {
00525 if (!cudaavail || dev < 0 || dev >= numdevices)
00526 return 0; 
00527 return (float) (devprops[dev].clockratekhz / 1000000.0);
00528 }
00529 
00530 int CUDAAccel::device_sm_count(int dev) {
00531 if (!cudaavail || dev < 0 || dev >= numdevices)
00532 return -1; 
00533 return devprops[dev].smcount;
00534 }
00535 
00536 int CUDAAccel::device_integratedgpu(int dev) {
00537 if (!cudaavail || dev < 0 || dev >= numdevices)
00538 return -1; 
00539 return devprops[dev].integratedgpu;
00540 }
00541 
00542 int CUDAAccel::device_asyncenginecount(int dev) {
00543 if (!cudaavail || dev < 0 || dev >= numdevices)
00544 return -1; 
00545 return devprops[dev].asyncenginecount;
00546 }
00547 
00548 int CUDAAccel::device_kerneltimeoutenabled(int dev) {
00549 if (!cudaavail || dev < 0 || dev >= numdevices)
00550 return -1; 
00551 return devprops[dev].kernelexectimeoutenabled;
00552 }
00553 
00554 int CUDAAccel::device_canmaphostmem(int dev) {
00555 if (!cudaavail || dev < 0 || dev >= numdevices)
00556 return -1; 
00557 return devprops[dev].canmaphostmem;
00558 }
00559 
00560 int CUDAAccel::device_computemode(int dev) {
00561 if (!cudaavail || dev < 0 || dev >= numdevices)
00562 return -1; 
00563 return devprops[dev].computemode;
00564 }
00565 
00566 int CUDAAccel::device_spdpfpperfratio(int dev) {
00567 if (!cudaavail || dev < 0 || dev >= numdevices)
00568 return -1; 
00569 return devprops[dev].spdpfpperfratio;
00570 }
00571 
00572 int CUDAAccel::device_pageablememaccess(int dev) {
00573 if (!cudaavail || dev < 0 || dev >= numdevices)
00574 return -1; 
00575 return devprops[dev].pageablememaccess;
00576 }
00577 
00578 int CUDAAccel::device_pageablememaccessuseshostpagetables(int dev) {
00579 if (!cudaavail || dev < 0 || dev >= numdevices)
00580 return -1; 
00581 return devprops[dev].pageablememaccessuseshostpagetables;
00582 }
00583