00001 /*************************************************************************** 00002 *cr 00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the 00004 *cr University of Illinois 00005 *cr All Rights Reserved 00006 *cr 00007 ***************************************************************************/ 00008 /*************************************************************************** 00009 * RCS INFORMATION: 00010 * 00011 * $RCSfile: CUDAAccel.C,v $ 00012 * $Author: johns $ $Locker: $ $State: Exp $ 00013 * $Revision: 1.70 $ $Date: 2022年02月13日 05:34:21 $ 00014 * 00015 ***************************************************************************/ 00032 #include <stdio.h> 00033 #include <stdlib.h> 00034 #include "config.h" // rebuild on config changes 00035 #include "Inform.h" 00036 #include "ResizeArray.h" 00037 #include "CUDAAccel.h" 00038 #include "CUDAKernels.h" 00039 #include "WKFThreads.h" 00040 #include "ProfileHooks.h" 00041 00042 CUDAAccel::CUDAAccel(void) { 00043 cudaavail = 0; 00044 numdevices = 0; 00045 numphysdevices = 0; 00046 00047 nvmlh=NULL; 00048 cudapool=NULL; 00049 00050 if (getenv("VMDNOCUDA") != NULL) { 00051 msgInfo << "VMDNOCUDA environment variable is set, CUDA support disabled." 00052 << sendmsg; 00053 return; 00054 } 00055 00056 #if defined(VMDCUDA) 00057 PROFILE_PUSH_RANGE("CUDAAccel::CUDAAccel()", 0); 00058 00059 unsigned int gpumask = 0xffffffff; 00060 const char *gpumaskstr = getenv("VMDCUDADEVICEMASK"); 00061 if (gpumaskstr != NULL) { 00062 unsigned int tmp; 00063 if (sscanf(gpumaskstr, "%x", &tmp) == 1) { 00064 gpumask = tmp; 00065 msgInfo << "Using GPU device mask '" 00066 << gpumaskstr << "'" << sendmsg; 00067 } else { 00068 msgInfo << "Failed to parse CUDA GPU device mask string '" 00069 << gpumaskstr << "'" << sendmsg; 00070 } 00071 } 00072 00073 // This is the very first CUDA API call during VMD startup. 00074 // There's a >= 2.0 second startup lag associated with it on the DGX-2, 00075 // likely due to CUDA runtime library internal initialization overheads 00076 // across the 16 GPUs. The first internal call checks the CUDA runtime 00077 // and driver version compatibility. 00078 int usabledevices = 0; 00079 int rc = 0; 00080 if ((rc=vmd_cuda_num_devices(&numphysdevices)) != VMDCUDA_ERR_NONE) { 00081 numdevices = 0; 00082 numphysdevices = 0; 00083 00084 // Only emit error messages when there are CUDA GPUs on the machine 00085 // but that they can't be used for some reason 00086 // XXX turning this off for the time being, as some people have 00087 // NVIDIA drivers installed on machines with no NVIDIA GPU, as can 00088 // happen with some distros that package the drivers by default. 00089 switch (rc) { 00090 case VMDCUDA_ERR_NODEVICES: 00091 case VMDCUDA_ERR_SOMEDEVICES: 00092 // msgInfo << "No CUDA accelerator devices available." << sendmsg; 00093 break; 00094 00095 #if 0 00096 case VMDCUDA_ERR_SOMEDEVICES: 00097 msgWarn << "One or more CUDA accelerators may exist but are not usable." << sendmsg; 00098 msgWarn << "Check to make sure that GPU drivers are up to date." << sendmsg; 00099 break; 00100 #endif 00101 00102 case VMDCUDA_ERR_DRVMISMATCH: 00103 msgWarn << "Detected a mismatch between CUDA runtime and GPU driver" << sendmsg; 00104 msgWarn << "Check to make sure that GPU drivers are up to date." << sendmsg; 00105 // msgInfo << "No CUDA accelerator devices available." << sendmsg; 00106 break; 00107 } 00108 00109 PROFILE_POP_RANGE(); 00110 return; 00111 } 00112 00113 // 00114 // Runtime load of NVML shared library (packaged with CUDA driver) to 00115 // manually obtain function points for query of low-level host platform 00116 // and GPU hardware details such as the best CPU affinity mask associated 00117 // with each GPU, taking into account the NUMA node, PCIe topology, and 00118 // NVLink topology that exist on the system. 00119 nvmlh = wrap_nvml_create(); 00120 00121 00122 // The following loop queries the individual GPU hardware and API 00123 // compatibility properties and records their results for subsequent use. 00124 // This phase of startup costs about 0.05 seconds on a DGX-2 with 16 GPUs. 00125 if (numphysdevices > 0) { 00126 cudaavail = 1; 00127 00128 int i; 00129 for (i=0; i<numphysdevices; i++) { 00130 cudadevprops dp; 00131 memset(&dp, 0, sizeof(dp)); 00132 if (!vmd_cuda_device_props(i, dp.name, sizeof(dp.name), 00133 &dp.major, &dp.minor, 00134 &dp.membytes, &dp.clockratekhz, 00135 &dp.smcount, &dp.integratedgpu, 00136 &dp.asyncenginecount, 00137 &dp.kernelexectimeoutenabled, 00138 &dp.canmaphostmem, &dp.computemode, 00139 &dp.spdpfpperfratio, 00140 &dp.pageablememaccess, 00141 &dp.pageablememaccessuseshostpagetables)) { 00142 dp.deviceid=i; // save the device index 00143 00144 // Check that each GPU device has not been excluded by virtue of 00145 // being used for display, by a GPU device mask, or by the CUDA 00146 // device mode being set to a "prohibited" status. 00147 if (!(dp.kernelexectimeoutenabled && getenv("VMDCUDANODISPLAYGPUS")) && 00148 (gpumask & (1 << i)) && 00149 (dp.computemode != computeModeProhibited)) { 00150 devprops.append(dp); 00151 usabledevices++; 00152 } 00153 } else { 00154 msgWarn << " Failed to retrieve properties for CUDA accelerator " << i << sendmsg; 00155 } 00156 } 00157 } 00158 00159 // assign the final usable device count as the number of available 00160 // CUDA devices (physical device count is maintained separately) 00161 numdevices=usabledevices; 00162 00163 // This code creates a pool of CPU worker threads (one per GPU) that 00164 // are hereafter responsible for managing each device. To ensure that 00165 // the GPUs are all actually usable, each worker thread allocates a 00166 // few bytes of memory and executes a trivial kernel on it. 00167 // On a DGX-2, this phase of startup costs about 7.63 seconds on 16 GPUs. 00168 devpool_init(); 00169 00170 PROFILE_POP_RANGE(); 00171 #endif 00172 } 00173 00174 00175 // destructor 00176 CUDAAccel::~CUDAAccel(void) { 00177 devpool_fini(); 00178 00179 #if defined(VMDCUDA) 00180 // destroy the live connection to NVML library 00181 if (nvmlh != NULL) { 00182 wrap_nvml_destroy(nvmlh); 00183 } 00184 #endif 00185 } 00186 00187 00188 void CUDAAccel::devpool_init(void) { 00189 cudapool=NULL; 00190 00191 #if defined(VMDCUDA) 00192 PROFILE_PUSH_RANGE("CUDAAccel::devpool_init()", 0); 00193 00194 // don't proceed any further if there are no devices or CUDA usage 00195 // has been disabled by the user 00196 if (!cudaavail || numdevices == 0 || getenv("VMDNOCUDA") != NULL) 00197 return; 00198 00199 // only use as many GPUs as CPU cores we're allowed to use 00200 int workercount=numdevices; 00201 if (workercount > wkf_thread_numprocessors()) 00202 workercount=wkf_thread_numprocessors(); 00203 00204 int *devlist = new int[workercount]; 00205 int i; 00206 for (i=0; i<workercount; i++) { 00207 devlist[i]=device_index(i); 00208 } 00209 00210 msgInfo << "Creating CUDA device pool and initializing hardware..." << sendmsg; 00211 cudapool=wkf_threadpool_create(workercount, devlist); 00212 delete [] devlist; 00213 00214 // associate each worker thread with a specific GPU 00215 if (getenv("VMDCUDAVERBOSE") != NULL) 00216 wkf_threadpool_launch(cudapool, vmd_cuda_devpool_setdevice, (void*)"VMD CUDA Dev Init", 1); 00217 else 00218 wkf_threadpool_launch(cudapool, vmd_cuda_devpool_setdevice, NULL, 1); 00219 00220 // clear all available device memory on each of the GPUs 00221 wkf_threadpool_launch(cudapool, vmd_cuda_devpool_clear_device_mem, NULL, 1); 00222 00223 // XXX enable fully-connected NVLink peer-to-peer GPU memory access 00224 // when requested (not fully generalized yet). This is done only 00225 // once per VMD process, per GPU, and never again. 00226 if (getenv("VMDCUDAP2PENABLE") != NULL) { 00227 msgInfo << "Enabling DGX-2 fully-connected NVLink GPU P2P..." << sendmsg; 00228 wkf_threadpool_launch(cudapool, vmd_cuda_devpool_enable_P2P, NULL, 1); 00229 } 00230 00231 PROFILE_POP_RANGE(); 00232 #endif 00233 } 00234 00235 void CUDAAccel::devpool_fini(void) { 00236 if (!cudapool) 00237 return; 00238 00239 #if defined(VMDCUDA) 00240 devpool_wait(); 00241 wkf_threadpool_destroy(cudapool); 00242 #endif 00243 cudapool=NULL; 00244 } 00245 00246 int CUDAAccel::devpool_launch(void *fctn(void *), void *parms, int blocking) { 00247 if (!cudapool) 00248 return -1; 00249 00250 return wkf_threadpool_launch(cudapool, fctn, parms, blocking); 00251 } 00252 00253 int CUDAAccel::devpool_wait(void) { 00254 if (!cudapool) 00255 return -1; 00256 00257 return wkf_threadpool_wait(cudapool); 00258 } 00259 00260 void CUDAAccel::print_cuda_devices(void) { 00261 if (getenv("VMDCUDANODISPLAYGPUS")) { 00262 msgInfo << "Ignoring CUDA-capable GPUs used for display" << sendmsg; 00263 } 00264 00265 if (!cudaavail || numdevices == 0) { 00266 msgInfo << "No CUDA accelerator devices available." << sendmsg; 00267 return; 00268 } 00269 00270 if (nvmlh == NULL) { 00271 msgInfo << "Unable to load NVML library, GPU-CPU affinity unavailable." << sendmsg; 00272 } 00273 00274 // XXX GPU P2P hardware features need to be abstracted by CUDAAccel in the 00275 // same way that usable CUDA devices are, so that VMDCUDADEVICEMASK 00276 // affects the record keeping and reporting of P2P connectivity etc. 00277 // If the user selects a subset of GPUs, we should disinclude consideration 00278 // of the P2P topology that connects GPUs that were masked out. 00279 // We should take into account the mask's impact on the number of P2P islands, 00280 // and not count or report links to GPUs that were masked out. 00281 // Since the low-level peer matrix helper function doesn't know anything about 00282 // GPU device masks or other control environment variables, CUDAAccel 00283 // should filter the output by copying only the P2P connectivity matrix elements 00284 // that correspond to links between GPUs that are enabled. The final filtered 00285 // and abstracted P2P matrix can then be used by the rest of VMD with appropriate 00286 // accessor functions that take the potentially sparse GPU mapping into account. 00287 int p2plinkcount=0, p2pislands=0; 00288 #if defined(VMDCUDA) 00289 int numdev=0; 00290 int *p2pmat=NULL; 00291 int *p2psupp=NULL; 00292 int *p2patomics=NULL; 00293 int *p2parrays=NULL; 00294 int *perfmat=NULL; 00295 00296 if (vmd_cuda_peer_matrix(&numdev, &p2pmat, &p2psupp, &p2patomics, &p2parrays, 00297 &perfmat, &p2plinkcount, &p2pislands) != VMDCUDA_ERR_NONE) { 00298 msgWarn << "Unable to ascertain GPU peer-to-peer connectivity" << sendmsg; 00299 } 00300 00301 if (p2pmat) 00302 free(p2pmat); 00303 if (p2psupp) 00304 free(p2psupp); 00305 if (p2patomics) 00306 free(p2patomics); 00307 if (p2parrays) 00308 free(p2parrays); 00309 if (perfmat) 00310 free(perfmat); 00311 #endif 00312 00313 // Report detected GPU hardware and PCIe/NVLink P2P topology 00314 msgInfo << "Detected " << numdevices << " available CUDA " 00315 << ((numdevices > 1) ? "accelerators" : "accelerator:"); 00316 00317 // XXX update to account for device masks... 00318 if (p2plinkcount > 0) { 00319 msgInfo << ", " 00320 << p2plinkcount << ((p2plinkcount > 1) ? " P2P links, " : " P2P link, ") 00321 << p2pislands << ((p2pislands > 1) ? " islands" : " island"); 00322 } 00323 00324 msgInfo << ":" << sendmsg; 00325 00326 00327 char oldstr[1024], outstr[1024], gpustr[1024], idxprefix[1024]; 00328 int idxrangecount=0,firstidx=-1, lastidx=-1; 00329 const char *idxfmtstring10gpus = "[%d]"; 00330 const char *idxfmtspaces10gpus = " "; 00331 const char *idxfmtstring100gpus = "[%2d]"; 00332 const char *idxfmtspaces100gpus = " "; 00333 const char *gpuidxfmtstring, *gpuidxfmtspaces; 00334 00335 #if 0 00336 int outputlineperdevice = 1; 00337 #else 00338 int outputlineperdevice = (getenv("VMDCUDAOUTPUTLINEPERDEVICE") != NULL); 00339 #endif 00340 00341 // when enumerating large DGX-2 class hardware, we ensure columns line up 00342 // by choosing format strings to fit range of device IDs we got 00343 if (device_index(numdevices-1) > 10) { 00344 gpuidxfmtstring = idxfmtstring100gpus; 00345 gpuidxfmtspaces = idxfmtspaces100gpus; 00346 } else { 00347 gpuidxfmtstring = idxfmtstring10gpus; 00348 gpuidxfmtspaces = idxfmtspaces10gpus; 00349 } 00350 00351 memset(oldstr, 0, sizeof(oldstr)); 00352 memset(gpustr, 0, sizeof(gpustr)); 00353 memset(idxprefix, 0, sizeof(idxprefix)); 00354 00355 int i; 00356 int shiftgpuidx=0; 00357 for (i=0; i<numdevices; i++) { 00358 memset(outstr, 0, sizeof(outstr)); 00359 00360 // list primary GPU device attributes 00361 const char *devname = device_name(i); 00362 sprintf(gpustr, " %-20s %2d SM_%d.%d %.1f GHz", 00363 (devname) ? devname : "NULL Device Name!", 00364 (device_sm_count(i) > 0) ? device_sm_count(i) : 0, 00365 device_version_major(i), device_version_minor(i), 00366 device_clock_ghz(i)); 00367 strcpy(outstr, gpustr); 00368 00369 // list memory capacity 00370 int gpumemmb = (device_membytes(i) / (1024 * 1024)); 00371 if (gpumemmb < 1000) { 00372 sprintf(gpustr, ", %4dMB RAM", gpumemmb); 00373 } else if (gpumemmb < 10240) { 00374 sprintf(gpustr, ", %.1fGB RAM", gpumemmb / 1024.0); 00375 } else { 00376 // round up to nearest GB 00377 sprintf(gpustr, ", %dGB RAM", (gpumemmb + 512) / 1024); 00378 } 00379 strcat(outstr, gpustr); 00380 00381 // list optional hardware features and configuration attributes here... 00382 if (device_computemode(i) == computeModeProhibited) { 00383 strcat(outstr, ", Compute Mode: Prohibited"); 00384 } else { 00385 int sfpr = device_spdpfpperfratio(i); 00386 if (sfpr > 2) { 00387 sprintf(gpustr, " SP%d", sfpr); 00388 strcat(outstr, gpustr); 00389 } 00390 00392 if (device_integratedgpu(i)) { 00393 strcat(outstr, " IGPU"); 00394 } 00395 00398 if (device_kerneltimeoutenabled(i)) { 00399 strcat(outstr, " KT"); 00400 } 00401 00403 if (device_asyncenginecount(i)) { 00404 sprintf(gpustr, " AE%d", device_asyncenginecount(i)); 00405 strcat(outstr, gpustr); 00406 } 00407 00409 if (device_canmaphostmem(i)) 00410 strcat(outstr, " ZC"); 00411 00414 if (device_pageablememaccess(i)) { 00417 if (device_pageablememaccessuseshostpagetables(i)) 00418 strcat(outstr, " PMT"); 00419 else 00420 strcat(outstr, " PM"); 00421 } 00422 } 00423 00424 if (outputlineperdevice) { 00425 // emit a status line per-device despite any redundancy 00426 sprintf(idxprefix, gpuidxfmtstring, device_index(i)); 00427 msgInfo << idxprefix << outstr << sendmsg; 00428 } else { 00429 // if the current GPU description is the same as the last one, 00430 // we don't bother duplicating its listing, and instead we 00431 // list the GPU index range(s) with matching descriptive strings 00432 int newidx = device_index(i); 00433 if (!strcmp(oldstr, outstr)) { 00434 // if we have a gap in GPU IDs, we emit the partial index range 00435 // and continue from the first index after the gap 00436 if ((newidx - lastidx) > 1) { 00437 if (lastidx > firstidx) { 00438 sprintf(idxprefix, "%d-%d", firstidx, lastidx); 00439 shiftgpuidx=1; 00440 } else { 00441 sprintf(idxprefix, "%s%d", (shiftgpuidx) ? " " : "", firstidx); 00442 } 00443 00444 msgInfo << ((idxrangecount == 0) ? "[" : ",") << idxprefix; 00445 idxrangecount++; 00446 firstidx = newidx; 00447 lastidx = newidx; 00448 } 00449 lastidx=newidx; 00450 } else { 00451 if (firstidx < 0) { 00452 firstidx = newidx; 00453 lastidx = newidx; 00454 strcpy(oldstr, outstr); 00455 continue; 00456 } 00457 00458 if (lastidx > firstidx) { 00459 sprintf(idxprefix, "%d-%d", firstidx, lastidx); 00460 shiftgpuidx=1; 00461 } else { 00462 sprintf(idxprefix, "%s%d", (shiftgpuidx) ? " " : "", firstidx); 00463 } 00464 msgInfo << ((idxrangecount == 0) ? "[" : ",") << idxprefix; 00465 msgInfo << "]" << oldstr << sendmsg; 00466 00467 idxrangecount = 0; 00468 firstidx = newidx; 00469 lastidx = newidx; 00470 strcpy(oldstr, outstr); 00471 memset(outstr, 0, sizeof(outstr)); 00472 } 00473 } 00474 } // end of loop over devices 00475 00476 if (!outputlineperdevice) { 00477 if (lastidx > firstidx) { 00478 sprintf(idxprefix, "%d-%d", firstidx, lastidx); 00479 } else { 00480 sprintf(idxprefix, "%s%d", (shiftgpuidx) ? " " : "", firstidx); 00481 } 00482 msgInfo << ((idxrangecount == 0) ? "[" : ",") << idxprefix; 00483 msgInfo << "]"; 00484 if (idxrangecount > 2) { 00485 msgInfo << ":" << sendmsg; 00486 msgInfo << gpuidxfmtspaces; // shift to right to line up with column 00487 } 00488 msgInfo << oldstr << sendmsg; 00489 } 00490 } 00491 00492 int CUDAAccel::num_devices(void) { 00493 return numdevices; 00494 } 00495 00496 int CUDAAccel::device_index(int dev) { 00497 return devprops[dev].deviceid; 00498 } 00499 00500 const char *CUDAAccel::device_name(int dev) { 00501 if (!cudaavail || dev < 0 || dev >= numdevices) 00502 return NULL; 00503 return devprops[dev].name; 00504 } 00505 00506 int CUDAAccel::device_version_major(int dev) { 00507 if (!cudaavail || dev < 0 || dev >= numdevices) 00508 return 0; 00509 return devprops[dev].major; 00510 } 00511 00512 int CUDAAccel::device_version_minor(int dev) { 00513 if (!cudaavail || dev < 0 || dev >= numdevices) 00514 return 0; 00515 return devprops[dev].minor; 00516 } 00517 00518 unsigned long CUDAAccel::device_membytes(int dev) { 00519 if (!cudaavail || dev < 0 || dev >= numdevices) 00520 return 0; 00521 return devprops[dev].membytes; 00522 } 00523 00524 float CUDAAccel::device_clock_ghz(int dev) { 00525 if (!cudaavail || dev < 0 || dev >= numdevices) 00526 return 0; 00527 return (float) (devprops[dev].clockratekhz / 1000000.0); 00528 } 00529 00530 int CUDAAccel::device_sm_count(int dev) { 00531 if (!cudaavail || dev < 0 || dev >= numdevices) 00532 return -1; 00533 return devprops[dev].smcount; 00534 } 00535 00536 int CUDAAccel::device_integratedgpu(int dev) { 00537 if (!cudaavail || dev < 0 || dev >= numdevices) 00538 return -1; 00539 return devprops[dev].integratedgpu; 00540 } 00541 00542 int CUDAAccel::device_asyncenginecount(int dev) { 00543 if (!cudaavail || dev < 0 || dev >= numdevices) 00544 return -1; 00545 return devprops[dev].asyncenginecount; 00546 } 00547 00548 int CUDAAccel::device_kerneltimeoutenabled(int dev) { 00549 if (!cudaavail || dev < 0 || dev >= numdevices) 00550 return -1; 00551 return devprops[dev].kernelexectimeoutenabled; 00552 } 00553 00554 int CUDAAccel::device_canmaphostmem(int dev) { 00555 if (!cudaavail || dev < 0 || dev >= numdevices) 00556 return -1; 00557 return devprops[dev].canmaphostmem; 00558 } 00559 00560 int CUDAAccel::device_computemode(int dev) { 00561 if (!cudaavail || dev < 0 || dev >= numdevices) 00562 return -1; 00563 return devprops[dev].computemode; 00564 } 00565 00566 int CUDAAccel::device_spdpfpperfratio(int dev) { 00567 if (!cudaavail || dev < 0 || dev >= numdevices) 00568 return -1; 00569 return devprops[dev].spdpfpperfratio; 00570 } 00571 00572 int CUDAAccel::device_pageablememaccess(int dev) { 00573 if (!cudaavail || dev < 0 || dev >= numdevices) 00574 return -1; 00575 return devprops[dev].pageablememaccess; 00576 } 00577 00578 int CUDAAccel::device_pageablememaccessuseshostpagetables(int dev) { 00579 if (!cudaavail || dev < 0 || dev >= numdevices) 00580 return -1; 00581 return devprops[dev].pageablememaccessuseshostpagetables; 00582 } 00583