00001 /*************************************************************************** 00002 *cr 00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the 00004 *cr University of Illinois 00005 *cr All Rights Reserved 00006 *cr 00007 ***************************************************************************/ 00008 /*************************************************************************** 00009 * RCS INFORMATION: 00010 * 00011 * $RCSfile: CUDABench.cu,v $ 00012 * $Author: johns $ $Locker: $ $State: Exp $ 00013 * $Revision: 1.42 $ $Date: 2022年02月09日 04:03:19 $ 00014 * 00015 ***************************************************************************/ 00021 #include <stdio.h> 00022 #include <stdlib.h> 00023 #include <string.h> 00024 #include <cuda.h> 00025 00026 #include "Inform.h" 00027 #include "WKFThreads.h" 00028 #include "WKFUtils.h" 00029 #include "CUDAKernels.h" 00030 #include "Measure.h" 00031 00032 00033 // 00034 // Restrict macro to make it easy to do perf tuning tests 00035 // 00036 #if 1 00037 #define RESTRICT __restrict__ 00038 #else 00039 #define RESTRICT 00040 #endif 00041 00042 00043 //#define VMDUSECUDAGDS 1 00044 #if defined(VMDUSECUDAGDS) 00045 #include </usr/local/gds-beta-0.7.1/lib/cufile.h> // GPU-Direct Storage 00046 00047 // direct calls to JS plugin for devel/testing until the plugin manager 00048 // and headers incorporate the new out-of-core GPU-direct I/O hooks. 00049 #define VMDJSPLUGININCLUDESRC 1 00050 #include "/home/johns/plugins/molfile_plugin/src/jsplugin.c" 00051 #endif 00052 00053 00054 #define CUERR { cudaError_t err; \ 00055 if ((err = cudaGetLastError()) != cudaSuccess) { \ 00056 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \ 00057 return -1; }} 00058 00059 00060 // 00061 // Benchmark peak Multiply-Add instruction performance, in GFLOPS 00062 // 00063 00064 // FMADD16 macro contains a sequence of operations that the compiler 00065 // won't optimize out, and will translate into a densely packed block 00066 // of multiply-add instructions with no intervening register copies/moves 00067 // or other instructions. 00068 #define FMADD16 \ 00069 tmp0 = tmp0*tmp4+tmp7; \ 00070 tmp1 = tmp1*tmp5+tmp0; \ 00071 tmp2 = tmp2*tmp6+tmp1; \ 00072 tmp3 = tmp3*tmp7+tmp2; \ 00073 tmp4 = tmp4*tmp0+tmp3; \ 00074 tmp5 = tmp5*tmp1+tmp4; \ 00075 tmp6 = tmp6*tmp2+tmp5; \ 00076 tmp7 = tmp7*tmp3+tmp6; \ 00077 tmp8 = tmp8*tmp12+tmp15; \ 00078 tmp9 = tmp9*tmp13+tmp8; \ 00079 tmp10 = tmp10*tmp14+tmp9; \ 00080 tmp11 = tmp11*tmp15+tmp10; \ 00081 tmp12 = tmp12*tmp8+tmp11; \ 00082 tmp13 = tmp13*tmp9+tmp12; \ 00083 tmp14 = tmp14*tmp10+tmp13; \ 00084 tmp15 = tmp15*tmp11+tmp14; 00085 00086 // CUDA grid, thread block, loop, and MADD operation counts 00087 #define GRIDSIZEX 6144 // number of 1-D thread blocks 00088 #define BLOCKSIZEX 64 // number of threads per 1-D block 00089 #define GLOOPS 2000 // iteration count (all threads) 00090 #define FMADD16COUNT 32 // 32 reps 00091 #define FLOPSPERFMADD16 32 // 16 MULs and 16 ADDs 00092 00093 // FLOP counting 00094 #define FLOPSPERLOOP (FMADD16COUNT * FLOPSPERFMADD16) 00095 00096 __global__ static void madd_kernel(float *doutput) { 00097 int tid = blockIdx.x * blockDim.x + threadIdx.x; 00098 float tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6,tmp7; 00099 float tmp8,tmp9,tmp10,tmp11,tmp12,tmp13,tmp14,tmp15; 00100 tmp0=tmp1=tmp2=tmp3=tmp4=tmp5=tmp6=tmp7=0.0f; 00101 tmp8=tmp9=tmp10=tmp11=tmp12=tmp13=tmp14=tmp15 = 0.0f; 00102 00103 tmp15=tmp7 = blockIdx.x * 0.001f; // prevent compiler from optimizing out 00104 tmp1 = blockIdx.y * 0.001f; // the body of the loop... 00105 00106 int loop; 00107 for(loop=0; loop<GLOOPS; loop++){ 00108 FMADD16 00109 FMADD16 00110 FMADD16 00111 FMADD16 00112 FMADD16 00113 FMADD16 00114 FMADD16 00115 FMADD16 00116 FMADD16 00117 FMADD16 00118 FMADD16 00119 FMADD16 00120 FMADD16 00121 FMADD16 00122 FMADD16 00123 FMADD16 00124 FMADD16 00125 FMADD16 00126 FMADD16 00127 FMADD16 00128 FMADD16 00129 FMADD16 00130 FMADD16 00131 FMADD16 00132 FMADD16 00133 FMADD16 00134 FMADD16 00135 FMADD16 00136 FMADD16 00137 FMADD16 00138 FMADD16 00139 FMADD16 00140 } 00141 00142 doutput[tid] = tmp0+tmp1+tmp2+tmp3+tmp4+tmp5+tmp6+tmp7 00143 +tmp8+tmp9+tmp10+tmp11+tmp12+tmp13+tmp14+tmp15; 00144 } 00145 00146 00147 static int cudamaddgflops(int cudadev, double *gflops, int testloops) { 00148 float *doutput = NULL; 00149 dim3 Bsz, Gsz; 00150 wkf_timerhandle timer; 00151 int i; 00152 00153 cudaError_t rc; 00154 rc = cudaSetDevice(cudadev); 00155 if (rc != cudaSuccess) { 00156 #if CUDART_VERSION >= 2010 00157 rc = cudaGetLastError(); // query last error and reset error state 00158 if (rc != cudaErrorSetOnActiveProcess) 00159 return -1; // abort and return an error 00160 #else 00161 cudaGetLastError(); // just ignore and reset error state, since older CUDA 00162 // revs don't have a cudaErrorSetOnActiveProcess enum 00163 #endif 00164 } 00165 00166 00167 // setup CUDA grid and block sizes 00168 Bsz.x = BLOCKSIZEX; 00169 Bsz.y = 1; 00170 Bsz.z = 1; 00171 Gsz.x = GRIDSIZEX; 00172 Gsz.y = 1; 00173 Gsz.z = 1; 00174 00175 // allocate output array 00176 cudaMalloc((void**)&doutput, BLOCKSIZEX * GRIDSIZEX * sizeof(float)); 00177 CUERR // check and clear any existing errors 00178 00179 // warmup run 00180 madd_kernel<<<Gsz, Bsz>>>(doutput); 00181 cudaDeviceSynchronize(); // wait for kernel to finish 00182 00183 // benchmark run 00184 timer=wkf_timer_create(); 00185 wkf_timer_start(timer); 00186 for (i=0; i<testloops; i++) { 00187 madd_kernel<<<Gsz, Bsz>>>(doutput); 00188 } 00189 cudaDeviceSynchronize(); // wait for kernel to finish 00190 CUERR // check and clear any existing errors 00191 wkf_timer_stop(timer); 00192 00193 double runtime = wkf_timer_time(timer); 00194 double gflop = ((double) GLOOPS) * ((double) FLOPSPERLOOP) * 00195 ((double) BLOCKSIZEX) * ((double) GRIDSIZEX) * (1.0e-9) * testloops; 00196 00197 *gflops = gflop / runtime; 00198 00199 cudaFree(doutput); 00200 CUERR // check and clear any existing errors 00201 00202 wkf_timer_destroy(timer); 00203 00204 return 0; 00205 } 00206 00207 typedef struct { 00208 int deviceid; 00209 int testloops; 00210 double gflops; 00211 } maddthrparms; 00212 00213 static void * cudamaddthread(void *voidparms) { 00214 maddthrparms *parms = (maddthrparms *) voidparms; 00215 cudamaddgflops(parms->deviceid, &parms->gflops, parms->testloops); 00216 return NULL; 00217 } 00218 00219 int vmd_cuda_madd_gflops(int numdevs, int *devlist, double *gflops, 00220 int testloops) { 00221 maddthrparms *parms; 00222 wkf_thread_t * threads; 00223 int i; 00224 00225 /* allocate array of threads */ 00226 threads = (wkf_thread_t *) calloc(numdevs * sizeof(wkf_thread_t), 1); 00227 00228 /* allocate and initialize array of thread parameters */ 00229 parms = (maddthrparms *) malloc(numdevs * sizeof(maddthrparms)); 00230 for (i=0; i<numdevs; i++) { 00231 if (devlist != NULL) 00232 parms[i].deviceid = devlist[i]; 00233 else 00234 parms[i].deviceid = i; 00235 00236 parms[i].testloops = testloops; 00237 parms[i].gflops = 0.0; 00238 } 00239 00240 #if defined(VMDTHREADS) 00241 /* spawn child threads to do the work */ 00242 /* thread 0 must also be processed this way otherwise */ 00243 /* we'll permanently bind the main thread to some device */ 00244 for (i=0; i<numdevs; i++) { 00245 wkf_thread_create(&threads[i], cudamaddthread, &parms[i]); 00246 } 00247 00248 /* join the threads after work is done */ 00249 for (i=0; i<numdevs; i++) { 00250 wkf_thread_join(threads[i], NULL); 00251 } 00252 #else 00253 /* single thread does all of the work */ 00254 cudamaddthread((void *) &parms[0]); 00255 #endif 00256 00257 for (i=0; i<numdevs; i++) { 00258 gflops[i] = parms[i].gflops; 00259 } 00260 00261 /* free thread parms */ 00262 free(parms); 00263 free(threads); 00264 00265 return 0; 00266 } 00267 00268 00269 00270 00271 00272 00273 // 00274 // Host-GPU memcpy I/O bandwidth benchmark 00275 // 00276 00277 #define BWITER 500 00278 #define LATENCYITER 50000 00279 00280 static int cudabusbw(int cudadev, 00281 double *hdmbsec, double *hdlatusec, 00282 double *phdmbsec, double *phdlatusec, 00283 double *dhmbsec, double *dhlatusec, 00284 double *pdhmbsec, double *pdhlatusec) { 00285 float *hdata = NULL; // non-pinned DMA buffer 00286 float *phdata = NULL; // pinned DMA buffer 00287 float *ddata = NULL; 00288 int i; 00289 double runtime; 00290 wkf_timerhandle timer; 00291 int memsz = 1024 * 1024 * sizeof(float); 00292 00293 *hdmbsec = 0.0; 00294 *hdlatusec = 0.0; 00295 *dhmbsec = 0.0; 00296 *dhlatusec = 0.0; 00297 *phdmbsec = 0.0; 00298 *phdlatusec = 0.0; 00299 *pdhmbsec = 0.0; 00300 *pdhlatusec = 0.0; 00301 00302 // attach to the selected device 00303 cudaError_t rc; 00304 rc = cudaSetDevice(cudadev); 00305 if (rc != cudaSuccess) { 00306 #if CUDART_VERSION >= 2010 00307 rc = cudaGetLastError(); // query last error and reset error state 00308 if (rc != cudaErrorSetOnActiveProcess) 00309 return -1; // abort and return an error 00310 #else 00311 cudaGetLastError(); // just ignore and reset error state, since older CUDA 00312 // revs don't have a cudaErrorSetOnActiveProcess enum 00313 #endif 00314 } 00315 00316 // allocate non-pinned output array 00317 hdata = (float *) malloc(memsz); 00318 00319 // allocate pinned output array 00320 cudaMallocHost((void**) &phdata, memsz); 00321 CUERR // check and clear any existing errors 00322 00323 // allocate device memory 00324 cudaMalloc((void**) &ddata, memsz); 00325 CUERR // check and clear any existing errors 00326 00327 // create timer 00328 timer=wkf_timer_create(); 00329 00330 // 00331 // Host to device timings 00332 // 00333 00334 // non-pinned bandwidth 00335 wkf_timer_start(timer); 00336 for (i=0; i<BWITER; i++) { 00337 cudaMemcpy(ddata, hdata, memsz, cudaMemcpyHostToDevice); 00338 } 00339 wkf_timer_stop(timer); 00340 CUERR // check and clear any existing errors 00341 runtime = wkf_timer_time(timer); 00342 *hdmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0); 00343 00344 // non-pinned latency 00345 wkf_timer_start(timer); 00346 for (i=0; i<LATENCYITER; i++) { 00347 cudaMemcpy(ddata, hdata, 1, cudaMemcpyHostToDevice); 00348 } 00349 wkf_timer_stop(timer); 00350 CUERR // check and clear any existing errors 00351 runtime = wkf_timer_time(timer); 00352 *hdlatusec = runtime * 1.0e6 / ((double) LATENCYITER); 00353 00354 00355 // pinned bandwidth 00356 wkf_timer_start(timer); 00357 for (i=0; i<BWITER; i++) { 00358 cudaMemcpy(ddata, phdata, memsz, cudaMemcpyHostToDevice); 00359 } 00360 wkf_timer_stop(timer); 00361 CUERR // check and clear any existing errors 00362 runtime = wkf_timer_time(timer); 00363 *phdmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0); 00364 00365 // pinned latency 00366 wkf_timer_start(timer); 00367 for (i=0; i<LATENCYITER; i++) { 00368 cudaMemcpy(ddata, phdata, 1, cudaMemcpyHostToDevice); 00369 } 00370 wkf_timer_stop(timer); 00371 CUERR // check and clear any existing errors 00372 runtime = wkf_timer_time(timer); 00373 *phdlatusec = runtime * 1.0e6 / ((double) LATENCYITER); 00374 00375 00376 // 00377 // Device to host timings 00378 // 00379 00380 // non-pinned bandwidth 00381 wkf_timer_start(timer); 00382 for (i=0; i<BWITER; i++) { 00383 cudaMemcpy(hdata, ddata, memsz, cudaMemcpyDeviceToHost); 00384 } 00385 wkf_timer_stop(timer); 00386 CUERR // check and clear any existing errors 00387 runtime = wkf_timer_time(timer); 00388 *dhmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0); 00389 00390 // non-pinned latency 00391 wkf_timer_start(timer); 00392 for (i=0; i<LATENCYITER; i++) { 00393 cudaMemcpy(hdata, ddata, 1, cudaMemcpyDeviceToHost); 00394 } 00395 wkf_timer_stop(timer); 00396 CUERR // check and clear any existing errors 00397 runtime = wkf_timer_time(timer); 00398 *dhlatusec = runtime * 1.0e6 / ((double) LATENCYITER); 00399 00400 00401 // pinned bandwidth 00402 wkf_timer_start(timer); 00403 for (i=0; i<BWITER; i++) { 00404 cudaMemcpy(phdata, ddata, memsz, cudaMemcpyDeviceToHost); 00405 } 00406 wkf_timer_stop(timer); 00407 CUERR // check and clear any existing errors 00408 runtime = wkf_timer_time(timer); 00409 *pdhmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0); 00410 00411 // pinned latency 00412 wkf_timer_start(timer); 00413 for (i=0; i<LATENCYITER; i++) { 00414 cudaMemcpy(phdata, ddata, 1, cudaMemcpyDeviceToHost); 00415 } 00416 wkf_timer_stop(timer); 00417 CUERR // check and clear any existing errors 00418 runtime = wkf_timer_time(timer); 00419 *pdhlatusec = runtime * 1.0e6 / ((double) LATENCYITER); 00420 00421 00422 cudaFree(ddata); 00423 CUERR // check and clear any existing errors 00424 cudaFreeHost(phdata); 00425 CUERR // check and clear any existing errors 00426 free(hdata); 00427 00428 wkf_timer_destroy(timer); 00429 00430 return 0; 00431 } 00432 00433 typedef struct { 00434 int deviceid; 00435 double hdmbsec; 00436 double hdlatusec; 00437 double phdmbsec; 00438 double phdlatusec; 00439 double dhmbsec; 00440 double dhlatusec; 00441 double pdhmbsec; 00442 double pdhlatusec; 00443 } busbwthrparms; 00444 00445 static void * cudabusbwthread(void *voidparms) { 00446 busbwthrparms *parms = (busbwthrparms *) voidparms; 00447 cudabusbw(parms->deviceid, 00448 &parms->hdmbsec, &parms->hdlatusec, 00449 &parms->phdmbsec, &parms->phdlatusec, 00450 &parms->dhmbsec, &parms->dhlatusec, 00451 &parms->pdhmbsec, &parms->pdhlatusec); 00452 return NULL; 00453 } 00454 00455 int vmd_cuda_bus_bw(int numdevs, int *devlist, 00456 double *hdmbsec, double *hdlatusec, 00457 double *phdmbsec,double *phdlatusec, 00458 double *dhmbsec, double *dhlatusec, 00459 double *pdhmbsec, double *pdhlatusec) { 00460 busbwthrparms *parms; 00461 wkf_thread_t * threads; 00462 int i; 00463 00464 /* allocate array of threads */ 00465 threads = (wkf_thread_t *) calloc(numdevs * sizeof(wkf_thread_t), 1); 00466 00467 /* allocate and initialize array of thread parameters */ 00468 parms = (busbwthrparms *) malloc(numdevs * sizeof(busbwthrparms)); 00469 for (i=0; i<numdevs; i++) { 00470 if (devlist != NULL) 00471 parms[i].deviceid = devlist[i]; 00472 else 00473 parms[i].deviceid = i; 00474 parms[i].hdmbsec = 0.0; 00475 parms[i].hdlatusec = 0.0; 00476 parms[i].phdmbsec = 0.0; 00477 parms[i].phdlatusec = 0.0; 00478 parms[i].dhmbsec = 0.0; 00479 parms[i].dhlatusec = 0.0; 00480 parms[i].pdhmbsec = 0.0; 00481 parms[i].pdhlatusec = 0.0; 00482 } 00483 00484 #if defined(VMDTHREADS) 00485 /* spawn child threads to do the work */ 00486 /* thread 0 must also be processed this way otherwise */ 00487 /* we'll permanently bind the main thread to some device */ 00488 for (i=0; i<numdevs; i++) { 00489 wkf_thread_create(&threads[i], cudabusbwthread, &parms[i]); 00490 } 00491 00492 /* join the threads after work is done */ 00493 for (i=0; i<numdevs; i++) { 00494 wkf_thread_join(threads[i], NULL); 00495 } 00496 #else 00497 /* single thread does all of the work */ 00498 cudabusbwthread((void *) &parms[0]); 00499 #endif 00500 00501 for (i=0; i<numdevs; i++) { 00502 hdmbsec[i] = parms[i].hdmbsec; 00503 hdlatusec[i] = parms[i].hdlatusec; 00504 phdmbsec[i] = parms[i].phdmbsec; 00505 phdlatusec[i] = parms[i].phdlatusec; 00506 dhmbsec[i] = parms[i].dhmbsec; 00507 dhlatusec[i] = parms[i].dhlatusec; 00508 pdhmbsec[i] = parms[i].pdhmbsec; 00509 pdhlatusec[i] = parms[i].pdhlatusec; 00510 } 00511 00512 /* free thread parms */ 00513 free(parms); 00514 free(threads); 00515 00516 return 0; 00517 } 00518 00519 00520 00521 // 00522 // GPU device global memory bandwidth benchmark 00523 // 00524 template <class T> 00525 __global__ void gpuglobmemcpybw(T *dest, const T *src) { 00526 const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; 00527 dest[idx] = src[idx]; 00528 } 00529 00530 template <class T> 00531 __global__ void gpuglobmemsetbw(T *dest, const T val) { 00532 int idx = threadIdx.x + blockIdx.x * blockDim.x; 00533 dest[idx] = val; 00534 } 00535 00536 typedef float4 datatype; 00537 00538 static int cudaglobmembw(int cudadev, double *gpumemsetgbsec, double *gpumemcpygbsec) { 00539 int i; 00540 int len = 1 << 22; // one thread per data element 00541 int loops = 500; 00542 datatype *src, *dest; 00543 datatype val=make_float4(1.0f, 1.0f, 1.0f, 1.0f); 00544 00545 // initialize to zero for starters 00546 float memsettime = 0.0f; 00547 float memcpytime = 0.0f; 00548 *gpumemsetgbsec = 0.0; 00549 *gpumemcpygbsec = 0.0; 00550 00551 // attach to the selected device 00552 cudaError_t rc; 00553 rc = cudaSetDevice(cudadev); 00554 if (rc != cudaSuccess) { 00555 #if CUDART_VERSION >= 2010 00556 rc = cudaGetLastError(); // query last error and reset error state 00557 if (rc != cudaErrorSetOnActiveProcess) 00558 return -1; // abort and return an error 00559 #else 00560 cudaGetLastError(); // just ignore and reset error state, since older CUDA 00561 // revs don't have a cudaErrorSetOnActiveProcess enum 00562 #endif 00563 } 00564 00565 cudaMalloc((void **) &src, sizeof(datatype)*len); 00566 CUERR 00567 cudaMalloc((void **) &dest, sizeof(datatype)*len); 00568 CUERR 00569 00570 dim3 BSz(256, 1, 1); 00571 dim3 GSz(len / (BSz.x * BSz.y * BSz.z), 1, 1); 00572 00573 // do a warm-up pass 00574 gpuglobmemsetbw<datatype><<< GSz, BSz >>>(src, val); 00575 CUERR 00576 gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val); 00577 CUERR 00578 gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src); 00579 CUERR 00580 00581 cudaEvent_t start, end; 00582 cudaEventCreate(&start); 00583 cudaEventCreate(&end); 00584 00585 // execute the memset kernel 00586 cudaEventRecord(start, 0); 00587 for (i=0; i<loops; i++) { 00588 gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val); 00589 } 00590 CUERR 00591 cudaEventRecord(end, 0); 00592 CUERR 00593 cudaEventSynchronize(start); 00594 CUERR 00595 cudaEventSynchronize(end); 00596 CUERR 00597 cudaEventElapsedTime(&memsettime, start, end); 00598 CUERR 00599 00600 // execute the memcpy kernel 00601 cudaEventRecord(start, 0); 00602 for (i=0; i<loops; i++) { 00603 gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src); 00604 } 00605 cudaEventRecord(end, 0); 00606 CUERR 00607 cudaEventSynchronize(start); 00608 CUERR 00609 cudaEventSynchronize(end); 00610 CUERR 00611 cudaEventElapsedTime(&memcpytime, start, end); 00612 CUERR 00613 00614 cudaEventDestroy(start); 00615 CUERR 00616 cudaEventDestroy(end); 00617 CUERR 00618 00619 *gpumemsetgbsec = (len * sizeof(datatype) / (1024.0 * 1024.0)) / (memsettime / loops); 00620 *gpumemcpygbsec = (2 * len * sizeof(datatype) / (1024.0 * 1024.0)) / (memcpytime / loops); 00621 cudaFree(dest); 00622 cudaFree(src); 00623 CUERR 00624 00625 return 0; 00626 } 00627 00628 typedef struct { 00629 int deviceid; 00630 double memsetgbsec; 00631 double memcpygbsec; 00632 } globmembwthrparms; 00633 00634 static void * cudaglobmembwthread(void *voidparms) { 00635 globmembwthrparms *parms = (globmembwthrparms *) voidparms; 00636 cudaglobmembw(parms->deviceid, &parms->memsetgbsec, &parms->memcpygbsec); 00637 return NULL; 00638 } 00639 00640 int vmd_cuda_globmem_bw(int numdevs, int *devlist, 00641 double *memsetgbsec, double *memcpygbsec) { 00642 globmembwthrparms *parms; 00643 wkf_thread_t * threads; 00644 int i; 00645 00646 /* allocate array of threads */ 00647 threads = (wkf_thread_t *) calloc(numdevs * sizeof(wkf_thread_t), 1); 00648 00649 /* allocate and initialize array of thread parameters */ 00650 parms = (globmembwthrparms *) malloc(numdevs * sizeof(globmembwthrparms)); 00651 for (i=0; i<numdevs; i++) { 00652 if (devlist != NULL) 00653 parms[i].deviceid = devlist[i]; 00654 else 00655 parms[i].deviceid = i; 00656 parms[i].memsetgbsec = 0.0; 00657 parms[i].memcpygbsec = 0.0; 00658 } 00659 00660 #if defined(VMDTHREADS) 00661 /* spawn child threads to do the work */ 00662 /* thread 0 must also be processed this way otherwise */ 00663 /* we'll permanently bind the main thread to some device */ 00664 for (i=0; i<numdevs; i++) { 00665 wkf_thread_create(&threads[i], cudaglobmembwthread, &parms[i]); 00666 } 00667 00668 /* join the threads after work is done */ 00669 for (i=0; i<numdevs; i++) { 00670 wkf_thread_join(threads[i], NULL); 00671 } 00672 #else 00673 /* single thread does all of the work */ 00674 cudaglobmembwthread((void *) &parms[0]); 00675 #endif 00676 00677 for (i=0; i<numdevs; i++) { 00678 memsetgbsec[i] = parms[i].memsetgbsec; 00679 memcpygbsec[i] = parms[i].memcpygbsec; 00680 } 00681 00682 /* free thread parms */ 00683 free(parms); 00684 free(threads); 00685 00686 return 0; 00687 } 00688 00689 00690 // 00691 // Benchmark latency for complete threadpool barrier wakeup/run/sleep cycle 00692 // 00693 static void * vmddevpoollatencythread(void *voidparms) { 00694 return NULL; 00695 } 00696 00697 static void * vmddevpooltilelatencythread(void *voidparms) { 00698 int threadid=-1; 00699 int tilesize=1; 00700 void *parms=NULL; 00701 wkf_threadpool_worker_getid(voidparms, &threadid, NULL); 00702 wkf_threadpool_worker_getdata(voidparms, (void **) &parms); 00703 00704 // grind through task tiles until none are left 00705 wkf_tasktile_t tile; 00706 while (wkf_threadpool_next_tile(voidparms, tilesize, &tile) != WKF_SCHED_DONE) { 00707 // do nothing but eat work units... 00708 } 00709 00710 return NULL; 00711 } 00712 00713 00714 // no-op kernel for timing kernel launches 00715 __global__ static void nopkernel(float * ddata) { 00716 unsigned int xindex = blockIdx.x * blockDim.x + threadIdx.x; 00717 unsigned int yindex = blockIdx.y * blockDim.y + threadIdx.y; 00718 unsigned int outaddr = gridDim.x * blockDim.x * yindex + xindex; 00719 00720 if (ddata != NULL) 00721 ddata[outaddr] = outaddr; 00722 } 00723 00724 // empty kernel for timing kernel launches 00725 __global__ static void voidkernel(void) { 00726 return; 00727 } 00728 00729 static void * vmddevpoolcudatilelatencythread(void *voidparms) { 00730 int threadid=-1; 00731 int tilesize=1; 00732 float *parms=NULL; 00733 wkf_threadpool_worker_getid(voidparms, &threadid, NULL); 00734 00735 // XXX Note that we expect parms to be set to NULL or a valid CUDA 00736 // global memory pointer for correct operation of the NOP kernel below 00737 wkf_threadpool_worker_getdata(voidparms, (void **) &parms); 00738 00739 #if 0 00740 // scale tile size by device performance 00741 tilesize=4; // GTX 280, Tesla C1060 starting point tile size 00742 wkf_threadpool_worker_devscaletile(voidparms, &tilesize); 00743 #endif 00744 00745 // grind through task tiles until none are left 00746 wkf_tasktile_t tile; 00747 dim3 Gsz(1,1,0); 00748 dim3 Bsz(8,8,1); 00749 while (wkf_threadpool_next_tile(voidparms, tilesize, &tile) != WKF_SCHED_DONE) { 00750 // launch a no-op CUDA kernel 00751 nopkernel<<<Gsz, Bsz, 0>>>(parms); 00752 } 00753 00754 // wait for all GPU kernels to complete 00755 cudaDeviceSynchronize(); 00756 00757 return NULL; 00758 } 00759 00760 00761 int vmd_cuda_devpool_latency(wkf_threadpool_t *devpool, int tilesize, 00762 double *kernlaunchlatency, 00763 double *barlatency, 00764 double *cyclelatency, 00765 double *tilelatency, 00766 double *kernellatency) { 00767 int i; 00768 wkf_tasktile_t tile; 00769 wkf_timerhandle timer; 00770 int loopcount; 00771 00772 timer=wkf_timer_create(); 00773 00774 // execute just a CUDA kernel launch and measure latency on whatever 00775 // GPU we get. 00776 loopcount = 15000; 00777 dim3 VGsz(1,1,0); 00778 dim3 VBsz(8,8,1); 00779 wkf_timer_start(timer); 00780 for (i=0; i<loopcount; i++) { 00781 voidkernel<<<VGsz, VBsz, 0>>>(); 00782 } 00783 // wait for GPU kernels to complete 00784 cudaDeviceSynchronize(); 00785 wkf_timer_stop(timer); 00786 *kernlaunchlatency = wkf_timer_time(timer) / ((double) loopcount); 00787 00788 // execute just a raw barrier sync and measure latency 00789 loopcount = 15000; 00790 wkf_timer_start(timer); 00791 for (i=0; i<loopcount; i++) { 00792 wkf_threadpool_wait(devpool); 00793 } 00794 wkf_timer_stop(timer); 00795 *barlatency = wkf_timer_time(timer) / ((double) loopcount); 00796 00797 // time wake-up, launch, and sleep/join of device pool doing a no-op 00798 loopcount = 5000; 00799 wkf_timer_start(timer); 00800 for (i=0; i<loopcount; i++) { 00801 tile.start=0; 00802 tile.end=0; 00803 wkf_threadpool_sched_dynamic(devpool, &tile); 00804 wkf_threadpool_launch(devpool, vmddevpoollatencythread, NULL, 1); 00805 } 00806 wkf_timer_stop(timer); 00807 *cyclelatency = wkf_timer_time(timer) / ((double) loopcount); 00808 00809 // time wake-up, launch, and sleep/join of device pool eating tiles 00810 loopcount = 5000; 00811 wkf_timer_start(timer); 00812 for (i=0; i<loopcount; i++) { 00813 tile.start=0; 00814 tile.end=tilesize; 00815 wkf_threadpool_sched_dynamic(devpool, &tile); 00816 wkf_threadpool_launch(devpool, vmddevpooltilelatencythread, NULL, 1); 00817 } 00818 wkf_timer_stop(timer); 00819 *tilelatency = wkf_timer_time(timer) / ((double) loopcount); 00820 00821 // time wake-up, launch, and sleep/join of device pool eating tiles 00822 loopcount = 2000; 00823 wkf_timer_start(timer); 00824 for (i=0; i<loopcount; i++) { 00825 tile.start=0; 00826 tile.end=tilesize; 00827 wkf_threadpool_sched_dynamic(devpool, &tile); 00828 wkf_threadpool_launch(devpool, vmddevpoolcudatilelatencythread, NULL, 1); 00829 } 00830 wkf_timer_stop(timer); 00831 *kernellatency = wkf_timer_time(timer) / ((double) loopcount); 00832 00833 wkf_timer_destroy(timer); 00834 00835 #if 1 00836 vmd_cuda_measure_latencies(devpool); 00837 #endif 00838 00839 return 0; 00840 } 00841 00842 00843 // 00844 // Benchmark CUDA kernel launch and memory copy latencies in isolation 00845 // 00846 typedef struct { 00847 int deviceid; 00848 int testloops; 00849 double kernlatency; 00850 double bcopylatency; 00851 double kbseqlatency; 00852 } latthrparms; 00853 00854 static void * vmddevpoolcudalatencythread(void *voidparms) { 00855 int threadid=-1; 00856 latthrparms *parms=NULL; 00857 00858 wkf_threadpool_worker_getid(voidparms, &threadid, NULL); 00859 wkf_threadpool_worker_getdata(voidparms, (void **) &parms); 00860 if (parms->deviceid == threadid) { 00861 wkf_timerhandle timer; 00862 timer=wkf_timer_create(); 00863 printf("Thread/device %d running...\n", threadid); 00864 cudaStream_t devstream; 00865 cudaStreamCreate(&devstream); 00866 00867 char *hostbuf = (char *) calloc(1, 65536 * sizeof(char)); 00868 char *gpubuf = NULL; 00869 cudaMalloc((void**)&gpubuf, 65536 * sizeof(char)); 00870 00871 dim3 Gsz(1,1,0); 00872 dim3 Bsz(8,8,1); 00873 00874 // measure back-to-back NULL kernel launches 00875 wkf_timer_start(timer); 00876 int i; 00877 for (i=0; i<parms->testloops; i++) { 00878 // launch a no-op CUDA kernel 00879 nopkernel<<<Gsz, Bsz, 0, devstream>>>(NULL); 00880 } 00881 // wait for all GPU kernels to complete 00882 cudaStreamSynchronize(devstream); 00883 wkf_timer_stop(timer); 00884 parms->kernlatency = 1000000 * wkf_timer_time(timer) / ((double) parms->testloops); 00885 00886 // measure back-to-back round-trip 1-byte memcpy latencies 00887 wkf_timer_start(timer); 00888 for (i=0; i<parms->testloops; i++) { 00889 cudaMemcpyAsync(gpubuf, hostbuf, 1, cudaMemcpyHostToDevice, devstream); 00890 cudaMemcpyAsync(hostbuf, gpubuf, 1, cudaMemcpyDeviceToHost, devstream); 00891 } 00892 // wait for all GPU kernels to complete 00893 cudaStreamSynchronize(devstream); 00894 wkf_timer_stop(timer); 00895 parms->kernlatency = 1000000 * wkf_timer_time(timer) / ((double) parms->testloops); 00896 00897 printf("NULL kernel launch latency (usec): %.2f\n", parms->kernlatency); 00898 00899 cudaStreamDestroy(devstream); 00900 cudaFree(gpubuf); 00901 free(hostbuf); 00902 wkf_timer_destroy(timer); 00903 } 00904 00905 return NULL; 00906 } 00907 00908 00909 int vmd_cuda_measure_latencies(wkf_threadpool_t *devpool) { 00910 latthrparms thrparms; 00911 int workers = wkf_threadpool_get_workercount(devpool); 00912 int i; 00913 printf("vmd_cuda_measure_latencies()...\n"); 00914 for (i=0; i<workers; i++) { 00915 memset(&thrparms, 0, sizeof(thrparms)); 00916 thrparms.deviceid = i; 00917 thrparms.testloops = 2500; 00918 wkf_threadpool_launch(devpool, vmddevpoolcudalatencythread, &thrparms, 1); 00919 } 00920 00921 return 0; 00922 } 00923 00924 00925 #if defined(VMDUSECUDAGDS) 00926 typedef struct { 00927 int nfiles; 00928 const char **trjfileset; 00929 jshandle **jshandles; 00930 CUfileHandle_t *cfh; 00931 int devcount; 00932 int natoms; 00933 const AtomSel *sel; 00934 int first; 00935 int last; 00936 int step; 00937 } gpuoocbenchthreadparms; 00938 00939 #define VMDGDSMAXFRAMEBUF 8 00940 static void * gpu_ooc_bench_thread(void *voidparms) { 00941 int threadid, numthreads; 00942 gpuoocbenchthreadparms *parms = NULL; 00943 wkf_threadpool_worker_getdata(voidparms, (void **) &parms); 00944 wkf_threadpool_worker_getid(voidparms, &threadid, &numthreads); 00945 00946 // 00947 // copy in per-thread parameters 00948 // 00949 int nfiles = parms->nfiles; 00950 int natoms = parms->natoms; 00951 const AtomSel *sel = parms->sel; 00952 int first = parms->first; 00953 int last = parms->last; 00954 int step = parms->step; 00955 00956 int usecufile = 1; 00957 fio_fd *hostfds = NULL; 00958 int pinhostiobuffer = 1; 00959 if (getenv("VMDGDSHOSTNOPIN")) { 00960 pinhostiobuffer=0; 00961 } 00962 00963 if (getenv("VMDGDSUSEHOST")) { 00964 usecufile=0; 00965 hostfds = (fio_fd *) calloc(1, nfiles * sizeof(fio_fd)); 00966 00967 int hostusedirectio = 1; 00968 if (getenv("VMDGDSHOSTBUFFEREDIO") != NULL) 00969 hostusedirectio = 0; 00970 00971 int openmode = FIO_READ; 00972 if (hostusedirectio) 00973 openmode |= FIO_DIRECT; 00974 00975 int i; 00976 for (i=0; i<nfiles; i++) { 00977 if (fio_open(parms->trjfileset[i], openmode, &hostfds[i]) < 0) { 00978 if (hostusedirectio) { 00979 printf("Thr[%d] direct I/O unavailable or can't open file '%s'\n", 00980 threadid, parms->trjfileset[i]); 00981 } else { 00982 printf("Thr[%d] can't open file '%s'\n", 00983 threadid, parms->trjfileset[i]); 00984 } 00985 return NULL; 00986 } 00987 } 00988 } 00989 00990 if (hostfds && usecufile) { 00991 printf("Inconsistent cufile/hostfds state, aborting!\n"); 00992 return NULL; 00993 } 00994 00995 /* ensure we have a large enough allocation so we can align */ 00996 /* the starting pointer to a blocksz page boundary */ 00997 long blocksz = MOLFILE_DIRECTIO_MIN_BLOCK_SIZE; 00998 long sz = 3L*sizeof(float)*natoms + blocksz; 00999 01000 /* pad the allocation to an even multiple of the block size */ 01001 size_t blockpadsz = (sz + (blocksz - 1)) & (~(blocksz - 1)); 01002 01003 int framecount = (last - first + 1) / step; 01004 int framesperfile = framecount / nfiles; 01005 01006 if (threadid == 0) { 01007 printf("Thr[%2d] %d frames total, natoms: %d selected: %d\n", 01008 threadid, framecount, natoms, sel->selected); 01009 printf("Thr[%2d] %d frames/file\n", threadid, framesperfile); 01010 } 01011 01012 cudaError_t crc; 01013 cudaStream_t oocstream; 01014 float *devptr=NULL; 01015 float *hostptr=NULL; 01016 float *hostptr_unaligned=NULL; 01017 01018 float *crdx1=NULL, *crdy1=NULL, *crdz1=NULL; 01019 float *crdx2=NULL, *crdy2=NULL, *crdz2=NULL; 01020 int multiframeio = 0; 01021 if (getenv("VMDGDSMULTIFRAME")) 01022 multiframeio = atoi(getenv("VMDGDSMULTIFRAME")); 01023 if (multiframeio > VMDGDSMAXFRAMEBUF) 01024 multiframeio = VMDGDSMAXFRAMEBUF; 01025 01026 // set block sizes and counts for IO bench calcs 01027 dim3 IOBsz = dim3(256, 1, 1); 01028 dim3 IOGsz = dim3((natoms + IOBsz.x - 1) / IOBsz.x, 1, 1); 01029 01030 if (parms->devcount > 0) { 01031 long gpuallocsz = (VMDGDSMAXFRAMEBUF+1) * blockpadsz; 01032 01033 if (threadid == 0) { 01034 printf("Thr[%2d] Allocating GPU timestep I/O buf: %ld \n", 01035 threadid, gpuallocsz); 01036 } 01037 crc = cudaMalloc((void**) &devptr, gpuallocsz); 01038 01039 if (hostfds != NULL) { 01040 if (pinhostiobuffer) { 01041 crc = cudaMallocHost((void**) &hostptr, gpuallocsz); 01042 } else { 01043 hostptr = (float *) alloc_aligned_ptr(gpuallocsz, 4096, 01044 (void**) &hostptr_unaligned); 01045 if (!hostptr) { 01046 printf("Thr[%d]: Failed allocation!\n", threadid); 01047 return NULL; 01048 } 01049 } 01050 } 01051 01052 long crdsz = sel->selected * sizeof(float); 01053 01054 // atomic coord buffers 01055 crc = cudaMalloc((void**) &crdx1, crdsz); 01056 crc = cudaMalloc((void**) &crdy1, crdsz); 01057 crc = cudaMalloc((void**) &crdz1, crdsz); 01058 crc = cudaMalloc((void**) &crdx2, crdsz); 01059 crc = cudaMalloc((void**) &crdy2, crdsz); 01060 crc = cudaMalloc((void**) &crdz2, crdsz); 01061 if (crc != cudaSuccess) { 01062 printf("Thr[%2d], Failed to allocate GPU buffer!\n", threadid); 01063 return NULL; // XXX error handling needs to be done here 01064 } 01065 01066 cudaStreamCreate(&oocstream); 01067 01068 #if defined(VMDUSECUDAGDS) 01069 cuFileBufRegister(devptr, gpuallocsz, 0); 01070 #endif 01071 } 01072 01073 int verbose = (getenv("VMDGDSVERBOSE") != NULL) ? 1 : 0; 01074 01075 int filestrategy = 0; 01076 if (getenv("VMDGDSFILESTRATEGY")) { 01077 filestrategy = atoi(getenv("VMDGDSFILESTRATEGY")); 01078 } 01079 if (threadid == 0) { 01080 printf("Thr[%2d] file strategy set to: %d\n", threadid, filestrategy); 01081 } 01082 01083 wkf_tasktile_t tile; 01084 while (wkf_threadlaunch_next_tile(voidparms, VMDGDSMAXFRAMEBUF * 1, &tile) != WKF_SCHED_DONE) { 01085 // 01086 // simple I/O + compute benchmarking... 01087 // 01088 int idx; 01089 int threadspergpu; 01090 if (parms->devcount > 0) 01091 threadspergpu = numthreads / parms->devcount; 01092 else 01093 threadspergpu = 1; 01094 01095 for (idx=tile.start; idx<tile.end; idx++) { 01096 int myfileidx, fileframeidx; 01097 01098 switch (filestrategy) { 01099 case 1: 01100 myfileidx = (idx / multiframeio) % nfiles; 01101 fileframeidx = idx % framesperfile; 01102 break; 01103 01104 case 2: 01105 myfileidx = (idx / (multiframeio * threadspergpu)) % nfiles; 01106 fileframeidx = idx % framesperfile; 01107 break; 01108 01109 case 3: 01110 myfileidx = (threadid / 4) % nfiles; 01111 fileframeidx = idx % framesperfile; 01112 break; 01113 01114 case 0: 01115 default: 01116 myfileidx = (threadid / threadspergpu) % nfiles; 01117 fileframeidx = idx % framesperfile; 01118 break; 01119 } 01120 01121 // 01122 // compute multi-frame or single-frame I/O offsets and sizes 01123 // 01124 long startoffset, foffset, readlen; 01125 read_js_timestep_index_offsets(parms->jshandles[myfileidx], 01126 natoms, fileframeidx, 0, natoms, NULL, 01127 &startoffset, &foffset, &readlen); 01128 if (multiframeio) { 01129 // multi-frame reads use the same starting offset, but the 01130 // read length is computed from the first and last frames 01131 // in the group. 01132 long multistartoffset, multifoffset, multireadlen; 01133 read_js_timestep_index_offsets(parms->jshandles[myfileidx], natoms, 01134 fileframeidx+multiframeio-1, 01135 0, natoms, NULL, 01136 &multistartoffset, &multifoffset, 01137 &multireadlen); 01138 01139 multireadlen = (multifoffset + multireadlen) - foffset; 01140 01141 //printf("** readlen: %ld multireadlen: %ld\n", readlen, multireadlen); 01142 readlen = multireadlen; 01143 idx+=multiframeio-1; // add in the required increment... 01144 } 01145 01146 // 01147 // perform the required I/O via GDS or by host kernel I/O 01148 // 01149 long ret=0; 01150 if (usecufile) { 01151 ret = cuFileRead(parms->cfh[myfileidx], (char *) devptr, readlen, foffset, 0); 01152 } else if (hostfds) { 01153 foffset=0; 01154 ret=fio_fseek(hostfds[myfileidx], foffset, FIO_SEEK_SET); 01155 if (ret<0) { printf("fio_fseek() error!\n"); return NULL; } 01156 ret=fio_fread(hostptr, readlen, 1, hostfds[myfileidx]); 01157 if (ret<0) { printf("fio_fseek() error!\n"); return NULL; } 01158 cudaMemcpy(devptr, hostptr, readlen, cudaMemcpyHostToDevice); 01159 } else { 01160 printf("Inconsistent cufile/hostfds state, aborting!\n"); 01161 return NULL; 01162 } 01163 01164 // handle errors if they have occured 01165 if (ret < 0) { 01166 printf("Thr[%2d] Error: cuFileRead(): %ld\n", threadid, ret); 01167 return NULL; // XXX error handling needs to be done here 01168 } 01169 01170 if (verbose) { 01171 printf("Thr[%2d]F[%d][tile: %d to %d] frame: %d cuFile len: %ld off: %ld\n", 01172 threadid, myfileidx, tile.start, tile.end, idx, 01173 readlen, foffset); 01174 } 01175 } 01176 } 01177 01178 cudaFree(crdx1); 01179 cudaFree(crdy1); 01180 cudaFree(crdz1); 01181 cudaFree(crdx2); 01182 cudaFree(crdy2); 01183 cudaFree(crdz2); 01184 01185 #if defined(VMDUSECUDAGDS) 01186 if (usecufile) { 01187 cuFileBufDeregister(devptr); 01188 } 01189 #endif 01190 01191 if (hostfds != NULL) { 01192 int i; 01193 for (i=0; i<nfiles; i++) { 01194 fio_fclose(hostfds[i]); 01195 } 01196 free(hostfds); 01197 } 01198 01199 if (hostptr != NULL) { 01200 if (pinhostiobuffer) { 01201 cudaFreeHost(hostptr); 01202 } else { 01203 free(hostptr_unaligned); 01204 } 01205 } 01206 01207 return NULL; 01208 } 01209 01210 #endif 01211 01212 01213 int gpu_ooc_bench(wkf_threadpool_t *devpool, // VMD GPU worker thread pool 01214 int nfiles, const char **trjfileset, const AtomSel *sel, 01215 int first, int last, int step) { 01216 printf("gpu_ooc_bench()\n"); 01217 wkf_threadpool_t *bigpool = NULL; 01218 01219 #if defined(VMDUSECUDAGDS) 01220 int devcount; 01221 cudaError_t crc = cudaGetDeviceCount(&devcount); 01222 printf("gpu_ooc_bench) GPU device count: %d\n", devcount); 01223 if (devcount==0) 01224 printf("gpu_ooc_bench) No GPU devices, continuing with host only...\n"); 01225 01226 CUfileHandle_t * cfh = (CUfileHandle_t *) calloc(1, nfiles * sizeof(CUfileHandle_t)); 01227 CUfileDescr_t * cfhdesc = (CUfileDescr_t *) calloc(1, nfiles * sizeof(CUfileDescr_t)); 01228 memset(&cfh[0], 0, sizeof(cfh)); 01229 memset(&cfhdesc[0], 0, sizeof(cfhdesc)); 01230 01231 int natomschk = 0; 01232 jshandle **jshandles = (jshandle **) calloc(1, nfiles * sizeof(jshandle *)); 01233 fio_fd *directio_fds = (fio_fd *) calloc(1, nfiles * sizeof(fio_fd)); 01234 01235 int i; 01236 for (i=0; i<nfiles; i++) { 01237 const char *filename = trjfileset[i]; 01238 printf("gpu_ooc_bench) File[%d] GDS setup, opening '%s'\n", i, filename); 01239 jshandles[i] = (jshandle *) open_js_read(filename, "js", &natomschk); 01240 if (!jshandles[i]) { 01241 printf("gpu_ooc_bench) File[%d] open_js_read failed for file %s\n", i, filename); 01242 return -1; // deal with error handling later 01243 } 01244 01245 #if vmdplugin_ABIVERSION > 17 01246 long blocksz = MOLFILE_DIRECTIO_MIN_BLOCK_SIZE; 01247 int filepgalignsz = 1; 01248 read_js_timestep_pagealign_size(jshandles[i], &filepgalignsz); 01249 if (filepgalignsz != blocksz) { 01250 printf("gpu_ooc_bench) File[%d] Plugin-returned page alignment size mismatch!\n", i); 01251 } else { 01252 printf("gpu_ooc_bench) File[%d] Page alignment size: %d\n", i, filepgalignsz); 01253 } 01254 #endif 01255 01256 read_js_timestep_index_offsets(jshandles[i], natomschk, 0, 0, 0, 01257 &directio_fds[i], NULL, NULL, NULL); 01258 01259 cfhdesc[i].handle.fd = directio_fds[i]; // typedef of Unix FD 01260 cfhdesc[i].type = CU_FILE_HANDLE_TYPE_OPAQUE_FD; 01261 CUfileError_t cferr = cuFileHandleRegister(&cfh[i], &cfhdesc[i]); 01262 01263 if (cferr.err != CU_FILE_SUCCESS) { 01264 printf("gpu_ooc_bench) File[%d] cuFileImportExternalFile on fd %d failed!\n", 01265 i, cfhdesc[i].handle.fd); 01266 return -1; // XXX error handling needs to be done here 01267 } 01268 } 01269 01270 01271 // 01272 // copy in per-thread parameters 01273 // 01274 gpuoocbenchthreadparms parms; 01275 memset(&parms, 0, sizeof(parms)); 01276 parms.devcount = devcount; 01277 parms.nfiles = nfiles; 01278 parms.trjfileset = trjfileset; 01279 parms.jshandles = jshandles; 01280 parms.cfh = cfh; 01281 parms.natoms = sel->num_atoms; 01282 parms.sel = sel; 01283 parms.first = first; 01284 parms.last = last; 01285 parms.step = step; 01286 01287 int framecount = nfiles * (last / step); 01288 01289 // create timers 01290 wkf_timerhandle timer; 01291 timer=wkf_timer_create(); 01292 01293 // spawn child threads to do the work 01294 wkf_tasktile_t tile; 01295 tile.start=0; 01296 tile.end=framecount - 1; // first row only 01297 01298 printf("gpu_ooc_bench) tile start: %d end: %d\n", tile.start, tile.end); 01299 01300 int gdsthreadspergpu = 1; 01301 if (getenv("VMDGDSTHREADSPERGPU") != NULL) 01302 gdsthreadspergpu = atoi(getenv("VMDGDSTHREADSPERGPU")); 01303 01304 printf("gpu_ooc_bench) gdsthreadspergpu: %d\n", gdsthreadspergpu); 01305 01306 if (gdsthreadspergpu > 1) { 01307 // XXX extra-large GPU device thread pool 01308 int workercount = devcount * gdsthreadspergpu; 01309 01310 int *devlist = new int[workercount]; 01311 int k; 01312 for (k=0; k<workercount; k++) { 01313 devlist[k] = k / gdsthreadspergpu; // XXX ignores VMD CUDA device masks 01314 } 01315 01316 msgInfo << "Creating Multi-worker (" 01317 << gdsthreadspergpu << " per-GPU) CUDA device pool..." << sendmsg; 01318 bigpool=wkf_threadpool_create(workercount, devlist); 01319 delete [] devlist; 01320 01321 // associate each worker thread with a specific GPU 01322 if (getenv("VMDCUDAVERBOSE") != NULL) 01323 wkf_threadpool_launch(bigpool, vmd_cuda_devpool_setdeviceonly, (void*)"VMD CUDA Dev Init", 1); 01324 else 01325 wkf_threadpool_launch(bigpool, vmd_cuda_devpool_setdeviceonly, NULL, 1); 01326 01327 // clear all available device memory on each of the GPUs 01328 wkf_threadpool_launch(bigpool, vmd_cuda_devpool_clear_device_mem, NULL, 1); 01329 01330 // XXX override which GPU device pool we're going to use 01331 devpool = bigpool; 01332 } 01333 01334 // XXX affinitize GPU worker threads for best perf 01335 wkf_threadpool_launch(devpool, vmd_cuda_affinitize_threads, NULL, 1); 01336 01337 wkf_threadpool_sched_dynamic(devpool, &tile); 01338 wkf_timer_start(timer); 01339 wkf_threadpool_launch(devpool, gpu_ooc_bench_thread, &parms, 1); 01340 wkf_timer_stop(timer); 01341 01342 double runtime = wkf_timer_time(timer); 01343 double gbytes = sel->num_atoms * 12L * (tile.end+1) / (1024.0 * 1024.0 * 1024.0); 01344 01345 printf("gpu_ooc_bench) natoms: %d, fsz: %ld, tsz: %ld\n", 01346 sel->num_atoms, sel->num_atoms * 12L, 01347 sel->num_atoms * 12L * (tile.end+1)); 01348 01349 int pinhostiobuffer = 1; 01350 if (getenv("VMDGDSHOSTNOPIN")) 01351 pinhostiobuffer=0; 01352 01353 int hostusedirectio = 1; 01354 if (getenv("VMDGDSHOSTBUFFEREDIO") != NULL) 01355 hostusedirectio = 0; 01356 01357 int usecufile=1; 01358 if (getenv("VMDGDSUSEHOST")) 01359 usecufile=0; 01360 01361 if (usecufile) { 01362 printf("OOC I/O via GDS + cuFile\n"); 01363 } else { 01364 printf("OOC I/O via host, %s APIs, %s memory buffers\n", 01365 (hostusedirectio) ? "Direct I/O" : "Buffered I/O", 01366 (pinhostiobuffer) ? "pinned" : "unpinned"); 01367 } 01368 01369 int multiframeio = 0; 01370 if (getenv("VMDGDSMULTIFRAME")) 01371 multiframeio = atoi(getenv("VMDGDSMULTIFRAME")); 01372 if (multiframeio > VMDGDSMAXFRAMEBUF) 01373 multiframeio = VMDGDSMAXFRAMEBUF; 01374 if (multiframeio) { 01375 printf("GDS multi-frame read opt: %d frames per call, %ld bytes\n", 01376 multiframeio, 01377 multiframeio * sel->num_atoms * 12L); 01378 } 01379 01380 printf("OOC runtime: %.1f, %.2fGB/sec\n", runtime, gbytes/runtime); 01381 01382 for (i=0; i<nfiles; i++) { 01383 #if defined(VMDUSECUDAGDS) 01384 cuFileHandleDeregister(cfh[i]); 01385 #endif 01386 close_js_read(jshandles[i]); 01387 } 01388 #endif 01389 01390 #if defined(VMDUSECUDAGDS) 01391 if (cfh != NULL) 01392 free(cfh); 01393 01394 if (cfhdesc != NULL) 01395 free(cfhdesc); 01396 01397 if (jshandles != NULL) 01398 free(jshandles); 01399 01400 if (directio_fds != NULL) 01401 free(directio_fds); 01402 #endif 01403 01404 // if we created an extra large-thread-count-per-GPU thread pool, we 01405 // need to destroy it here... 01406 if (bigpool != NULL) 01407 wkf_threadpool_destroy(bigpool); 01408 01409 return 0; 01410 } 01411 01412 01413