Main Page Namespace List Class Hierarchy Alphabetical List Compound List File List Namespace Members Compound Members File Members Related Pages
CUDABench.cu

Go to the documentation of this file.
00001 /***************************************************************************
00002 *cr
00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the
00004 *cr University of Illinois
00005 *cr All Rights Reserved
00006 *cr
00007 ***************************************************************************/
00008 /***************************************************************************
00009 * RCS INFORMATION:
00010 *
00011 * $RCSfile: CUDABench.cu,v $
00012 * $Author: johns $ $Locker: $ $State: Exp $
00013 * $Revision: 1.42 $ $Date: 2022年02月09日 04:03:19 $
00014 *
00015 ***************************************************************************/
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024 #include <cuda.h>
00025 
00026 #include "Inform.h"
00027 #include "WKFThreads.h"
00028 #include "WKFUtils.h"
00029 #include "CUDAKernels.h" 
00030 #include "Measure.h"
00031 
00032 
00033 //
00034 // Restrict macro to make it easy to do perf tuning tests
00035 //
00036 #if 1
00037 #define RESTRICT __restrict__
00038 #else
00039 #define RESTRICT
00040 #endif
00041 
00042 
00043 //#define VMDUSECUDAGDS 1
00044 #if defined(VMDUSECUDAGDS)
00045 #include </usr/local/gds-beta-0.7.1/lib/cufile.h> // GPU-Direct Storage
00046 
00047 // direct calls to JS plugin for devel/testing until the plugin manager
00048 // and headers incorporate the new out-of-core GPU-direct I/O hooks.
00049 #define VMDJSPLUGININCLUDESRC 1
00050 #include "/home/johns/plugins/molfile_plugin/src/jsplugin.c"
00051 #endif
00052 
00053 
00054 #define CUERR { cudaError_t err; \
00055 if ((err = cudaGetLastError()) != cudaSuccess) { \
00056 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00057 return -1; }}
00058 
00059 
00060 //
00061 // Benchmark peak Multiply-Add instruction performance, in GFLOPS
00062 //
00063 
00064 // FMADD16 macro contains a sequence of operations that the compiler
00065 // won't optimize out, and will translate into a densely packed block
00066 // of multiply-add instructions with no intervening register copies/moves
00067 // or other instructions. 
00068 #define FMADD16 \
00069 tmp0 = tmp0*tmp4+tmp7; \
00070 tmp1 = tmp1*tmp5+tmp0; \
00071 tmp2 = tmp2*tmp6+tmp1; \
00072 tmp3 = tmp3*tmp7+tmp2; \
00073 tmp4 = tmp4*tmp0+tmp3; \
00074 tmp5 = tmp5*tmp1+tmp4; \
00075 tmp6 = tmp6*tmp2+tmp5; \
00076 tmp7 = tmp7*tmp3+tmp6; \
00077 tmp8 = tmp8*tmp12+tmp15; \
00078 tmp9 = tmp9*tmp13+tmp8; \
00079 tmp10 = tmp10*tmp14+tmp9; \
00080 tmp11 = tmp11*tmp15+tmp10; \
00081 tmp12 = tmp12*tmp8+tmp11; \
00082 tmp13 = tmp13*tmp9+tmp12; \
00083 tmp14 = tmp14*tmp10+tmp13; \
00084 tmp15 = tmp15*tmp11+tmp14;
00085 
00086 // CUDA grid, thread block, loop, and MADD operation counts
00087 #define GRIDSIZEX 6144 // number of 1-D thread blocks
00088 #define BLOCKSIZEX 64 // number of threads per 1-D block
00089 #define GLOOPS 2000 // iteration count (all threads)
00090 #define FMADD16COUNT 32 // 32 reps
00091 #define FLOPSPERFMADD16 32 // 16 MULs and 16 ADDs
00092 
00093 // FLOP counting
00094 #define FLOPSPERLOOP (FMADD16COUNT * FLOPSPERFMADD16)
00095 
00096 __global__ static void madd_kernel(float *doutput) {
00097 int tid = blockIdx.x * blockDim.x + threadIdx.x;
00098 float tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6,tmp7;
00099 float tmp8,tmp9,tmp10,tmp11,tmp12,tmp13,tmp14,tmp15;
00100 tmp0=tmp1=tmp2=tmp3=tmp4=tmp5=tmp6=tmp7=0.0f;
00101 tmp8=tmp9=tmp10=tmp11=tmp12=tmp13=tmp14=tmp15 = 0.0f;
00102 
00103 tmp15=tmp7 = blockIdx.x * 0.001f; // prevent compiler from optimizing out
00104 tmp1 = blockIdx.y * 0.001f; // the body of the loop...
00105 
00106 int loop;
00107 for(loop=0; loop<GLOOPS; loop++){
00108 FMADD16
00109 FMADD16
00110 FMADD16
00111 FMADD16
00112 FMADD16
00113 FMADD16
00114 FMADD16
00115 FMADD16
00116 FMADD16
00117 FMADD16
00118 FMADD16
00119 FMADD16
00120 FMADD16
00121 FMADD16
00122 FMADD16
00123 FMADD16
00124 FMADD16
00125 FMADD16
00126 FMADD16
00127 FMADD16
00128 FMADD16
00129 FMADD16
00130 FMADD16
00131 FMADD16
00132 FMADD16
00133 FMADD16
00134 FMADD16
00135 FMADD16
00136 FMADD16
00137 FMADD16
00138 FMADD16
00139 FMADD16
00140 }
00141 
00142 doutput[tid] = tmp0+tmp1+tmp2+tmp3+tmp4+tmp5+tmp6+tmp7
00143 +tmp8+tmp9+tmp10+tmp11+tmp12+tmp13+tmp14+tmp15;
00144 }
00145 
00146 
00147 static int cudamaddgflops(int cudadev, double *gflops, int testloops) {
00148 float *doutput = NULL;
00149 dim3 Bsz, Gsz;
00150 wkf_timerhandle timer;
00151 int i;
00152 
00153 cudaError_t rc;
00154 rc = cudaSetDevice(cudadev);
00155 if (rc != cudaSuccess) {
00156 #if CUDART_VERSION >= 2010
00157 rc = cudaGetLastError(); // query last error and reset error state
00158 if (rc != cudaErrorSetOnActiveProcess)
00159 return -1; // abort and return an error
00160 #else
00161 cudaGetLastError(); // just ignore and reset error state, since older CUDA
00162 // revs don't have a cudaErrorSetOnActiveProcess enum
00163 #endif
00164 }
00165 
00166 
00167 // setup CUDA grid and block sizes
00168 Bsz.x = BLOCKSIZEX;
00169 Bsz.y = 1;
00170 Bsz.z = 1;
00171 Gsz.x = GRIDSIZEX;
00172 Gsz.y = 1;
00173 Gsz.z = 1;
00174 
00175 // allocate output array
00176 cudaMalloc((void**)&doutput, BLOCKSIZEX * GRIDSIZEX * sizeof(float));
00177 CUERR // check and clear any existing errors
00178 
00179 // warmup run
00180 madd_kernel<<<Gsz, Bsz>>>(doutput);
00181 cudaDeviceSynchronize(); // wait for kernel to finish
00182 
00183 // benchmark run
00184 timer=wkf_timer_create();
00185 wkf_timer_start(timer);
00186 for (i=0; i<testloops; i++) { 
00187 madd_kernel<<<Gsz, Bsz>>>(doutput);
00188 }
00189 cudaDeviceSynchronize(); // wait for kernel to finish
00190 CUERR // check and clear any existing errors
00191 wkf_timer_stop(timer);
00192 
00193 double runtime = wkf_timer_time(timer);
00194 double gflop = ((double) GLOOPS) * ((double) FLOPSPERLOOP) *
00195 ((double) BLOCKSIZEX) * ((double) GRIDSIZEX) * (1.0e-9) * testloops;
00196 
00197 *gflops = gflop / runtime;
00198 
00199 cudaFree(doutput);
00200 CUERR // check and clear any existing errors
00201 
00202 wkf_timer_destroy(timer);
00203 
00204 return 0;
00205 }
00206 
00207 typedef struct {
00208 int deviceid;
00209 int testloops;
00210 double gflops;
00211 } maddthrparms;
00212 
00213 static void * cudamaddthread(void *voidparms) {
00214 maddthrparms *parms = (maddthrparms *) voidparms;
00215 cudamaddgflops(parms->deviceid, &parms->gflops, parms->testloops);
00216 return NULL;
00217 }
00218 
00219 int vmd_cuda_madd_gflops(int numdevs, int *devlist, double *gflops,
00220 int testloops) {
00221 maddthrparms *parms;
00222 wkf_thread_t * threads;
00223 int i;
00224 
00225 /* allocate array of threads */
00226 threads = (wkf_thread_t *) calloc(numdevs * sizeof(wkf_thread_t), 1);
00227 
00228 /* allocate and initialize array of thread parameters */
00229 parms = (maddthrparms *) malloc(numdevs * sizeof(maddthrparms));
00230 for (i=0; i<numdevs; i++) {
00231 if (devlist != NULL)
00232 parms[i].deviceid = devlist[i];
00233 else
00234 parms[i].deviceid = i;
00235 
00236 parms[i].testloops = testloops;
00237 parms[i].gflops = 0.0;
00238 }
00239 
00240 #if defined(VMDTHREADS)
00241 /* spawn child threads to do the work */
00242 /* thread 0 must also be processed this way otherwise */
00243 /* we'll permanently bind the main thread to some device */
00244 for (i=0; i<numdevs; i++) {
00245 wkf_thread_create(&threads[i], cudamaddthread, &parms[i]);
00246 }
00247 
00248 /* join the threads after work is done */
00249 for (i=0; i<numdevs; i++) {
00250 wkf_thread_join(threads[i], NULL);
00251 }
00252 #else
00253 /* single thread does all of the work */
00254 cudamaddthread((void *) &parms[0]);
00255 #endif
00256 
00257 for (i=0; i<numdevs; i++) {
00258 gflops[i] = parms[i].gflops; 
00259 }
00260 
00261 /* free thread parms */
00262 free(parms);
00263 free(threads);
00264 
00265 return 0;
00266 }
00267 
00268 
00269 
00270 
00271 
00272 
00273 //
00274 // Host-GPU memcpy I/O bandwidth benchmark
00275 //
00276 
00277 #define BWITER 500
00278 #define LATENCYITER 50000
00279 
00280 static int cudabusbw(int cudadev, 
00281 double *hdmbsec, double *hdlatusec, 
00282 double *phdmbsec, double *phdlatusec, 
00283 double *dhmbsec, double *dhlatusec,
00284 double *pdhmbsec, double *pdhlatusec) {
00285 float *hdata = NULL; // non-pinned DMA buffer
00286 float *phdata = NULL; // pinned DMA buffer
00287 float *ddata = NULL;
00288 int i;
00289 double runtime;
00290 wkf_timerhandle timer;
00291 int memsz = 1024 * 1024 * sizeof(float);
00292 
00293 *hdmbsec = 0.0;
00294 *hdlatusec = 0.0;
00295 *dhmbsec = 0.0;
00296 *dhlatusec = 0.0;
00297 *phdmbsec = 0.0;
00298 *phdlatusec = 0.0;
00299 *pdhmbsec = 0.0;
00300 *pdhlatusec = 0.0;
00301 
00302 // attach to the selected device
00303 cudaError_t rc;
00304 rc = cudaSetDevice(cudadev);
00305 if (rc != cudaSuccess) {
00306 #if CUDART_VERSION >= 2010
00307 rc = cudaGetLastError(); // query last error and reset error state
00308 if (rc != cudaErrorSetOnActiveProcess)
00309 return -1; // abort and return an error
00310 #else
00311 cudaGetLastError(); // just ignore and reset error state, since older CUDA
00312 // revs don't have a cudaErrorSetOnActiveProcess enum
00313 #endif
00314 }
00315 
00316 // allocate non-pinned output array
00317 hdata = (float *) malloc(memsz); 
00318 
00319 // allocate pinned output array
00320 cudaMallocHost((void**) &phdata, memsz);
00321 CUERR // check and clear any existing errors
00322 
00323 // allocate device memory
00324 cudaMalloc((void**) &ddata, memsz);
00325 CUERR // check and clear any existing errors
00326 
00327 // create timer
00328 timer=wkf_timer_create();
00329 
00330 //
00331 // Host to device timings
00332 //
00333 
00334 // non-pinned bandwidth
00335 wkf_timer_start(timer);
00336 for (i=0; i<BWITER; i++) {
00337 cudaMemcpy(ddata, hdata, memsz, cudaMemcpyHostToDevice);
00338 }
00339 wkf_timer_stop(timer);
00340 CUERR // check and clear any existing errors
00341 runtime = wkf_timer_time(timer);
00342 *hdmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00343 
00344 // non-pinned latency
00345 wkf_timer_start(timer);
00346 for (i=0; i<LATENCYITER; i++) {
00347 cudaMemcpy(ddata, hdata, 1, cudaMemcpyHostToDevice);
00348 }
00349 wkf_timer_stop(timer);
00350 CUERR // check and clear any existing errors
00351 runtime = wkf_timer_time(timer);
00352 *hdlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00353 
00354 
00355 // pinned bandwidth
00356 wkf_timer_start(timer);
00357 for (i=0; i<BWITER; i++) {
00358 cudaMemcpy(ddata, phdata, memsz, cudaMemcpyHostToDevice);
00359 }
00360 wkf_timer_stop(timer);
00361 CUERR // check and clear any existing errors
00362 runtime = wkf_timer_time(timer);
00363 *phdmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00364 
00365 // pinned latency
00366 wkf_timer_start(timer);
00367 for (i=0; i<LATENCYITER; i++) {
00368 cudaMemcpy(ddata, phdata, 1, cudaMemcpyHostToDevice);
00369 }
00370 wkf_timer_stop(timer);
00371 CUERR // check and clear any existing errors
00372 runtime = wkf_timer_time(timer);
00373 *phdlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00374 
00375 
00376 //
00377 // Device to host timings
00378 //
00379 
00380 // non-pinned bandwidth
00381 wkf_timer_start(timer);
00382 for (i=0; i<BWITER; i++) {
00383 cudaMemcpy(hdata, ddata, memsz, cudaMemcpyDeviceToHost);
00384 }
00385 wkf_timer_stop(timer);
00386 CUERR // check and clear any existing errors
00387 runtime = wkf_timer_time(timer);
00388 *dhmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00389 
00390 // non-pinned latency
00391 wkf_timer_start(timer);
00392 for (i=0; i<LATENCYITER; i++) {
00393 cudaMemcpy(hdata, ddata, 1, cudaMemcpyDeviceToHost);
00394 }
00395 wkf_timer_stop(timer);
00396 CUERR // check and clear any existing errors
00397 runtime = wkf_timer_time(timer);
00398 *dhlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00399 
00400 
00401 // pinned bandwidth
00402 wkf_timer_start(timer);
00403 for (i=0; i<BWITER; i++) {
00404 cudaMemcpy(phdata, ddata, memsz, cudaMemcpyDeviceToHost);
00405 }
00406 wkf_timer_stop(timer);
00407 CUERR // check and clear any existing errors
00408 runtime = wkf_timer_time(timer);
00409 *pdhmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00410 
00411 // pinned latency
00412 wkf_timer_start(timer);
00413 for (i=0; i<LATENCYITER; i++) {
00414 cudaMemcpy(phdata, ddata, 1, cudaMemcpyDeviceToHost);
00415 }
00416 wkf_timer_stop(timer);
00417 CUERR // check and clear any existing errors
00418 runtime = wkf_timer_time(timer);
00419 *pdhlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00420 
00421 
00422 cudaFree(ddata);
00423 CUERR // check and clear any existing errors
00424 cudaFreeHost(phdata);
00425 CUERR // check and clear any existing errors
00426 free(hdata);
00427 
00428 wkf_timer_destroy(timer);
00429 
00430 return 0;
00431 }
00432 
00433 typedef struct {
00434 int deviceid;
00435 double hdmbsec;
00436 double hdlatusec;
00437 double phdmbsec;
00438 double phdlatusec;
00439 double dhmbsec;
00440 double dhlatusec;
00441 double pdhmbsec;
00442 double pdhlatusec;
00443 } busbwthrparms;
00444 
00445 static void * cudabusbwthread(void *voidparms) {
00446 busbwthrparms *parms = (busbwthrparms *) voidparms;
00447 cudabusbw(parms->deviceid, 
00448 &parms->hdmbsec, &parms->hdlatusec,
00449 &parms->phdmbsec, &parms->phdlatusec,
00450 &parms->dhmbsec, &parms->dhlatusec,
00451 &parms->pdhmbsec, &parms->pdhlatusec);
00452 return NULL;
00453 }
00454 
00455 int vmd_cuda_bus_bw(int numdevs, int *devlist, 
00456 double *hdmbsec, double *hdlatusec,
00457 double *phdmbsec,double *phdlatusec,
00458 double *dhmbsec, double *dhlatusec,
00459 double *pdhmbsec, double *pdhlatusec) {
00460 busbwthrparms *parms;
00461 wkf_thread_t * threads;
00462 int i;
00463 
00464 /* allocate array of threads */
00465 threads = (wkf_thread_t *) calloc(numdevs * sizeof(wkf_thread_t), 1);
00466 
00467 /* allocate and initialize array of thread parameters */
00468 parms = (busbwthrparms *) malloc(numdevs * sizeof(busbwthrparms));
00469 for (i=0; i<numdevs; i++) {
00470 if (devlist != NULL)
00471 parms[i].deviceid = devlist[i];
00472 else
00473 parms[i].deviceid = i;
00474 parms[i].hdmbsec = 0.0;
00475 parms[i].hdlatusec = 0.0;
00476 parms[i].phdmbsec = 0.0;
00477 parms[i].phdlatusec = 0.0;
00478 parms[i].dhmbsec = 0.0;
00479 parms[i].dhlatusec = 0.0;
00480 parms[i].pdhmbsec = 0.0;
00481 parms[i].pdhlatusec = 0.0;
00482 }
00483 
00484 #if defined(VMDTHREADS)
00485 /* spawn child threads to do the work */
00486 /* thread 0 must also be processed this way otherwise */
00487 /* we'll permanently bind the main thread to some device */
00488 for (i=0; i<numdevs; i++) {
00489 wkf_thread_create(&threads[i], cudabusbwthread, &parms[i]);
00490 }
00491 
00492 /* join the threads after work is done */
00493 for (i=0; i<numdevs; i++) {
00494 wkf_thread_join(threads[i], NULL);
00495 }
00496 #else
00497 /* single thread does all of the work */
00498 cudabusbwthread((void *) &parms[0]);
00499 #endif
00500 
00501 for (i=0; i<numdevs; i++) {
00502 hdmbsec[i] = parms[i].hdmbsec; 
00503 hdlatusec[i] = parms[i].hdlatusec; 
00504 phdmbsec[i] = parms[i].phdmbsec; 
00505 phdlatusec[i] = parms[i].phdlatusec; 
00506 dhmbsec[i] = parms[i].dhmbsec; 
00507 dhlatusec[i] = parms[i].dhlatusec; 
00508 pdhmbsec[i] = parms[i].pdhmbsec; 
00509 pdhlatusec[i] = parms[i].pdhlatusec; 
00510 }
00511 
00512 /* free thread parms */
00513 free(parms);
00514 free(threads);
00515 
00516 return 0;
00517 }
00518 
00519 
00520 
00521 //
00522 // GPU device global memory bandwidth benchmark
00523 //
00524 template <class T>
00525 __global__ void gpuglobmemcpybw(T *dest, const T *src) {
00526 const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
00527 dest[idx] = src[idx];
00528 }
00529 
00530 template <class T>
00531 __global__ void gpuglobmemsetbw(T *dest, const T val) {
00532 int idx = threadIdx.x + blockIdx.x * blockDim.x;
00533 dest[idx] = val;
00534 }
00535 
00536 typedef float4 datatype;
00537 
00538 static int cudaglobmembw(int cudadev, double *gpumemsetgbsec, double *gpumemcpygbsec) {
00539 int i;
00540 int len = 1 << 22; // one thread per data element
00541 int loops = 500;
00542 datatype *src, *dest;
00543 datatype val=make_float4(1.0f, 1.0f, 1.0f, 1.0f);
00544 
00545 // initialize to zero for starters
00546 float memsettime = 0.0f;
00547 float memcpytime = 0.0f;
00548 *gpumemsetgbsec = 0.0;
00549 *gpumemcpygbsec = 0.0;
00550 
00551 // attach to the selected device
00552 cudaError_t rc;
00553 rc = cudaSetDevice(cudadev);
00554 if (rc != cudaSuccess) {
00555 #if CUDART_VERSION >= 2010
00556 rc = cudaGetLastError(); // query last error and reset error state
00557 if (rc != cudaErrorSetOnActiveProcess)
00558 return -1; // abort and return an error
00559 #else
00560 cudaGetLastError(); // just ignore and reset error state, since older CUDA
00561 // revs don't have a cudaErrorSetOnActiveProcess enum
00562 #endif
00563 }
00564 
00565 cudaMalloc((void **) &src, sizeof(datatype)*len);
00566 CUERR
00567 cudaMalloc((void **) &dest, sizeof(datatype)*len);
00568 CUERR
00569 
00570 dim3 BSz(256, 1, 1);
00571 dim3 GSz(len / (BSz.x * BSz.y * BSz.z), 1, 1); 
00572 
00573 // do a warm-up pass
00574 gpuglobmemsetbw<datatype><<< GSz, BSz >>>(src, val);
00575 CUERR
00576 gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
00577 CUERR
00578 gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
00579 CUERR
00580 
00581 cudaEvent_t start, end;
00582 cudaEventCreate(&start);
00583 cudaEventCreate(&end);
00584 
00585 // execute the memset kernel
00586 cudaEventRecord(start, 0);
00587 for (i=0; i<loops; i++) {
00588 gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
00589 }
00590 CUERR
00591 cudaEventRecord(end, 0);
00592 CUERR
00593 cudaEventSynchronize(start);
00594 CUERR
00595 cudaEventSynchronize(end);
00596 CUERR
00597 cudaEventElapsedTime(&memsettime, start, end);
00598 CUERR
00599 
00600 // execute the memcpy kernel
00601 cudaEventRecord(start, 0);
00602 for (i=0; i<loops; i++) {
00603 gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
00604 }
00605 cudaEventRecord(end, 0);
00606 CUERR
00607 cudaEventSynchronize(start);
00608 CUERR
00609 cudaEventSynchronize(end);
00610 CUERR
00611 cudaEventElapsedTime(&memcpytime, start, end);
00612 CUERR
00613 
00614 cudaEventDestroy(start);
00615 CUERR
00616 cudaEventDestroy(end);
00617 CUERR
00618 
00619 *gpumemsetgbsec = (len * sizeof(datatype) / (1024.0 * 1024.0)) / (memsettime / loops);
00620 *gpumemcpygbsec = (2 * len * sizeof(datatype) / (1024.0 * 1024.0)) / (memcpytime / loops);
00621 cudaFree(dest);
00622 cudaFree(src);
00623 CUERR
00624 
00625 return 0;
00626 }
00627 
00628 typedef struct {
00629 int deviceid;
00630 double memsetgbsec;
00631 double memcpygbsec;
00632 } globmembwthrparms;
00633 
00634 static void * cudaglobmembwthread(void *voidparms) {
00635 globmembwthrparms *parms = (globmembwthrparms *) voidparms;
00636 cudaglobmembw(parms->deviceid, &parms->memsetgbsec, &parms->memcpygbsec);
00637 return NULL;
00638 }
00639 
00640 int vmd_cuda_globmem_bw(int numdevs, int *devlist, 
00641 double *memsetgbsec, double *memcpygbsec) {
00642 globmembwthrparms *parms;
00643 wkf_thread_t * threads;
00644 int i;
00645 
00646 /* allocate array of threads */
00647 threads = (wkf_thread_t *) calloc(numdevs * sizeof(wkf_thread_t), 1);
00648 
00649 /* allocate and initialize array of thread parameters */
00650 parms = (globmembwthrparms *) malloc(numdevs * sizeof(globmembwthrparms));
00651 for (i=0; i<numdevs; i++) {
00652 if (devlist != NULL)
00653 parms[i].deviceid = devlist[i];
00654 else
00655 parms[i].deviceid = i;
00656 parms[i].memsetgbsec = 0.0;
00657 parms[i].memcpygbsec = 0.0;
00658 }
00659 
00660 #if defined(VMDTHREADS)
00661 /* spawn child threads to do the work */
00662 /* thread 0 must also be processed this way otherwise */
00663 /* we'll permanently bind the main thread to some device */
00664 for (i=0; i<numdevs; i++) {
00665 wkf_thread_create(&threads[i], cudaglobmembwthread, &parms[i]);
00666 }
00667 
00668 /* join the threads after work is done */
00669 for (i=0; i<numdevs; i++) {
00670 wkf_thread_join(threads[i], NULL);
00671 }
00672 #else
00673 /* single thread does all of the work */
00674 cudaglobmembwthread((void *) &parms[0]);
00675 #endif
00676 
00677 for (i=0; i<numdevs; i++) {
00678 memsetgbsec[i] = parms[i].memsetgbsec;
00679 memcpygbsec[i] = parms[i].memcpygbsec;
00680 }
00681 
00682 /* free thread parms */
00683 free(parms);
00684 free(threads);
00685 
00686 return 0;
00687 }
00688 
00689 
00690 //
00691 // Benchmark latency for complete threadpool barrier wakeup/run/sleep cycle
00692 //
00693 static void * vmddevpoollatencythread(void *voidparms) {
00694 return NULL;
00695 }
00696 
00697 static void * vmddevpooltilelatencythread(void *voidparms) {
00698 int threadid=-1;
00699 int tilesize=1;
00700 void *parms=NULL;
00701 wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00702 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00703 
00704 // grind through task tiles until none are left
00705 wkf_tasktile_t tile;
00706 while (wkf_threadpool_next_tile(voidparms, tilesize, &tile) != WKF_SCHED_DONE) {
00707 // do nothing but eat work units...
00708 }
00709 
00710 return NULL;
00711 }
00712 
00713 
00714 // no-op kernel for timing kernel launches
00715 __global__ static void nopkernel(float * ddata) {
00716 unsigned int xindex = blockIdx.x * blockDim.x + threadIdx.x;
00717 unsigned int yindex = blockIdx.y * blockDim.y + threadIdx.y;
00718 unsigned int outaddr = gridDim.x * blockDim.x * yindex + xindex;
00719 
00720 if (ddata != NULL)
00721 ddata[outaddr] = outaddr;
00722 }
00723 
00724 // empty kernel for timing kernel launches
00725 __global__ static void voidkernel(void) {
00726 return;
00727 }
00728 
00729 static void * vmddevpoolcudatilelatencythread(void *voidparms) {
00730 int threadid=-1;
00731 int tilesize=1;
00732 float *parms=NULL;
00733 wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00734 
00735 // XXX Note that we expect parms to be set to NULL or a valid CUDA
00736 // global memory pointer for correct operation of the NOP kernel below
00737 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00738 
00739 #if 0
00740 // scale tile size by device performance
00741 tilesize=4; // GTX 280, Tesla C1060 starting point tile size
00742 wkf_threadpool_worker_devscaletile(voidparms, &tilesize);
00743 #endif
00744 
00745 // grind through task tiles until none are left
00746 wkf_tasktile_t tile;
00747 dim3 Gsz(1,1,0);
00748 dim3 Bsz(8,8,1);
00749 while (wkf_threadpool_next_tile(voidparms, tilesize, &tile) != WKF_SCHED_DONE) {
00750 // launch a no-op CUDA kernel
00751 nopkernel<<<Gsz, Bsz, 0>>>(parms);
00752 }
00753 
00754 // wait for all GPU kernels to complete
00755 cudaDeviceSynchronize();
00756 
00757 return NULL;
00758 }
00759 
00760 
00761 int vmd_cuda_devpool_latency(wkf_threadpool_t *devpool, int tilesize,
00762 double *kernlaunchlatency,
00763 double *barlatency,
00764 double *cyclelatency, 
00765 double *tilelatency,
00766 double *kernellatency) {
00767 int i;
00768 wkf_tasktile_t tile;
00769 wkf_timerhandle timer;
00770 int loopcount;
00771 
00772 timer=wkf_timer_create();
00773 
00774 // execute just a CUDA kernel launch and measure latency on whatever
00775 // GPU we get.
00776 loopcount = 15000;
00777 dim3 VGsz(1,1,0);
00778 dim3 VBsz(8,8,1);
00779 wkf_timer_start(timer);
00780 for (i=0; i<loopcount; i++) {
00781 voidkernel<<<VGsz, VBsz, 0>>>();
00782 }
00783 // wait for GPU kernels to complete
00784 cudaDeviceSynchronize();
00785 wkf_timer_stop(timer);
00786 *kernlaunchlatency = wkf_timer_time(timer) / ((double) loopcount);
00787 
00788 // execute just a raw barrier sync and measure latency
00789 loopcount = 15000;
00790 wkf_timer_start(timer);
00791 for (i=0; i<loopcount; i++) {
00792 wkf_threadpool_wait(devpool);
00793 }
00794 wkf_timer_stop(timer);
00795 *barlatency = wkf_timer_time(timer) / ((double) loopcount);
00796 
00797 // time wake-up, launch, and sleep/join of device pool doing a no-op
00798 loopcount = 5000;
00799 wkf_timer_start(timer);
00800 for (i=0; i<loopcount; i++) {
00801 tile.start=0;
00802 tile.end=0;
00803 wkf_threadpool_sched_dynamic(devpool, &tile);
00804 wkf_threadpool_launch(devpool, vmddevpoollatencythread, NULL, 1);
00805 }
00806 wkf_timer_stop(timer);
00807 *cyclelatency = wkf_timer_time(timer) / ((double) loopcount);
00808 
00809 // time wake-up, launch, and sleep/join of device pool eating tiles
00810 loopcount = 5000;
00811 wkf_timer_start(timer);
00812 for (i=0; i<loopcount; i++) {
00813 tile.start=0;
00814 tile.end=tilesize;
00815 wkf_threadpool_sched_dynamic(devpool, &tile);
00816 wkf_threadpool_launch(devpool, vmddevpooltilelatencythread, NULL, 1);
00817 }
00818 wkf_timer_stop(timer);
00819 *tilelatency = wkf_timer_time(timer) / ((double) loopcount);
00820 
00821 // time wake-up, launch, and sleep/join of device pool eating tiles
00822 loopcount = 2000;
00823 wkf_timer_start(timer);
00824 for (i=0; i<loopcount; i++) {
00825 tile.start=0;
00826 tile.end=tilesize;
00827 wkf_threadpool_sched_dynamic(devpool, &tile);
00828 wkf_threadpool_launch(devpool, vmddevpoolcudatilelatencythread, NULL, 1);
00829 }
00830 wkf_timer_stop(timer);
00831 *kernellatency = wkf_timer_time(timer) / ((double) loopcount);
00832 
00833 wkf_timer_destroy(timer);
00834 
00835 #if 1
00836 vmd_cuda_measure_latencies(devpool);
00837 #endif
00838 
00839 return 0;
00840 }
00841 
00842 
00843 //
00844 // Benchmark CUDA kernel launch and memory copy latencies in isolation
00845 //
00846 typedef struct {
00847 int deviceid;
00848 int testloops;
00849 double kernlatency;
00850 double bcopylatency;
00851 double kbseqlatency;
00852 } latthrparms;
00853 
00854 static void * vmddevpoolcudalatencythread(void *voidparms) {
00855 int threadid=-1;
00856 latthrparms *parms=NULL;
00857 
00858 wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00859 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00860 if (parms->deviceid == threadid) { 
00861 wkf_timerhandle timer;
00862 timer=wkf_timer_create();
00863 printf("Thread/device %d running...\n", threadid);
00864 cudaStream_t devstream;
00865 cudaStreamCreate(&devstream);
00866 
00867 char *hostbuf = (char *) calloc(1, 65536 * sizeof(char));
00868 char *gpubuf = NULL;
00869 cudaMalloc((void**)&gpubuf, 65536 * sizeof(char));
00870 
00871 dim3 Gsz(1,1,0);
00872 dim3 Bsz(8,8,1);
00873 
00874 // measure back-to-back NULL kernel launches
00875 wkf_timer_start(timer);
00876 int i;
00877 for (i=0; i<parms->testloops; i++) {
00878 // launch a no-op CUDA kernel
00879 nopkernel<<<Gsz, Bsz, 0, devstream>>>(NULL);
00880 }
00881 // wait for all GPU kernels to complete
00882 cudaStreamSynchronize(devstream);
00883 wkf_timer_stop(timer);
00884 parms->kernlatency = 1000000 * wkf_timer_time(timer) / ((double) parms->testloops);
00885 
00886 // measure back-to-back round-trip 1-byte memcpy latencies
00887 wkf_timer_start(timer);
00888 for (i=0; i<parms->testloops; i++) {
00889 cudaMemcpyAsync(gpubuf, hostbuf, 1, cudaMemcpyHostToDevice, devstream);
00890 cudaMemcpyAsync(hostbuf, gpubuf, 1, cudaMemcpyDeviceToHost, devstream);
00891 }
00892 // wait for all GPU kernels to complete
00893 cudaStreamSynchronize(devstream);
00894 wkf_timer_stop(timer);
00895 parms->kernlatency = 1000000 * wkf_timer_time(timer) / ((double) parms->testloops);
00896 
00897 printf("NULL kernel launch latency (usec): %.2f\n", parms->kernlatency);
00898 
00899 cudaStreamDestroy(devstream);
00900 cudaFree(gpubuf);
00901 free(hostbuf);
00902 wkf_timer_destroy(timer);
00903 }
00904 
00905 return NULL;
00906 }
00907 
00908 
00909 int vmd_cuda_measure_latencies(wkf_threadpool_t *devpool) {
00910 latthrparms thrparms;
00911 int workers = wkf_threadpool_get_workercount(devpool);
00912 int i;
00913 printf("vmd_cuda_measure_latencies()...\n");
00914 for (i=0; i<workers; i++) {
00915 memset(&thrparms, 0, sizeof(thrparms));
00916 thrparms.deviceid = i;
00917 thrparms.testloops = 2500;
00918 wkf_threadpool_launch(devpool, vmddevpoolcudalatencythread, &thrparms, 1);
00919 }
00920 
00921 return 0;
00922 }
00923 
00924 
00925 #if defined(VMDUSECUDAGDS)
00926 typedef struct {
00927 int nfiles;
00928 const char **trjfileset;
00929 jshandle **jshandles;
00930 CUfileHandle_t *cfh;
00931 int devcount;
00932 int natoms;
00933 const AtomSel *sel;
00934 int first;
00935 int last;
00936 int step;
00937 } gpuoocbenchthreadparms;
00938 
00939 #define VMDGDSMAXFRAMEBUF 8
00940 static void * gpu_ooc_bench_thread(void *voidparms) {
00941 int threadid, numthreads;
00942 gpuoocbenchthreadparms *parms = NULL;
00943 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00944 wkf_threadpool_worker_getid(voidparms, &threadid, &numthreads);
00945 
00946 //
00947 // copy in per-thread parameters
00948 //
00949 int nfiles = parms->nfiles;
00950 int natoms = parms->natoms;
00951 const AtomSel *sel = parms->sel;
00952 int first = parms->first;
00953 int last = parms->last;
00954 int step = parms->step;
00955 
00956 int usecufile = 1;
00957 fio_fd *hostfds = NULL;
00958 int pinhostiobuffer = 1;
00959 if (getenv("VMDGDSHOSTNOPIN")) {
00960 pinhostiobuffer=0;
00961 }
00962 
00963 if (getenv("VMDGDSUSEHOST")) {
00964 usecufile=0;
00965 hostfds = (fio_fd *) calloc(1, nfiles * sizeof(fio_fd));
00966 
00967 int hostusedirectio = 1;
00968 if (getenv("VMDGDSHOSTBUFFEREDIO") != NULL)
00969 hostusedirectio = 0;
00970 
00971 int openmode = FIO_READ;
00972 if (hostusedirectio)
00973 openmode |= FIO_DIRECT;
00974 
00975 int i;
00976 for (i=0; i<nfiles; i++) {
00977 if (fio_open(parms->trjfileset[i], openmode, &hostfds[i]) < 0) {
00978 if (hostusedirectio) {
00979 printf("Thr[%d] direct I/O unavailable or can't open file '%s'\n",
00980 threadid, parms->trjfileset[i]);
00981 } else {
00982 printf("Thr[%d] can't open file '%s'\n",
00983 threadid, parms->trjfileset[i]);
00984 }
00985 return NULL;
00986 }
00987 }
00988 }
00989 
00990 if (hostfds && usecufile) {
00991 printf("Inconsistent cufile/hostfds state, aborting!\n");
00992 return NULL;
00993 }
00994 
00995 /* ensure we have a large enough allocation so we can align */
00996 /* the starting pointer to a blocksz page boundary */
00997 long blocksz = MOLFILE_DIRECTIO_MIN_BLOCK_SIZE;
00998 long sz = 3L*sizeof(float)*natoms + blocksz;
00999 
01000 /* pad the allocation to an even multiple of the block size */
01001 size_t blockpadsz = (sz + (blocksz - 1)) & (~(blocksz - 1));
01002 
01003 int framecount = (last - first + 1) / step;
01004 int framesperfile = framecount / nfiles;
01005 
01006 if (threadid == 0) {
01007 printf("Thr[%2d] %d frames total, natoms: %d selected: %d\n",
01008 threadid, framecount, natoms, sel->selected);
01009 printf("Thr[%2d] %d frames/file\n", threadid, framesperfile);
01010 }
01011 
01012 cudaError_t crc;
01013 cudaStream_t oocstream;
01014 float *devptr=NULL;
01015 float *hostptr=NULL;
01016 float *hostptr_unaligned=NULL;
01017 
01018 float *crdx1=NULL, *crdy1=NULL, *crdz1=NULL;
01019 float *crdx2=NULL, *crdy2=NULL, *crdz2=NULL;
01020 int multiframeio = 0;
01021 if (getenv("VMDGDSMULTIFRAME"))
01022 multiframeio = atoi(getenv("VMDGDSMULTIFRAME"));
01023 if (multiframeio > VMDGDSMAXFRAMEBUF)
01024 multiframeio = VMDGDSMAXFRAMEBUF;
01025 
01026 // set block sizes and counts for IO bench calcs
01027 dim3 IOBsz = dim3(256, 1, 1);
01028 dim3 IOGsz = dim3((natoms + IOBsz.x - 1) / IOBsz.x, 1, 1);
01029 
01030 if (parms->devcount > 0) {
01031 long gpuallocsz = (VMDGDSMAXFRAMEBUF+1) * blockpadsz;
01032 
01033 if (threadid == 0) {
01034 printf("Thr[%2d] Allocating GPU timestep I/O buf: %ld \n",
01035 threadid, gpuallocsz);
01036 }
01037 crc = cudaMalloc((void**) &devptr, gpuallocsz);
01038 
01039 if (hostfds != NULL) {
01040 if (pinhostiobuffer) {
01041 crc = cudaMallocHost((void**) &hostptr, gpuallocsz);
01042 } else {
01043 hostptr = (float *) alloc_aligned_ptr(gpuallocsz, 4096,
01044 (void**) &hostptr_unaligned);
01045 if (!hostptr) {
01046 printf("Thr[%d]: Failed allocation!\n", threadid);
01047 return NULL;
01048 }
01049 }
01050 }
01051 
01052 long crdsz = sel->selected * sizeof(float);
01053 
01054 // atomic coord buffers
01055 crc = cudaMalloc((void**) &crdx1, crdsz);
01056 crc = cudaMalloc((void**) &crdy1, crdsz);
01057 crc = cudaMalloc((void**) &crdz1, crdsz);
01058 crc = cudaMalloc((void**) &crdx2, crdsz);
01059 crc = cudaMalloc((void**) &crdy2, crdsz);
01060 crc = cudaMalloc((void**) &crdz2, crdsz);
01061 if (crc != cudaSuccess) {
01062 printf("Thr[%2d], Failed to allocate GPU buffer!\n", threadid);
01063 return NULL; // XXX error handling needs to be done here
01064 }
01065 
01066 cudaStreamCreate(&oocstream);
01067 
01068 #if defined(VMDUSECUDAGDS)
01069 cuFileBufRegister(devptr, gpuallocsz, 0);
01070 #endif
01071 }
01072 
01073 int verbose = (getenv("VMDGDSVERBOSE") != NULL) ? 1 : 0;
01074 
01075 int filestrategy = 0;
01076 if (getenv("VMDGDSFILESTRATEGY")) {
01077 filestrategy = atoi(getenv("VMDGDSFILESTRATEGY"));
01078 }
01079 if (threadid == 0) {
01080 printf("Thr[%2d] file strategy set to: %d\n", threadid, filestrategy);
01081 }
01082 
01083 wkf_tasktile_t tile;
01084 while (wkf_threadlaunch_next_tile(voidparms, VMDGDSMAXFRAMEBUF * 1, &tile) != WKF_SCHED_DONE) {
01085 //
01086 // simple I/O + compute benchmarking...
01087 //
01088 int idx;
01089 int threadspergpu;
01090 if (parms->devcount > 0)
01091 threadspergpu = numthreads / parms->devcount;
01092 else 
01093 threadspergpu = 1;
01094 
01095 for (idx=tile.start; idx<tile.end; idx++) {
01096 int myfileidx, fileframeidx;
01097 
01098 switch (filestrategy) {
01099 case 1:
01100 myfileidx = (idx / multiframeio) % nfiles;
01101 fileframeidx = idx % framesperfile;
01102 break;
01103 
01104 case 2:
01105 myfileidx = (idx / (multiframeio * threadspergpu)) % nfiles;
01106 fileframeidx = idx % framesperfile;
01107 break;
01108 
01109 case 3:
01110 myfileidx = (threadid / 4) % nfiles;
01111 fileframeidx = idx % framesperfile;
01112 break;
01113 
01114 case 0:
01115 default:
01116 myfileidx = (threadid / threadspergpu) % nfiles;
01117 fileframeidx = idx % framesperfile;
01118 break;
01119 }
01120 
01121 //
01122 // compute multi-frame or single-frame I/O offsets and sizes
01123 //
01124 long startoffset, foffset, readlen;
01125 read_js_timestep_index_offsets(parms->jshandles[myfileidx],
01126 natoms, fileframeidx, 0, natoms, NULL,
01127 &startoffset, &foffset, &readlen);
01128 if (multiframeio) {
01129 // multi-frame reads use the same starting offset, but the 
01130 // read length is computed from the first and last frames 
01131 // in the group.
01132 long multistartoffset, multifoffset, multireadlen;
01133 read_js_timestep_index_offsets(parms->jshandles[myfileidx], natoms,
01134 fileframeidx+multiframeio-1,
01135 0, natoms, NULL,
01136 &multistartoffset, &multifoffset,
01137 &multireadlen);
01138 
01139 multireadlen = (multifoffset + multireadlen) - foffset;
01140 
01141 //printf("** readlen: %ld multireadlen: %ld\n", readlen, multireadlen);
01142 readlen = multireadlen;
01143 idx+=multiframeio-1; // add in the required increment...
01144 }
01145 
01146 //
01147 // perform the required I/O via GDS or by host kernel I/O
01148 //
01149 long ret=0;
01150 if (usecufile) {
01151 ret = cuFileRead(parms->cfh[myfileidx], (char *) devptr, readlen, foffset, 0);
01152 } else if (hostfds) {
01153 foffset=0;
01154 ret=fio_fseek(hostfds[myfileidx], foffset, FIO_SEEK_SET);
01155 if (ret<0) { printf("fio_fseek() error!\n"); return NULL; }
01156 ret=fio_fread(hostptr, readlen, 1, hostfds[myfileidx]);
01157 if (ret<0) { printf("fio_fseek() error!\n"); return NULL; }
01158 cudaMemcpy(devptr, hostptr, readlen, cudaMemcpyHostToDevice);
01159 } else {
01160 printf("Inconsistent cufile/hostfds state, aborting!\n");
01161 return NULL;
01162 }
01163 
01164 // handle errors if they have occured
01165 if (ret < 0) {
01166 printf("Thr[%2d] Error: cuFileRead(): %ld\n", threadid, ret);
01167 return NULL; // XXX error handling needs to be done here
01168 }
01169 
01170 if (verbose) {
01171 printf("Thr[%2d]F[%d][tile: %d to %d] frame: %d cuFile len: %ld off: %ld\n",
01172 threadid, myfileidx, tile.start, tile.end, idx, 
01173 readlen, foffset);
01174 }
01175 }
01176 }
01177 
01178 cudaFree(crdx1);
01179 cudaFree(crdy1);
01180 cudaFree(crdz1);
01181 cudaFree(crdx2);
01182 cudaFree(crdy2);
01183 cudaFree(crdz2);
01184 
01185 #if defined(VMDUSECUDAGDS)
01186 if (usecufile) {
01187 cuFileBufDeregister(devptr);
01188 }
01189 #endif
01190 
01191 if (hostfds != NULL) {
01192 int i;
01193 for (i=0; i<nfiles; i++) {
01194 fio_fclose(hostfds[i]);
01195 }
01196 free(hostfds);
01197 }
01198 
01199 if (hostptr != NULL) {
01200 if (pinhostiobuffer) {
01201 cudaFreeHost(hostptr);
01202 } else {
01203 free(hostptr_unaligned);
01204 }
01205 }
01206 
01207 return NULL;
01208 }
01209 
01210 #endif
01211 
01212 
01213 int gpu_ooc_bench(wkf_threadpool_t *devpool, // VMD GPU worker thread pool
01214 int nfiles, const char **trjfileset, const AtomSel *sel,
01215 int first, int last, int step) {
01216 printf("gpu_ooc_bench()\n");
01217 wkf_threadpool_t *bigpool = NULL;
01218 
01219 #if defined(VMDUSECUDAGDS)
01220 int devcount;
01221 cudaError_t crc = cudaGetDeviceCount(&devcount);
01222 printf("gpu_ooc_bench) GPU device count: %d\n", devcount);
01223 if (devcount==0)
01224 printf("gpu_ooc_bench) No GPU devices, continuing with host only...\n");
01225 
01226 CUfileHandle_t * cfh = (CUfileHandle_t *) calloc(1, nfiles * sizeof(CUfileHandle_t));
01227 CUfileDescr_t * cfhdesc = (CUfileDescr_t *) calloc(1, nfiles * sizeof(CUfileDescr_t));
01228 memset(&cfh[0], 0, sizeof(cfh));
01229 memset(&cfhdesc[0], 0, sizeof(cfhdesc));
01230 
01231 int natomschk = 0;
01232 jshandle **jshandles = (jshandle **) calloc(1, nfiles * sizeof(jshandle *));
01233 fio_fd *directio_fds = (fio_fd *) calloc(1, nfiles * sizeof(fio_fd));
01234 
01235 int i;
01236 for (i=0; i<nfiles; i++) {
01237 const char *filename = trjfileset[i];
01238 printf("gpu_ooc_bench) File[%d] GDS setup, opening '%s'\n", i, filename);
01239 jshandles[i] = (jshandle *) open_js_read(filename, "js", &natomschk);
01240 if (!jshandles[i]) {
01241 printf("gpu_ooc_bench) File[%d] open_js_read failed for file %s\n", i, filename);
01242 return -1; // deal with error handling later
01243 }
01244 
01245 #if vmdplugin_ABIVERSION > 17
01246 long blocksz = MOLFILE_DIRECTIO_MIN_BLOCK_SIZE;
01247 int filepgalignsz = 1;
01248 read_js_timestep_pagealign_size(jshandles[i], &filepgalignsz);
01249 if (filepgalignsz != blocksz) {
01250 printf("gpu_ooc_bench) File[%d] Plugin-returned page alignment size mismatch!\n", i);
01251 } else {
01252 printf("gpu_ooc_bench) File[%d] Page alignment size: %d\n", i, filepgalignsz);
01253 }
01254 #endif
01255 
01256 read_js_timestep_index_offsets(jshandles[i], natomschk, 0, 0, 0,
01257 &directio_fds[i], NULL, NULL, NULL);
01258 
01259 cfhdesc[i].handle.fd = directio_fds[i]; // typedef of Unix FD
01260 cfhdesc[i].type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
01261 CUfileError_t cferr = cuFileHandleRegister(&cfh[i], &cfhdesc[i]);
01262 
01263 if (cferr.err != CU_FILE_SUCCESS) {
01264 printf("gpu_ooc_bench) File[%d] cuFileImportExternalFile on fd %d failed!\n",
01265 i, cfhdesc[i].handle.fd);
01266 return -1; // XXX error handling needs to be done here
01267 }
01268 }
01269 
01270 
01271 //
01272 // copy in per-thread parameters
01273 //
01274 gpuoocbenchthreadparms parms;
01275 memset(&parms, 0, sizeof(parms));
01276 parms.devcount = devcount;
01277 parms.nfiles = nfiles;
01278 parms.trjfileset = trjfileset;
01279 parms.jshandles = jshandles;
01280 parms.cfh = cfh;
01281 parms.natoms = sel->num_atoms;
01282 parms.sel = sel;
01283 parms.first = first;
01284 parms.last = last;
01285 parms.step = step;
01286 
01287 int framecount = nfiles * (last / step);
01288 
01289 // create timers
01290 wkf_timerhandle timer;
01291 timer=wkf_timer_create();
01292 
01293 // spawn child threads to do the work
01294 wkf_tasktile_t tile;
01295 tile.start=0;
01296 tile.end=framecount - 1; // first row only
01297 
01298 printf("gpu_ooc_bench) tile start: %d end: %d\n", tile.start, tile.end);
01299 
01300 int gdsthreadspergpu = 1;
01301 if (getenv("VMDGDSTHREADSPERGPU") != NULL)
01302 gdsthreadspergpu = atoi(getenv("VMDGDSTHREADSPERGPU"));
01303 
01304 printf("gpu_ooc_bench) gdsthreadspergpu: %d\n", gdsthreadspergpu);
01305 
01306 if (gdsthreadspergpu > 1) {
01307 // XXX extra-large GPU device thread pool
01308 int workercount = devcount * gdsthreadspergpu;
01309 
01310 int *devlist = new int[workercount];
01311 int k;
01312 for (k=0; k<workercount; k++) {
01313 devlist[k] = k / gdsthreadspergpu; // XXX ignores VMD CUDA device masks
01314 }
01315 
01316 msgInfo << "Creating Multi-worker ("
01317 << gdsthreadspergpu << " per-GPU) CUDA device pool..." << sendmsg;
01318 bigpool=wkf_threadpool_create(workercount, devlist);
01319 delete [] devlist;
01320 
01321 // associate each worker thread with a specific GPU
01322 if (getenv("VMDCUDAVERBOSE") != NULL)
01323 wkf_threadpool_launch(bigpool, vmd_cuda_devpool_setdeviceonly, (void*)"VMD CUDA Dev Init", 1);
01324 else
01325 wkf_threadpool_launch(bigpool, vmd_cuda_devpool_setdeviceonly, NULL, 1);
01326 
01327 // clear all available device memory on each of the GPUs
01328 wkf_threadpool_launch(bigpool, vmd_cuda_devpool_clear_device_mem, NULL, 1);
01329 
01330 // XXX override which GPU device pool we're going to use
01331 devpool = bigpool;
01332 }
01333 
01334 // XXX affinitize GPU worker threads for best perf
01335 wkf_threadpool_launch(devpool, vmd_cuda_affinitize_threads, NULL, 1);
01336 
01337 wkf_threadpool_sched_dynamic(devpool, &tile);
01338 wkf_timer_start(timer);
01339 wkf_threadpool_launch(devpool, gpu_ooc_bench_thread, &parms, 1);
01340 wkf_timer_stop(timer);
01341 
01342 double runtime = wkf_timer_time(timer);
01343 double gbytes = sel->num_atoms * 12L * (tile.end+1) / (1024.0 * 1024.0 * 1024.0);
01344 
01345 printf("gpu_ooc_bench) natoms: %d, fsz: %ld, tsz: %ld\n",
01346 sel->num_atoms, sel->num_atoms * 12L,
01347 sel->num_atoms * 12L * (tile.end+1));
01348 
01349 int pinhostiobuffer = 1;
01350 if (getenv("VMDGDSHOSTNOPIN"))
01351 pinhostiobuffer=0;
01352 
01353 int hostusedirectio = 1;
01354 if (getenv("VMDGDSHOSTBUFFEREDIO") != NULL)
01355 hostusedirectio = 0;
01356 
01357 int usecufile=1;
01358 if (getenv("VMDGDSUSEHOST"))
01359 usecufile=0;
01360 
01361 if (usecufile) {
01362 printf("OOC I/O via GDS + cuFile\n");
01363 } else {
01364 printf("OOC I/O via host, %s APIs, %s memory buffers\n",
01365 (hostusedirectio) ? "Direct I/O" : "Buffered I/O",
01366 (pinhostiobuffer) ? "pinned" : "unpinned");
01367 }
01368 
01369 int multiframeio = 0;
01370 if (getenv("VMDGDSMULTIFRAME"))
01371 multiframeio = atoi(getenv("VMDGDSMULTIFRAME"));
01372 if (multiframeio > VMDGDSMAXFRAMEBUF)
01373 multiframeio = VMDGDSMAXFRAMEBUF;
01374 if (multiframeio) {
01375 printf("GDS multi-frame read opt: %d frames per call, %ld bytes\n",
01376 multiframeio,
01377 multiframeio * sel->num_atoms * 12L);
01378 }
01379 
01380 printf("OOC runtime: %.1f, %.2fGB/sec\n", runtime, gbytes/runtime);
01381 
01382 for (i=0; i<nfiles; i++) {
01383 #if defined(VMDUSECUDAGDS)
01384 cuFileHandleDeregister(cfh[i]);
01385 #endif
01386 close_js_read(jshandles[i]);
01387 }
01388 #endif
01389 
01390 #if defined(VMDUSECUDAGDS)
01391 if (cfh != NULL)
01392 free(cfh);
01393 
01394 if (cfhdesc != NULL)
01395 free(cfhdesc);
01396 
01397 if (jshandles != NULL)
01398 free(jshandles);
01399 
01400 if (directio_fds != NULL)
01401 free(directio_fds);
01402 #endif
01403 
01404 // if we created an extra large-thread-count-per-GPU thread pool, we
01405 // need to destroy it here...
01406 if (bigpool != NULL)
01407 wkf_threadpool_destroy(bigpool);
01408 
01409 return 0;
01410 } 
01411 
01412 
01413