00001 /*************************************************************************** 00002 *cr 00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the 00004 *cr University of Illinois 00005 *cr All Rights Reserved 00006 *cr 00007 ***************************************************************************/ 00008 /*************************************************************************** 00009 * RCS INFORMATION: 00010 * 00011 * $RCSfile: CUDAVolCPotential.cu,v $ 00012 * $Author: johns $ $Locker: $ $State: Exp $ 00013 * $Revision: 1.46 $ $Date: 2020年02月26日 20:16:56 $ 00014 * 00015 ***************************************************************************/ 00035 #include <stdio.h> 00036 #include <stdlib.h> 00037 #include <string.h> 00038 #include <cuda.h> 00039 00040 #include "Inform.h" 00041 #include "utilities.h" 00042 #include "WKFThreads.h" 00043 #include "WKFUtils.h" 00044 #include "CUDAKernels.h" 00045 00046 typedef struct { 00047 float* atoms; 00048 float* grideners; 00049 long int numplane; 00050 long int numcol; 00051 long int numpt; 00052 long int natoms; 00053 float gridspacing; 00054 } enthrparms; 00055 00056 /* thread prototype */ 00057 static void * cudaenergythread(void *); 00058 00059 #if 1 00060 #define CUERR { cudaError_t err; \ 00061 if ((err = cudaGetLastError()) != cudaSuccess) { \ 00062 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \ 00063 printf("Thread aborting...\n"); \ 00064 return NULL; }} 00065 #else 00066 #define CUERR 00067 #endif 00068 00069 // max constant buffer size is 64KB, minus whatever 00070 // the CUDA runtime and compiler are using that we don't know about 00071 // At 16 bytes/atom, 4000 atoms is about the max we can store in 00072 // the constant buffer. 00073 #define MAXATOMS 4000 00074 __constant__ static float4 atominfo[MAXATOMS]; 00075 00076 00077 // 00078 // The CUDA kernels calculate coulombic potential at each grid point and 00079 // store the results in the output array. 00080 // 00081 // These versions of the code use the 64KB constant buffer area reloaded 00082 // for each group of MAXATOMS atoms, until the contributions from all 00083 // atoms have been summed into the potential grid. 00084 // 00085 // These implementations use precomputed and unrolled loops of 00086 // (dy^2 + dz^2) values for increased FP arithmetic intensity. 00087 // The X coordinate portion of the loop is unrolled by four or eight, 00088 // allowing the same dy^2 + dz^2 values to be reused multiple times, 00089 // increasing the ratio of FP arithmetic relative to FP loads, and 00090 // eliminating some redundant calculations. 00091 // 00092 00093 // 00094 // Tuned global memory coalescing version, unrolled in X 00095 // 00096 // Benchmark for this version: 291 GFLOPS, 39.5 billion atom evals/sec 00097 // (Test system: GeForce 8800GTX) 00098 // 00099 00100 #if 1 00101 00102 // 00103 // Tunings for large potential map dimensions (e.g. 384x384x...) 00104 // 00105 // NVCC -cubin says this implementation uses 20 regs 00106 // 00107 #define UNROLLX 8 00108 #define UNROLLY 1 00109 #define BLOCKSIZEX 8 // make large enough to allow coalesced global mem ops 00110 #define BLOCKSIZEY 8 // make as small as possible for finer granularity 00111 00112 #else 00113 00114 // 00115 // Tunings for small potential map dimensions (e.g. 128x128x...) 00116 // 00117 // NVCC -cubin says this implementation uses 16 regs 00118 // 00119 #define UNROLLX 4 00120 #define UNROLLY 1 00121 #define BLOCKSIZEX 16 // make large enough to allow coalesced global mem ops 00122 #define BLOCKSIZEY 16 // make as small as possible for finer granularity 00123 00124 #endif 00125 00126 #define BLOCKSIZE (BLOCKSIZEX*BLOCKSIZEY) 00127 00128 // FLOP counting 00129 #if UNROLLX == 8 00130 #define FLOPSPERATOMEVAL (59.0/8.0) 00131 #elif UNROLLX == 4 00132 #define FLOPSPERATOMEVAL (31.0/4.0) 00133 #endif 00134 00135 __global__ static void cenergy(int numatoms, float gridspacing, float * energygrid) { 00136 unsigned int xindex = blockIdx.x * blockDim.x * UNROLLX + threadIdx.x; 00137 unsigned int yindex = blockIdx.y * blockDim.y + threadIdx.y; 00138 unsigned int outaddr = gridDim.x * blockDim.x * UNROLLX * yindex + xindex; 00139 00140 float coory = gridspacing * yindex; 00141 float coorx = gridspacing * xindex; 00142 00143 float energyvalx1=0.0f; 00144 float energyvalx2=0.0f; 00145 float energyvalx3=0.0f; 00146 float energyvalx4=0.0f; 00147 #if UNROLLX > 4 00148 float energyvalx5=0.0f; 00149 float energyvalx6=0.0f; 00150 float energyvalx7=0.0f; 00151 float energyvalx8=0.0f; 00152 #endif 00153 00154 float gridspacing_coalesce = gridspacing * BLOCKSIZEX; 00155 00156 // NOTE: One beneficial side effect of summing groups of 4,000 atoms 00157 // at a time in multiple passes is that these smaller summation 00158 // groupings are likely to result in higher precision, since structures 00159 // are typically stored with atoms in chain order leading to high 00160 // spatial locality within the groups of 4,000 atoms. Kernels that 00161 // sum these subgroups independently of the global sum for each voxel 00162 // should achieve relatively good floating point precision since large values 00163 // will tend to be summed with large values, and small values summed with 00164 // small values, etc. 00165 int atomid; 00166 for (atomid=0; atomid<numatoms; atomid++) { 00167 float dy = coory - atominfo[atomid].y; 00168 float dyz2 = (dy * dy) + atominfo[atomid].z; 00169 00170 float dx1 = coorx - atominfo[atomid].x; 00171 float dx2 = dx1 + gridspacing_coalesce; 00172 float dx3 = dx2 + gridspacing_coalesce; 00173 float dx4 = dx3 + gridspacing_coalesce; 00174 #if UNROLLX > 4 00175 float dx5 = dx4 + gridspacing_coalesce; 00176 float dx6 = dx5 + gridspacing_coalesce; 00177 float dx7 = dx6 + gridspacing_coalesce; 00178 float dx8 = dx7 + gridspacing_coalesce; 00179 #endif 00180 00181 energyvalx1 += atominfo[atomid].w * rsqrtf(dx1*dx1 + dyz2); 00182 energyvalx2 += atominfo[atomid].w * rsqrtf(dx2*dx2 + dyz2); 00183 energyvalx3 += atominfo[atomid].w * rsqrtf(dx3*dx3 + dyz2); 00184 energyvalx4 += atominfo[atomid].w * rsqrtf(dx4*dx4 + dyz2); 00185 #if UNROLLX > 4 00186 energyvalx5 += atominfo[atomid].w * rsqrtf(dx5*dx5 + dyz2); 00187 energyvalx6 += atominfo[atomid].w * rsqrtf(dx6*dx6 + dyz2); 00188 energyvalx7 += atominfo[atomid].w * rsqrtf(dx7*dx7 + dyz2); 00189 energyvalx8 += atominfo[atomid].w * rsqrtf(dx8*dx8 + dyz2); 00190 #endif 00191 } 00192 00193 energygrid[outaddr ] += energyvalx1; 00194 energygrid[outaddr+1*BLOCKSIZEX] += energyvalx2; 00195 energygrid[outaddr+2*BLOCKSIZEX] += energyvalx3; 00196 energygrid[outaddr+3*BLOCKSIZEX] += energyvalx4; 00197 #if UNROLLX > 4 00198 energygrid[outaddr+4*BLOCKSIZEX] += energyvalx5; 00199 energygrid[outaddr+5*BLOCKSIZEX] += energyvalx6; 00200 energygrid[outaddr+6*BLOCKSIZEX] += energyvalx7; 00201 energygrid[outaddr+7*BLOCKSIZEX] += energyvalx8; 00202 #endif 00203 } 00204 00205 00206 // required GPU array padding to match thread block size 00207 // XXX note: this code requires block size dimensions to be a power of two 00208 #define TILESIZEX BLOCKSIZEX*UNROLLX 00209 #define TILESIZEY BLOCKSIZEY*UNROLLY 00210 #define GPU_X_ALIGNMASK (TILESIZEX - 1) 00211 #define GPU_Y_ALIGNMASK (TILESIZEY - 1) 00212 00213 static int copyatomstoconstbuf(const float *atoms, float *atompre, 00214 int count, float zplane) { 00215 if (count > MAXATOMS) { 00216 printf("Atom count exceeds constant buffer storage capacity\n"); 00217 return -1; 00218 } 00219 00220 int i; 00221 for (i=0; i<count*4; i+=4) { 00222 atompre[i ] = atoms[i ]; 00223 atompre[i + 1] = atoms[i + 1]; 00224 float dz = zplane - atoms[i + 2]; 00225 atompre[i + 2] = dz*dz; 00226 atompre[i + 3] = atoms[i + 3]; 00227 } 00228 00229 cudaMemcpyToSymbol(atominfo, atompre, count * 4 * sizeof(float), 0); 00230 00231 return 0; 00232 } 00233 00234 int vmd_cuda_vol_cpotential(long int natoms, float* atoms, float* grideners, 00235 long int numplane, long int numcol, long int numpt, 00236 float gridspacing) { 00237 enthrparms parms; 00238 wkf_timerhandle globaltimer; 00239 double totalruntime; 00240 int rc=0; 00241 00242 int numprocs = wkf_thread_numprocessors(); 00243 int deviceCount = 0; 00244 if (vmd_cuda_num_devices(&deviceCount)) 00245 return -1; 00246 if (deviceCount < 1) 00247 return -1; 00248 00249 /* take the lesser of the number of CPUs and GPUs */ 00250 /* and execute that many threads */ 00251 if (deviceCount < numprocs) { 00252 numprocs = deviceCount; 00253 } 00254 00255 printf("Using %d CUDA GPUs\n", numprocs); 00256 printf("GPU padded grid size: %d x %d x %d\n", 00257 (numpt + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK), 00258 (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK), 00259 numplane); 00260 00261 parms.atoms = atoms; 00262 parms.grideners = grideners; 00263 parms.numplane = numplane; 00264 parms.numcol = numcol; 00265 parms.numpt = numpt; 00266 parms.natoms = natoms; 00267 parms.gridspacing = gridspacing; 00268 00269 globaltimer = wkf_timer_create(); 00270 wkf_timer_start(globaltimer); 00271 00272 /* spawn child threads to do the work */ 00273 wkf_tasktile_t tile; 00274 tile.start=0; 00275 tile.end=numplane; 00276 rc = wkf_threadlaunch(numprocs, &parms, cudaenergythread, &tile); 00277 00278 // Measure GFLOPS 00279 wkf_timer_stop(globaltimer); 00280 totalruntime = wkf_timer_time(globaltimer); 00281 wkf_timer_destroy(globaltimer); 00282 00283 if (!rc) { 00284 double atomevalssec = ((double) numplane * numcol * numpt * natoms) / (totalruntime * 1000000000.0); 00285 printf(" %g billion atom evals/second, %g GFLOPS\n", 00286 atomevalssec, atomevalssec * FLOPSPERATOMEVAL); 00287 } else { 00288 msgWarn << "A GPU encountered an unrecoverable error." << sendmsg; 00289 msgWarn << "Calculation will continue using the main CPU." << sendmsg; 00290 } 00291 return rc; 00292 } 00293 00294 00295 00296 00297 00298 static void * cudaenergythread(void *voidparms) { 00299 dim3 volsize, Gsz, Bsz; 00300 float *devenergy = NULL; 00301 float *hostenergy = NULL; 00302 float *atomprebuf = NULL; 00303 enthrparms *parms = NULL; 00304 int threadid=0; 00305 int atomprebufpinned = 0; // try to use pinned/page-locked atom buffer 00306 00307 wkf_threadlaunch_getid(voidparms, &threadid, NULL); 00308 wkf_threadlaunch_getdata(voidparms, (void **) &parms); 00309 00310 /* 00311 * copy in per-thread parameters 00312 */ 00313 const float *atoms = parms->atoms; 00314 float* grideners = parms->grideners; 00315 const long int numplane = parms->numplane; 00316 const long int numcol = parms->numcol; 00317 const long int numpt = parms->numpt; 00318 const long int natoms = parms->natoms; 00319 const float gridspacing = parms->gridspacing; 00320 double lasttime, totaltime; 00321 00322 cudaError_t rc; 00323 rc = cudaSetDevice(threadid); 00324 if (rc != cudaSuccess) { 00325 #if CUDART_VERSION >= 2010 00326 rc = cudaGetLastError(); // query last error and reset error state 00327 if (rc != cudaErrorSetOnActiveProcess) 00328 return NULL; // abort and return an error 00329 #else 00330 cudaGetLastError(); // just ignore and reset error state, since older CUDA 00331 // revs don't have a cudaErrorSetOnActiveProcess enum 00332 #endif 00333 } 00334 00335 // setup energy grid size, padding out arrays for peak GPU memory performance 00336 volsize.x = (numpt + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK); 00337 volsize.y = (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK); 00338 volsize.z = 1; // we only do one plane at a time 00339 00340 // setup CUDA grid and block sizes 00341 Bsz.x = BLOCKSIZEX; 00342 Bsz.y = BLOCKSIZEY; 00343 Bsz.z = 1; 00344 Gsz.x = volsize.x / (Bsz.x * UNROLLX); 00345 Gsz.y = volsize.y / (Bsz.y * UNROLLY); 00346 Gsz.z = 1; 00347 int volmemsz = sizeof(float) * volsize.x * volsize.y * volsize.z; 00348 00349 printf("Thread %d started for CUDA device %d...\n", threadid, threadid); 00350 wkf_timerhandle timer = wkf_timer_create(); 00351 wkf_timer_start(timer); 00352 wkfmsgtimer * msgt = wkf_msg_timer_create(5); 00353 00354 // Allocate DMA buffers with some extra padding at the end so that 00355 // multiple GPUs aren't DMAing too close to each other, for NUMA.. 00356 #define DMABUFPADSIZE (32 * 1024) 00357 00358 // allocate atom pre-computation and copy buffer in pinned memory 00359 // for better host/GPU transfer bandwidth, when enabled 00360 if (atomprebufpinned) { 00361 // allocate GPU DMA buffer (with padding) 00362 if (cudaMallocHost((void**) &atomprebuf, MAXATOMS*4*sizeof(float) + DMABUFPADSIZE) != cudaSuccess) { 00363 printf("Pinned atom copy buffer allocation failed!\n"); 00364 atomprebufpinned=0; 00365 } 00366 } 00367 00368 // if a pinned allocation failed or we choose not to use 00369 // pinned memory, fall back to a normal malloc here. 00370 if (!atomprebufpinned) { 00371 // allocate GPU DMA buffer (with padding) 00372 atomprebuf = (float *) malloc(MAXATOMS * 4 * sizeof(float) + DMABUFPADSIZE); 00373 if (atomprebuf == NULL) { 00374 printf("Atom copy buffer allocation failed!\n"); 00375 return NULL; 00376 } 00377 } 00378 00379 00380 // allocate and initialize the GPU output array 00381 cudaMalloc((void**)&devenergy, volmemsz); 00382 CUERR // check and clear any existing errors 00383 00384 hostenergy = (float *) malloc(volmemsz); // allocate working buffer 00385 00386 // For each point in the cube... 00387 int iterations=0; 00388 int computedplanes=0; 00389 wkf_tasktile_t tile; 00390 while (wkf_threadlaunch_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) { 00391 int k; 00392 for (k=tile.start; k<tile.end; k++) { 00393 int y; 00394 int atomstart; 00395 float zplane = k * (float) gridspacing; 00396 computedplanes++; // track work done by this GPU for progress reporting 00397 00398 // Copy energy grid into GPU 16-element padded input 00399 for (y=0; y<numcol; y++) { 00400 long eneraddr = k*numcol*numpt + y*numpt; 00401 memcpy(&hostenergy[y*volsize.x], &grideners[eneraddr], numpt * sizeof(float)); 00402 } 00403 00404 // Copy the Host input data to the GPU.. 00405 cudaMemcpy(devenergy, hostenergy, volmemsz, cudaMemcpyHostToDevice); 00406 CUERR // check and clear any existing errors 00407 00408 lasttime = wkf_timer_timenow(timer); 00409 for (atomstart=0; atomstart<natoms; atomstart+=MAXATOMS) { 00410 iterations++; 00411 int runatoms; 00412 int atomsremaining = natoms - atomstart; 00413 if (atomsremaining > MAXATOMS) 00414 runatoms = MAXATOMS; 00415 else 00416 runatoms = atomsremaining; 00417 00418 // copy the next group of atoms to the GPU 00419 if (copyatomstoconstbuf(atoms + 4*atomstart, atomprebuf, runatoms, zplane)) 00420 return NULL; 00421 00422 // RUN the kernel... 00423 cenergy<<<Gsz, Bsz, 0>>>(runatoms, gridspacing, devenergy); 00424 CUERR // check and clear any existing errors 00425 } 00426 00427 // Copy the GPU output data back to the host and use/store it.. 00428 cudaMemcpy(hostenergy, devenergy, volmemsz, cudaMemcpyDeviceToHost); 00429 CUERR // check and clear any existing errors 00430 00431 // Copy GPU blocksize padded array back down to the original size 00432 for (y=0; y<numcol; y++) { 00433 long eneraddr = k*numcol*numpt + y*numpt; 00434 memcpy(&grideners[eneraddr], &hostenergy[y*volsize.x], numpt * sizeof(float)); 00435 } 00436 00437 totaltime = wkf_timer_timenow(timer); 00438 if (wkf_msg_timer_timeout(msgt)) { 00439 // XXX: we have to use printf here as msgInfo is not thread-safe yet. 00440 printf("thread[%d] plane %d/%ld (%d computed) time %.2f, elapsed %.1f, est. total: %.1f\n", 00441 threadid, k, numplane, computedplanes, 00442 totaltime - lasttime, totaltime, 00443 totaltime * numplane / (k+1)); 00444 } 00445 } 00446 } 00447 00448 wkf_timer_destroy(timer); // free timer 00449 wkf_msg_timer_destroy(msgt); // free timer 00450 free(hostenergy); // free working buffer 00451 cudaFree(devenergy); // free CUDA memory buffer 00452 CUERR // check and clear any existing errors 00453 00454 // free pinned GPU copy buffer 00455 if (atomprebufpinned) { 00456 if (cudaFreeHost(atomprebuf) != cudaSuccess) { 00457 printf("Pinned atom buffer deallocation failed!\n"); 00458 return NULL; 00459 } 00460 } else { 00461 free(atomprebuf); 00462 } 00463 return NULL; 00464 } 00465 00466 00467 00468