00001 /*************************************************************************** 00002 *cr 00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the 00004 *cr University of Illinois 00005 *cr All Rights Reserved 00006 *cr 00007 ***************************************************************************/ 00008 /*************************************************************************** 00009 * RCS INFORMATION: 00010 * 00011 * $RCSfile: CUDAMeasureRDF.cu,v $ 00012 * $Author: johns $ $Locker: $ $State: Exp $ 00013 * $Revision: 1.28 $ $Date: 2020年02月26日 20:16:56 $ 00014 * 00015 ***************************************************************************/ 00029 #include <stdio.h> 00030 #include <stdlib.h> 00031 #include <string.h> 00032 #include <math.h> 00033 #include <cuda.h> 00034 00035 #include "Inform.h" 00036 #include "utilities.h" 00037 #include "WKFThreads.h" 00038 #include "WKFUtils.h" 00039 #include "CUDAKernels.h" 00040 00041 // 00042 // the below parameters are important to tuning the performance of the code 00043 // 00044 #if 0 00045 // original (slow?) default values for all GPU types 00046 // no hardware support for atomic operations, so the shared memory 00047 // tagging scheme is used. 00048 #define NBLOCK 128 // size of an RDF data block 00049 #define MAXBIN 64 // maximum number of bins in a histogram 00050 #elif 1 00051 // optimal values for compute 1.0 (G80/G9x) devices 00052 // no hardware support for atomic operations, so the shared memory 00053 // tagging scheme is used. 00054 #define NBLOCK 32 // size of an RDF data block 00055 #define MAXBIN 1024 // maximum number of bins in a histogram 00056 #elif 1 00057 // optimal values for compute 1.3 (GT200) devices 00058 // uses atomic operations to increment histogram bins 00059 #define NBLOCK 320 // size of an RDF data block 00060 #define MAXBIN 3072 // maximum number of bins in a histogram 00061 #define __USEATOMIC 1 // enable atomic increment operations 00062 #elif 1 00063 // optimal values for compute 2.0 (Fermi) devices 00064 // uses atomic operations to increment histogram bins 00065 // uses Fermi's 3x larger shared memory (48KB) to reduce the number 00066 // of passes required for computation of very large histograms 00067 #define NBLOCK 896 // size of an RDF data block 00068 #define MAXBIN 8192 // maximum number of bins in a histogram 00069 #define __USEATOMIC 1 // enable atomic increment operations 00070 #endif 00071 00072 #define NCUDABLOCKS 256 // the number of blocks to divide the shared memory 00073 // dimension into. Performance increases up to ~8 00074 // and then levels off 00075 00076 00077 #define NBLOCKHIST 64 // the number of threads used in the final histogram 00078 // kernel. The summation of the histograms isn't 00079 // a bottleneck so this isn't a critical choice 00080 00081 #define NCONSTBLOCK 5440 // the number of atoms in constant memory. 00082 // Can be 4096 (or larger) up to capacity limits. 00083 00084 #define THREADSPERWARP 32 // this is correct for G80/GT200/Fermi GPUs 00085 #define WARP_LOG_SIZE 5 // this is also correct for G80/GT200/Fermi GPUs 00086 00087 #define BIN_OVERFLOW_LIMIT 2147483648 // maximum value a bin can store before 00088 // we need to accumulate and start over 00089 00090 00091 // This routine prepares data for the calculation of the histogram. 00092 // It zeros out instances of the histograms in global memory. 00093 __global__ void init_hist(unsigned int* llhistg, // histograms 00094 int maxbin) // number of bins per histogram 00095 { 00096 00097 #ifndef __USEATOMIC 00098 int io; // index of atoms in selection 00099 #endif 00100 00101 int iblock; // index of data blocks 00102 00103 unsigned int *llhist; // this pointer will point to the begining of 00104 // each thread's instance of the histogram 00105 00106 int nofblocks; // nofblocks is the number of full data blocks. 00107 int nleftover; // nleftover is the dimension of the blocks left over 00108 // after all full data blocks 00109 int maxloop; // the loop condition maximum 00110 00111 // initialize llhists to point to an instance of the histogram for each 00112 // warp. Zero out the integer (llhist) and floating point (histdev) copies 00113 // of the histogram in global memory 00114 #ifdef __USEATOMIC 00115 llhist=llhistg+blockIdx.x*maxbin+threadIdx.x; 00116 nofblocks=maxbin/NBLOCK; 00117 nleftover=maxbin-nofblocks*NBLOCK; 00118 maxloop=nofblocks*NBLOCK; 00119 00120 for (iblock=0; iblock < maxloop; iblock+=NBLOCK) { 00121 llhist[iblock]=0UL; 00122 } 00123 // take care of leftovers 00124 if (threadIdx.x < nleftover) { 00125 llhist[iblock]=0UL; 00126 } 00127 00128 #else 00129 llhist=llhistg+(((blockIdx.x)*NBLOCK)/THREADSPERWARP)*maxbin+threadIdx.x; 00130 nofblocks=maxbin/NBLOCK; 00131 nleftover=maxbin-nofblocks*NBLOCK; 00132 maxloop=nofblocks*NBLOCK; 00133 int maxloop2=(NBLOCK / THREADSPERWARP)*maxbin; 00134 00135 for (io=0; io < maxloop2; io+=maxbin) { 00136 for (iblock=0; iblock < maxloop; iblock+=NBLOCK) { 00137 llhist[io + iblock]=0UL; 00138 } 00139 // take care of leftovers 00140 if (threadIdx.x < nleftover) { 00141 llhist[io + iblock]=0UL; 00142 } 00143 } 00144 #endif 00145 00146 return; 00147 } 00148 00149 00150 00151 /* This routine prepares data for the calculation of the histogram. 00152 * It zeros out the floating point histogram in global memory. 00153 */ 00154 __global__ void init_hist_f(float* histdev) { 00155 histdev[threadIdx.x]=0.0f; 00156 return; 00157 } 00158 00159 00160 00161 /* This routine prepares data for the calculation of the histogram. 00162 * It recenters the atom to a single periodic unit cell. 00163 * It also chooses positions for the padding atoms such that 00164 * they don't contribute to the histogram 00165 */ 00166 __global__ void reimage_xyz(float* xyz, // XYZ data in global memory. 00167 int natomsi, // number of atoms 00168 int natomsipad, // # atoms including padding atoms 00169 float3 celld, // the cell size of each frame 00170 float rmax) // dist. used to space padding atoms 00171 { 00172 int iblock; // index of data blocks 00173 00174 __shared__ float xyzi[3*NBLOCK]; // this array hold xyz data for the current 00175 // data block. 00176 00177 int nofblocks; // ibin is an index associated with histogram 00178 // bins. nofblocks is the number of full data blocks. nleftover is the 00179 // dimension of the blocks left over after all full data blocks 00180 00181 float *pxi; // these pointers point to the portion of xyz currently 00182 // being processed 00183 00184 int threetimesid; // three times threadIdx.x 00185 00186 float rtmp; // a temporary distance to be used in creating padding atoms 00187 00188 int ifinalblock; // this is the index of the final block 00189 00190 int loopmax; // maximum for the loop counter 00191 00192 // initialize nofblocks, pxi, and threetimesid 00193 nofblocks=((natomsipad/NBLOCK)+NCUDABLOCKS-blockIdx.x-1)/NCUDABLOCKS; 00194 loopmax=nofblocks*3*NCUDABLOCKS*NBLOCK; 00195 ifinalblock=(natomsipad / NBLOCK - 1) % NCUDABLOCKS; 00196 pxi=xyz+blockIdx.x*3*NBLOCK+threadIdx.x; 00197 threetimesid=3*threadIdx.x; 00198 00199 // shift all atoms into the same unit cell centered at the origin 00200 for (iblock=0; iblock<loopmax; iblock+=3*NCUDABLOCKS*NBLOCK) { 00201 __syncthreads(); 00202 //these reads from global memory should be coallesced 00203 xyzi[threadIdx.x ]=pxi[iblock ]; 00204 xyzi[threadIdx.x+ NBLOCK]=pxi[iblock+ NBLOCK]; 00205 xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK]; 00206 __syncthreads(); 00207 00208 // Shift the xyz coordinates so that 0 <= xyz < celld. 00209 // The ?: line is necesary because of the less than convenient behavior 00210 // of fmod for negative xyz values 00211 xyzi[threetimesid] = fmodf(xyzi[threetimesid ], celld.x); 00212 if (xyzi[threetimesid ] < 0.0f) { 00213 xyzi[threetimesid ] += celld.x; 00214 } 00215 xyzi[threetimesid+1]=fmodf(xyzi[threetimesid+1], celld.y); 00216 if (xyzi[threetimesid+1] < 0.0f) { 00217 xyzi[threetimesid+1] += celld.y; 00218 } 00219 xyzi[threetimesid+2]=fmodf(xyzi[threetimesid+2], celld.z); 00220 if (xyzi[threetimesid+2] < 0.0f) { 00221 xyzi[threetimesid+2] += celld.z; 00222 } 00223 00224 // if this is the final block then we pad 00225 if (iblock==loopmax-3*NCUDABLOCKS*NBLOCK && blockIdx.x==ifinalblock) { 00226 // pad the xyz coordinates with atoms which are spaced out such that they 00227 // cannot contribute to the histogram. Note that these atoms are NOT 00228 // reimaged 00229 if ((blockDim.x-threadIdx.x) <= (natomsipad - natomsi)) { 00230 rtmp = -((float)(threadIdx.x+1))*rmax; 00231 xyzi[threetimesid ] = rtmp; 00232 xyzi[threetimesid+1] = rtmp; 00233 xyzi[threetimesid+2] = rtmp; 00234 } 00235 } 00236 00237 __syncthreads(); 00238 00239 // store the xyz values back to global memory. these stores are coallesced 00240 pxi[iblock ]=xyzi[threadIdx.x ]; 00241 pxi[iblock+ NBLOCK]=xyzi[threadIdx.x+ NBLOCK]; 00242 pxi[iblock+2*NBLOCK]=xyzi[threadIdx.x+2*NBLOCK]; 00243 00244 // increment pxi to the next block of global memory to be processed 00245 //pxi=pxi+3*NCUDABLOCKS*NBLOCK; 00246 } 00247 } 00248 00249 00250 // shift the "phantom" atoms so they don't interfere in non-periodic 00251 // calculations. 00252 __global__ void phantom_xyz(float* xyz, // XYZ data in global memory. 00253 int natomsi, // number of atoms 00254 int natomsipad, // # atoms including padding atoms 00255 float minxyz, 00256 float rmax) // dist. used to space padding atoms 00257 { 00258 int iblock; // index of data blocks 00259 00260 __shared__ float xyzi[3*NBLOCK]; // this array hold xyz data for the current 00261 // data block. 00262 00263 int nofblocks; // ibin is an index associated with histogram 00264 // bins. nofblocks is the number of full data blocks. nleftover is the 00265 // dimension of the blocks left over after all full data blocks 00266 00267 float *pxi; // these pointers point to the portion of xyz currently 00268 // being processed 00269 00270 int threetimesid; // three times threadIdx.x 00271 00272 float rtmp; // a temporary distance to be used in creating padding atoms 00273 00274 int ifinalblock; // this is the index of the final block 00275 00276 int loopmax; // maximum for the loop counter 00277 00278 // initialize nofblocks, pxi, and threetimesid 00279 nofblocks=((natomsipad/NBLOCK)+NCUDABLOCKS-blockIdx.x-1)/NCUDABLOCKS; 00280 loopmax=nofblocks*3*NCUDABLOCKS*NBLOCK; 00281 ifinalblock=(natomsipad / NBLOCK - 1) % NCUDABLOCKS; 00282 pxi=xyz+blockIdx.x*3*NBLOCK+threadIdx.x; 00283 threetimesid=3*threadIdx.x; 00284 00285 // shift all atoms into the same unit cell centered at the origin 00286 for (iblock=0; iblock<loopmax; iblock+=3*NCUDABLOCKS*NBLOCK) { 00287 __syncthreads(); 00288 //these reads from global memory should be coallesced 00289 xyzi[threadIdx.x ]=pxi[iblock ]; 00290 xyzi[threadIdx.x+ NBLOCK]=pxi[iblock+ NBLOCK]; 00291 xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK]; 00292 __syncthreads(); 00293 00294 // if this is the final block then we pad 00295 if (iblock==loopmax-3*NCUDABLOCKS*NBLOCK && blockIdx.x==ifinalblock) { 00296 // pad the xyz coordinates with atoms which are spaced out such that they 00297 // cannot contribute to the histogram. Note that these atoms are NOT 00298 // reimaged 00299 if ((blockDim.x-threadIdx.x) <= (natomsipad - natomsi)) { 00300 rtmp = -((float)(threadIdx.x+1))*rmax-minxyz; 00301 xyzi[threetimesid ] = rtmp; 00302 xyzi[threetimesid+1] = rtmp; 00303 xyzi[threetimesid+2] = rtmp; 00304 } 00305 } 00306 00307 __syncthreads(); 00308 00309 // store the xyz values back to global memory. these stores are coallesced 00310 pxi[iblock ]=xyzi[threadIdx.x ]; 00311 pxi[iblock+ NBLOCK]=xyzi[threadIdx.x+ NBLOCK]; 00312 pxi[iblock+2*NBLOCK]=xyzi[threadIdx.x+2*NBLOCK]; 00313 00314 // increment pxi to the next block of global memory to be processed 00315 //pxi=pxi+3*NCUDABLOCKS*NBLOCK; 00316 } 00317 } 00318 00319 00320 /* This kernel calculates a radial distribution function between 00321 * two selections one selection is stored in its entirety in global memory. 00322 * The other is stored partially in constant memory. This routine is 00323 * called to calculate the rdf between all of selection 1 and the portion 00324 * of selection 2 in constant memory. 00325 * Each element of selection 2 is associated with it's own thread. To minimize 00326 * accesses to global memory selection 1 is divided into chunks of NBLOCK 00327 * elements. These chunks are loaded into shared memory simultaneously, 00328 * and then then all possible pairs of these atoms with those in 00329 * constant memory are calculated. 00330 */ 00331 __constant__ static float xyzj[3*NCONSTBLOCK]; // this array stores the 00332 // coordinates of selection two in constant memory 00333 00334 // this routine is called from the host to move coordinate data to constant mem 00335 void copycoordstoconstbuff(int natoms, const float* xyzh) { 00336 cudaMemcpyToSymbol(xyzj, xyzh, 3*natoms*sizeof(float)); 00337 } 00338 00339 00340 // This subroutine is based on a similar routine in the CUDA SDK histogram256 00341 // example. It adds a count to the histogram in such a way that there are no 00342 // collisions. If several routines need to update the same element a tag 00343 // applied to to the highest bits of that element is used to determine which 00344 // elements have already updated it. For devices with compute capability 1.2 00345 // or higher this is not necessary as atomicAdds can be used. 00346 00347 #ifndef __USEATOMIC 00348 __device__ void addData(volatile unsigned int *s_WarpHist, // the first element 00349 // of the histogram to be operated on 00350 unsigned int data, // the bin to add a count to 00351 unsigned int threadTag) // tag of the current thread 00352 { 00353 unsigned int count; // the count in the bin being operated on 00354 00355 do { 00356 count = s_WarpHist[data] & 0x07FFFFFFUL; 00357 count = threadTag | (count + 1); 00358 s_WarpHist[data] = count; 00359 } while(s_WarpHist[data] != count); 00360 } 00361 #endif 00362 00363 00364 // This is the routine that does all the real work. It takes as input the 00365 // various atomic coordinates and produces one rdf per warp, as described 00366 // above. These rdfs will be summed later by calculate_hist. 00367 __global__ void calculate_rdf(int usepbc, // periodic or non-periodic calc. 00368 float* xyz, // atom XYZ coords, gmem 00369 int natomsi, // # of atoms in sel1, gmem 00370 int natomsj, // # of atoms in sel2, cmem 00371 float3 celld, // cell size of each frame 00372 unsigned int* llhistg, // histograms, gmem 00373 int nbins, // # of bins in each histogram 00374 float rmin, // minimum distance for histogram 00375 float delr_inv) // recip. width of histogram bins 00376 { 00377 int io; // indices of the atoms in selection 2 00378 int iblock; //the block of selection 1 currently being processed. 00379 00380 #ifdef __USEATOMIC 00381 unsigned int *llhists1; // a pointer to the beginning of llhists 00382 // which is operated on by the warp 00383 00384 __shared__ unsigned int llhists[MAXBIN]; // array holds 1 histogram per warp. 00385 #else 00386 volatile unsigned int *llhists1; // a pointer to the beginning of llhists 00387 // which is operated on by the warp 00388 00389 volatile __shared__ unsigned int llhists[(NBLOCK*MAXBIN)/THREADSPERWARP]; 00390 // this array will hold 1 histogram per warp. 00391 #endif 00392 00393 __shared__ float xyzi[3*NBLOCK]; // coords for the portion of sel1 00394 // being processed in shared memory 00395 00396 float rxij, rxij2, rij; // registers holding intermediate values in 00397 // calculating the distance between two atoms 00398 00399 int ibin,nofblocksi; // registers counters and upper limit in some loops 00400 int nleftover; // # bins leftover after full blocks are processed 00401 float *pxi; // pointer to the portion of sel1 being processed 00402 unsigned int joffset; // the current offset in constant memory 00403 unsigned int loopmax, loopmax2; // the limit for several loops 00404 float rmin2=.0001/delr_inv; 00405 00406 #ifndef __USEATOMIC 00407 // this thread tag is needed for the histograming method implemented in 00408 // addData above. 00409 const unsigned int threadTag = ((unsigned int) 00410 ( threadIdx.x & (THREADSPERWARP - 1))) 00411 << (32 - WARP_LOG_SIZE); 00412 #endif 00413 00414 // initialize llhists1. If atomics are used, then each block has its own 00415 // copy of the histogram in shared memory, which will be stored to it's own 00416 // place in global memory. If atomics aren't used then we need 1 histogram 00417 // per warp. The number of blocks is also set approprately so that the 00418 // histograms may be zeroed out. 00419 #ifdef __USEATOMIC 00420 llhists1=llhists; 00421 nofblocksi=nbins/NBLOCK; 00422 nleftover=nbins-nofblocksi*NBLOCK; 00423 #else 00424 llhists1=llhists+(threadIdx.x/THREADSPERWARP)*MAXBIN; 00425 nofblocksi=nbins/THREADSPERWARP; 00426 nleftover=nbins-nofblocksi*THREADSPERWARP; 00427 #endif 00428 00429 // Here we also zero out the shared memory used in this routine. 00430 #ifdef __USEATOMIC 00431 loopmax = nofblocksi*NBLOCK; 00432 for (io=0; io<loopmax; io+=NBLOCK) { 00433 llhists1[io + threadIdx.x]=0UL; 00434 } 00435 if (threadIdx.x < nleftover) { 00436 llhists1[io + threadIdx.x]=0UL; 00437 } 00438 00439 #else 00440 loopmax = nofblocksi*THREADSPERWARP; 00441 for (io=0; io<loopmax; io+=THREADSPERWARP) { 00442 llhists1[io + (threadIdx.x & (THREADSPERWARP - 1))]=0UL; 00443 } 00444 if ((threadIdx.x & (THREADSPERWARP - 1)) < nleftover) { 00445 llhists1[io + (threadIdx.x & (THREADSPERWARP - 1))]=0UL; 00446 } 00447 #endif 00448 00449 // initialize nofblocks and pxi 00450 nofblocksi=((natomsi/NBLOCK)+NCUDABLOCKS-blockIdx.x-1)/NCUDABLOCKS; 00451 pxi=xyz+blockIdx.x*3*NBLOCK+threadIdx.x; 00452 00453 loopmax = 3 * natomsj; 00454 int idxt3 = 3 * threadIdx.x; 00455 int idxt3p1 = idxt3+1; 00456 int idxt3p2 = idxt3+2; 00457 00458 loopmax2 = nofblocksi*3*NCUDABLOCKS*NBLOCK; 00459 00460 // loop over all atoms in constant memory, using either 00461 // periodic or non-periodic particle pair distance calculation 00462 if (usepbc) { 00463 for (iblock=0; iblock<loopmax2; iblock+=3*NCUDABLOCKS*NBLOCK) { 00464 __syncthreads(); 00465 00466 // these loads from global memory should be coallesced 00467 xyzi[threadIdx.x ]=pxi[iblock ]; 00468 xyzi[threadIdx.x+ NBLOCK]=pxi[iblock+ NBLOCK]; 00469 xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK]; 00470 00471 __syncthreads(); 00472 00473 for (joffset=0; joffset<loopmax; joffset+=3) { 00474 // calculate the distance between two atoms. rxij and rxij2 are reused 00475 // to minimize the number of registers used 00476 rxij=fabsf(xyzi[idxt3 ] - xyzj[joffset ]); 00477 rxij2=celld.x - rxij; 00478 rxij=fminf(rxij, rxij2); 00479 rij=rxij*rxij; 00480 rxij=fabsf(xyzi[idxt3p1] - xyzj[joffset+1]); 00481 rxij2=celld.y - rxij; 00482 rxij=fminf(rxij, rxij2); 00483 rij+=rxij*rxij; 00484 rxij=fabsf(xyzi[idxt3p2] - xyzj[joffset+2]); 00485 rxij2=celld.z - rxij; 00486 rxij=fminf(rxij, rxij2); 00487 rij=sqrtf(rij + rxij*rxij); 00488 00489 // increment the appropriate bin, don't add duplicates to the zeroth bin 00490 ibin=__float2int_rd((rij-rmin)*delr_inv); 00491 if (ibin<nbins && ibin>=0 && rij>rmin2) { 00492 #ifdef __USEATOMIC 00493 atomicAdd(llhists1+ibin, 1U); 00494 #else 00495 addData(llhists1, ibin, threadTag); 00496 #endif 00497 } 00498 } //joffset 00499 } //iblock 00500 } else { // non-periodic 00501 for (iblock=0; iblock<loopmax2; iblock+=3*NCUDABLOCKS*NBLOCK) { 00502 __syncthreads(); 00503 00504 // these loads from global memory should be coallesced 00505 xyzi[threadIdx.x ]=pxi[iblock ]; 00506 xyzi[threadIdx.x+ NBLOCK]=pxi[iblock+ NBLOCK]; 00507 xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK]; 00508 00509 __syncthreads(); 00510 00511 // loop over all atoms in constant memory 00512 for (joffset=0; joffset<loopmax; joffset+=3) { 00513 // calculate the distance between two atoms. rxij and rxij2 are reused 00514 // to minimize the number of registers used 00515 rxij=xyzi[idxt3 ] - xyzj[joffset ]; 00516 rij=rxij*rxij; 00517 rxij=xyzi[idxt3p1] - xyzj[joffset+1]; 00518 rij+=rxij*rxij; 00519 rxij=xyzi[idxt3p2] - xyzj[joffset+2]; 00520 rij=sqrtf(rij + rxij*rxij); 00521 00522 // increment the appropriate bin, don't add duplicates to the zeroth bin 00523 ibin=__float2int_rd((rij-rmin)*delr_inv); 00524 if (ibin<nbins && ibin>=0 && rij>rmin2) { 00525 #ifdef __USEATOMIC 00526 atomicAdd(llhists1+ibin, 1U); 00527 #else 00528 addData(llhists1, ibin, threadTag); 00529 #endif 00530 } 00531 } //joffset 00532 } //iblock 00533 } 00534 00535 __syncthreads(); 00536 00537 00538 // below we store the histograms to global memory so that they may be summed 00539 // in another routine. Setting nofblo 00540 nofblocksi=nbins/NBLOCK; 00541 nleftover=nbins-nofblocksi*NBLOCK; 00542 00543 #ifdef __USEATOMIC 00544 // reinitialize llhists1 to point to global memory 00545 unsigned int *llhistg1=llhistg+blockIdx.x*MAXBIN+threadIdx.x; 00546 00547 // loop to add all elements to global memory 00548 loopmax = nofblocksi*NBLOCK; 00549 for (iblock=0; iblock < loopmax; iblock+=NBLOCK) { 00550 llhistg1[iblock] += llhists[iblock+threadIdx.x]; 00551 } 00552 // take care of leftovers 00553 if (threadIdx.x < nleftover) { 00554 llhistg1[iblock] += llhists[iblock+threadIdx.x]; 00555 } 00556 #else 00557 // reinitialize llhists1 to point to global memory 00558 unsigned int* llhistg1=llhistg+(((blockIdx.x)*NBLOCK)/THREADSPERWARP)*MAXBIN+threadIdx.x; 00559 00560 // loop to add all elements to global memory 00561 loopmax = MAXBIN * (NBLOCK / THREADSPERWARP); 00562 loopmax2 = nofblocksi * NBLOCK; 00563 for (io=0; io < loopmax; io+=MAXBIN) { 00564 for (iblock=0; iblock < loopmax2; iblock+=NBLOCK) { 00565 llhistg1[io+iblock] += llhists[io+iblock+threadIdx.x]; 00566 } 00567 // take care of leftovers 00568 if (threadIdx.x < nleftover) { 00569 llhistg1[iblock] += llhists[io+iblock+threadIdx.x]; 00570 } 00571 } 00572 #endif 00573 00574 return; 00575 } 00576 00577 00578 #define ull2float __uint2float_rn 00579 00580 /* This routine takes a series of integer histograms in llhist, 00581 * stored in as integers, converts them to floats, multiplies 00582 * them by appropriate factors, and sums them. The result is 00583 * stored in histdev. 00584 */ 00585 __global__ void calculate_histogram(float* histdev, // histogram to return 00586 unsigned int* llhistg, // the histograms stored 00587 // in global memory. There is one histogram 00588 // per block (warp) if atomicAdds are (not) used 00589 int nbins) // stride between geometries in xyz. 00590 { 00591 int io; // index for inner loop 00592 int iblock; // index for outer loop 00593 int maxloop; // maxima for loop conditions 00594 00595 __shared__ unsigned int llhists[NBLOCKHIST]; // block of int histogram in final summation 00596 00597 __shared__ float xi[NBLOCKHIST]; // smem for the floating point histograms. 00598 int nofblocks; // number of data blocks per histogram 00599 00600 nofblocks=nbins/NBLOCKHIST; // set number of blocks per histogram 00601 int nleftover=nbins-nofblocks*NBLOCKHIST; 00602 maxloop = nofblocks*NBLOCKHIST; // set maxloop and maxloop2 00603 00604 // loop over all of the blocks in a histogram 00605 for (iblock=0;iblock<maxloop;iblock+=NBLOCKHIST) { 00606 xi[threadIdx.x]=0.0f; // zero out xi 00607 00608 // loop over the histograms created by calculate_rdf 00609 #ifdef __USEATOMIC 00610 for (io=0; io < MAXBIN*NCUDABLOCKS; io+=MAXBIN) { 00611 #else 00612 for (io=0; io < MAXBIN * (NCUDABLOCKS*NBLOCK / THREADSPERWARP); io+=MAXBIN) { 00613 #endif 00614 // load integer histogram data into shared memory (coalesced) 00615 llhists[threadIdx.x]=llhistg[io+iblock+threadIdx.x]; 00616 00617 // shave off the thread tag that might remain from calculate_rdf 00618 llhists[threadIdx.x]=llhists[threadIdx.x] & 0x07FFFFFFUL; 00619 00620 // convert to float 00621 xi[threadIdx.x]+=ull2float(llhists[threadIdx.x]); 00622 } 00623 00624 // ... and store in global memory 00625 histdev[iblock+threadIdx.x]+=xi[threadIdx.x]; 00626 } 00627 00628 // take care of leftovers 00629 if (threadIdx.x < nleftover) { 00630 xi[threadIdx.x]=0.0f; // zero out xi 00631 00632 // loop over the histograms created by calculate_rdf 00633 #ifdef __USEATOMIC 00634 for (io=0; io < MAXBIN*NCUDABLOCKS; io+=MAXBIN) { 00635 #else 00636 for (io=0; io < MAXBIN *(NCUDABLOCKS*NBLOCK / THREADSPERWARP); io+=MAXBIN) { 00637 #endif 00638 // load integer histogram data into shared memory (coalesced) 00639 llhists[threadIdx.x]=llhistg[io+iblock+threadIdx.x]; 00640 00641 // shave off the thread tag that might remain from calculate_rdf 00642 llhists[threadIdx.x]=llhists[threadIdx.x] & 0x07FFFFFFUL; 00643 00644 // convert to float 00645 xi[threadIdx.x]+=ull2float(llhists[threadIdx.x]); 00646 } 00647 00648 // ... and store in global memory 00649 histdev[iblock+threadIdx.x]+=xi[threadIdx.x]; 00650 } 00651 00652 // calculate_hist out 00653 return; 00654 } 00655 00656 00657 /* 00658 * calculate_histogram_block 00659 * Compute one histogram block 00660 */ 00661 void calculate_histogram_block(int usepbc, 00662 float *xyz, 00663 int natomsi, 00664 int natomsj, 00665 float3 celld, 00666 unsigned int* llhistg, 00667 int nbins, 00668 float rmin, 00669 float delr_inv, 00670 float *histdev, 00671 int nblockhist) { 00672 // zero out the histograms in global memory 00673 init_hist<<<NCUDABLOCKS, NBLOCK>>>(llhistg, MAXBIN); 00674 00675 // calculate the histograms 00676 calculate_rdf<<<NCUDABLOCKS, NBLOCK>>>(usepbc, xyz, natomsi, natomsj, celld, 00677 llhistg, nbins, rmin, delr_inv); 00678 00679 // sum histograms and begin normalization by adjusting for the cell volume 00680 calculate_histogram<<<1, nblockhist>>>(histdev, llhistg, nbins); 00681 } 00682 00683 00684 00685 /* 00686 * input parameters for rdf_thread() routine 00687 */ 00688 typedef struct { 00689 int usepbc; // periodic or non-periodic calculation 00690 int natoms1; // number of atoms in selection 1. 00691 float* xyz; // coordinates of first selection, [natoms1][3] 00692 int natoms2; // number of atoms in selection 2. 00693 float** xyz2array; // coordinates of selection 2. [natoms2][3] (per-thread) 00694 float* cell; // the cell x y and z dimensions [3] 00695 float** histarray; // the final histogram in host mem [nbins] (per-thread) 00696 int nbins; // the number of bins in the histogram 00697 int nblockhist; // the size of a block used in the 00698 // reduction of the histogram 00699 float rmin; // the minimum value of the first bin 00700 float delr; // the width of each bin 00701 int nblocks; // number of const blocks to compute all pairs histogram 00702 int nhblock; // # NBLOCKHIST-sized blocks needed to calc whole histogram 00703 } rdfthrparms; 00704 00705 00706 /* 00707 * rdf_thread -- multithreaded version of subroutine_rdf() 00708 * This routine is called from the CPU to calculate g(r) for a single frame 00709 * on the GPU. Right now it performs a brute force O(N^2) calculations 00710 * (with all atomic pairs considered). Future version may reduce the overall 00711 * work by performing a grid search. 00712 */ 00713 static void * rdf_thread(void *voidparms) { 00714 rdfthrparms *parms = NULL; 00715 int threadid=0; 00716 00717 wkf_threadpool_worker_getid(voidparms, &threadid, NULL); 00718 wkf_threadpool_worker_getdata(voidparms, (void **) &parms); 00719 00720 #if 0 00721 printf("rdf thread[%d] lanched and running...\n", threadid); 00722 #endif 00723 cudaSetDevice(threadid); 00724 00725 /* 00726 * copy in per-thread parameters 00727 */ 00728 const int natoms1 = parms->natoms1; 00729 const float *xyz = parms->xyz; 00730 const int natoms2 = parms->natoms2; 00731 const float *cell = parms->cell; 00732 const int nbins = parms->nbins; 00733 const int nblockhist = parms->nblockhist; 00734 const float rmin = parms->rmin; 00735 const float delr = parms->delr; 00736 float *xyz2 = parms->xyz2array[threadid]; 00737 float *hist = parms->histarray[threadid]; 00738 const int nhblock = parms->nhblock; 00739 const int usepbc = parms->usepbc; 00740 00741 float* xyzdev; // pointer to xyz data in global memory 00742 float3 celld; // cell x, y, and z dimensions 00743 float* histdev; // pointer to float histogram in global memory 00744 unsigned int* llhistg; // pointer to int histograms in global memory 00745 00746 int nconstblocks; // # full blocks to be sent to constant memory 00747 int nleftover; // # elements in leftover block of const mem 00748 int nbigblocks; // number of blocks required to prevent histogram overflow 00749 int ibigblock; // the index of the current big block 00750 int nleftoverbig; // the number of elements in the final big block 00751 int ntmpbig; 00752 int nbinstmp; // number of bins currently being processed 00753 int natoms1pad; // number of atoms in selection 1 after padding 00754 int natoms2pad; // number of atoms in selection 1 after padding 00755 float rmax; // cutoff radius for atom contributions to histogram 00756 00757 // natoms1pad is natoms1 rounded up to the next multiple of NBLOCK 00758 natoms1pad=natoms1; 00759 if (natoms1%NBLOCK!=0) {natoms1pad = (natoms1 / NBLOCK + 1) * NBLOCK;} 00760 natoms2pad=natoms2; 00761 if (natoms2%NBLOCK!=0) {natoms2pad = (natoms2 / NBLOCK + 1) * NBLOCK;} 00762 00763 // allocate global memory for: 00764 00765 // the xyz coordinates of selection 1 00766 cudaMalloc((void**)&xyzdev, 3*max(natoms1pad,natoms2pad)*sizeof(float)); 00767 00768 // the final copy of the histogram 00769 cudaMalloc((void**)&histdev, nbins*sizeof(float)); 00770 00771 // zero out floating point histogram 00772 int ihblock; // loop counter for histogram blocks 00773 int maxloop=nbins/nblockhist; 00774 if (maxloop*nblockhist!=nbins) {maxloop=maxloop+1;} 00775 for (ihblock=0; ihblock<maxloop; ihblock++) { 00776 init_hist_f<<<1, nblockhist>>>(histdev+ihblock*nblockhist); 00777 } 00778 00779 // the global memory copies of the histogram 00780 #ifdef __USEATOMIC 00781 cudaMalloc((void**)&llhistg, (NCUDABLOCKS*MAXBIN*sizeof(int))); 00782 #else 00783 cudaMalloc((void**)&llhistg, (NCUDABLOCKS*NBLOCK*MAXBIN* 00784 sizeof(int))/THREADSPERWARP); 00785 #endif 00786 00787 // set the cell dimensions 00788 celld.x=cell[0]; 00789 celld.y=cell[1]; 00790 celld.z=cell[2]; 00791 00792 // set rmax. this is the distance that the padding atoms must be from the 00793 // unit cell AND that they must be from each other 00794 rmax = 1.1f * (rmin+((float)nbins)*delr); 00795 00796 // send the second selection to the gpu for reimaging 00797 cudaMemcpy(xyzdev, xyz2, 3*natoms2*sizeof(float), cudaMemcpyHostToDevice); 00798 00799 if (usepbc) { 00800 // reimage atom coors into a single periodic cell 00801 reimage_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms2,natoms2pad,celld,rmax); 00802 } else { 00803 // shift phantom atoms so they don't interfere with non-periodic calc. 00804 phantom_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms2,natoms2pad,1.0e38f,rmax); 00805 } 00806 00807 // now, unfortunately, we have to move it back because this selection 00808 // will need to be sent to constant memory (done on the host side) 00809 cudaMemcpy(xyz2, xyzdev, 3*natoms2*sizeof(float), cudaMemcpyDeviceToHost); 00810 00811 // send the xyz coords of selection 1 to the gpu 00812 cudaMemcpy(xyzdev, xyz, 3*natoms1*sizeof(float), cudaMemcpyHostToDevice); 00813 00814 if (usepbc) { 00815 // reimage xyz coords back into the periodic box and zero all histograms 00816 reimage_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms1,natoms1pad,celld,rmax); 00817 } else { 00818 // shift phantom atoms so they don't interfere with non-periodic calc. 00819 phantom_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms1,natoms1pad,1.0e38f,rmax); 00820 } 00821 00822 // all blocks except for the final one will have MAXBIN elements 00823 nbinstmp = MAXBIN; 00824 00825 // calc # of groups to divide sel2 into to fit it into constant memory 00826 nconstblocks=natoms2/NCONSTBLOCK; 00827 nleftover=natoms2-nconstblocks*NCONSTBLOCK; 00828 00829 // set up the big blocks to avoid bin overflow 00830 ntmpbig = NBLOCK * ((BIN_OVERFLOW_LIMIT / NCONSTBLOCK) / NBLOCK); 00831 nbigblocks = natoms1pad / ntmpbig; 00832 nleftoverbig = natoms1pad - nbigblocks * ntmpbig; 00833 00834 int nbblocks = (nleftoverbig != 0) ? nbigblocks+1 : nbigblocks; 00835 00836 // parallel loop over all "constblocks" 00837 wkf_tasktile_t tile; // next batch of work units 00838 while (wkf_threadpool_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) { 00839 #if 0 00840 printf("rdf thread[%d] fetched tile: s: %d e: %d\n", threadid, tile.start, tile.end); 00841 #endif 00842 int iconstblock = -1; // loop counter for constblocks, force update 00843 ihblock = -1; // initialize to sentinel value to force immediate update 00844 int workitem; // current parallel work item 00845 for (workitem=tile.start; workitem<tile.end; workitem++) { 00846 // decode work item into ihblock and iconstblock indices 00847 ihblock = workitem % nhblock; 00848 int newiconstblock = workitem / nhblock; 00849 int blocksz; 00850 00851 // When moving to the next constblock we must copy in new coordinates 00852 if (iconstblock != newiconstblock) { 00853 iconstblock = newiconstblock; 00854 00855 // take care of the leftovers on the last block 00856 blocksz = (iconstblock == nconstblocks && nleftover != 0) 00857 ? nleftover : NCONSTBLOCK; 00858 00859 // send a "constblock" of coords from selection 2 to const mem 00860 copycoordstoconstbuff(blocksz, xyz2+(3*NCONSTBLOCK*iconstblock)); 00861 } 00862 00863 #if 0 00864 printf("rdf thread[%d] iconstblock: %d ihblock: %d\n", threadid, 00865 iconstblock, ihblock); 00866 #endif 00867 00868 // loop over blocks of histogram 00869 // minimum distance for the current histogram block 00870 // the first block starts at rmin 00871 float rmintmp=rmin + (MAXBIN * delr * ihblock); 00872 00873 nbinstmp = MAXBIN; 00874 00875 // the final block will have nbin%MAXBIN elements 00876 if (ihblock==(nhblock-1)) { 00877 nbinstmp = nbins%MAXBIN; 00878 // if nbins is an even multiple of MAXBIN, the final block is full 00879 if (nbinstmp==0) { nbinstmp = MAXBIN;} 00880 } 00881 00882 for (ibigblock=0; ibigblock < nbblocks; ibigblock++) { 00883 // take care of the leftovers on the last block 00884 int bblocksz = (ibigblock == nbigblocks && nleftoverbig != 0) 00885 ? nleftoverbig : ntmpbig; 00886 00887 calculate_histogram_block(usepbc, (xyzdev+ibigblock*ntmpbig*3), 00888 bblocksz, blocksz, celld, 00889 llhistg, nbinstmp, rmintmp, (1/delr), 00890 histdev+ihblock*MAXBIN, nblockhist); 00891 } 00892 00893 cudaDeviceSynchronize(); 00894 } 00895 } // end of parallel tile fetch while loop 00896 00897 // retrieve g of r for this frame 00898 cudaMemcpy(hist, histdev, nbins*sizeof(float), cudaMemcpyDeviceToHost); 00899 00900 cudaFree(xyzdev); // free gpu global memory 00901 cudaFree(histdev); 00902 cudaFree(llhistg); 00903 00904 #if 0 00905 printf("thread[%d] finished, releasing resources\n", threadid); 00906 #endif 00907 00908 return NULL; 00909 } 00910 00911 00912 int rdf_gpu(wkf_threadpool_t *devpool, // VMD GPU worker thread pool 00913 int usepbc, // periodic or non-periodic calc. 00914 int natoms1, // array of the number of atoms in 00915 // selection 1 in each frame. 00916 float *xyz, // coordinates of first selection. 00917 // [natoms1][3] 00918 int natoms2, // array of the number of atoms in 00919 // selection 2 in each frame. 00920 float *xyz2, // coordinates of selection 2. 00921 // [natoms2][3] 00922 float* cell, // the cell x y and z dimensions [3] 00923 float* hist, // the histograms, 1 per block 00924 // [ncudablocks][maxbin] 00925 int nbins, // the number of bins in the histogram 00926 float rmin, // the minimum value of the first bin 00927 float delr) // the width of each bin 00928 { 00929 int i, ibin; 00930 00931 if (devpool == NULL) { 00932 return -1; // abort if no device pool exists 00933 } 00934 int numprocs = wkf_threadpool_get_workercount(devpool); 00935 00936 // multi-thread buffers 00937 float **xyz2array = (float**) calloc(1, sizeof(float *) * numprocs); 00938 float **histarray = (float**) calloc(1, sizeof(float *) * numprocs); 00939 for (i=0; i<numprocs; i++) { 00940 xyz2array[i] = (float*) calloc(1, sizeof(float) * 3 * (natoms2/NBLOCK + 1) * NBLOCK); 00941 memcpy(xyz2array[i], xyz2, sizeof(float) * 3 * (natoms2/NBLOCK + 1) * NBLOCK); 00942 histarray[i] = (float*) calloc(1, sizeof(float) * nbins); 00943 } 00944 00945 memset(hist, 0, nbins * sizeof(float)); // clear global histogram array 00946 00947 // setup thread parameters 00948 rdfthrparms parms; 00949 00950 parms.usepbc = usepbc; 00951 parms.natoms1 = natoms1; 00952 parms.xyz = xyz; 00953 parms.natoms2 = natoms2; 00954 parms.cell = cell; 00955 parms.nbins = nbins; 00956 parms.nblockhist = NBLOCKHIST; // size of final reduction block 00957 parms.rmin = rmin; 00958 parms.delr = delr; 00959 parms.xyz2array = xyz2array; // per-thread output data 00960 parms.histarray = histarray; // per-thread output data 00961 00962 // calculate the number of blocks to divide the histogram into 00963 parms.nhblock = (nbins+MAXBIN-1)/MAXBIN; 00964 00965 // calc # of groups to divide sel2 into to fit it into constant memory 00966 int nconstblocks=parms.natoms2/NCONSTBLOCK; 00967 int nleftoverconst=parms.natoms2 - nconstblocks*NCONSTBLOCK; 00968 00969 // we have to do one extra cleanup pass if there are leftovers 00970 parms.nblocks = (nleftoverconst != 0) ? nconstblocks+1 : nconstblocks; 00971 00972 int parblocks = parms.nblocks * parms.nhblock; 00973 #if 0 00974 printf("master thread: number of parallel work items: %d\n", parblocks); 00975 #endif 00976 00977 // spawn child threads to do the work 00978 wkf_tasktile_t tile; 00979 tile.start=0; 00980 tile.end=parblocks; 00981 wkf_threadpool_sched_dynamic(devpool, &tile); 00982 wkf_threadpool_launch(devpool, rdf_thread, &parms, 1); 00983 00984 memset(hist, 0, nbins * sizeof(float)); // clear global histogram array 00985 // collect independent thread results into final histogram buffer 00986 for (i=0; i<numprocs; i++) { 00987 for (ibin=0; ibin<nbins; ibin++) { 00988 hist[ibin] += histarray[i][ibin]; 00989 } 00990 } 00991 00992 return 0; 00993 } 00994 00995 00996