Main Page Namespace List Class Hierarchy Alphabetical List Compound List File List Namespace Members Compound Members File Members Related Pages
CUDAMeasureRDF.cu

Go to the documentation of this file.
00001 /***************************************************************************
00002 *cr
00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the
00004 *cr University of Illinois
00005 *cr All Rights Reserved
00006 *cr
00007 ***************************************************************************/
00008 /***************************************************************************
00009 * RCS INFORMATION:
00010 *
00011 * $RCSfile: CUDAMeasureRDF.cu,v $
00012 * $Author: johns $ $Locker: $ $State: Exp $
00013 * $Revision: 1.28 $ $Date: 2020年02月26日 20:16:56 $
00014 *
00015 ***************************************************************************/
00029 #include <stdio.h>
00030 #include <stdlib.h>
00031 #include <string.h>
00032 #include <math.h>
00033 #include <cuda.h>
00034 
00035 #include "Inform.h"
00036 #include "utilities.h"
00037 #include "WKFThreads.h"
00038 #include "WKFUtils.h"
00039 #include "CUDAKernels.h" 
00040 
00041 //
00042 // the below parameters are important to tuning the performance of the code
00043 //
00044 #if 0
00045 // original (slow?) default values for all GPU types
00046 // no hardware support for atomic operations, so the shared memory 
00047 // tagging scheme is used.
00048 #define NBLOCK 128 // size of an RDF data block 
00049 #define MAXBIN 64 // maximum number of bins in a histogram
00050 #elif 1
00051 // optimal values for compute 1.0 (G80/G9x) devices 
00052 // no hardware support for atomic operations, so the shared memory 
00053 // tagging scheme is used.
00054 #define NBLOCK 32 // size of an RDF data block 
00055 #define MAXBIN 1024 // maximum number of bins in a histogram
00056 #elif 1
00057 // optimal values for compute 1.3 (GT200) devices 
00058 // uses atomic operations to increment histogram bins
00059 #define NBLOCK 320 // size of an RDF data block 
00060 #define MAXBIN 3072 // maximum number of bins in a histogram
00061 #define __USEATOMIC 1 // enable atomic increment operations
00062 #elif 1
00063 // optimal values for compute 2.0 (Fermi) devices 
00064 // uses atomic operations to increment histogram bins
00065 // uses Fermi's 3x larger shared memory (48KB) to reduce the number 
00066 // of passes required for computation of very large histograms
00067 #define NBLOCK 896 // size of an RDF data block 
00068 #define MAXBIN 8192 // maximum number of bins in a histogram
00069 #define __USEATOMIC 1 // enable atomic increment operations
00070 #endif
00071 
00072 #define NCUDABLOCKS 256 // the number of blocks to divide the shared memory 
00073 // dimension into. Performance increases up to ~8 
00074 // and then levels off
00075 
00076 
00077 #define NBLOCKHIST 64 // the number of threads used in the final histogram
00078 // kernel. The summation of the histograms isn't 
00079 // a bottleneck so this isn't a critical choice
00080 
00081 #define NCONSTBLOCK 5440 // the number of atoms in constant memory.
00082 // Can be 4096 (or larger) up to capacity limits.
00083 
00084 #define THREADSPERWARP 32 // this is correct for G80/GT200/Fermi GPUs
00085 #define WARP_LOG_SIZE 5 // this is also correct for G80/GT200/Fermi GPUs
00086 
00087 #define BIN_OVERFLOW_LIMIT 2147483648 // maximum value a bin can store before
00088 // we need to accumulate and start over
00089 
00090 
00091 // This routine prepares data for the calculation of the histogram.
00092 // It zeros out instances of the histograms in global memory.
00093 __global__ void init_hist(unsigned int* llhistg, // histograms
00094 int maxbin) // number of bins per histogram
00095 {
00096 
00097 #ifndef __USEATOMIC
00098 int io; // index of atoms in selection
00099 #endif
00100 
00101 int iblock; // index of data blocks 
00102 
00103 unsigned int *llhist; // this pointer will point to the begining of 
00104 // each thread's instance of the histogram
00105 
00106 int nofblocks; // nofblocks is the number of full data blocks. 
00107 int nleftover; // nleftover is the dimension of the blocks left over 
00108 // after all full data blocks
00109 int maxloop; // the loop condition maximum
00110 
00111 // initialize llhists to point to an instance of the histogram for each
00112 // warp. Zero out the integer (llhist) and floating point (histdev) copies
00113 // of the histogram in global memory
00114 #ifdef __USEATOMIC
00115 llhist=llhistg+blockIdx.x*maxbin+threadIdx.x;
00116 nofblocks=maxbin/NBLOCK;
00117 nleftover=maxbin-nofblocks*NBLOCK;
00118 maxloop=nofblocks*NBLOCK;
00119 
00120 for (iblock=0; iblock < maxloop; iblock+=NBLOCK) {
00121 llhist[iblock]=0UL;
00122 }
00123 // take care of leftovers
00124 if (threadIdx.x < nleftover) {
00125 llhist[iblock]=0UL;
00126 }
00127 
00128 #else
00129 llhist=llhistg+(((blockIdx.x)*NBLOCK)/THREADSPERWARP)*maxbin+threadIdx.x;
00130 nofblocks=maxbin/NBLOCK;
00131 nleftover=maxbin-nofblocks*NBLOCK;
00132 maxloop=nofblocks*NBLOCK;
00133 int maxloop2=(NBLOCK / THREADSPERWARP)*maxbin;
00134 
00135 for (io=0; io < maxloop2; io+=maxbin) {
00136 for (iblock=0; iblock < maxloop; iblock+=NBLOCK) {
00137 llhist[io + iblock]=0UL;
00138 }
00139 // take care of leftovers
00140 if (threadIdx.x < nleftover) {
00141 llhist[io + iblock]=0UL;
00142 }
00143 }
00144 #endif
00145 
00146 return;
00147 }
00148 
00149 
00150 
00151 /* This routine prepares data for the calculation of the histogram.
00152 * It zeros out the floating point histogram in global memory.
00153 */
00154 __global__ void init_hist_f(float* histdev) {
00155 histdev[threadIdx.x]=0.0f;
00156 return;
00157 }
00158 
00159 
00160 
00161 /* This routine prepares data for the calculation of the histogram.
00162 * It recenters the atom to a single periodic unit cell. 
00163 * It also chooses positions for the padding atoms such that 
00164 * they don't contribute to the histogram
00165 */
00166 __global__ void reimage_xyz(float* xyz, // XYZ data in global memory. 
00167 int natomsi, // number of atoms
00168 int natomsipad, // # atoms including padding atoms
00169 float3 celld, // the cell size of each frame
00170 float rmax) // dist. used to space padding atoms
00171 {
00172 int iblock; // index of data blocks 
00173 
00174 __shared__ float xyzi[3*NBLOCK]; // this array hold xyz data for the current 
00175 // data block. 
00176 
00177 int nofblocks; // ibin is an index associated with histogram 
00178 // bins. nofblocks is the number of full data blocks. nleftover is the 
00179 // dimension of the blocks left over after all full data blocks
00180 
00181 float *pxi; // these pointers point to the portion of xyz currently
00182 // being processed
00183 
00184 int threetimesid; // three times threadIdx.x
00185 
00186 float rtmp; // a temporary distance to be used in creating padding atoms
00187 
00188 int ifinalblock; // this is the index of the final block
00189 
00190 int loopmax; // maximum for the loop counter
00191 
00192 // initialize nofblocks, pxi, and threetimesid
00193 nofblocks=((natomsipad/NBLOCK)+NCUDABLOCKS-blockIdx.x-1)/NCUDABLOCKS;
00194 loopmax=nofblocks*3*NCUDABLOCKS*NBLOCK;
00195 ifinalblock=(natomsipad / NBLOCK - 1) % NCUDABLOCKS;
00196 pxi=xyz+blockIdx.x*3*NBLOCK+threadIdx.x;
00197 threetimesid=3*threadIdx.x;
00198 
00199 // shift all atoms into the same unit cell centered at the origin
00200 for (iblock=0; iblock<loopmax; iblock+=3*NCUDABLOCKS*NBLOCK) {
00201 __syncthreads();
00202 //these reads from global memory should be coallesced
00203 xyzi[threadIdx.x ]=pxi[iblock ];
00204 xyzi[threadIdx.x+ NBLOCK]=pxi[iblock+ NBLOCK];
00205 xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK];
00206 __syncthreads();
00207 
00208 // Shift the xyz coordinates so that 0 <= xyz < celld.
00209 // The ?: line is necesary because of the less than convenient behavior
00210 // of fmod for negative xyz values
00211 xyzi[threetimesid] = fmodf(xyzi[threetimesid ], celld.x);
00212 if (xyzi[threetimesid ] < 0.0f) {
00213 xyzi[threetimesid ] += celld.x;
00214 }
00215 xyzi[threetimesid+1]=fmodf(xyzi[threetimesid+1], celld.y);
00216 if (xyzi[threetimesid+1] < 0.0f) {
00217 xyzi[threetimesid+1] += celld.y;
00218 }
00219 xyzi[threetimesid+2]=fmodf(xyzi[threetimesid+2], celld.z);
00220 if (xyzi[threetimesid+2] < 0.0f) {
00221 xyzi[threetimesid+2] += celld.z;
00222 }
00223 
00224 // if this is the final block then we pad
00225 if (iblock==loopmax-3*NCUDABLOCKS*NBLOCK && blockIdx.x==ifinalblock) {
00226 // pad the xyz coordinates with atoms which are spaced out such that they
00227 // cannot contribute to the histogram. Note that these atoms are NOT 
00228 // reimaged
00229 if ((blockDim.x-threadIdx.x) <= (natomsipad - natomsi)) {
00230 rtmp = -((float)(threadIdx.x+1))*rmax;
00231 xyzi[threetimesid ] = rtmp;
00232 xyzi[threetimesid+1] = rtmp;
00233 xyzi[threetimesid+2] = rtmp;
00234 }
00235 }
00236 
00237 __syncthreads();
00238 
00239 // store the xyz values back to global memory. these stores are coallesced
00240 pxi[iblock ]=xyzi[threadIdx.x ];
00241 pxi[iblock+ NBLOCK]=xyzi[threadIdx.x+ NBLOCK];
00242 pxi[iblock+2*NBLOCK]=xyzi[threadIdx.x+2*NBLOCK];
00243 
00244 // increment pxi to the next block of global memory to be processed
00245 //pxi=pxi+3*NCUDABLOCKS*NBLOCK;
00246 }
00247 }
00248 
00249 
00250 // shift the "phantom" atoms so they don't interfere in non-periodic
00251 // calculations.
00252 __global__ void phantom_xyz(float* xyz, // XYZ data in global memory.
00253 int natomsi, // number of atoms
00254 int natomsipad, // # atoms including padding atoms
00255 float minxyz,
00256 float rmax) // dist. used to space padding atoms
00257 {
00258 int iblock; // index of data blocks
00259 
00260 __shared__ float xyzi[3*NBLOCK]; // this array hold xyz data for the current
00261 // data block.
00262 
00263 int nofblocks; // ibin is an index associated with histogram
00264 // bins. nofblocks is the number of full data blocks. nleftover is the
00265 // dimension of the blocks left over after all full data blocks
00266 
00267 float *pxi; // these pointers point to the portion of xyz currently
00268 // being processed
00269 
00270 int threetimesid; // three times threadIdx.x
00271 
00272 float rtmp; // a temporary distance to be used in creating padding atoms
00273 
00274 int ifinalblock; // this is the index of the final block
00275 
00276 int loopmax; // maximum for the loop counter
00277 
00278 // initialize nofblocks, pxi, and threetimesid
00279 nofblocks=((natomsipad/NBLOCK)+NCUDABLOCKS-blockIdx.x-1)/NCUDABLOCKS;
00280 loopmax=nofblocks*3*NCUDABLOCKS*NBLOCK;
00281 ifinalblock=(natomsipad / NBLOCK - 1) % NCUDABLOCKS;
00282 pxi=xyz+blockIdx.x*3*NBLOCK+threadIdx.x;
00283 threetimesid=3*threadIdx.x;
00284 
00285 // shift all atoms into the same unit cell centered at the origin
00286 for (iblock=0; iblock<loopmax; iblock+=3*NCUDABLOCKS*NBLOCK) {
00287 __syncthreads();
00288 //these reads from global memory should be coallesced
00289 xyzi[threadIdx.x ]=pxi[iblock ];
00290 xyzi[threadIdx.x+ NBLOCK]=pxi[iblock+ NBLOCK];
00291 xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK];
00292 __syncthreads();
00293 
00294 // if this is the final block then we pad
00295 if (iblock==loopmax-3*NCUDABLOCKS*NBLOCK && blockIdx.x==ifinalblock) {
00296 // pad the xyz coordinates with atoms which are spaced out such that they
00297 // cannot contribute to the histogram. Note that these atoms are NOT
00298 // reimaged
00299 if ((blockDim.x-threadIdx.x) <= (natomsipad - natomsi)) {
00300 rtmp = -((float)(threadIdx.x+1))*rmax-minxyz;
00301 xyzi[threetimesid ] = rtmp;
00302 xyzi[threetimesid+1] = rtmp;
00303 xyzi[threetimesid+2] = rtmp;
00304 }
00305 }
00306 
00307 __syncthreads();
00308 
00309 // store the xyz values back to global memory. these stores are coallesced
00310 pxi[iblock ]=xyzi[threadIdx.x ];
00311 pxi[iblock+ NBLOCK]=xyzi[threadIdx.x+ NBLOCK];
00312 pxi[iblock+2*NBLOCK]=xyzi[threadIdx.x+2*NBLOCK];
00313 
00314 // increment pxi to the next block of global memory to be processed
00315 //pxi=pxi+3*NCUDABLOCKS*NBLOCK;
00316 }
00317 }
00318 
00319 
00320 /* This kernel calculates a radial distribution function between 
00321 * two selections one selection is stored in its entirety in global memory.
00322 * The other is stored partially in constant memory. This routine is 
00323 * called to calculate the rdf between all of selection 1 and the portion
00324 * of selection 2 in constant memory. 
00325 * Each element of selection 2 is associated with it's own thread. To minimize
00326 * accesses to global memory selection 1 is divided into chunks of NBLOCK 
00327 * elements. These chunks are loaded into shared memory simultaneously, 
00328 * and then then all possible pairs of these atoms with those in 
00329 * constant memory are calculated.
00330 */
00331 __constant__ static float xyzj[3*NCONSTBLOCK]; // this array stores the
00332 // coordinates of selection two in constant memory
00333 
00334 // this routine is called from the host to move coordinate data to constant mem
00335 void copycoordstoconstbuff(int natoms, const float* xyzh) {
00336 cudaMemcpyToSymbol(xyzj, xyzh, 3*natoms*sizeof(float));
00337 }
00338 
00339 
00340 // This subroutine is based on a similar routine in the CUDA SDK histogram256 
00341 // example. It adds a count to the histogram in such a way that there are no 
00342 // collisions. If several routines need to update the same element a tag 
00343 // applied to to the highest bits of that element is used to determine which 
00344 // elements have already updated it. For devices with compute capability 1.2 
00345 // or higher this is not necessary as atomicAdds can be used.
00346 
00347 #ifndef __USEATOMIC
00348 __device__ void addData(volatile unsigned int *s_WarpHist, // the first element
00349 // of the histogram to be operated on
00350 unsigned int data, // the bin to add a count to
00351 unsigned int threadTag) // tag of the current thread
00352 {
00353 unsigned int count; // the count in the bin being operated on
00354 
00355 do { 
00356 count = s_WarpHist[data] & 0x07FFFFFFUL;
00357 count = threadTag | (count + 1);
00358 s_WarpHist[data] = count;
00359 } while(s_WarpHist[data] != count);
00360 }
00361 #endif
00362 
00363 
00364 // This is the routine that does all the real work. It takes as input the
00365 // various atomic coordinates and produces one rdf per warp, as described 
00366 // above. These rdfs will be summed later by calculate_hist.
00367 __global__ void calculate_rdf(int usepbc, // periodic or non-periodic calc.
00368 float* xyz, // atom XYZ coords, gmem
00369 int natomsi, // # of atoms in sel1, gmem
00370 int natomsj, // # of atoms in sel2, cmem
00371 float3 celld, // cell size of each frame
00372 unsigned int* llhistg, // histograms, gmem
00373 int nbins, // # of bins in each histogram
00374 float rmin, // minimum distance for histogram
00375 float delr_inv) // recip. width of histogram bins
00376 {
00377 int io; // indices of the atoms in selection 2
00378 int iblock; //the block of selection 1 currently being processed. 
00379 
00380 #ifdef __USEATOMIC
00381 unsigned int *llhists1; // a pointer to the beginning of llhists
00382 // which is operated on by the warp
00383 
00384 __shared__ unsigned int llhists[MAXBIN]; // array holds 1 histogram per warp.
00385 #else
00386 volatile unsigned int *llhists1; // a pointer to the beginning of llhists
00387 // which is operated on by the warp
00388 
00389 volatile __shared__ unsigned int llhists[(NBLOCK*MAXBIN)/THREADSPERWARP]; 
00390 // this array will hold 1 histogram per warp.
00391 #endif
00392 
00393 __shared__ float xyzi[3*NBLOCK]; // coords for the portion of sel1 
00394 // being processed in shared memory
00395 
00396 float rxij, rxij2, rij; // registers holding intermediate values in
00397 // calculating the distance between two atoms
00398 
00399 int ibin,nofblocksi; // registers counters and upper limit in some loops
00400 int nleftover; // # bins leftover after full blocks are processed
00401 float *pxi; // pointer to the portion of sel1 being processed
00402 unsigned int joffset; // the current offset in constant memory
00403 unsigned int loopmax, loopmax2; // the limit for several loops
00404 float rmin2=.0001/delr_inv;
00405 
00406 #ifndef __USEATOMIC
00407 // this thread tag is needed for the histograming method implemented in
00408 // addData above.
00409 const unsigned int threadTag = ((unsigned int)
00410 ( threadIdx.x & (THREADSPERWARP - 1)))
00411 << (32 - WARP_LOG_SIZE);
00412 #endif
00413 
00414 // initialize llhists1. If atomics are used, then each block has its own 
00415 // copy of the histogram in shared memory, which will be stored to it's own 
00416 // place in global memory. If atomics aren't used then we need 1 histogram 
00417 // per warp. The number of blocks is also set approprately so that the
00418 // histograms may be zeroed out.
00419 #ifdef __USEATOMIC
00420 llhists1=llhists;
00421 nofblocksi=nbins/NBLOCK;
00422 nleftover=nbins-nofblocksi*NBLOCK;
00423 #else
00424 llhists1=llhists+(threadIdx.x/THREADSPERWARP)*MAXBIN;
00425 nofblocksi=nbins/THREADSPERWARP;
00426 nleftover=nbins-nofblocksi*THREADSPERWARP;
00427 #endif
00428 
00429 // Here we also zero out the shared memory used in this routine.
00430 #ifdef __USEATOMIC
00431 loopmax = nofblocksi*NBLOCK;
00432 for (io=0; io<loopmax; io+=NBLOCK) {
00433 llhists1[io + threadIdx.x]=0UL;
00434 } 
00435 if (threadIdx.x < nleftover) {
00436 llhists1[io + threadIdx.x]=0UL;
00437 } 
00438 
00439 #else
00440 loopmax = nofblocksi*THREADSPERWARP;
00441 for (io=0; io<loopmax; io+=THREADSPERWARP) {
00442 llhists1[io + (threadIdx.x & (THREADSPERWARP - 1))]=0UL;
00443 } 
00444 if ((threadIdx.x & (THREADSPERWARP - 1)) < nleftover) {
00445 llhists1[io + (threadIdx.x & (THREADSPERWARP - 1))]=0UL;
00446 } 
00447 #endif
00448 
00449 // initialize nofblocks and pxi
00450 nofblocksi=((natomsi/NBLOCK)+NCUDABLOCKS-blockIdx.x-1)/NCUDABLOCKS;
00451 pxi=xyz+blockIdx.x*3*NBLOCK+threadIdx.x;
00452 
00453 loopmax = 3 * natomsj;
00454 int idxt3 = 3 * threadIdx.x;
00455 int idxt3p1 = idxt3+1;
00456 int idxt3p2 = idxt3+2;
00457 
00458 loopmax2 = nofblocksi*3*NCUDABLOCKS*NBLOCK;
00459 
00460 // loop over all atoms in constant memory, using either 
00461 // periodic or non-periodic particle pair distance calculation
00462 if (usepbc) {
00463 for (iblock=0; iblock<loopmax2; iblock+=3*NCUDABLOCKS*NBLOCK) {
00464 __syncthreads();
00465 
00466 // these loads from global memory should be coallesced
00467 xyzi[threadIdx.x ]=pxi[iblock ];
00468 xyzi[threadIdx.x+ NBLOCK]=pxi[iblock+ NBLOCK];
00469 xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK];
00470 
00471 __syncthreads();
00472 
00473 for (joffset=0; joffset<loopmax; joffset+=3) {
00474 // calculate the distance between two atoms. rxij and rxij2 are reused
00475 // to minimize the number of registers used
00476 rxij=fabsf(xyzi[idxt3 ] - xyzj[joffset ]);
00477 rxij2=celld.x - rxij;
00478 rxij=fminf(rxij, rxij2);
00479 rij=rxij*rxij;
00480 rxij=fabsf(xyzi[idxt3p1] - xyzj[joffset+1]);
00481 rxij2=celld.y - rxij;
00482 rxij=fminf(rxij, rxij2);
00483 rij+=rxij*rxij;
00484 rxij=fabsf(xyzi[idxt3p2] - xyzj[joffset+2]);
00485 rxij2=celld.z - rxij;
00486 rxij=fminf(rxij, rxij2);
00487 rij=sqrtf(rij + rxij*rxij);
00488 
00489 // increment the appropriate bin, don't add duplicates to the zeroth bin
00490 ibin=__float2int_rd((rij-rmin)*delr_inv);
00491 if (ibin<nbins && ibin>=0 && rij>rmin2) {
00492 #ifdef __USEATOMIC
00493 atomicAdd(llhists1+ibin, 1U);
00494 #else
00495 addData(llhists1, ibin, threadTag);
00496 #endif
00497 }
00498 } //joffset
00499 } //iblock
00500 } else { // non-periodic
00501 for (iblock=0; iblock<loopmax2; iblock+=3*NCUDABLOCKS*NBLOCK) {
00502 __syncthreads();
00503 
00504 // these loads from global memory should be coallesced
00505 xyzi[threadIdx.x ]=pxi[iblock ];
00506 xyzi[threadIdx.x+ NBLOCK]=pxi[iblock+ NBLOCK];
00507 xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK];
00508 
00509 __syncthreads();
00510 
00511 // loop over all atoms in constant memory
00512 for (joffset=0; joffset<loopmax; joffset+=3) {
00513 // calculate the distance between two atoms. rxij and rxij2 are reused
00514 // to minimize the number of registers used
00515 rxij=xyzi[idxt3 ] - xyzj[joffset ];
00516 rij=rxij*rxij;
00517 rxij=xyzi[idxt3p1] - xyzj[joffset+1];
00518 rij+=rxij*rxij;
00519 rxij=xyzi[idxt3p2] - xyzj[joffset+2];
00520 rij=sqrtf(rij + rxij*rxij);
00521 
00522 // increment the appropriate bin, don't add duplicates to the zeroth bin
00523 ibin=__float2int_rd((rij-rmin)*delr_inv);
00524 if (ibin<nbins && ibin>=0 && rij>rmin2) {
00525 #ifdef __USEATOMIC
00526 atomicAdd(llhists1+ibin, 1U);
00527 #else
00528 addData(llhists1, ibin, threadTag);
00529 #endif
00530 }
00531 } //joffset
00532 } //iblock
00533 }
00534 
00535 __syncthreads();
00536 
00537 
00538 // below we store the histograms to global memory so that they may be summed
00539 // in another routine. Setting nofblo
00540 nofblocksi=nbins/NBLOCK;
00541 nleftover=nbins-nofblocksi*NBLOCK;
00542 
00543 #ifdef __USEATOMIC
00544 // reinitialize llhists1 to point to global memory
00545 unsigned int *llhistg1=llhistg+blockIdx.x*MAXBIN+threadIdx.x;
00546 
00547 // loop to add all elements to global memory
00548 loopmax = nofblocksi*NBLOCK;
00549 for (iblock=0; iblock < loopmax; iblock+=NBLOCK) {
00550 llhistg1[iblock] += llhists[iblock+threadIdx.x];
00551 }
00552 // take care of leftovers
00553 if (threadIdx.x < nleftover) {
00554 llhistg1[iblock] += llhists[iblock+threadIdx.x];
00555 }
00556 #else
00557 // reinitialize llhists1 to point to global memory
00558 unsigned int* llhistg1=llhistg+(((blockIdx.x)*NBLOCK)/THREADSPERWARP)*MAXBIN+threadIdx.x;
00559 
00560 // loop to add all elements to global memory
00561 loopmax = MAXBIN * (NBLOCK / THREADSPERWARP);
00562 loopmax2 = nofblocksi * NBLOCK;
00563 for (io=0; io < loopmax; io+=MAXBIN) {
00564 for (iblock=0; iblock < loopmax2; iblock+=NBLOCK) {
00565 llhistg1[io+iblock] += llhists[io+iblock+threadIdx.x];
00566 }
00567 // take care of leftovers
00568 if (threadIdx.x < nleftover) {
00569 llhistg1[iblock] += llhists[io+iblock+threadIdx.x];
00570 }
00571 }
00572 #endif
00573 
00574 return;
00575 }
00576 
00577 
00578 #define ull2float __uint2float_rn
00579 
00580 /* This routine takes a series of integer histograms in llhist, 
00581 * stored in as integers, converts them to floats, multiplies 
00582 * them by appropriate factors, and sums them. The result is 
00583 * stored in histdev.
00584 */
00585 __global__ void calculate_histogram(float* histdev, // histogram to return
00586 unsigned int* llhistg, // the histograms stored
00587 // in global memory. There is one histogram
00588 // per block (warp) if atomicAdds are (not) used
00589 int nbins) // stride between geometries in xyz.
00590 {
00591 int io; // index for inner loop
00592 int iblock; // index for outer loop 
00593 int maxloop; // maxima for loop conditions
00594 
00595 __shared__ unsigned int llhists[NBLOCKHIST]; // block of int histogram in final summation
00596 
00597 __shared__ float xi[NBLOCKHIST]; // smem for the floating point histograms.
00598 int nofblocks; // number of data blocks per histogram
00599 
00600 nofblocks=nbins/NBLOCKHIST; // set number of blocks per histogram
00601 int nleftover=nbins-nofblocks*NBLOCKHIST;
00602 maxloop = nofblocks*NBLOCKHIST; // set maxloop and maxloop2
00603 
00604 // loop over all of the blocks in a histogram
00605 for (iblock=0;iblock<maxloop;iblock+=NBLOCKHIST) {
00606 xi[threadIdx.x]=0.0f; // zero out xi
00607 
00608 // loop over the histograms created by calculate_rdf
00609 #ifdef __USEATOMIC
00610 for (io=0; io < MAXBIN*NCUDABLOCKS; io+=MAXBIN) {
00611 #else
00612 for (io=0; io < MAXBIN * (NCUDABLOCKS*NBLOCK / THREADSPERWARP); io+=MAXBIN) {
00613 #endif
00614 // load integer histogram data into shared memory (coalesced)
00615 llhists[threadIdx.x]=llhistg[io+iblock+threadIdx.x];
00616 
00617 // shave off the thread tag that might remain from calculate_rdf
00618 llhists[threadIdx.x]=llhists[threadIdx.x] & 0x07FFFFFFUL;
00619 
00620 // convert to float
00621 xi[threadIdx.x]+=ull2float(llhists[threadIdx.x]);
00622 }
00623 
00624 // ... and store in global memory
00625 histdev[iblock+threadIdx.x]+=xi[threadIdx.x];
00626 }
00627 
00628 // take care of leftovers
00629 if (threadIdx.x < nleftover) {
00630 xi[threadIdx.x]=0.0f; // zero out xi
00631 
00632 // loop over the histograms created by calculate_rdf
00633 #ifdef __USEATOMIC
00634 for (io=0; io < MAXBIN*NCUDABLOCKS; io+=MAXBIN) {
00635 #else
00636 for (io=0; io < MAXBIN *(NCUDABLOCKS*NBLOCK / THREADSPERWARP); io+=MAXBIN) {
00637 #endif
00638 // load integer histogram data into shared memory (coalesced)
00639 llhists[threadIdx.x]=llhistg[io+iblock+threadIdx.x];
00640 
00641 // shave off the thread tag that might remain from calculate_rdf
00642 llhists[threadIdx.x]=llhists[threadIdx.x] & 0x07FFFFFFUL;
00643 
00644 // convert to float
00645 xi[threadIdx.x]+=ull2float(llhists[threadIdx.x]);
00646 }
00647 
00648 // ... and store in global memory
00649 histdev[iblock+threadIdx.x]+=xi[threadIdx.x];
00650 }
00651 
00652 // calculate_hist out
00653 return;
00654 }
00655 
00656 
00657 /*
00658 * calculate_histogram_block
00659 * Compute one histogram block 
00660 */
00661 void calculate_histogram_block(int usepbc,
00662 float *xyz, 
00663 int natomsi, 
00664 int natomsj,
00665 float3 celld,
00666 unsigned int* llhistg,
00667 int nbins,
00668 float rmin,
00669 float delr_inv,
00670 float *histdev,
00671 int nblockhist) {
00672 // zero out the histograms in global memory
00673 init_hist<<<NCUDABLOCKS, NBLOCK>>>(llhistg, MAXBIN);
00674 
00675 // calculate the histograms
00676 calculate_rdf<<<NCUDABLOCKS, NBLOCK>>>(usepbc, xyz, natomsi, natomsj, celld,
00677 llhistg, nbins, rmin, delr_inv);
00678 
00679 // sum histograms and begin normalization by adjusting for the cell volume
00680 calculate_histogram<<<1, nblockhist>>>(histdev, llhistg, nbins);
00681 }
00682 
00683 
00684 
00685 /*
00686 * input parameters for rdf_thread() routine
00687 */
00688 typedef struct {
00689 int usepbc; // periodic or non-periodic calculation
00690 int natoms1; // number of atoms in selection 1.
00691 float* xyz; // coordinates of first selection, [natoms1][3]
00692 int natoms2; // number of atoms in selection 2.
00693 float** xyz2array; // coordinates of selection 2. [natoms2][3] (per-thread)
00694 float* cell; // the cell x y and z dimensions [3]
00695 float** histarray; // the final histogram in host mem [nbins] (per-thread)
00696 int nbins; // the number of bins in the histogram
00697 int nblockhist; // the size of a block used in the
00698 // reduction of the histogram
00699 float rmin; // the minimum value of the first bin
00700 float delr; // the width of each bin
00701 int nblocks; // number of const blocks to compute all pairs histogram
00702 int nhblock; // # NBLOCKHIST-sized blocks needed to calc whole histogram
00703 } rdfthrparms;
00704 
00705 
00706 /* 
00707 * rdf_thread -- multithreaded version of subroutine_rdf()
00708 * This routine is called from the CPU to calculate g(r) for a single frame
00709 * on the GPU. Right now it performs a brute force O(N^2) calculations
00710 * (with all atomic pairs considered). Future version may reduce the overall
00711 * work by performing a grid search.
00712 */
00713 static void * rdf_thread(void *voidparms) {
00714 rdfthrparms *parms = NULL;
00715 int threadid=0;
00716 
00717 wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00718 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00719 
00720 #if 0
00721 printf("rdf thread[%d] lanched and running...\n", threadid);
00722 #endif
00723 cudaSetDevice(threadid);
00724 
00725 /*
00726 * copy in per-thread parameters
00727 */
00728 const int natoms1 = parms->natoms1;
00729 const float *xyz = parms->xyz;
00730 const int natoms2 = parms->natoms2;
00731 const float *cell = parms->cell;
00732 const int nbins = parms->nbins;
00733 const int nblockhist = parms->nblockhist;
00734 const float rmin = parms->rmin;
00735 const float delr = parms->delr;
00736 float *xyz2 = parms->xyz2array[threadid];
00737 float *hist = parms->histarray[threadid];
00738 const int nhblock = parms->nhblock;
00739 const int usepbc = parms->usepbc;
00740 
00741 float* xyzdev; // pointer to xyz data in global memory
00742 float3 celld; // cell x, y, and z dimensions
00743 float* histdev; // pointer to float histogram in global memory
00744 unsigned int* llhistg; // pointer to int histograms in global memory
00745 
00746 int nconstblocks; // # full blocks to be sent to constant memory
00747 int nleftover; // # elements in leftover block of const mem
00748 int nbigblocks; // number of blocks required to prevent histogram overflow
00749 int ibigblock; // the index of the current big block
00750 int nleftoverbig; // the number of elements in the final big block
00751 int ntmpbig;
00752 int nbinstmp; // number of bins currently being processed
00753 int natoms1pad; // number of atoms in selection 1 after padding
00754 int natoms2pad; // number of atoms in selection 1 after padding
00755 float rmax; // cutoff radius for atom contributions to histogram
00756 
00757 // natoms1pad is natoms1 rounded up to the next multiple of NBLOCK
00758 natoms1pad=natoms1;
00759 if (natoms1%NBLOCK!=0) {natoms1pad = (natoms1 / NBLOCK + 1) * NBLOCK;}
00760 natoms2pad=natoms2;
00761 if (natoms2%NBLOCK!=0) {natoms2pad = (natoms2 / NBLOCK + 1) * NBLOCK;}
00762 
00763 // allocate global memory for:
00764 
00765 // the xyz coordinates of selection 1
00766 cudaMalloc((void**)&xyzdev, 3*max(natoms1pad,natoms2pad)*sizeof(float));
00767 
00768 // the final copy of the histogram
00769 cudaMalloc((void**)&histdev, nbins*sizeof(float));
00770 
00771 // zero out floating point histogram
00772 int ihblock; // loop counter for histogram blocks
00773 int maxloop=nbins/nblockhist;
00774 if (maxloop*nblockhist!=nbins) {maxloop=maxloop+1;}
00775 for (ihblock=0; ihblock<maxloop; ihblock++) {
00776 init_hist_f<<<1, nblockhist>>>(histdev+ihblock*nblockhist);
00777 }
00778 
00779 // the global memory copies of the histogram
00780 #ifdef __USEATOMIC
00781 cudaMalloc((void**)&llhistg, (NCUDABLOCKS*MAXBIN*sizeof(int)));
00782 #else
00783 cudaMalloc((void**)&llhistg, (NCUDABLOCKS*NBLOCK*MAXBIN*
00784 sizeof(int))/THREADSPERWARP);
00785 #endif
00786 
00787 // set the cell dimensions
00788 celld.x=cell[0];
00789 celld.y=cell[1];
00790 celld.z=cell[2];
00791 
00792 // set rmax. this is the distance that the padding atoms must be from the 
00793 // unit cell AND that they must be from each other
00794 rmax = 1.1f * (rmin+((float)nbins)*delr);
00795 
00796 // send the second selection to the gpu for reimaging
00797 cudaMemcpy(xyzdev, xyz2, 3*natoms2*sizeof(float), cudaMemcpyHostToDevice);
00798 
00799 if (usepbc) {
00800 // reimage atom coors into a single periodic cell
00801 reimage_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms2,natoms2pad,celld,rmax);
00802 } else {
00803 // shift phantom atoms so they don't interfere with non-periodic calc.
00804 phantom_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms2,natoms2pad,1.0e38f,rmax);
00805 }
00806 
00807 // now, unfortunately, we have to move it back because this selection 
00808 // will need to be sent to constant memory (done on the host side)
00809 cudaMemcpy(xyz2, xyzdev, 3*natoms2*sizeof(float), cudaMemcpyDeviceToHost);
00810 
00811 // send the xyz coords of selection 1 to the gpu
00812 cudaMemcpy(xyzdev, xyz, 3*natoms1*sizeof(float), cudaMemcpyHostToDevice);
00813 
00814 if (usepbc) {
00815 // reimage xyz coords back into the periodic box and zero all histograms
00816 reimage_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms1,natoms1pad,celld,rmax);
00817 } else {
00818 // shift phantom atoms so they don't interfere with non-periodic calc.
00819 phantom_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms1,natoms1pad,1.0e38f,rmax);
00820 }
00821 
00822 // all blocks except for the final one will have MAXBIN elements
00823 nbinstmp = MAXBIN; 
00824 
00825 // calc # of groups to divide sel2 into to fit it into constant memory
00826 nconstblocks=natoms2/NCONSTBLOCK;
00827 nleftover=natoms2-nconstblocks*NCONSTBLOCK;
00828 
00829 // set up the big blocks to avoid bin overflow
00830 ntmpbig = NBLOCK * ((BIN_OVERFLOW_LIMIT / NCONSTBLOCK) / NBLOCK);
00831 nbigblocks = natoms1pad / ntmpbig;
00832 nleftoverbig = natoms1pad - nbigblocks * ntmpbig;
00833 
00834 int nbblocks = (nleftoverbig != 0) ? nbigblocks+1 : nbigblocks;
00835 
00836 // parallel loop over all "constblocks"
00837 wkf_tasktile_t tile; // next batch of work units
00838 while (wkf_threadpool_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) {
00839 #if 0
00840 printf("rdf thread[%d] fetched tile: s: %d e: %d\n", threadid, tile.start, tile.end);
00841 #endif
00842 int iconstblock = -1; // loop counter for constblocks, force update 
00843 ihblock = -1; // initialize to sentinel value to force immediate update
00844 int workitem; // current parallel work item
00845 for (workitem=tile.start; workitem<tile.end; workitem++) {
00846 // decode work item into ihblock and iconstblock indices
00847 ihblock = workitem % nhblock;
00848 int newiconstblock = workitem / nhblock;
00849 int blocksz;
00850 
00851 // When moving to the next constblock we must copy in new coordinates
00852 if (iconstblock != newiconstblock) {
00853 iconstblock = newiconstblock;
00854 
00855 // take care of the leftovers on the last block
00856 blocksz = (iconstblock == nconstblocks && nleftover != 0) 
00857 ? nleftover : NCONSTBLOCK;
00858 
00859 // send a "constblock" of coords from selection 2 to const mem
00860 copycoordstoconstbuff(blocksz, xyz2+(3*NCONSTBLOCK*iconstblock));
00861 } 
00862 
00863 #if 0
00864 printf("rdf thread[%d] iconstblock: %d ihblock: %d\n", threadid, 
00865 iconstblock, ihblock);
00866 #endif
00867 
00868 // loop over blocks of histogram
00869 // minimum distance for the current histogram block
00870 // the first block starts at rmin
00871 float rmintmp=rmin + (MAXBIN * delr * ihblock);
00872 
00873 nbinstmp = MAXBIN;
00874 
00875 // the final block will have nbin%MAXBIN elements
00876 if (ihblock==(nhblock-1)) { 
00877 nbinstmp = nbins%MAXBIN;
00878 // if nbins is an even multiple of MAXBIN, the final block is full
00879 if (nbinstmp==0) { nbinstmp = MAXBIN;}
00880 }
00881 
00882 for (ibigblock=0; ibigblock < nbblocks; ibigblock++) {
00883 // take care of the leftovers on the last block
00884 int bblocksz = (ibigblock == nbigblocks && nleftoverbig != 0) 
00885 ? nleftoverbig : ntmpbig;
00886 
00887 calculate_histogram_block(usepbc, (xyzdev+ibigblock*ntmpbig*3),
00888 bblocksz, blocksz, celld,
00889 llhistg, nbinstmp, rmintmp, (1/delr),
00890 histdev+ihblock*MAXBIN, nblockhist);
00891 }
00892 
00893 cudaDeviceSynchronize();
00894 }
00895 } // end of parallel tile fetch while loop
00896 
00897 // retrieve g of r for this frame
00898 cudaMemcpy(hist, histdev, nbins*sizeof(float), cudaMemcpyDeviceToHost);
00899 
00900 cudaFree(xyzdev); // free gpu global memory
00901 cudaFree(histdev);
00902 cudaFree(llhistg);
00903 
00904 #if 0
00905 printf("thread[%d] finished, releasing resources\n", threadid); 
00906 #endif
00907 
00908 return NULL;
00909 }
00910 
00911 
00912 int rdf_gpu(wkf_threadpool_t *devpool, // VMD GPU worker thread pool
00913 int usepbc, // periodic or non-periodic calc.
00914 int natoms1, // array of the number of atoms in
00915 // selection 1 in each frame.
00916 float *xyz, // coordinates of first selection.
00917 // [natoms1][3]
00918 int natoms2, // array of the number of atoms in
00919 // selection 2 in each frame.
00920 float *xyz2, // coordinates of selection 2.
00921 // [natoms2][3]
00922 float* cell, // the cell x y and z dimensions [3]
00923 float* hist, // the histograms, 1 per block
00924 // [ncudablocks][maxbin]
00925 int nbins, // the number of bins in the histogram
00926 float rmin, // the minimum value of the first bin
00927 float delr) // the width of each bin
00928 {
00929 int i, ibin;
00930 
00931 if (devpool == NULL) {
00932 return -1; // abort if no device pool exists
00933 }
00934 int numprocs = wkf_threadpool_get_workercount(devpool);
00935 
00936 // multi-thread buffers
00937 float **xyz2array = (float**) calloc(1, sizeof(float *) * numprocs);
00938 float **histarray = (float**) calloc(1, sizeof(float *) * numprocs);
00939 for (i=0; i<numprocs; i++) {
00940 xyz2array[i] = (float*) calloc(1, sizeof(float) * 3 * (natoms2/NBLOCK + 1) * NBLOCK);
00941 memcpy(xyz2array[i], xyz2, sizeof(float) * 3 * (natoms2/NBLOCK + 1) * NBLOCK);
00942 histarray[i] = (float*) calloc(1, sizeof(float) * nbins);
00943 }
00944 
00945 memset(hist, 0, nbins * sizeof(float)); // clear global histogram array
00946 
00947 // setup thread parameters
00948 rdfthrparms parms;
00949 
00950 parms.usepbc = usepbc;
00951 parms.natoms1 = natoms1;
00952 parms.xyz = xyz;
00953 parms.natoms2 = natoms2;
00954 parms.cell = cell;
00955 parms.nbins = nbins;
00956 parms.nblockhist = NBLOCKHIST; // size of final reduction block 
00957 parms.rmin = rmin;
00958 parms.delr = delr;
00959 parms.xyz2array = xyz2array; // per-thread output data
00960 parms.histarray = histarray; // per-thread output data
00961 
00962 // calculate the number of blocks to divide the histogram into
00963 parms.nhblock = (nbins+MAXBIN-1)/MAXBIN;
00964 
00965 // calc # of groups to divide sel2 into to fit it into constant memory
00966 int nconstblocks=parms.natoms2/NCONSTBLOCK;
00967 int nleftoverconst=parms.natoms2 - nconstblocks*NCONSTBLOCK;
00968 
00969 // we have to do one extra cleanup pass if there are leftovers
00970 parms.nblocks = (nleftoverconst != 0) ? nconstblocks+1 : nconstblocks;
00971 
00972 int parblocks = parms.nblocks * parms.nhblock;
00973 #if 0
00974 printf("master thread: number of parallel work items: %d\n", parblocks);
00975 #endif
00976 
00977 // spawn child threads to do the work
00978 wkf_tasktile_t tile;
00979 tile.start=0;
00980 tile.end=parblocks;
00981 wkf_threadpool_sched_dynamic(devpool, &tile);
00982 wkf_threadpool_launch(devpool, rdf_thread, &parms, 1);
00983 
00984 memset(hist, 0, nbins * sizeof(float)); // clear global histogram array
00985 // collect independent thread results into final histogram buffer
00986 for (i=0; i<numprocs; i++) {
00987 for (ibin=0; ibin<nbins; ibin++) {
00988 hist[ibin] += histarray[i][ibin];
00989 } 
00990 }
00991 
00992 return 0;
00993 }
00994 
00995 
00996