00001 /*************************************************************************** 00002 *cr 00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the 00004 *cr University of Illinois 00005 *cr All Rights Reserved 00006 *cr 00007 ***************************************************************************/ 00008 /*************************************************************************** 00009 * RCS INFORMATION: 00010 * 00011 * $RCSfile: VolCPotential.C,v $ 00012 * $Author: johns $ $Locker: $ $State: Exp $ 00013 * $Revision: 1.30 $ $Date: 2019年01月17日 21:21:02 $ 00014 * 00015 ***************************************************************************/ 00016 /* 00017 * Calculate a coulombic potential map 00018 */ 00019 #include <stdio.h> 00020 #include <stdlib.h> 00021 #include <math.h> 00022 #include <string.h> 00023 00024 #include "config.h" // force recompile when configuration changes 00025 #include "utilities.h" 00026 #include "Inform.h" 00027 #include "WKFThreads.h" 00028 #include "WKFUtils.h" 00029 #include "VolCPotential.h" 00030 #if defined(VMDCUDA) 00031 #include "CUDAKernels.h" 00032 #endif 00033 #if defined(VMDOPENCL) 00034 #include "OpenCLKernels.h" 00035 #endif 00036 00037 typedef struct { 00038 float* atoms; 00039 float* grideners; 00040 long int numplane; 00041 long int numcol; 00042 long int numpt; 00043 long int natoms; 00044 float gridspacing; 00045 } enthrparms; 00046 00047 #if 1 || defined(__INTEL_COMPILER) 00048 00049 #define FLOPSPERATOMEVAL 6.0 00050 extern "C" void * energythread(void *voidparms) { 00051 int threadid; 00052 enthrparms *parms = NULL; 00053 wkf_threadlaunch_getdata(voidparms, (void **) &parms); 00054 wkf_threadlaunch_getid(voidparms, &threadid, NULL); 00055 00056 /* 00057 * copy in per-thread parameters 00058 */ 00059 const float *atoms = parms->atoms; 00060 float* grideners = parms->grideners; 00061 const long int numplane = parms->numplane; 00062 const long int numcol = parms->numcol; 00063 const long int numpt = parms->numpt; 00064 const long int natoms = parms->natoms; 00065 const float gridspacing = parms->gridspacing; 00066 int i, j, k, n; 00067 double lasttime, totaltime; 00068 00069 /* Calculate the coulombic energy at each grid point from each atom 00070 * This is by far the most time consuming part of the process 00071 * We iterate over z,y,x, and then atoms 00072 */ 00073 00074 printf("thread %d started...\n", threadid); 00075 wkf_timerhandle timer = wkf_timer_create(); 00076 wkf_timer_start(timer); 00077 wkfmsgtimer *msgt = wkf_msg_timer_create(5); 00078 00079 // Holds atom x, dy**2+dz**2, and atom q as x/r/q/x/r/q... 00080 float * xrq = (float *) malloc(3*natoms * sizeof(float)); 00081 int maxn = natoms * 3; 00082 00083 // For each point in the cube... 00084 wkf_tasktile_t tile; 00085 while (wkf_threadlaunch_next_tile(voidparms, 2, &tile) != WKF_SCHED_DONE) { 00086 for (k=tile.start; k<tile.end; k++) { 00087 const float z = gridspacing * (float) k; 00088 lasttime = wkf_timer_timenow(timer); 00089 for (j=0; j<numcol; j++) { 00090 const float y = gridspacing * (float) j; 00091 long int voxaddr = numcol*numpt*k + numpt*j; 00092 00093 // Prebuild a table of dy and dz values on a per atom basis 00094 for (n=0; n<natoms; n++) { 00095 int addr3 = n*3; 00096 int addr4 = n*4; 00097 float dy = y - atoms[addr4 + 1]; 00098 float dz = z - atoms[addr4 + 2]; 00099 xrq[addr3 ] = atoms[addr4]; 00100 xrq[addr3 + 1] = dz*dz + dy*dy; 00101 xrq[addr3 + 2] = atoms[addr4 + 3]; 00102 } 00103 00104 // help the vectorizer make reasonable decisions 00105 #if defined(__INTEL_COMPILER) 00106 #pragma vector always 00107 #endif 00108 /* walk through more than one grid point at a time */ 00109 for (i=0; i<numpt; i+=8) { 00110 float energy1 = 0.0f; /* Energy of first grid point */ 00111 float energy2 = 0.0f; /* Energy of second grid point */ 00112 float energy3 = 0.0f; /* Energy of third grid point */ 00113 float energy4 = 0.0f; /* Energy of fourth grid point */ 00114 float energy5 = 0.0f; /* Energy of fourth grid point */ 00115 float energy6 = 0.0f; /* Energy of fourth grid point */ 00116 float energy7 = 0.0f; /* Energy of fourth grid point */ 00117 float energy8 = 0.0f; /* Energy of fourth grid point */ 00118 00119 const float x = gridspacing * (float) i; 00120 00121 // help the vectorizer make reasonable decisions 00122 #if defined(__INTEL_COMPILER) 00123 #pragma vector always 00124 #endif 00125 /* Calculate the interaction with each atom */ 00126 /* SSE allows simultaneous calculations of */ 00127 /* multiple iterations */ 00128 /* 6 flops per grid point */ 00129 for (n=0; n<maxn; n+=3) { 00130 float dy2pdz2 = xrq[n + 1]; 00131 float q = xrq[n + 2]; 00132 00133 float dx1 = x - xrq[n]; 00134 energy1 += q / sqrtf(dx1*dx1 + dy2pdz2); 00135 00136 float dx2 = dx1 + gridspacing; 00137 energy2 += q / sqrtf(dx2*dx2 + dy2pdz2); 00138 00139 float dx3 = dx2 + gridspacing; 00140 energy3 += q / sqrtf(dx3*dx3 + dy2pdz2); 00141 00142 float dx4 = dx3 + gridspacing; 00143 energy4 += q / sqrtf(dx4*dx4 + dy2pdz2); 00144 00145 float dx5 = dx4 + gridspacing; 00146 energy5 += q / sqrtf(dx5*dx5 + dy2pdz2); 00147 00148 float dx6 = dx5 + gridspacing; 00149 energy6 += q / sqrtf(dx6*dx6 + dy2pdz2); 00150 00151 float dx7 = dx6 + gridspacing; 00152 energy7 += q / sqrtf(dx7*dx7 + dy2pdz2); 00153 00154 float dx8 = dx7 + gridspacing; 00155 energy8 += q / sqrtf(dx8*dx8 + dy2pdz2); 00156 } 00157 00158 grideners[voxaddr + i] = energy1; 00159 if (i+1 < numpt) 00160 grideners[voxaddr + i + 1] = energy2; 00161 if (i+2 < numpt) 00162 grideners[voxaddr + i + 2] = energy3; 00163 if (i+3 < numpt) 00164 grideners[voxaddr + i + 3] = energy4; 00165 if (i+4 < numpt) 00166 grideners[voxaddr + i + 4] = energy5; 00167 if (i+5 < numpt) 00168 grideners[voxaddr + i + 5] = energy6; 00169 if (i+6 < numpt) 00170 grideners[voxaddr + i + 6] = energy7; 00171 if (i+7 < numpt) 00172 grideners[voxaddr + i + 7] = energy8; 00173 } 00174 } 00175 totaltime = wkf_timer_timenow(timer); 00176 00177 if (wkf_msg_timer_timeout(msgt)) { 00178 // XXX: we have to use printf here as msgInfo is not thread-safe yet. 00179 printf("thread[%d] plane %d/%ld time %.2f, elapsed %.1f, est. total: %.1f\n", 00180 threadid, k, numplane, 00181 totaltime - lasttime, totaltime, 00182 totaltime * numplane / (k+1)); 00183 } 00184 } 00185 } 00186 00187 wkf_timer_destroy(timer); 00188 wkf_msg_timer_destroy(msgt); 00189 free(xrq); 00190 00191 return NULL; 00192 } 00193 00194 #else 00195 00196 #define FLOPSPERATOMEVAL 6.0 00197 extern "C" void * energythread(void *voidparms) { 00198 int threadid; 00199 enthrparms *parms = NULL; 00200 wkf_threadlaunch_getdata(voidparms, (void **) &parms); 00201 wkf_threadlaunch_getid(voidparms, &threadid, NULL); 00202 00203 /* 00204 * copy in per-thread parameters 00205 */ 00206 const float *atoms = parms->atoms; 00207 float* grideners = parms->grideners; 00208 const long int numplane = parms->numplane; 00209 const long int numcol = parms->numcol; 00210 const long int numpt = parms->numpt; 00211 const long int natoms = parms->natoms; 00212 const float gridspacing = parms->gridspacing; 00213 int i, j, k, n; 00214 double lasttime, totaltime; 00215 00216 /* Calculate the coulombic energy at each grid point from each atom 00217 * This is by far the most time consuming part of the process 00218 * We iterate over z,y,x, and then atoms 00219 */ 00220 00221 printf("thread %d started...\n", threadid); 00222 wkf_timerhandle timer = wkf_timer_create(); 00223 wkf_timer_start(timer); 00224 wkfmsgtimer *msgt = wkf_msg_timer_create(5); 00225 00226 // Holds atom x, dy**2+dz**2, and atom q as x/r/q/x/r/q... 00227 float * xrq = (float *) malloc(3*natoms * sizeof(float)); 00228 int maxn = natoms * 3; 00229 00230 // For each point in the cube... 00231 while (!wkf_threadlaunch_next(voidparms, &k)) { 00232 const float z = gridspacing * (float) k; 00233 lasttime = wkf_timer_timenow(timer); 00234 for (j=0; j<numcol; j++) { 00235 const float y = gridspacing * (float) j; 00236 long int voxaddr = numcol*numpt*k + numpt*j; 00237 00238 // Prebuild a table of dy and dz values on a per atom basis 00239 for (n=0; n<natoms; n++) { 00240 int addr3 = n*3; 00241 int addr4 = n*4; 00242 float dy = y - atoms[addr4 + 1]; 00243 float dz = z - atoms[addr4 + 2]; 00244 xrq[addr3 ] = atoms[addr4]; 00245 xrq[addr3 + 1] = dz*dz + dy*dy; 00246 xrq[addr3 + 2] = atoms[addr4 + 3]; 00247 } 00248 00249 #if defined(__INTEL_COMPILER) 00250 // help the vectorizer make reasonable decisions (used prime to keep it honest) 00251 #pragma loop count(1009) 00252 #endif 00253 for (i=0; i<numpt; i++) { 00254 float energy = grideners[voxaddr + i]; // Energy at current grid point 00255 const float x = gridspacing * (float) i; 00256 00257 #if defined(__INTEL_COMPILER) 00258 // help the vectorizer make reasonable decisions 00259 #pragma vector always 00260 #endif 00261 // Calculate the interaction with each atom 00262 for (n=0; n<maxn; n+=3) { 00263 float dx = x - xrq[n]; 00264 energy += xrq[n + 2] / sqrtf(dx*dx + xrq[n + 1]); 00265 } 00266 grideners[voxaddr + i] = energy; 00267 } 00268 } 00269 totaltime = wkf_timer_timenow(timer); 00270 00271 if (wkf_msg_timer_timeout(msgt)) { 00272 // XXX: we have to use printf here as msgInfo is not thread-safe yet. 00273 printf("thread[%d] plane %d/%ld time %.2f, elapsed %.1f, est. total: %.1f\n", 00274 threadid, k, numplane, 00275 totaltime - lasttime, totaltime, 00276 totaltime * numplane / (k+1)); 00277 } 00278 } 00279 00280 wkf_timer_destroy(timer); 00281 wkf_msg_timer_destroy(msgt); 00282 free(xrq); 00283 00284 return NULL; 00285 } 00286 00287 #endif 00288 00289 00290 static int vol_cpotential_cpu(long int natoms, float* atoms, float* grideners, long int numplane, long int numcol, long int numpt, float gridspacing) { 00291 int rc=0; 00292 enthrparms parms; 00293 wkf_timerhandle globaltimer; 00294 double totalruntime; 00295 00296 #if defined(VMDTHREADS) 00297 int numprocs = wkf_thread_numprocessors(); 00298 #else 00299 int numprocs = 1; 00300 #endif 00301 00302 printf("Using %d %s\n", numprocs, ((numprocs > 1) ? "CPUs" : "CPU")); 00303 00304 parms.atoms = atoms; 00305 parms.grideners = grideners; 00306 parms.numplane = numplane; 00307 parms.numcol = numcol; 00308 parms.numpt = numpt; 00309 parms.natoms = natoms; 00310 parms.gridspacing = gridspacing; 00311 00312 globaltimer = wkf_timer_create(); 00313 wkf_timer_start(globaltimer); 00314 00315 /* spawn child threads to do the work */ 00316 wkf_tasktile_t tile; 00317 tile.start=0; 00318 tile.end=numplane; 00319 rc = wkf_threadlaunch(numprocs, &parms, energythread, &tile); 00320 00321 // Measure GFLOPS 00322 wkf_timer_stop(globaltimer); 00323 totalruntime = wkf_timer_time(globaltimer); 00324 wkf_timer_destroy(globaltimer); 00325 00326 if (!rc) { 00327 double atomevalssec = ((double) numplane * numcol * numpt * natoms) / (totalruntime * 1000000000.0); 00328 printf(" %g billion atom evals/second, %g GFLOPS\n", 00329 atomevalssec, atomevalssec * FLOPSPERATOMEVAL); 00330 } else { 00331 msgWarn << "Encountered an unrecoverable error, calculation terminated." << sendmsg; 00332 } 00333 00334 return rc; 00335 } 00336 00337 00338 int vol_cpotential(long int natoms, float* atoms, float* grideners, long int numplane, long int numcol, long int numpt, float gridspacing) { 00339 int rc = -1; // init rc value to indicate we haven't run yet 00340 wkf_timerhandle timer = wkf_timer_create(); 00341 wkf_timer_start(timer); 00342 00343 #if defined(VMDCUDA) 00344 if (!getenv("VMDNOCUDA")) { 00345 rc = vmd_cuda_vol_cpotential(natoms, atoms, grideners, 00346 numplane, numcol, numpt, gridspacing); 00347 } 00348 #endif 00349 #if defined(VMDOPENCL) 00350 if ((rc != 0) && !getenv("VMDNOOPENCL")) { 00351 rc = vmd_opencl_vol_cpotential(natoms, atoms, grideners, 00352 numplane, numcol, numpt, gridspacing); 00353 } 00354 #endif 00355 00356 // if we tried to run on the GPU and failed, or we haven't run yet, 00357 // run on the CPU 00358 if (rc != 0) 00359 rc = vol_cpotential_cpu(natoms, atoms, grideners, 00360 numplane, numcol, numpt, gridspacing); 00361 00362 double totaltime = wkf_timer_timenow(timer); 00363 wkf_timer_destroy(timer); 00364 00365 msgInfo << "Coulombic potential map calculation complete: " 00366 << totaltime << " seconds" << sendmsg; 00367 00368 return rc; 00369 } 00370 00371 00372