00001 /*************************************************************************** 00002 *cr 00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the 00004 *cr University of Illinois 00005 *cr All Rights Reserved 00006 *cr 00007 ***************************************************************************/ 00008 00009 /*************************************************************************** 00010 * RCS INFORMATION: 00011 * 00012 * $RCSfile: OpenCLVolCPotential.C,v $ 00013 * $Author: johns $ $Locker: $ $State: Exp $ 00014 * $Revision: 1.32 $ $Date: 2019年01月17日 21:38:55 $ 00015 * 00016 *************************************************************************** 00017 * DESCRIPTION: 00018 * OpenCL accelerated coulombic potential grid calculation 00019 * 00020 ***************************************************************************/ 00021 #include <stdio.h> 00022 #include <stdlib.h> 00023 #include <string.h> 00024 #if defined(__APPLE__) 00025 #include <OpenCL/cl.h> 00026 #else 00027 #include <CL/cl.h> 00028 #endif 00029 00030 #include "Inform.h" 00031 #include "utilities.h" 00032 #include "WKFThreads.h" 00033 #include "WKFUtils.h" 00034 #include "OpenCLKernels.h" 00035 #include "OpenCLUtils.h" 00036 00037 typedef struct { 00038 float* atoms; 00039 float* grideners; 00040 long int numplane; 00041 long int numcol; 00042 long int numpt; 00043 long int natoms; 00044 float gridspacing; 00045 } enthrparms; 00046 00047 /* thread prototype */ 00048 static void * openclenergythread(void *); 00049 00050 #if 1 00051 #define CLERR \ 00052 if (clerr != CL_SUCCESS) { \ 00053 printf("opencl error %d, %s line %d\n", clerr, __FILE__, __LINE__); \ 00054 return NULL; \ 00055 } 00056 #else 00057 #define CLERR 00058 #endif 00059 00060 // max constant buffer size is 64KB, minus whatever 00061 // the OpenCL runtime and compiler are using that we don't know about 00062 // At 16 bytes/atom, 4000 atoms is about the max we can store in 00063 // the constant buffer. 00064 #define MAXATOMS 4000 00065 00066 00067 // 00068 // The OpenCL kernels calculate coulombic potential at each grid point and 00069 // store the results in the output array. 00070 // 00071 // These versions of the code use the 64KB constant buffer area reloaded 00072 // for each group of MAXATOMS atoms, until the contributions from all 00073 // atoms have been summed into the potential grid. 00074 // 00075 // These implementations use precomputed and unrolled loops of 00076 // (dy^2 + dz^2) values for increased FP arithmetic intensity. 00077 // The X coordinate portion of the loop is unrolled by four or eight, 00078 // allowing the same dy^2 + dz^2 values to be reused multiple times, 00079 // increasing the ratio of FP arithmetic relative to FP loads, and 00080 // eliminating some redundant calculations. 00081 // 00082 00083 // 00084 // Tuned global memory coalescing version, unrolled in X 00085 // 00086 00087 // 00088 // Tunings for large potential map dimensions (e.g. 384x384x...) 00089 // 00090 #define UNROLLX 8 00091 #define UNROLLY 1 00092 #define BLOCKSIZEX 8 // make large enough to allow coalesced global mem ops 00093 #define BLOCKSIZEY 8 // make as small as possible for finer granularity 00094 #define BLOCKSIZE (BLOCKSIZEX * BLOCKSIZEY) 00095 00096 #define V4UNROLLX 8 00097 #define V4UNROLLY 1 00098 #define V4BLOCKSIZEX 8 00099 #define V4BLOCKSIZEY 8 00100 #define V4BLOCKSIZE V4BLOCKSIZEX * V4BLOCKSIZEY 00101 00102 // FLOP counting 00103 #define FLOPSPERATOMEVAL (59.0/8.0) 00104 00105 // OpenCL source code 00106 const char* clenergysrc = 00107 "__kernel __attribute__((reqd_work_group_size(BLOCKSIZEX, BLOCKSIZEY, 1))) \n" 00108 "void clenergy(int numatoms, float gridspacing, __global float *energy, __constant float4 *atominfo) { \n" 00109 " unsigned int xindex = (get_global_id(0) - get_local_id(0)) * UNROLLX + get_local_id(0); \n" 00110 " unsigned int yindex = get_global_id(1); \n" 00111 " unsigned int outaddr = get_global_size(0) * UNROLLX * yindex + xindex;\n" 00112 " \n" 00113 " float coory = gridspacing * yindex; \n" 00114 " float coorx = gridspacing * xindex; \n" 00115 " \n" 00116 " float energyvalx1 = 0.0f; \n" 00117 #if UNROLLX >= 4 00118 " float energyvalx2 = 0.0f; \n" 00119 " float energyvalx3 = 0.0f; \n" 00120 " float energyvalx4 = 0.0f; \n" 00121 #endif 00122 #if UNROLLX == 8 00123 " float energyvalx5 = 0.0f; \n" 00124 " float energyvalx6 = 0.0f; \n" 00125 " float energyvalx7 = 0.0f; \n" 00126 " float energyvalx8 = 0.0f; \n" 00127 #endif 00128 " \n" 00129 " float gridspacing_u = gridspacing * BLOCKSIZEX; \n" 00130 " \n" 00131 " int atomid; \n" 00132 " for (atomid=0; atomid<numatoms; atomid++) { \n" 00133 " float dy = coory - atominfo[atomid].y; \n" 00134 " float dyz2 = (dy * dy) + atominfo[atomid].z; \n" 00135 " \n" 00136 " float dx1 = coorx - atominfo[atomid].x; \n" 00137 #if UNROLLX >= 4 00138 " float dx2 = dx1 + gridspacing_u; \n" 00139 " float dx3 = dx2 + gridspacing_u; \n" 00140 " float dx4 = dx3 + gridspacing_u; \n" 00141 #endif 00142 #if UNROLLX == 8 00143 " float dx5 = dx4 + gridspacing_u; \n" 00144 " float dx6 = dx5 + gridspacing_u; \n" 00145 " float dx7 = dx6 + gridspacing_u; \n" 00146 " float dx8 = dx7 + gridspacing_u; \n" 00147 #endif 00148 " \n" 00149 " energyvalx1 += atominfo[atomid].w * native_rsqrt(dx1*dx1 + dyz2); \n" 00150 #if UNROLLX >= 4 00151 " energyvalx2 += atominfo[atomid].w * native_rsqrt(dx2*dx2 + dyz2); \n" 00152 " energyvalx3 += atominfo[atomid].w * native_rsqrt(dx3*dx3 + dyz2); \n" 00153 " energyvalx4 += atominfo[atomid].w * native_rsqrt(dx4*dx4 + dyz2); \n" 00154 #endif 00155 #if UNROLLX == 8 00156 " energyvalx5 += atominfo[atomid].w * native_rsqrt(dx5*dx5 + dyz2); \n" 00157 " energyvalx6 += atominfo[atomid].w * native_rsqrt(dx6*dx6 + dyz2); \n" 00158 " energyvalx7 += atominfo[atomid].w * native_rsqrt(dx7*dx7 + dyz2); \n" 00159 " energyvalx8 += atominfo[atomid].w * native_rsqrt(dx8*dx8 + dyz2); \n" 00160 #endif 00161 " } \n" 00162 " \n" 00163 " energy[outaddr ] += energyvalx1; \n" 00164 #if UNROLLX >= 4 00165 " energy[outaddr+1*BLOCKSIZEX] += energyvalx2; \n" 00166 " energy[outaddr+2*BLOCKSIZEX] += energyvalx3; \n" 00167 " energy[outaddr+3*BLOCKSIZEX] += energyvalx4; \n" 00168 #endif 00169 #if UNROLLX == 8 00170 " energy[outaddr+4*BLOCKSIZEX] += energyvalx5; \n" 00171 " energy[outaddr+5*BLOCKSIZEX] += energyvalx6; \n" 00172 " energy[outaddr+6*BLOCKSIZEX] += energyvalx7; \n" 00173 " energy[outaddr+7*BLOCKSIZEX] += energyvalx8; \n" 00174 #endif 00175 "} \n" 00176 " \n" 00177 " \n" 00178 " \n" 00179 "__kernel __attribute__((reqd_work_group_size(V4BLOCKSIZEX, V4BLOCKSIZEY, 1))) \n" 00180 "void clenergy_vec4(int numatoms, float gridspacing, __global float *energy, __constant float4 *atominfo) { \n" 00181 " unsigned int xindex = (get_global_id(0) - get_local_id(0)) * V4UNROLLX + get_local_id(0); \n" 00182 " unsigned int yindex = get_global_id(1); \n" 00183 " unsigned int outaddr = get_global_size(0) * V4UNROLLX * yindex + xindex;\n" 00184 " \n" 00185 " float coory = gridspacing * yindex; \n" 00186 " float coorx = gridspacing * xindex; \n" 00187 " \n" 00188 " float4 energyvalx = 0.f; \n" 00189 #if V4UNROLLX == 8 00190 " float4 energyvalx2 = 0.f; \n" 00191 #endif 00192 " \n" 00193 " float4 gridspacing_u4 = { 0.f, 1.f, 2.f, 3.f }; \n" 00194 " gridspacing_u4 *= gridspacing * V4BLOCKSIZEX; \n" 00195 " \n" 00196 " int atomid; \n" 00197 " for (atomid=0; atomid<numatoms; atomid++) { \n" 00198 " float dy = coory - atominfo[atomid].y; \n" 00199 " float dyz2 = (dy * dy) + atominfo[atomid].z; \n" 00200 " \n" 00201 " float4 dx = gridspacing_u4 + (coorx - atominfo[atomid].x); \n" 00202 " energyvalx += atominfo[atomid].w * native_rsqrt(dx*dx + dyz2); \n" 00203 #if V4UNROLLX == 8 00204 " dx += (4.0f * V4BLOCKSIZEX); \n" 00205 " energyvalx2 += atominfo[atomid].w * native_rsqrt(dx*dx + dyz2); \n" 00206 #endif 00207 " } \n" 00208 " \n" 00209 " energy[outaddr ] += energyvalx.x; \n" 00210 " energy[outaddr+1*V4BLOCKSIZEX] += energyvalx.y; \n" 00211 " energy[outaddr+2*V4BLOCKSIZEX] += energyvalx.z; \n" 00212 " energy[outaddr+3*V4BLOCKSIZEX] += energyvalx.w; \n" 00213 #if V4UNROLLX == 8 00214 " energy[outaddr+4*V4BLOCKSIZEX] += energyvalx2.x; \n" 00215 " energy[outaddr+5*V4BLOCKSIZEX] += energyvalx2.y; \n" 00216 " energy[outaddr+6*V4BLOCKSIZEX] += energyvalx2.z; \n" 00217 " energy[outaddr+7*V4BLOCKSIZEX] += energyvalx2.w; \n" 00218 #endif 00219 "} \n" 00220 " \n"; 00221 00222 00223 // required GPU array padding to match thread block size 00224 // XXX note: this code requires block size dimensions to be a power of two 00225 #define TILESIZEX BLOCKSIZEX*UNROLLX 00226 #define TILESIZEY BLOCKSIZEY*UNROLLY 00227 #define GPU_X_ALIGNMASK (TILESIZEX - 1) 00228 #define GPU_Y_ALIGNMASK (TILESIZEY - 1) 00229 00230 #define V4TILESIZEX V4BLOCKSIZEX*V4UNROLLX 00231 #define V4TILESIZEY V4BLOCKSIZEY*V4UNROLLY 00232 #define V4GPU_X_ALIGNMASK (V4TILESIZEX - 1) 00233 #define V4GPU_Y_ALIGNMASK (V4TILESIZEY - 1) 00234 00235 static int copyatomstoconstbuf(cl_command_queue clcmdq, cl_mem datominfo, 00236 const float *atoms, int count, float zplane) { 00237 if (count > MAXATOMS) { 00238 printf("Atom count exceeds constant buffer storage capacity\n"); 00239 return -1; 00240 } 00241 00242 float atompre[4*MAXATOMS]; 00243 int i; 00244 for (i=0; i<count*4; i+=4) { 00245 atompre[i ] = atoms[i ]; 00246 atompre[i + 1] = atoms[i + 1]; 00247 float dz = zplane - atoms[i + 2]; 00248 atompre[i + 2] = dz*dz; 00249 atompre[i + 3] = atoms[i + 3]; 00250 } 00251 00252 cl_int clerr = CL_SUCCESS; 00253 clerr = clEnqueueWriteBuffer(clcmdq, datominfo, CL_TRUE, 0, count * sizeof(cl_float4), (void *) atompre, 0, NULL, NULL); 00254 // CLERR 00255 00256 return 0; 00257 } 00258 00259 00260 int vmd_opencl_vol_cpotential(long int natoms, float* atoms, float* grideners, 00261 long int numplane, long int numcol, long int numpt, 00262 float gridspacing) { 00263 enthrparms parms; 00264 wkf_timerhandle globaltimer; 00265 double totalruntime; 00266 int rc=0; 00267 int deviceCount = 1; // hard coded for now 00268 int numprocs = 1; // hard coded for now 00269 00270 /* take the lesser of the number of CPUs and GPUs */ 00271 /* and execute that many threads */ 00272 if (deviceCount < numprocs) { 00273 numprocs = deviceCount; 00274 } 00275 00276 printf("Using %d OpenCL devices\n", numprocs); 00277 int usevec4=0; 00278 if (getenv("VMDDCSVEC4")!=NULL) 00279 usevec4=1; 00280 00281 if (usevec4) { 00282 printf("OpenCL padded grid size: %ld x %ld x %ld\n", 00283 (numpt + V4GPU_X_ALIGNMASK) & ~(V4GPU_X_ALIGNMASK), 00284 (numcol + V4GPU_Y_ALIGNMASK) & ~(V4GPU_Y_ALIGNMASK), 00285 numplane); 00286 } else { 00287 printf("OpenCL padded grid size: %ld x %ld x %ld\n", 00288 (numpt + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK), 00289 (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK), 00290 numplane); 00291 } 00292 00293 parms.atoms = atoms; 00294 parms.grideners = grideners; 00295 parms.numplane = numplane; 00296 parms.numcol = numcol; 00297 parms.numpt = numpt; 00298 parms.natoms = natoms; 00299 parms.gridspacing = gridspacing; 00300 00301 globaltimer = wkf_timer_create(); 00302 wkf_timer_start(globaltimer); 00303 00304 /* spawn child threads to do the work */ 00305 wkf_tasktile_t tile; 00306 tile.start=0; 00307 tile.end=numplane; 00308 rc = wkf_threadlaunch(numprocs, &parms, openclenergythread, &tile); 00309 00310 // Measure GFLOPS 00311 wkf_timer_stop(globaltimer); 00312 totalruntime = wkf_timer_time(globaltimer); 00313 wkf_timer_destroy(globaltimer); 00314 00315 if (!rc) { 00316 double atomevalssec = ((double) numplane * numcol * numpt * natoms) / (totalruntime * 1000000000.0); 00317 printf(" %g billion atom evals/second, %g GFLOPS\n", 00318 atomevalssec, atomevalssec * FLOPSPERATOMEVAL); 00319 } else { 00320 msgWarn << "An OpenCL device encountered an unrecoverable error." << sendmsg; 00321 msgWarn << "Calculation will continue using the main CPU." << sendmsg; 00322 } 00323 return rc; 00324 } 00325 00326 00327 cl_program vmd_opencl_compile_volcpotential_pgm(cl_context clctx, cl_device_id *cldevs, int &clerr) { 00328 cl_program clpgm = NULL; 00329 00330 clpgm = clCreateProgramWithSource(clctx, 1, &clenergysrc, NULL, &clerr); 00331 CLERR 00332 00333 char clcompileflags[4096]; 00334 sprintf(clcompileflags, 00335 "-DUNROLLX=%d -DUNROLLY=%d -DBLOCKSIZEX=%d -DBLOCKSIZEY=%d -DBLOCKSIZE=%d " 00336 "-DV4UNROLLX=%d -DV4UNROLLY=%d -DV4BLOCKSIZEX=%d -DV4BLOCKSIZEY=%d -DV4BLOCKSIZE=%d " 00337 "-cl-fast-relaxed-math -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros", 00338 UNROLLX, UNROLLY, BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZE, 00339 V4UNROLLX, V4UNROLLY, V4BLOCKSIZEX, V4BLOCKSIZEY, V4BLOCKSIZE); 00340 00341 clerr = clBuildProgram(clpgm, 0, NULL, clcompileflags, NULL, NULL); 00342 if (clerr != CL_SUCCESS) 00343 printf(" compilation failed!\n"); 00344 00345 if (cldevs) { 00346 char buildlog[8192]; 00347 size_t len=0; 00348 clerr = clGetProgramBuildInfo(clpgm, cldevs[0], CL_PROGRAM_BUILD_LOG, sizeof(buildlog), buildlog, &len); 00349 if (len > 1) { 00350 printf("OpenCL compilation log:\n"); 00351 printf(" '%s'\n", buildlog); 00352 } 00353 CLERR 00354 } 00355 00356 return clpgm; 00357 } 00358 00359 00360 00361 static void * openclenergythread(void *voidparms) { 00362 size_t volsize[3], Gsz[3], Bsz[3]; 00363 cl_int clerr = CL_SUCCESS; 00364 cl_mem devenergy = NULL; 00365 cl_mem datominfo = NULL; 00366 float *hostenergy = NULL; 00367 enthrparms *parms = NULL; 00368 00369 int threadid=0; 00370 00371 wkf_threadlaunch_getid(voidparms, &threadid, NULL); 00372 wkf_threadlaunch_getdata(voidparms, (void **) &parms); 00373 00374 /* 00375 * copy in per-thread parameters 00376 */ 00377 const float *atoms = parms->atoms; 00378 float* grideners = parms->grideners; 00379 const long int numplane = parms->numplane; 00380 const long int numcol = parms->numcol; 00381 const long int numpt = parms->numpt; 00382 const long int natoms = parms->natoms; 00383 const float gridspacing = parms->gridspacing; 00384 double lasttime, totaltime; 00385 00386 printf("OpenCL worker[%d] initializing...\n", threadid); 00387 cl_platform_id clplatid = vmd_cl_get_platform_index(0); 00388 cl_context_properties clctxprops[] = {(cl_context_properties) CL_CONTEXT_PLATFORM, (cl_context_properties) clplatid, (cl_context_properties) 0}; 00389 #if 0 00390 // 00391 // On the IBM "Blue Drop" Power 775 supercomputer, there are no GPUs, but 00392 // by using OpenCL on the CPU device type, we can better exploit the 00393 // vector units. The final NSF/NCSA Blue Waters machine ended up being 00394 // a Cray XE6/XK7, so this code isn't relevant for production use 00395 // currently, but there may be other cases where this strategy will 00396 // be useful in the future. 00397 // 00398 cl_context clctx = clCreateContextFromType(clctxprops, CL_DEVICE_TYPE_CPU, NULL, NULL, &clerr); 00399 #else 00400 cl_context clctx = clCreateContextFromType(clctxprops, CL_DEVICE_TYPE_GPU, NULL, NULL, &clerr); 00401 #endif 00402 CLERR 00403 00404 size_t parmsz; 00405 clerr |= clGetContextInfo(clctx, CL_CONTEXT_DEVICES, 0, NULL, &parmsz); 00406 CLERR 00407 00408 cl_device_id* cldevs = (cl_device_id *) malloc(parmsz); 00409 clerr |= clGetContextInfo(clctx, CL_CONTEXT_DEVICES, parmsz, cldevs, NULL); 00410 CLERR 00411 00412 cl_command_queue clcmdq = clCreateCommandQueue(clctx, cldevs[0], 0, &clerr); 00413 CLERR 00414 00415 cl_program clpgm = vmd_opencl_compile_volcpotential_pgm(clctx, cldevs, clerr); 00416 CLERR 00417 00418 cl_kernel clenergy = clCreateKernel(clpgm, "clenergy", &clerr); 00419 cl_kernel clenergyvec4 = clCreateKernel(clpgm, "clenergy_vec4", &clerr); 00420 CLERR 00421 printf("OpenCL worker[%d] ready.\n", threadid); 00422 00423 // setup OpenCL grid and block sizes 00424 int usevec4=0; 00425 if (getenv("VMDDCSVEC4")!=NULL) 00426 usevec4=1; 00427 00428 if (usevec4) { 00429 // setup energy grid size, padding arrays for peak GPU memory performance 00430 volsize[0] = (numpt + V4GPU_X_ALIGNMASK) & ~(V4GPU_X_ALIGNMASK); 00431 volsize[1] = (numcol + V4GPU_Y_ALIGNMASK) & ~(V4GPU_Y_ALIGNMASK); 00432 volsize[2] = 1; // we only do one plane at a time 00433 Bsz[0] = V4BLOCKSIZEX; 00434 Bsz[1] = V4BLOCKSIZEY; 00435 Bsz[2] = 1; 00436 Gsz[0] = volsize[0] / V4UNROLLX; 00437 Gsz[1] = volsize[1] / V4UNROLLY; 00438 Gsz[2] = volsize[2]; 00439 } else { 00440 // setup energy grid size, padding arrays for peak GPU memory performance 00441 volsize[0] = (numpt + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK); 00442 volsize[1] = (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK); 00443 volsize[2] = 1; // we only do one plane at a time 00444 Bsz[0] = BLOCKSIZEX; 00445 Bsz[1] = BLOCKSIZEY; 00446 Bsz[2] = 1; 00447 Gsz[0] = volsize[0] / UNROLLX; 00448 Gsz[1] = volsize[1] / UNROLLY; 00449 Gsz[2] = volsize[2]; 00450 } 00451 00452 00453 int volmemsz = sizeof(float) * volsize[0] * volsize[1] * volsize[2]; 00454 00455 printf("Thread %d started for OpenCL device %d...\n", threadid, threadid); 00456 wkf_timerhandle timer = wkf_timer_create(); 00457 wkf_timer_start(timer); 00458 wkfmsgtimer * msgt = wkf_msg_timer_create(5); 00459 00460 // Allocate DMA buffers with some extra padding at the end so that 00461 // multiple OpenCL devices aren't DMAing too close to each other, for NUMA.. 00462 #define DMABUFPADSIZE (32 * 1024) 00463 00464 hostenergy = (float *) malloc(volmemsz); // allocate working buffer 00465 00466 devenergy = clCreateBuffer(clctx, CL_MEM_READ_WRITE, volmemsz, NULL, NULL); 00467 CLERR 00468 00469 datominfo = clCreateBuffer(clctx, CL_MEM_READ_ONLY, MAXATOMS * sizeof(cl_float4), NULL, NULL); 00470 CLERR 00471 00472 00473 // For each point in the cube... 00474 int iterations=0; 00475 int computedplanes=0; 00476 wkf_tasktile_t tile; 00477 while (wkf_threadlaunch_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) { 00478 int k; 00479 for (k=tile.start; k<tile.end; k++) { 00480 int y; 00481 int atomstart; 00482 float zplane = k * (float) gridspacing; 00483 computedplanes++; // track work done by this GPU for progress reporting 00484 00485 // Copy energy grid into GPU 16-element padded input 00486 for (y=0; y<numcol; y++) { 00487 long eneraddr = k*numcol*numpt + y*numpt; 00488 memcpy(&hostenergy[y*volsize[0]], &grideners[eneraddr], numpt * sizeof(float)); 00489 } 00490 00491 // Copy the Host input data to the GPU.. 00492 clEnqueueWriteBuffer(clcmdq, devenergy, CL_TRUE, 0, volmemsz, hostenergy, 0, NULL, NULL); 00493 CLERR // check and clear any existing errors 00494 00495 lasttime = wkf_timer_timenow(timer); 00496 for (atomstart=0; atomstart<natoms; atomstart+=MAXATOMS) { 00497 iterations++; 00498 int runatoms; 00499 int atomsremaining = natoms - atomstart; 00500 if (atomsremaining > MAXATOMS) 00501 runatoms = MAXATOMS; 00502 else 00503 runatoms = atomsremaining; 00504 00505 // copy the next group of atoms to the GPU 00506 if (copyatomstoconstbuf(clcmdq, datominfo, 00507 atoms + 4*atomstart, runatoms, zplane)) 00508 return NULL; 00509 00510 cl_kernel clkern; 00511 if (usevec4) 00512 clkern = clenergyvec4; 00513 else 00514 clkern = clenergy; 00515 00516 // RUN the kernel... 00517 clerr |= clSetKernelArg(clkern, 0, sizeof(int), &runatoms); 00518 clerr |= clSetKernelArg(clkern, 1, sizeof(float), &gridspacing); 00519 clerr |= clSetKernelArg(clkern, 2, sizeof(cl_mem), &devenergy); 00520 clerr |= clSetKernelArg(clkern, 3, sizeof(cl_mem), &datominfo); 00521 CLERR 00522 cl_event event; 00523 #if 0 00524 printf("Gsz: %ld %ld %ld Bsz: %ld %ld %ld\n", 00525 Gsz[0], Gsz[1], Gsz[2], Bsz[0], Bsz[1], Bsz[2]); 00526 #endif 00527 clerr |= clEnqueueNDRangeKernel(clcmdq, clkern, 2, NULL, Gsz, Bsz, 0, NULL, &event); 00528 CLERR 00529 00530 clerr |= clWaitForEvents(1, &event); 00531 clerr |= clReleaseEvent(event); 00532 CLERR // check and clear any existing errors 00533 } 00534 clFinish(clcmdq); 00535 00536 // Copy the GPU output data back to the host and use/store it.. 00537 clEnqueueReadBuffer(clcmdq, devenergy, CL_TRUE, 0, volmemsz, hostenergy, 00538 0, NULL, NULL); 00539 00540 CLERR // check and clear any existing errors 00541 00542 // Copy GPU blocksize padded array back down to the original size 00543 for (y=0; y<numcol; y++) { 00544 long eneraddr = k*numcol*numpt + y*numpt; 00545 memcpy(&grideners[eneraddr], &hostenergy[y*volsize[0]], numpt * sizeof(float)); 00546 } 00547 00548 totaltime = wkf_timer_timenow(timer); 00549 if (wkf_msg_timer_timeout(msgt)) { 00550 // XXX: we have to use printf here as msgInfo is not thread-safe yet. 00551 printf("thread[%d] plane %d/%ld (%d computed) time %.2f, elapsed %.1f, est. total: %.1f\n", 00552 threadid, k, numplane, computedplanes, 00553 totaltime - lasttime, totaltime, 00554 totaltime * numplane / (k+1)); 00555 } 00556 } 00557 } 00558 00559 wkf_timer_destroy(timer); // free timer 00560 wkf_msg_timer_destroy(msgt); // free timer 00561 free(hostenergy); // free working buffer 00562 00563 printf("destroying context, programs, etc\n"); 00564 clReleaseMemObject(devenergy); 00565 clReleaseMemObject(datominfo); 00566 clReleaseKernel(clenergy); 00567 clReleaseKernel(clenergyvec4); 00568 clReleaseProgram(clpgm); 00569 clReleaseCommandQueue(clcmdq); 00570 clReleaseContext(clctx); 00571 printf("done.\n"); 00572 00573 CLERR // check and clear any existing errors 00574 00575 return NULL; 00576 } 00577 00578 00579 00580