00001 /*************************************************************************** 00002 *cr 00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the 00004 *cr University of Illinois 00005 *cr All Rights Reserved 00006 *cr 00007 ***************************************************************************/ 00008 /*************************************************************************** 00009 * RCS INFORMATION: 00010 * 00011 * $RCSfile: CUDAQuickSurf.cu,v $ 00012 * $Author: johns $ $Locker: $ $State: Exp $ 00013 * $Revision: 1.97 $ $Date: 2021年12月21日 05:30:12 $ 00014 * 00015 ***************************************************************************/ 00042 #include <stdio.h> 00043 #include <stdlib.h> 00044 #include <string.h> 00045 #include <cuda.h> 00046 #if CUDART_VERSION >= 9000 00047 #include <cuda_fp16.h> // need to explicitly include for CUDA 9.0 00048 #endif 00049 00050 #if CUDART_VERSION < 4000 00051 #error The VMD QuickSurf feature requires CUDA 4.0 or later 00052 #endif 00053 00054 #include "Inform.h" 00055 #include "utilities.h" 00056 #include "WKFThreads.h" 00057 #include "WKFUtils.h" 00058 #include "CUDAKernels.h" 00059 #include "CUDASpatialSearch.h" 00060 #include "CUDAMarchingCubes.h" 00061 #include "CUDAQuickSurf.h" 00062 00063 #include "DispCmds.h" 00064 #include "VMDDisplayList.h" 00065 00066 #if 1 00067 #define CUERR { cudaError_t err; \ 00068 if ((err = cudaGetLastError()) != cudaSuccess) { \ 00069 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \ 00070 printf("Thread aborting...\n"); \ 00071 return NULL; }} 00072 #else 00073 #define CUERR 00074 #endif 00075 00076 00077 // 00078 // general operator overload helper routines 00079 // 00080 00081 // "*" operator 00082 inline __host__ __device__ float3 operator*(float3 a, float b) { 00083 return make_float3(b * a.x, b * a.y, b * a.z); 00084 } 00085 00086 // "+=" operator 00087 inline __host__ __device__ void operator+=(float3 &a, float3 b) { 00088 a.x += b.x; 00089 a.y += b.y; 00090 a.z += b.z; 00091 } 00092 00093 // down-convert float4 to float3 00094 inline __device__ float3 make_float3(float4 a) { 00095 float3 b; 00096 b.x = a.x; 00097 b.y = a.y; 00098 b.z = a.z; 00099 return b; 00100 } 00101 00102 00103 // 00104 // density format conversion routines 00105 // 00106 00107 // no-op conversion for float to float 00108 inline __device__ void convert_density(float & df, float df2) { 00109 df = df2; 00110 } 00111 00112 // Convert float (32-bit) to half-precision (16-bit floating point) stored 00113 // into an unsigned short (16-bit integer type). 00114 inline __device__ void convert_density(unsigned short & dh, float df2) { 00115 dh = __float2half_rn(df2); 00116 } 00117 00118 00119 00120 // 00121 // color format conversion routines 00122 // 00123 00124 // conversion for float3 to float3 00125 inline __device__ void convert_color(float3 & cf, float3 cf2, float invisovalue) { 00126 cf = cf2 * invisovalue; 00127 } 00128 00129 00130 // Convert float3 colors to uchar4, performing the necessary bias, scaling, 00131 // and range clamping so we don't encounter integer wraparound, etc. 00132 inline __device__ void convert_color(uchar4 & cu, float3 cf, float invisovalue) { 00133 // conversion to GLubyte format, Table 2.6, p. 44 of OpenGL spec 1.2.1 00134 // c = f * (2^8-1) 00135 00136 #if 1 00137 float3 cftmp = cf * invisovalue; 00138 cu = make_uchar4(fminf(cftmp.x * 255.0f, 255.0f), 00139 fminf(cftmp.y * 255.0f, 255.0f), 00140 fminf(cftmp.z * 255.0f, 255.0f), 00141 255); 00142 #else 00143 // scale color values to prevent overflow, 00144 // and convert to fixed-point representation all at once 00145 float invmaxcolscale = __frcp_rn(fmaxf(fmaxf(fmaxf(cf.x, cf.y), cf.z), 1.0f)) * 255.0f; 00146 00147 // clamp color values to prevent integer wraparound 00148 cu = make_uchar4(cf.x * invmaxcolscale, 00149 cf.y * invmaxcolscale, 00150 cf.z * invmaxcolscale, 00151 255); 00152 #endif 00153 } 00154 00155 00156 // convert uchar4 colors to float3 00157 inline __device__ void convert_color(float3 & cf, uchar4 cu) { 00158 const float i2f = 1.0f / 255.0f; 00159 cf.x = cu.x * i2f; 00160 cf.y = cu.y * i2f; 00161 cf.z = cu.z * i2f; 00162 } 00163 00164 00165 // 00166 // Restrict macro to make it easy to do perf tuning tests 00167 // 00168 #if 1 && (CUDART_VERSION >= 4000) 00169 #define RESTRICT __restrict__ 00170 #else 00171 #define RESTRICT 00172 #endif 00173 00174 // 00175 // Parameters for linear-time range-limited gaussian density kernels 00176 // 00177 #define GGRIDSZ 8.0f 00178 #define GBLOCKSZX 8 00179 #define GBLOCKSZY 8 00180 00181 #if 1 00182 #define GTEXBLOCKSZZ 2 00183 #define GTEXUNROLL 4 00184 #define GBLOCKSZZ 2 00185 #define GUNROLL 4 00186 #else 00187 #define GTEXBLOCKSZZ 8 00188 #define GTEXUNROLL 1 00189 #define GBLOCKSZZ 8 00190 #define GUNROLL 1 00191 #endif 00192 00193 #define MAXTHRDENS ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ ) 00194 #if __CUDA_ARCH__ >= 600 00195 #define MINBLOCKDENS 16 00196 #elif __CUDA_ARCH__ >= 300 00197 #define MINBLOCKDENS 16 00198 #elif __CUDA_ARCH__ >= 200 00199 #define MINBLOCKDENS 1 00200 #else 00201 #define MINBLOCKDENS 1 00202 #endif 00203 00204 00205 // 00206 // Templated version of the density map kernel to handle multiple 00207 // data formats for the output density volume and volumetric texture. 00208 // This variant of the density map algorithm normalizes densities so 00209 // that the target isovalue is a density of 1.0. 00210 // 00211 template<class DENSITY, class VOLTEX> 00212 __global__ static void 00213 __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS ) 00214 gaussdensity_fast_tex_norm(int natoms, 00215 const float4 * RESTRICT sorted_xyzr, 00216 const float4 * RESTRICT sorted_color, 00217 int3 volsz, 00218 int3 acncells, 00219 float acgridspacing, 00220 float invacgridspacing, 00221 const uint2 * RESTRICT cellStartEnd, 00222 float gridspacing, unsigned int z, 00223 DENSITY * RESTRICT densitygrid, 00224 VOLTEX * RESTRICT voltexmap, 00225 float invisovalue) { 00226 unsigned int xindex = (blockIdx.x * blockDim.x) + threadIdx.x; 00227 unsigned int yindex = (blockIdx.y * blockDim.y) + threadIdx.y; 00228 unsigned int zindex = ((blockIdx.z * blockDim.z) + threadIdx.z) * GTEXUNROLL; 00229 00230 // shave register use slightly 00231 unsigned int outaddr = zindex * volsz.x * volsz.y + 00232 yindex * volsz.x + xindex; 00233 00234 // early exit if this thread is outside of the grid bounds 00235 if (xindex >= volsz.x || yindex >= volsz.y || zindex >= volsz.z) 00236 return; 00237 00238 zindex += z; 00239 00240 // compute ac grid index of lower corner minus gaussian radius 00241 int xabmin = ((blockIdx.x * blockDim.x) * gridspacing - acgridspacing) * invacgridspacing; 00242 int yabmin = ((blockIdx.y * blockDim.y) * gridspacing - acgridspacing) * invacgridspacing; 00243 int zabmin = ((z + blockIdx.z * blockDim.z * GTEXUNROLL) * gridspacing - acgridspacing) * invacgridspacing; 00244 00245 // compute ac grid index of upper corner plus gaussian radius 00246 int xabmax = (((blockIdx.x+1) * blockDim.x) * gridspacing + acgridspacing) * invacgridspacing; 00247 int yabmax = (((blockIdx.y+1) * blockDim.y) * gridspacing + acgridspacing) * invacgridspacing; 00248 int zabmax = ((z + (blockIdx.z+1) * blockDim.z * GTEXUNROLL) * gridspacing + acgridspacing) * invacgridspacing; 00249 00250 xabmin = (xabmin < 0) ? 0 : xabmin; 00251 yabmin = (yabmin < 0) ? 0 : yabmin; 00252 zabmin = (zabmin < 0) ? 0 : zabmin; 00253 xabmax = (xabmax >= acncells.x-1) ? acncells.x-1 : xabmax; 00254 yabmax = (yabmax >= acncells.y-1) ? acncells.y-1 : yabmax; 00255 zabmax = (zabmax >= acncells.z-1) ? acncells.z-1 : zabmax; 00256 00257 float coorx = gridspacing * xindex; 00258 float coory = gridspacing * yindex; 00259 float coorz = gridspacing * zindex; 00260 00261 float densityval1=0.0f; 00262 float3 densitycol1=make_float3(0.0f, 0.0f, 0.0f); 00263 #if GTEXUNROLL >= 2 00264 float densityval2=0.0f; 00265 float3 densitycol2=densitycol1; 00266 #endif 00267 #if GTEXUNROLL >= 4 00268 float densityval3=0.0f; 00269 float3 densitycol3=densitycol1; 00270 float densityval4=0.0f; 00271 float3 densitycol4=densitycol1; 00272 #endif 00273 00274 int acplanesz = acncells.x * acncells.y; 00275 int xab, yab, zab; 00276 for (zab=zabmin; zab<=zabmax; zab++) { 00277 for (yab=yabmin; yab<=yabmax; yab++) { 00278 for (xab=xabmin; xab<=xabmax; xab++) { 00279 int abcellidx = zab * acplanesz + yab * acncells.x + xab; 00280 // this biggest latency hotspot in the kernel, if we could improve 00281 // packing of the grid cell map, we'd likely improve performance 00282 uint2 atomstartend = cellStartEnd[abcellidx]; 00283 if (atomstartend.x != GRID_CELL_EMPTY) { 00284 unsigned int atomid; 00285 for (atomid=atomstartend.x; atomid<atomstartend.y; atomid++) { 00286 float4 atom = sorted_xyzr[atomid]; 00287 float4 color = sorted_color[atomid]; 00288 float dx = coorx - atom.x; 00289 float dy = coory - atom.y; 00290 float dxy2 = dx*dx + dy*dy; 00291 00292 float dz = coorz - atom.z; 00293 float r21 = (dxy2 + dz*dz) * atom.w; 00294 float tmp1 = exp2f(r21); 00295 densityval1 += tmp1; 00296 densitycol1.x += tmp1 * color.x; 00297 densitycol1.y += tmp1 * color.y; 00298 densitycol1.z += tmp1 * color.z; 00299 00300 #if GTEXUNROLL >= 2 00301 float dz2 = dz + gridspacing; 00302 float r22 = (dxy2 + dz2*dz2) * atom.w; 00303 float tmp2 = exp2f(r22); 00304 densityval2 += tmp2; 00305 densitycol2.x += tmp2 * color.x; 00306 densitycol2.y += tmp2 * color.y; 00307 densitycol2.z += tmp2 * color.z; 00308 #endif 00309 #if GTEXUNROLL >= 4 00310 float dz3 = dz2 + gridspacing; 00311 float r23 = (dxy2 + dz3*dz3) * atom.w; 00312 float tmp3 = exp2f(r23); 00313 densityval3 += tmp3; 00314 densitycol3.x += tmp3 * color.x; 00315 densitycol3.y += tmp3 * color.y; 00316 densitycol3.z += tmp3 * color.z; 00317 00318 float dz4 = dz3 + gridspacing; 00319 float r24 = (dxy2 + dz4*dz4) * atom.w; 00320 float tmp4 = exp2f(r24); 00321 densityval4 += tmp4; 00322 densitycol4.x += tmp4 * color.x; 00323 densitycol4.y += tmp4 * color.y; 00324 densitycol4.z += tmp4 * color.z; 00325 #endif 00326 } 00327 } 00328 } 00329 } 00330 } 00331 00332 DENSITY densityout; 00333 VOLTEX texout; 00334 convert_density(densityout, densityval1 * invisovalue); 00335 densitygrid[outaddr ] = densityout; 00336 convert_color(texout, densitycol1, invisovalue); 00337 voltexmap[outaddr ] = texout; 00338 00339 #if GTEXUNROLL >= 2 00340 int planesz = volsz.x * volsz.y; 00341 convert_density(densityout, densityval2 * invisovalue); 00342 densitygrid[outaddr + planesz] = densityout; 00343 convert_color(texout, densitycol2, invisovalue); 00344 voltexmap[outaddr + planesz] = texout; 00345 #endif 00346 #if GTEXUNROLL >= 4 00347 convert_density(densityout, densityval3 * invisovalue); 00348 densitygrid[outaddr + 2*planesz] = densityout; 00349 convert_color(texout, densitycol3, invisovalue); 00350 voltexmap[outaddr + 2*planesz] = texout; 00351 00352 convert_density(densityout, densityval4 * invisovalue); 00353 densitygrid[outaddr + 3*planesz] = densityout; 00354 convert_color(texout, densitycol4, invisovalue); 00355 voltexmap[outaddr + 3*planesz] = texout; 00356 #endif 00357 } 00358 00359 00360 __global__ static void 00361 __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS ) 00362 gaussdensity_fast_tex3f(int natoms, 00363 const float4 * RESTRICT sorted_xyzr, 00364 const float4 * RESTRICT sorted_color, 00365 int3 volsz, 00366 int3 acncells, 00367 float acgridspacing, 00368 float invacgridspacing, 00369 const uint2 * RESTRICT cellStartEnd, 00370 float gridspacing, unsigned int z, 00371 float * RESTRICT densitygrid, 00372 float3 * RESTRICT voltexmap, 00373 float invisovalue) { 00374 unsigned int xindex = (blockIdx.x * blockDim.x) + threadIdx.x; 00375 unsigned int yindex = (blockIdx.y * blockDim.y) + threadIdx.y; 00376 unsigned int zindex = ((blockIdx.z * blockDim.z) + threadIdx.z) * GTEXUNROLL; 00377 00378 // shave register use slightly 00379 unsigned int outaddr = zindex * volsz.x * volsz.y + 00380 yindex * volsz.x + xindex; 00381 00382 // early exit if this thread is outside of the grid bounds 00383 if (xindex >= volsz.x || yindex >= volsz.y || zindex >= volsz.z) 00384 return; 00385 00386 zindex += z; 00387 00388 // compute ac grid index of lower corner minus gaussian radius 00389 int xabmin = ((blockIdx.x * blockDim.x) * gridspacing - acgridspacing) * invacgridspacing; 00390 int yabmin = ((blockIdx.y * blockDim.y) * gridspacing - acgridspacing) * invacgridspacing; 00391 int zabmin = ((z + blockIdx.z * blockDim.z * GTEXUNROLL) * gridspacing - acgridspacing) * invacgridspacing; 00392 00393 // compute ac grid index of upper corner plus gaussian radius 00394 int xabmax = (((blockIdx.x+1) * blockDim.x) * gridspacing + acgridspacing) * invacgridspacing; 00395 int yabmax = (((blockIdx.y+1) * blockDim.y) * gridspacing + acgridspacing) * invacgridspacing; 00396 int zabmax = ((z + (blockIdx.z+1) * blockDim.z * GTEXUNROLL) * gridspacing + acgridspacing) * invacgridspacing; 00397 00398 xabmin = (xabmin < 0) ? 0 : xabmin; 00399 yabmin = (yabmin < 0) ? 0 : yabmin; 00400 zabmin = (zabmin < 0) ? 0 : zabmin; 00401 xabmax = (xabmax >= acncells.x-1) ? acncells.x-1 : xabmax; 00402 yabmax = (yabmax >= acncells.y-1) ? acncells.y-1 : yabmax; 00403 zabmax = (zabmax >= acncells.z-1) ? acncells.z-1 : zabmax; 00404 00405 float coorx = gridspacing * xindex; 00406 float coory = gridspacing * yindex; 00407 float coorz = gridspacing * zindex; 00408 00409 float densityval1=0.0f; 00410 float3 densitycol1=make_float3(0.0f, 0.0f, 0.0f); 00411 #if GTEXUNROLL >= 2 00412 float densityval2=0.0f; 00413 float3 densitycol2=densitycol1; 00414 #endif 00415 #if GTEXUNROLL >= 4 00416 float densityval3=0.0f; 00417 float3 densitycol3=densitycol1; 00418 float densityval4=0.0f; 00419 float3 densitycol4=densitycol1; 00420 #endif 00421 00422 int acplanesz = acncells.x * acncells.y; 00423 int xab, yab, zab; 00424 for (zab=zabmin; zab<=zabmax; zab++) { 00425 for (yab=yabmin; yab<=yabmax; yab++) { 00426 for (xab=xabmin; xab<=xabmax; xab++) { 00427 int abcellidx = zab * acplanesz + yab * acncells.x + xab; 00428 // this biggest latency hotspot in the kernel, if we could improve 00429 // packing of the grid cell map, we'd likely improve performance 00430 uint2 atomstartend = cellStartEnd[abcellidx]; 00431 if (atomstartend.x != GRID_CELL_EMPTY) { 00432 unsigned int atomid; 00433 for (atomid=atomstartend.x; atomid<atomstartend.y; atomid++) { 00434 float4 atom = sorted_xyzr[atomid]; 00435 float3 color = make_float3(sorted_color[atomid]); 00436 float dx = coorx - atom.x; 00437 float dy = coory - atom.y; 00438 float dxy2 = dx*dx + dy*dy; 00439 00440 float dz = coorz - atom.z; 00441 float r21 = (dxy2 + dz*dz) * atom.w; 00442 float tmp1 = exp2f(r21); 00443 densityval1 += tmp1; 00444 densitycol1 += color * tmp1; 00445 00446 #if GTEXUNROLL >= 2 00447 float dz2 = dz + gridspacing; 00448 float r22 = (dxy2 + dz2*dz2) * atom.w; 00449 float tmp2 = exp2f(r22); 00450 densityval2 += tmp2; 00451 densitycol2 += color * tmp2; 00452 #endif 00453 #if GTEXUNROLL >= 4 00454 float dz3 = dz2 + gridspacing; 00455 float r23 = (dxy2 + dz3*dz3) * atom.w; 00456 float tmp3 = exp2f(r23); 00457 densityval3 += tmp3; 00458 densitycol3 += color * tmp3; 00459 00460 float dz4 = dz3 + gridspacing; 00461 float r24 = (dxy2 + dz4*dz4) * atom.w; 00462 float tmp4 = exp2f(r24); 00463 densityval4 += tmp4; 00464 densitycol4 += color * tmp4; 00465 #endif 00466 } 00467 } 00468 } 00469 } 00470 } 00471 00472 float3 texout; 00473 densitygrid[outaddr ] = densityval1; 00474 convert_color(texout, densitycol1, invisovalue); 00475 voltexmap[outaddr ] = texout; 00476 00477 #if GTEXUNROLL >= 2 00478 int planesz = volsz.x * volsz.y; 00479 densitygrid[outaddr + planesz] = densityval2; 00480 convert_color(texout, densitycol2, invisovalue); 00481 voltexmap[outaddr + planesz] = texout; 00482 #endif 00483 #if GTEXUNROLL >= 4 00484 densitygrid[outaddr + 2*planesz] = densityval3; 00485 convert_color(texout, densitycol3, invisovalue); 00486 voltexmap[outaddr + 2*planesz] = texout; 00487 00488 densitygrid[outaddr + 3*planesz] = densityval4; 00489 convert_color(texout, densitycol4, invisovalue); 00490 voltexmap[outaddr + 3*planesz] = texout; 00491 #endif 00492 } 00493 00494 00495 __global__ static void 00496 // __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS ) 00497 gaussdensity_fast(int natoms, 00498 const float4 * RESTRICT sorted_xyzr, 00499 int3 volsz, 00500 int3 acncells, 00501 float acgridspacing, 00502 float invacgridspacing, 00503 const uint2 * RESTRICT cellStartEnd, 00504 float gridspacing, unsigned int z, 00505 float * RESTRICT densitygrid) { 00506 unsigned int xindex = (blockIdx.x * blockDim.x) + threadIdx.x; 00507 unsigned int yindex = (blockIdx.y * blockDim.y) + threadIdx.y; 00508 unsigned int zindex = ((blockIdx.z * blockDim.z) + threadIdx.z) * GUNROLL; 00509 unsigned int outaddr = zindex * volsz.x * volsz.y + 00510 yindex * volsz.x + 00511 xindex; 00512 00513 // early exit if this thread is outside of the grid bounds 00514 if (xindex >= volsz.x || yindex >= volsz.y || zindex >= volsz.z) 00515 return; 00516 00517 zindex += z; 00518 00519 // compute ac grid index of lower corner minus gaussian radius 00520 int xabmin = ((blockIdx.x * blockDim.x) * gridspacing - acgridspacing) * invacgridspacing; 00521 int yabmin = ((blockIdx.y * blockDim.y) * gridspacing - acgridspacing) * invacgridspacing; 00522 int zabmin = ((z + blockIdx.z * blockDim.z * GUNROLL) * gridspacing - acgridspacing) * invacgridspacing; 00523 00524 // compute ac grid index of upper corner plus gaussian radius 00525 int xabmax = (((blockIdx.x+1) * blockDim.x) * gridspacing + acgridspacing) * invacgridspacing; 00526 int yabmax = (((blockIdx.y+1) * blockDim.y) * gridspacing + acgridspacing) * invacgridspacing; 00527 int zabmax = ((z + (blockIdx.z+1) * blockDim.z * GUNROLL) * gridspacing + acgridspacing) * invacgridspacing; 00528 00529 xabmin = (xabmin < 0) ? 0 : xabmin; 00530 yabmin = (yabmin < 0) ? 0 : yabmin; 00531 zabmin = (zabmin < 0) ? 0 : zabmin; 00532 xabmax = (xabmax >= acncells.x-1) ? acncells.x-1 : xabmax; 00533 yabmax = (yabmax >= acncells.y-1) ? acncells.y-1 : yabmax; 00534 zabmax = (zabmax >= acncells.z-1) ? acncells.z-1 : zabmax; 00535 00536 float coorx = gridspacing * xindex; 00537 float coory = gridspacing * yindex; 00538 float coorz = gridspacing * zindex; 00539 00540 float densityval1=0.0f; 00541 #if GUNROLL >= 2 00542 float densityval2=0.0f; 00543 #endif 00544 #if GUNROLL >= 4 00545 float densityval3=0.0f; 00546 float densityval4=0.0f; 00547 #endif 00548 00549 int acplanesz = acncells.x * acncells.y; 00550 int xab, yab, zab; 00551 for (zab=zabmin; zab<=zabmax; zab++) { 00552 for (yab=yabmin; yab<=yabmax; yab++) { 00553 for (xab=xabmin; xab<=xabmax; xab++) { 00554 int abcellidx = zab * acplanesz + yab * acncells.x + xab; 00555 uint2 atomstartend = cellStartEnd[abcellidx]; 00556 if (atomstartend.x != GRID_CELL_EMPTY) { 00557 unsigned int atomid; 00558 for (atomid=atomstartend.x; atomid<atomstartend.y; atomid++) { 00559 float4 atom = sorted_xyzr[atomid]; 00560 float dx = coorx - atom.x; 00561 float dy = coory - atom.y; 00562 float dxy2 = dx*dx + dy*dy; 00563 00564 float dz = coorz - atom.z; 00565 float r21 = (dxy2 + dz*dz) * atom.w; 00566 densityval1 += exp2f(r21); 00567 00568 #if GUNROLL >= 2 00569 float dz2 = dz + gridspacing; 00570 float r22 = (dxy2 + dz2*dz2) * atom.w; 00571 densityval2 += exp2f(r22); 00572 #endif 00573 #if GUNROLL >= 4 00574 float dz3 = dz2 + gridspacing; 00575 float r23 = (dxy2 + dz3*dz3) * atom.w; 00576 densityval3 += exp2f(r23); 00577 00578 float dz4 = dz3 + gridspacing; 00579 float r24 = (dxy2 + dz4*dz4) * atom.w; 00580 densityval4 += exp2f(r24); 00581 #endif 00582 } 00583 } 00584 } 00585 } 00586 } 00587 00588 densitygrid[outaddr ] = densityval1; 00589 #if GUNROLL >= 2 00590 int planesz = volsz.x * volsz.y; 00591 densitygrid[outaddr + planesz] = densityval2; 00592 #endif 00593 #if GUNROLL >= 4 00594 densitygrid[outaddr + 2*planesz] = densityval3; 00595 densitygrid[outaddr + 3*planesz] = densityval4; 00596 #endif 00597 } 00598 00599 00600 // per-GPU handle with various memory buffer pointers, etc. 00601 typedef struct { 00602 cudaDeviceProp deviceProp; 00603 int dev; 00604 int verbose; 00605 00607 long int natoms; 00608 int colorperatom; 00609 CUDAQuickSurf::VolTexFormat vtexformat; 00610 int acx; 00611 int acy; 00612 int acz; 00613 int gx; 00614 int gy; 00615 int gz; 00616 00617 CUDAMarchingCubes *mc; 00618 00619 float *devdensity; 00620 void *devvoltexmap; 00621 float4 *xyzr_d; 00622 float4 *sorted_xyzr_d; 00623 float4 *color_d; 00624 float4 *sorted_color_d; 00625 00626 unsigned int *atomIndex_d; 00627 unsigned int *atomHash_d; 00628 uint2 *cellStartEnd_d; 00629 00630 void *safety; 00631 00632 float3 *v3f_d; 00633 float3 *n3f_d; 00634 float3 *c3f_d; 00635 char3 *n3b_d; 00636 uchar4 *c4u_d; 00637 } qsurf_gpuhandle; 00638 00639 00640 CUDAQuickSurf::CUDAQuickSurf() { 00641 voidgpu = calloc(1, sizeof(qsurf_gpuhandle)); 00642 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu; 00643 00644 if (getenv("VMDQUICKSURFVERBOSE") != NULL) { 00645 gpuh->verbose = 1; 00646 int tmp = atoi(getenv("VMDQUICKSURFVERBOSE")); 00647 if (tmp > 0) 00648 gpuh->verbose = tmp; 00649 } 00650 00651 gpuh->vtexformat = RGB3F; // large size allocs by default 00652 00653 if (cudaGetDevice(&gpuh->dev) != cudaSuccess) { 00654 gpuh->dev = -1; // flag GPU as unusable 00655 } 00656 00657 if (cudaGetDeviceProperties(&gpuh->deviceProp, gpuh->dev) != cudaSuccess) { 00658 cudaError_t err = cudaGetLastError(); // eat error so next CUDA op succeeds 00659 gpuh->dev = -1; // flag GPU as unusable 00660 } 00661 } 00662 00663 00664 CUDAQuickSurf::~CUDAQuickSurf() { 00665 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu; 00666 00667 // free all working buffers if not done already 00668 free_bufs(); 00669 00670 // delete marching cubes object 00671 delete gpuh->mc; 00672 00673 free(voidgpu); 00674 } 00675 00676 00677 int CUDAQuickSurf::free_bufs() { 00678 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu; 00679 00680 // zero out max buffer capacities 00681 gpuh->natoms = 0; 00682 gpuh->colorperatom = 0; 00683 gpuh->acx = 0; 00684 gpuh->acy = 0; 00685 gpuh->acz = 0; 00686 gpuh->gx = 0; 00687 gpuh->gy = 0; 00688 gpuh->gz = 0; 00689 00690 if (gpuh->safety != NULL) 00691 cudaFree(gpuh->safety); 00692 gpuh->safety=NULL; 00693 00694 if (gpuh->devdensity != NULL) 00695 cudaFree(gpuh->devdensity); 00696 gpuh->devdensity=NULL; 00697 00698 if (gpuh->devvoltexmap != NULL) 00699 cudaFree(gpuh->devvoltexmap); 00700 gpuh->devvoltexmap=NULL; 00701 00702 if (gpuh->xyzr_d != NULL) 00703 cudaFree(gpuh->xyzr_d); 00704 gpuh->xyzr_d=NULL; 00705 00706 if (gpuh->sorted_xyzr_d != NULL) 00707 cudaFree(gpuh->sorted_xyzr_d); 00708 gpuh->sorted_xyzr_d=NULL; 00709 00710 if (gpuh->color_d != NULL) 00711 cudaFree(gpuh->color_d); 00712 gpuh->color_d=NULL; 00713 00714 if (gpuh->sorted_color_d != NULL) 00715 cudaFree(gpuh->sorted_color_d); 00716 gpuh->sorted_color_d=NULL; 00717 00718 if (gpuh->atomIndex_d != NULL) 00719 cudaFree(gpuh->atomIndex_d); 00720 gpuh->atomIndex_d=NULL; 00721 00722 if (gpuh->atomHash_d != NULL) 00723 cudaFree(gpuh->atomHash_d); 00724 gpuh->atomHash_d=NULL; 00725 00726 if (gpuh->cellStartEnd_d != NULL) 00727 cudaFree(gpuh->cellStartEnd_d); 00728 gpuh->cellStartEnd_d=NULL; 00729 00730 if (gpuh->v3f_d != NULL) 00731 cudaFree(gpuh->v3f_d); 00732 gpuh->v3f_d=NULL; 00733 00734 if (gpuh->n3f_d != NULL) 00735 cudaFree(gpuh->n3f_d); 00736 gpuh->n3f_d=NULL; 00737 00738 if (gpuh->c3f_d != NULL) 00739 cudaFree(gpuh->c3f_d); 00740 gpuh->c3f_d=NULL; 00741 00742 if (gpuh->n3b_d != NULL) 00743 cudaFree(gpuh->n3b_d); 00744 gpuh->n3b_d=NULL; 00745 00746 if (gpuh->c4u_d != NULL) 00747 cudaFree(gpuh->c4u_d); 00748 gpuh->c4u_d=NULL; 00749 00750 00751 return 0; 00752 } 00753 00754 00755 int CUDAQuickSurf::check_bufs(long int natoms, int colorperatom, 00756 VolTexFormat vtexformat, 00757 int acx, int acy, int acz, 00758 int gx, int gy, int gz) { 00759 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu; 00760 00761 // If the current atom count, texturing mode, and total voxel count 00762 // use the same or less storage than the size of the existing buffers, 00763 // we can reuse the same buffers without having to go through the 00764 // complex allocation and validation loops. This is a big performance 00765 // benefit during trajectory animation. 00766 if (natoms <= gpuh->natoms && 00767 colorperatom <= gpuh->colorperatom && 00768 vtexformat == gpuh->vtexformat && 00769 (acx*acy*acz) <= (gpuh->acx * gpuh->acy * gpuh->acz) && 00770 (gx*gy*gz) <= (gpuh->gx * gpuh->gy * gpuh->gz)) 00771 return 0; 00772 00773 return -1; // no existing bufs, or too small to be used 00774 } 00775 00776 00777 int CUDAQuickSurf::alloc_bufs(long int natoms, int colorperatom, 00778 VolTexFormat vtexformat, 00779 int acx, int acy, int acz, 00780 int gx, int gy, int gz) { 00781 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu; 00782 00783 // early exit from allocation call if we've already got existing 00784 // buffers that are large enough to support the request 00785 if (check_bufs(natoms, colorperatom, vtexformat, acx, acy, acz, gx, gy, gz) == 0) 00786 return 0; 00787 00788 // If we have any existing allocations, trash them as they weren't 00789 // usable for this new request and we need to reallocate them from scratch 00790 free_bufs(); 00791 00792 long int acncells = ((long) acx) * ((long) acy) * ((long) acz); 00793 long int ncells = ((long) gx) * ((long) gy) * ((long) gz); 00794 long int volmemsz = ncells * sizeof(float); 00795 long int chunkmaxverts = 3L * ncells; // assume worst case 50% triangle occupancy 00796 long int MCsz = CUDAMarchingCubes::MemUsageMC(gx, gy, gz); 00797 00798 // Allocate all of the memory buffers our algorithms will need up-front, 00799 // so we can retry and gracefully reduce the sizes of various buffers 00800 // to attempt to fit within available GPU memory 00801 long int totalmemsz = 00802 volmemsz + // volume 00803 (2L * natoms * sizeof(unsigned int)) + // bin sort 00804 (acncells * sizeof(uint2)) + // bin sort 00805 (3L * chunkmaxverts * sizeof(float3)) + // MC vertex bufs 00806 natoms*sizeof(float4) + // thrust 00807 8L * gx * gy * sizeof(float) + // thrust 00808 MCsz; // mcubes 00809 00810 cudaMalloc((void**)&gpuh->devdensity, volmemsz); 00811 if (colorperatom) { 00812 int voltexsz = 0; 00813 switch (vtexformat) { 00814 case RGB3F: 00815 voltexsz = ncells * sizeof(float3); 00816 break; 00817 00818 case RGB4U: 00819 voltexsz = ncells * sizeof(uchar4); 00820 break; 00821 } 00822 gpuh->vtexformat = vtexformat; 00823 00824 cudaMalloc((void**)&gpuh->devvoltexmap, voltexsz); 00825 cudaMalloc((void**)&gpuh->color_d, natoms * sizeof(float4)); 00826 cudaMalloc((void**)&gpuh->sorted_color_d, natoms * sizeof(float4)); 00827 totalmemsz += 2 * natoms * sizeof(float4); 00828 } 00829 cudaMalloc((void**)&gpuh->xyzr_d, natoms * sizeof(float4)); 00830 cudaMalloc((void**)&gpuh->sorted_xyzr_d, natoms * sizeof(float4)); 00831 cudaMalloc((void**)&gpuh->atomIndex_d, natoms * sizeof(unsigned int)); 00832 cudaMalloc((void**)&gpuh->atomHash_d, natoms * sizeof(unsigned int)); 00833 cudaMalloc((void**)&gpuh->cellStartEnd_d, acncells * sizeof(uint2)); 00834 00835 // allocate marching cubes output buffers 00836 cudaMalloc((void**)&gpuh->v3f_d, 3 * chunkmaxverts * sizeof(float3)); 00837 #if 1 00838 // Note: The CUDA char3 type is explicitly a signed type 00839 cudaMalloc((void**)&gpuh->n3b_d, 3 * chunkmaxverts * sizeof(char3)); 00840 totalmemsz += 3 * chunkmaxverts * sizeof(char3); // MC normal bufs 00841 #else 00842 cudaMalloc((void**)&gpuh->n3f_d, 3 * chunkmaxverts * sizeof(float3)); 00843 totalmemsz += 3 * chunkmaxverts * sizeof(float3); // MC normal bufs 00844 #endif 00845 #if 1 00846 cudaMalloc((void**)&gpuh->c4u_d, 3 * chunkmaxverts * sizeof(uchar4)); 00847 totalmemsz += 3 * chunkmaxverts * sizeof(uchar4); // MC vertex color bufs 00848 #else 00849 cudaMalloc((void**)&gpuh->c3f_d, 3 * chunkmaxverts * sizeof(float3)); 00850 totalmemsz += 3 * chunkmaxverts * sizeof(float3); // MC vertex color bufs 00851 #endif 00852 00853 // Allocate an extra phantom array to act as a safety net to 00854 // ensure that subsequent allocations performed internally by 00855 // the NVIDIA thrust template library or by our 00856 // marching cubes implementation don't fail, since we can't 00857 // currently pre-allocate all of them. 00858 cudaMalloc(&gpuh->safety, 00859 natoms*sizeof(float4) + // thrust 00860 8 * gx * gy * sizeof(float) + // thrust 00861 MCsz); // mcubes 00862 00863 if (gpuh->verbose > 1) 00864 printf("Total QuickSurf mem size: %ld MB\n", totalmemsz / (1024*1024)); 00865 00866 cudaError_t err = cudaGetLastError(); 00867 if (err != cudaSuccess) 00868 return -1; 00869 00870 // once the allocation has succeeded, we update the GPU handle info 00871 // so that the next test/allocation pass knows the latest state. 00872 gpuh->natoms = natoms; 00873 gpuh->colorperatom = colorperatom; 00874 00875 gpuh->acx = acx; 00876 gpuh->acy = acy; 00877 gpuh->acz = acz; 00878 00879 gpuh->gx = gx; 00880 gpuh->gy = gy; 00881 gpuh->gz = gz; 00882 00883 return 0; 00884 } 00885 00886 00887 int CUDAQuickSurf::get_chunk_bufs(int testexisting, 00888 long int natoms, int colorperatom, 00889 VolTexFormat vtexformat, 00890 int acx, int acy, int acz, 00891 int gx, int gy, int gz, 00892 int &cx, int &cy, int &cz, 00893 int &sx, int &sy, int &sz) { 00894 dim3 Bsz(GBLOCKSZX, GBLOCKSZY, GBLOCKSZZ); 00895 if (colorperatom) 00896 Bsz.z = GTEXBLOCKSZZ; 00897 00898 cudaError_t err = cudaGetLastError(); // eat error so next CUDA op succeeds 00899 00900 // enter loop to attempt a single-pass computation, but if the 00901 // allocation fails, cut the chunk size Z dimension by half 00902 // repeatedly until we either run chunks of 8 planes at a time, 00903 // otherwise we assume it is hopeless. 00904 cz <<= 1; // premultiply by two to simplify loop body 00905 int chunkiters = 0; 00906 int chunkallocated = 0; 00907 while (!chunkallocated) { 00908 // Cut the Z chunk size in half 00909 chunkiters++; 00910 cz >>= 1; 00911 00912 // if we've already dropped to a subvolume size, subtract off the 00913 // four extra Z planes from last time before we do the modulo padding 00914 // calculation so we don't hit an infinite loop trying to go below 00915 // 16 planes due the padding math below. 00916 if (cz != gz) 00917 cz-=4; 00918 00919 // Pad the chunk to a multiple of the computational tile size since 00920 // each thread computes multiple elements (unrolled in the Z direction) 00921 cz += (8 - (cz % 8)); 00922 00923 // The density map "slab" size is the chunk size but without the extra 00924 // plane used to copy the last plane of the previous chunk's density 00925 // into the start, for use by the marching cubes. 00926 sx = cx; 00927 sy = cy; 00928 sz = cz; 00929 00930 // Add four extra Z-planes for copying the previous end planes into 00931 // the start of the next chunk. 00932 cz+=4; 00933 00934 #if 0 00935 printf(" Trying slab size: %d (test: %d)\n", sz, testexisting); 00936 #endif 00937 00938 #if 1 00939 // test to see if total number of thread blocks exceeds maximum 00940 // number we can reasonably run prior to a kernel timeout error 00941 dim3 tGsz((sx+Bsz.x-1) / Bsz.x, 00942 (sy+Bsz.y-1) / Bsz.y, 00943 (sz+(Bsz.z*GUNROLL)-1) / (Bsz.z * GUNROLL)); 00944 if (colorperatom) { 00945 tGsz.z = (sz+(Bsz.z*GTEXUNROLL)-1) / (Bsz.z * GTEXUNROLL); 00946 } 00947 if (tGsz.x * tGsz.y * tGsz.z > 65535) 00948 continue; 00949 #endif 00950 00951 // Bail out if we can't get enough memory to run at least 00952 // 8 slices in a single pass (making sure we've freed any allocations 00953 // beforehand, so they aren't leaked). 00954 if (sz <= 8) { 00955 return -1; 00956 } 00957 00958 if (testexisting) { 00959 if (check_bufs(natoms, colorperatom, vtexformat, acx, acy, acz, cx, cy, cz) != 0) 00960 continue; 00961 } else { 00962 if (alloc_bufs(natoms, colorperatom, vtexformat, acx, acy, acz, cx, cy, cz) != 0) 00963 continue; 00964 } 00965 00966 chunkallocated=1; 00967 } 00968 00969 return 0; 00970 } 00971 00972 00973 int CUDAQuickSurf::calc_surf(long int natoms, const float *xyzr_f, 00974 const float *colors_f, 00975 int colorperatom, VolTexFormat voltexformat, 00976 float *origin, int *numvoxels, float maxrad, 00977 float radscale, float gridspacing, 00978 float isovalue, float gausslim, 00979 VMDDisplayList *cmdList) { 00980 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu; 00981 00982 // if there were any problems when the constructor tried to get 00983 // GPU hardware capabilities, we consider it to be unusable for all time. 00984 if (gpuh->dev < 0) 00985 return -1; 00986 00987 // This code currently requires compute capability 2.x or greater. 00988 // We absolutely depend on hardware broadcasts for 00989 // global memory reads by multiple threads reading the same element, 00990 // and the code more generally assumes the Fermi L1 cache and prefers 00991 // to launch 3-D grids where possible. 00992 if (gpuh->deviceProp.major < 2) { 00993 return -1; 00994 } 00995 00996 // start timing... 00997 wkf_timerhandle globaltimer = wkf_timer_create(); 00998 wkf_timer_start(globaltimer); 00999 01000 int vtexsize = 0; 01001 // const VolTexFormat voltexformat = RGB4U; // XXX caller may want to set this 01002 // const VolTexFormat voltexformat = RGB3F; // XXX caller may want to set this 01003 switch (voltexformat) { 01004 case RGB3F: 01005 vtexsize = sizeof(float3); 01006 break; 01007 01008 case RGB4U: 01009 vtexsize = sizeof(uchar4); 01010 break; 01011 } 01012 01013 float4 *colors = (float4 *) colors_f; 01014 int chunkmaxverts=0; 01015 int chunknumverts=0; 01016 int numverts=0; 01017 int numfacets=0; 01018 01019 // compute grid spacing for the acceleration grid 01020 float acgridspacing = gausslim * radscale * maxrad; 01021 01022 // ensure acceleration grid spacing >= density grid spacing 01023 if (acgridspacing < gridspacing) 01024 acgridspacing = gridspacing; 01025 01026 // Allocate output arrays for the gaussian density map and 3-D texture map 01027 // We test for errors carefully here since this is the most likely place 01028 // for a memory allocation failure due to the size of the grid. 01029 int3 volsz = make_int3(numvoxels[0], numvoxels[1], numvoxels[2]); 01030 int3 chunksz = volsz; 01031 int3 slabsz = volsz; 01032 01033 // An alternative scheme to minimize the QuickSurf GPU memory footprint 01034 if (getenv("VMDQUICKSURFMINMEM")) { 01035 if (volsz.z > 32) { 01036 slabsz.z = chunksz.z = 16; 01037 } 01038 } 01039 01040 int3 accelcells; 01041 accelcells.x = max(int((volsz.x*gridspacing) / acgridspacing), 1); 01042 accelcells.y = max(int((volsz.y*gridspacing) / acgridspacing), 1); 01043 accelcells.z = max(int((volsz.z*gridspacing) / acgridspacing), 1); 01044 01045 dim3 Bsz(GBLOCKSZX, GBLOCKSZY, GBLOCKSZZ); 01046 if (colorperatom) 01047 Bsz.z = GTEXBLOCKSZZ; 01048 01049 // check to see if it's possible to use an existing allocation, 01050 // if so, just leave things as they are, and do the computation 01051 // using the existing buffers 01052 if (gpuh->natoms == 0 || 01053 get_chunk_bufs(1, natoms, colorperatom, voltexformat, 01054 accelcells.x, accelcells.y, accelcells.z, 01055 volsz.x, volsz.y, volsz.z, 01056 chunksz.x, chunksz.y, chunksz.z, 01057 slabsz.x, slabsz.y, slabsz.z) == -1) { 01058 // reset the chunksz and slabsz after failing to try and 01059 // fit them into the existing allocations... 01060 chunksz = volsz; 01061 slabsz = volsz; 01062 01063 // reallocate the chunk buffers from scratch since we weren't 01064 // able to reuse them 01065 if (get_chunk_bufs(0, natoms, colorperatom, voltexformat, 01066 accelcells.x, accelcells.y, accelcells.z, 01067 volsz.x, volsz.y, volsz.z, 01068 chunksz.x, chunksz.y, chunksz.z, 01069 slabsz.x, slabsz.y, slabsz.z) == -1) { 01070 wkf_timer_destroy(globaltimer); 01071 return -1; 01072 } 01073 } 01074 chunkmaxverts = 3 * chunksz.x * chunksz.y * chunksz.z; 01075 01076 // Free the "safety padding" memory we allocate to ensure we dont 01077 // have trouble with thrust calls that allocate their own memory later 01078 if (gpuh->safety != NULL) 01079 cudaFree(gpuh->safety); 01080 gpuh->safety = NULL; 01081 01082 if (gpuh->verbose > 1) { 01083 printf(" Using GPU chunk size: %d\n", chunksz.z); 01084 printf(" Accel grid(%d, %d, %d) spacing %f\n", 01085 accelcells.x, accelcells.y, accelcells.z, acgridspacing); 01086 } 01087 01088 // pre-process the atom coordinates and radii as needed 01089 // short-term fix until a new CUDA kernel takes care of this 01090 int i, i4; 01091 float4 *xyzr = (float4 *) malloc(natoms * sizeof(float4)); 01092 float log2e = log2f(2.718281828f); 01093 for (i=0,i4=0; i<natoms; i++,i4+=4) { 01094 xyzr[i].x = xyzr_f[i4 ]; 01095 xyzr[i].y = xyzr_f[i4 + 1]; 01096 xyzr[i].z = xyzr_f[i4 + 2]; 01097 01098 float scaledrad = xyzr_f[i4 + 3] * radscale; 01099 float arinv = -1.0f * log2e / (2.0f*scaledrad*scaledrad); 01100 01101 xyzr[i].w = arinv; 01102 } 01103 cudaMemcpy(gpuh->xyzr_d, xyzr, natoms * sizeof(float4), cudaMemcpyHostToDevice); 01104 free(xyzr); 01105 01106 if (colorperatom) 01107 cudaMemcpy(gpuh->color_d, colors, natoms * sizeof(float4), cudaMemcpyHostToDevice); 01108 01109 // build uniform grid acceleration structure 01110 if (vmd_cuda_build_density_atom_grid(natoms, gpuh->xyzr_d, gpuh->color_d, 01111 gpuh->sorted_xyzr_d, 01112 gpuh->sorted_color_d, 01113 gpuh->atomIndex_d, gpuh->atomHash_d, 01114 gpuh->cellStartEnd_d, 01115 accelcells, 1.0f / acgridspacing) != 0) { 01116 wkf_timer_destroy(globaltimer); 01117 free_bufs(); 01118 return -1; 01119 } 01120 01121 double sorttime = wkf_timer_timenow(globaltimer); 01122 double lastlooptime = sorttime; 01123 01124 double densitykerneltime = 0.0f; 01125 double densitytime = 0.0f; 01126 double mckerneltime = 0.0f; 01127 double mctime = 0.0f; 01128 double copycalltime = 0.0f; 01129 double copytime = 0.0f; 01130 01131 float *volslab_d = NULL; 01132 void *texslab_d = NULL; 01133 01134 int lzplane = GBLOCKSZZ * GUNROLL; 01135 if (colorperatom) 01136 lzplane = GTEXBLOCKSZZ * GTEXUNROLL; 01137 01138 // initialize CUDA marching cubes class instance or rebuild it if needed 01139 uint3 mgsz = make_uint3(chunksz.x, chunksz.y, chunksz.z); 01140 if (gpuh->mc == NULL) { 01141 gpuh->mc = new CUDAMarchingCubes(); 01142 if (!gpuh->mc->Initialize(mgsz)) { 01143 printf("QuickSurf call to MC Initialize() failed\n"); 01144 } 01145 } else { 01146 uint3 mcmaxgridsize = gpuh->mc->GetMaxGridSize(); 01147 if (slabsz.x > mcmaxgridsize.x || 01148 slabsz.y > mcmaxgridsize.y || 01149 slabsz.z > mcmaxgridsize.z) { 01150 if (gpuh->verbose) 01151 printf("*** QuickSurf Allocating new MC object...\n"); 01152 01153 // delete marching cubes object 01154 delete gpuh->mc; 01155 01156 // create and initialize CUDA marching cubes class instance 01157 gpuh->mc = new CUDAMarchingCubes(); 01158 01159 if (!gpuh->mc->Initialize(mgsz)) { 01160 printf("QuickSurf MC Initialize() call failed to recreate MC object\n"); 01161 } 01162 } 01163 } 01164 01165 int z; 01166 int chunkcount=0; 01167 float invacgridspacing = 1.0f / acgridspacing; 01168 float invisovalue = 1.0f / isovalue; 01169 for (z=0; z<volsz.z; z+=slabsz.z) { 01170 int3 curslab = slabsz; 01171 if (z+curslab.z > volsz.z) 01172 curslab.z = volsz.z - z; 01173 01174 int slabplanesz = curslab.x * curslab.y; 01175 01176 dim3 Gsz((curslab.x+Bsz.x-1) / Bsz.x, 01177 (curslab.y+Bsz.y-1) / Bsz.y, 01178 (curslab.z+(Bsz.z*GUNROLL)-1) / (Bsz.z * GUNROLL)); 01179 if (colorperatom) 01180 Gsz.z = (curslab.z+(Bsz.z*GTEXUNROLL)-1) / (Bsz.z * GTEXUNROLL); 01181 01182 // For SM >= 2.x, we can run the entire slab in one pass by launching 01183 // a 3-D grid of thread blocks. 01184 dim3 Gszslice = Gsz; 01185 01186 if (gpuh->verbose > 1) { 01187 printf("CUDA device %d, grid size %dx%dx%d\n", 01188 0, Gsz.x, Gsz.y, Gsz.z); 01189 printf("CUDA: vol(%d,%d,%d) accel(%d,%d,%d)\n", 01190 curslab.x, curslab.y, curslab.z, 01191 accelcells.x, accelcells.y, accelcells.z); 01192 printf("Z=%d, curslab.z=%d\n", z, curslab.z); 01193 } 01194 01195 // For all but the first density slab, we copy the last four 01196 // planes of the previous run into the start of the next run so 01197 // that we can extract the isosurface with no discontinuities 01198 if (z == 0) { 01199 volslab_d = gpuh->devdensity; 01200 if (colorperatom) 01201 texslab_d = gpuh->devvoltexmap; 01202 } else { 01203 int fourplanes = 4 * slabplanesz; 01204 cudaMemcpy(gpuh->devdensity, 01205 volslab_d + (slabsz.z-4) * slabplanesz, 01206 fourplanes * sizeof(float), cudaMemcpyDeviceToDevice); 01207 volslab_d = gpuh->devdensity + fourplanes; 01208 01209 if (colorperatom) { 01210 cudaMemcpy(gpuh->devvoltexmap, 01211 ((unsigned char *) texslab_d) + (slabsz.z-4) * slabplanesz * vtexsize, 01212 fourplanes * vtexsize, cudaMemcpyDeviceToDevice); 01213 texslab_d = ((unsigned char *) gpuh->devvoltexmap) + fourplanes * vtexsize; 01214 } 01215 } 01216 01217 // loop over the planes/slices in a slab and compute density and texture 01218 for (int lz=0; lz<Gsz.z; lz+=Gszslice.z) { 01219 int lzinc = lz * lzplane; 01220 float *volslice_d = volslab_d + lzinc * slabplanesz; 01221 01222 if (colorperatom) { 01223 void *texslice_d = ((unsigned char *) texslab_d) + lzinc * slabplanesz * vtexsize; 01224 switch (voltexformat) { 01225 case RGB3F: 01226 gaussdensity_fast_tex3f<<<Gszslice, Bsz, 0>>>(natoms, 01227 gpuh->sorted_xyzr_d, gpuh->sorted_color_d, 01228 curslab, accelcells, acgridspacing, invacgridspacing, 01229 gpuh->cellStartEnd_d, gridspacing, z+lzinc, 01230 volslice_d, (float3 *) texslice_d, invisovalue); 01231 break; 01232 01233 case RGB4U: 01234 gaussdensity_fast_tex_norm<float, uchar4><<<Gszslice, Bsz, 0>>>(natoms, 01235 gpuh->sorted_xyzr_d, gpuh->sorted_color_d, 01236 curslab, accelcells, acgridspacing, invacgridspacing, 01237 gpuh->cellStartEnd_d, gridspacing, z+lzinc, 01238 volslice_d, (uchar4 *) texslice_d, invisovalue); 01239 break; 01240 } 01241 } else { 01242 gaussdensity_fast<<<Gszslice, Bsz, 0>>>(natoms, gpuh->sorted_xyzr_d, 01243 curslab, accelcells, acgridspacing, invacgridspacing, 01244 gpuh->cellStartEnd_d, gridspacing, z+lzinc, volslice_d); 01245 } 01246 } 01247 cudaDeviceSynchronize(); 01248 densitykerneltime = wkf_timer_timenow(globaltimer); 01249 01250 #if 0 01251 printf(" CUDA mcubes..."); fflush(stdout); 01252 #endif 01253 01254 uint3 gvsz = make_uint3(curslab.x, curslab.y, curslab.z); 01255 01256 // For all but the first density slab, we copy the last four 01257 // planes of the previous run into the start of the next run so 01258 // that we can extract the isosurface with no discontinuities 01259 if (z != 0) 01260 gvsz.z=curslab.z + 4; 01261 01262 float3 bbox = make_float3(gvsz.x * gridspacing, gvsz.y * gridspacing, 01263 gvsz.z * gridspacing); 01264 01265 float3 gorigin = make_float3(origin[0], origin[1], 01266 origin[2] + (z * gridspacing)); 01267 if (z != 0) 01268 gorigin.z = origin[2] + ((z-4) * gridspacing); 01269 01270 #if 0 01271 printf("\n ... vsz: %d %d %d\n", gvsz.x, gvsz.y, gvsz.z); 01272 printf(" ... org: %.2f %.2f %.2f\n", gorigin.x, gorigin.y, gorigin.z); 01273 printf(" ... bxs: %.2f %.2f %.2f\n", bbox.x, bbox.y, bbox.y); 01274 printf(" ... bbe: %.2f %.2f %.2f\n", gorigin.x+bbox.x, gorigin.y+bbox.y, gorigin.z+bbox.z); 01275 #endif 01276 01277 // If we are computing the volume using multiple passes, we have to 01278 // overlap the marching cubes grids and compute a sub-volume to exclude 01279 // the end planes, except for the first and last sub-volume, in order to 01280 // get correct per-vertex normals at the edges of each sub-volume 01281 int skipstartplane=0; 01282 int skipendplane=0; 01283 if (chunksz.z < volsz.z) { 01284 // on any but the first pass, we skip the first Z plane 01285 if (z != 0) 01286 skipstartplane=1; 01287 01288 // on any but the last pass, we skip the last Z plane 01289 if (z+curslab.z < volsz.z) 01290 skipendplane=1; 01291 } 01292 01293 // 01294 // Extract density map isosurface using marching cubes 01295 // 01296 01297 // Choose the isovalue depending on whether the desnity map 01298 // contains normalized or un-normalized density values 01299 if (colorperatom && (voltexformat == RGB4U)) { 01300 // incoming densities are pre-normalized so that the target isovalue 01301 // is represented as a density of 1.0f 01302 gpuh->mc->SetIsovalue(1.0f); 01303 } else { 01304 gpuh->mc->SetIsovalue(isovalue); 01305 } 01306 01307 int mcstat = 0; 01308 switch (voltexformat) { 01309 case RGB3F: 01310 mcstat = gpuh->mc->SetVolumeData(gpuh->devdensity, 01311 (float3 *) gpuh->devvoltexmap, 01312 gvsz, gorigin, bbox, true); 01313 break; 01314 01315 case RGB4U: 01316 mcstat = gpuh->mc->SetVolumeData(gpuh->devdensity, 01317 (uchar4 *) gpuh->devvoltexmap, 01318 gvsz, gorigin, bbox, true); 01319 break; 01320 } 01321 if (!mcstat) { 01322 printf("QuickSurf call to MC SetVolumeData() failed\n"); 01323 } 01324 01325 // set the sub-volume starting/ending indices if needed 01326 if (skipstartplane || skipendplane) { 01327 uint3 volstart = make_uint3(0, 0, 0); 01328 uint3 volend = make_uint3(gvsz.x, gvsz.y, gvsz.z); 01329 01330 if (skipstartplane) 01331 volstart.z = 2; 01332 01333 if (skipendplane) 01334 volend.z = gvsz.z - 2; 01335 01336 gpuh->mc->SetSubVolume(volstart, volend); 01337 } 01338 if (gpuh->n3b_d) { 01339 gpuh->mc->computeIsosurface(gpuh->v3f_d, gpuh->n3b_d, 01340 gpuh->c4u_d, chunkmaxverts); 01341 } else if (gpuh->c4u_d) { 01342 gpuh->mc->computeIsosurface(gpuh->v3f_d, gpuh->n3f_d, 01343 gpuh->c4u_d, chunkmaxverts); 01344 } else { 01345 gpuh->mc->computeIsosurface(gpuh->v3f_d, gpuh->n3f_d, 01346 gpuh->c3f_d, chunkmaxverts); 01347 } 01348 chunknumverts = gpuh->mc->GetVertexCount(); 01349 01350 #if 0 01351 printf("generated %d vertices, max vert limit: %d\n", chunknumverts, chunkmaxverts); 01352 #endif 01353 if (chunknumverts == chunkmaxverts) 01354 printf(" *** QuickSurf exceeded marching cubes vertex limit (%d verts)\n", chunknumverts); 01355 01356 cudaDeviceSynchronize(); 01357 mckerneltime = wkf_timer_timenow(globaltimer); 01358 01359 // Create a triangle mesh 01360 if (chunknumverts > 0) { 01361 DispCmdTriMesh cmdTriMesh; 01362 if (colorperatom) { 01363 // emit triangle mesh with per-vertex colors 01364 if (gpuh->n3b_d) { 01365 cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d, 01366 (const char *) gpuh->n3b_d, 01367 (const unsigned char *) gpuh->c4u_d, 01368 chunknumverts/3, cmdList); 01369 } else if (gpuh->c4u_d) { 01370 cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d, 01371 (const float *) gpuh->n3f_d, 01372 (const unsigned char *) gpuh->c4u_d, 01373 chunknumverts/3, cmdList); 01374 } else { 01375 cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d, 01376 (const float *) gpuh->n3f_d, 01377 (const float *) gpuh->c3f_d, 01378 chunknumverts/3, cmdList); 01379 } 01380 } else { 01381 // emit triangle mesh with no colors, uses current rendering state 01382 if (gpuh->n3b_d) { 01383 cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d, 01384 (const char *) gpuh->n3b_d, 01385 (const unsigned char *) NULL, 01386 chunknumverts/3, cmdList); 01387 } else { 01388 cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d, 01389 (const float *) gpuh->n3f_d, 01390 (const float *) NULL, 01391 chunknumverts/3, cmdList); 01392 } 01393 } 01394 } 01395 numverts+=chunknumverts; 01396 numfacets+=chunknumverts/3; 01397 01398 #if 0 01399 // XXX we'll hold onto this as we'll want to rescue this approach 01400 // for electrostatics coloring where we have to have the 01401 // entire triangle mesh in order to do the calculation 01402 int l; 01403 int vertstart = 3 * numverts; 01404 int vertbufsz = 3 * (numverts + chunknumverts) * sizeof(float); 01405 int facebufsz = (numverts + chunknumverts) * sizeof(int); 01406 int chunkvertsz = 3 * chunknumverts * sizeof(float); 01407 01408 v = (float*) realloc(v, vertbufsz); 01409 n = (float*) realloc(n, vertbufsz); 01410 c = (float*) realloc(c, vertbufsz); 01411 f = (int*) realloc(f, facebufsz); 01412 cudaMemcpy(v+vertstart, gpuh->v3f_d, chunkvertsz, cudaMemcpyDeviceToHost); 01413 cudaMemcpy(n+vertstart, gpuh->n3f_d, chunkvertsz, cudaMemcpyDeviceToHost); 01414 if (colorperatom) { 01415 cudaMemcpy(c+vertstart, gpuh->c3f_d, chunkvertsz, cudaMemcpyDeviceToHost); 01416 } else { 01417 float *color = c+vertstart; 01418 for (l=0; l<chunknumverts*3; l+=3) { 01419 color[l + 0] = colors[0].x; 01420 color[l + 1] = colors[0].y; 01421 color[l + 2] = colors[0].z; 01422 } 01423 } 01424 for (l=numverts; l<numverts+chunknumverts; l++) { 01425 f[l]=l; 01426 } 01427 numverts+=chunknumverts; 01428 numfacets+=chunknumverts/3; 01429 #endif 01430 01431 copycalltime = wkf_timer_timenow(globaltimer); 01432 01433 densitytime += densitykerneltime - lastlooptime; 01434 mctime += mckerneltime - densitykerneltime; 01435 copytime += copycalltime - mckerneltime; 01436 01437 lastlooptime = wkf_timer_timenow(globaltimer); 01438 01439 chunkcount++; // increment number of chunks processed 01440 } 01441 01442 // catch any errors that may have occured so that at the very least, 01443 // all of the subsequent resource deallocation calls will succeed 01444 cudaError_t err = cudaGetLastError(); 01445 01446 wkf_timer_stop(globaltimer); 01447 double totalruntime = wkf_timer_time(globaltimer); 01448 wkf_timer_destroy(globaltimer); 01449 01450 // If an error occured, we print it and return an error code, once 01451 // all of the memory deallocations have completed. 01452 if (err != cudaSuccess) { 01453 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); 01454 return -1; 01455 } 01456 01457 if (gpuh->verbose) { 01458 printf(" GPU generated %d vertices, %d facets, in %d passes\n", 01459 numverts, numfacets, chunkcount); 01460 printf(" GPU time (%s): %.3f [sort: %.3f density %.3f mcubes: %.3f copy: %.3f]\n", 01461 "SM >= 2.x", totalruntime, sorttime, densitytime, mctime, copytime); 01462 } 01463 01464 return 0; 01465 } 01466 01467 01468 01469 01470