Main Page Namespace List Class Hierarchy Alphabetical List Compound List File List Namespace Members Compound Members File Members Related Pages
CUDAQuickSurf.cu

Go to the documentation of this file.
00001 /***************************************************************************
00002 *cr
00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the
00004 *cr University of Illinois
00005 *cr All Rights Reserved
00006 *cr
00007 ***************************************************************************/
00008 /***************************************************************************
00009 * RCS INFORMATION:
00010 *
00011 * $RCSfile: CUDAQuickSurf.cu,v $
00012 * $Author: johns $ $Locker: $ $State: Exp $
00013 * $Revision: 1.97 $ $Date: 2021年12月21日 05:30:12 $
00014 *
00015 ***************************************************************************/
00042 #include <stdio.h>
00043 #include <stdlib.h>
00044 #include <string.h>
00045 #include <cuda.h>
00046 #if CUDART_VERSION >= 9000
00047 #include <cuda_fp16.h> // need to explicitly include for CUDA 9.0
00048 #endif
00049 
00050 #if CUDART_VERSION < 4000
00051 #error The VMD QuickSurf feature requires CUDA 4.0 or later
00052 #endif
00053 
00054 #include "Inform.h"
00055 #include "utilities.h"
00056 #include "WKFThreads.h"
00057 #include "WKFUtils.h"
00058 #include "CUDAKernels.h" 
00059 #include "CUDASpatialSearch.h"
00060 #include "CUDAMarchingCubes.h"
00061 #include "CUDAQuickSurf.h" 
00062 
00063 #include "DispCmds.h"
00064 #include "VMDDisplayList.h"
00065 
00066 #if 1
00067 #define CUERR { cudaError_t err; \
00068 if ((err = cudaGetLastError()) != cudaSuccess) { \
00069 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00070 printf("Thread aborting...\n"); \
00071 return NULL; }}
00072 #else
00073 #define CUERR
00074 #endif
00075 
00076 
00077 //
00078 // general operator overload helper routines
00079 //
00080 
00081 // "*" operator
00082 inline __host__ __device__ float3 operator*(float3 a, float b) {
00083 return make_float3(b * a.x, b * a.y, b * a.z);
00084 }
00085 
00086 // "+=" operator
00087 inline __host__ __device__ void operator+=(float3 &a, float3 b) {
00088 a.x += b.x;
00089 a.y += b.y;
00090 a.z += b.z;
00091 }
00092 
00093 // down-convert float4 to float3
00094 inline __device__ float3 make_float3(float4 a) {
00095 float3 b;
00096 b.x = a.x;
00097 b.y = a.y;
00098 b.z = a.z;
00099 return b;
00100 }
00101 
00102 
00103 //
00104 // density format conversion routines
00105 //
00106 
00107 // no-op conversion for float to float
00108 inline __device__ void convert_density(float & df, float df2) {
00109 df = df2;
00110 }
00111 
00112 // Convert float (32-bit) to half-precision (16-bit floating point) stored
00113 // into an unsigned short (16-bit integer type). 
00114 inline __device__ void convert_density(unsigned short & dh, float df2) {
00115 dh = __float2half_rn(df2);
00116 }
00117 
00118 
00119 
00120 //
00121 // color format conversion routines
00122 //
00123 
00124 // conversion for float3 to float3
00125 inline __device__ void convert_color(float3 & cf, float3 cf2, float invisovalue) {
00126 cf = cf2 * invisovalue;
00127 }
00128 
00129 
00130 // Convert float3 colors to uchar4, performing the necessary bias, scaling, 
00131 // and range clamping so we don't encounter integer wraparound, etc.
00132 inline __device__ void convert_color(uchar4 & cu, float3 cf, float invisovalue) {
00133 // conversion to GLubyte format, Table 2.6, p. 44 of OpenGL spec 1.2.1
00134 // c = f * (2^8-1)
00135 
00136 #if 1
00137 float3 cftmp = cf * invisovalue;
00138 cu = make_uchar4(fminf(cftmp.x * 255.0f, 255.0f),
00139 fminf(cftmp.y * 255.0f, 255.0f),
00140 fminf(cftmp.z * 255.0f, 255.0f),
00141 255); 
00142 #else
00143 // scale color values to prevent overflow, 
00144 // and convert to fixed-point representation all at once
00145 float invmaxcolscale = __frcp_rn(fmaxf(fmaxf(fmaxf(cf.x, cf.y), cf.z), 1.0f)) * 255.0f;
00146 
00147 // clamp color values to prevent integer wraparound
00148 cu = make_uchar4(cf.x * invmaxcolscale,
00149 cf.y * invmaxcolscale,
00150 cf.z * invmaxcolscale,
00151 255);
00152 #endif
00153 }
00154 
00155 
00156 // convert uchar4 colors to float3
00157 inline __device__ void convert_color(float3 & cf, uchar4 cu) {
00158 const float i2f = 1.0f / 255.0f;
00159 cf.x = cu.x * i2f;
00160 cf.y = cu.y * i2f;
00161 cf.z = cu.z * i2f;
00162 }
00163 
00164 
00165 //
00166 // Restrict macro to make it easy to do perf tuning tests
00167 //
00168 #if 1 && (CUDART_VERSION >= 4000)
00169 #define RESTRICT __restrict__
00170 #else
00171 #define RESTRICT
00172 #endif
00173 
00174 // 
00175 // Parameters for linear-time range-limited gaussian density kernels
00176 //
00177 #define GGRIDSZ 8.0f
00178 #define GBLOCKSZX 8
00179 #define GBLOCKSZY 8
00180 
00181 #if 1
00182 #define GTEXBLOCKSZZ 2
00183 #define GTEXUNROLL 4
00184 #define GBLOCKSZZ 2
00185 #define GUNROLL 4
00186 #else
00187 #define GTEXBLOCKSZZ 8
00188 #define GTEXUNROLL 1
00189 #define GBLOCKSZZ 8
00190 #define GUNROLL 1
00191 #endif
00192 
00193 #define MAXTHRDENS ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ )
00194 #if __CUDA_ARCH__ >= 600
00195 #define MINBLOCKDENS 16
00196 #elif __CUDA_ARCH__ >= 300
00197 #define MINBLOCKDENS 16
00198 #elif __CUDA_ARCH__ >= 200
00199 #define MINBLOCKDENS 1
00200 #else
00201 #define MINBLOCKDENS 1
00202 #endif
00203 
00204 
00205 //
00206 // Templated version of the density map kernel to handle multiple 
00207 // data formats for the output density volume and volumetric texture.
00208 // This variant of the density map algorithm normalizes densities so
00209 // that the target isovalue is a density of 1.0.
00210 //
00211 template<class DENSITY, class VOLTEX>
00212 __global__ static void 
00213 __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
00214 gaussdensity_fast_tex_norm(int natoms,
00215 const float4 * RESTRICT sorted_xyzr, 
00216 const float4 * RESTRICT sorted_color, 
00217 int3 volsz,
00218 int3 acncells,
00219 float acgridspacing,
00220 float invacgridspacing,
00221 const uint2 * RESTRICT cellStartEnd,
00222 float gridspacing, unsigned int z, 
00223 DENSITY * RESTRICT densitygrid,
00224 VOLTEX * RESTRICT voltexmap,
00225 float invisovalue) {
00226 unsigned int xindex = (blockIdx.x * blockDim.x) + threadIdx.x;
00227 unsigned int yindex = (blockIdx.y * blockDim.y) + threadIdx.y;
00228 unsigned int zindex = ((blockIdx.z * blockDim.z) + threadIdx.z) * GTEXUNROLL;
00229 
00230 // shave register use slightly
00231 unsigned int outaddr = zindex * volsz.x * volsz.y + 
00232 yindex * volsz.x + xindex;
00233 
00234 // early exit if this thread is outside of the grid bounds
00235 if (xindex >= volsz.x || yindex >= volsz.y || zindex >= volsz.z)
00236 return;
00237 
00238 zindex += z;
00239 
00240 // compute ac grid index of lower corner minus gaussian radius
00241 int xabmin = ((blockIdx.x * blockDim.x) * gridspacing - acgridspacing) * invacgridspacing;
00242 int yabmin = ((blockIdx.y * blockDim.y) * gridspacing - acgridspacing) * invacgridspacing;
00243 int zabmin = ((z + blockIdx.z * blockDim.z * GTEXUNROLL) * gridspacing - acgridspacing) * invacgridspacing;
00244 
00245 // compute ac grid index of upper corner plus gaussian radius
00246 int xabmax = (((blockIdx.x+1) * blockDim.x) * gridspacing + acgridspacing) * invacgridspacing;
00247 int yabmax = (((blockIdx.y+1) * blockDim.y) * gridspacing + acgridspacing) * invacgridspacing;
00248 int zabmax = ((z + (blockIdx.z+1) * blockDim.z * GTEXUNROLL) * gridspacing + acgridspacing) * invacgridspacing;
00249 
00250 xabmin = (xabmin < 0) ? 0 : xabmin;
00251 yabmin = (yabmin < 0) ? 0 : yabmin;
00252 zabmin = (zabmin < 0) ? 0 : zabmin;
00253 xabmax = (xabmax >= acncells.x-1) ? acncells.x-1 : xabmax;
00254 yabmax = (yabmax >= acncells.y-1) ? acncells.y-1 : yabmax;
00255 zabmax = (zabmax >= acncells.z-1) ? acncells.z-1 : zabmax;
00256 
00257 float coorx = gridspacing * xindex;
00258 float coory = gridspacing * yindex;
00259 float coorz = gridspacing * zindex;
00260 
00261 float densityval1=0.0f;
00262 float3 densitycol1=make_float3(0.0f, 0.0f, 0.0f);
00263 #if GTEXUNROLL >= 2
00264 float densityval2=0.0f;
00265 float3 densitycol2=densitycol1;
00266 #endif
00267 #if GTEXUNROLL >= 4
00268 float densityval3=0.0f;
00269 float3 densitycol3=densitycol1;
00270 float densityval4=0.0f;
00271 float3 densitycol4=densitycol1;
00272 #endif
00273 
00274 int acplanesz = acncells.x * acncells.y;
00275 int xab, yab, zab;
00276 for (zab=zabmin; zab<=zabmax; zab++) {
00277 for (yab=yabmin; yab<=yabmax; yab++) {
00278 for (xab=xabmin; xab<=xabmax; xab++) {
00279 int abcellidx = zab * acplanesz + yab * acncells.x + xab;
00280 // this biggest latency hotspot in the kernel, if we could improve
00281 // packing of the grid cell map, we'd likely improve performance 
00282 uint2 atomstartend = cellStartEnd[abcellidx];
00283 if (atomstartend.x != GRID_CELL_EMPTY) {
00284 unsigned int atomid;
00285 for (atomid=atomstartend.x; atomid<atomstartend.y; atomid++) {
00286 float4 atom = sorted_xyzr[atomid];
00287 float4 color = sorted_color[atomid];
00288 float dx = coorx - atom.x;
00289 float dy = coory - atom.y;
00290 float dxy2 = dx*dx + dy*dy;
00291 
00292 float dz = coorz - atom.z;
00293 float r21 = (dxy2 + dz*dz) * atom.w;
00294 float tmp1 = exp2f(r21);
00295 densityval1 += tmp1;
00296 densitycol1.x += tmp1 * color.x;
00297 densitycol1.y += tmp1 * color.y;
00298 densitycol1.z += tmp1 * color.z;
00299 
00300 #if GTEXUNROLL >= 2
00301 float dz2 = dz + gridspacing;
00302 float r22 = (dxy2 + dz2*dz2) * atom.w;
00303 float tmp2 = exp2f(r22);
00304 densityval2 += tmp2;
00305 densitycol2.x += tmp2 * color.x;
00306 densitycol2.y += tmp2 * color.y;
00307 densitycol2.z += tmp2 * color.z;
00308 #endif
00309 #if GTEXUNROLL >= 4
00310 float dz3 = dz2 + gridspacing;
00311 float r23 = (dxy2 + dz3*dz3) * atom.w;
00312 float tmp3 = exp2f(r23);
00313 densityval3 += tmp3;
00314 densitycol3.x += tmp3 * color.x;
00315 densitycol3.y += tmp3 * color.y;
00316 densitycol3.z += tmp3 * color.z;
00317 
00318 float dz4 = dz3 + gridspacing;
00319 float r24 = (dxy2 + dz4*dz4) * atom.w;
00320 float tmp4 = exp2f(r24);
00321 densityval4 += tmp4;
00322 densitycol4.x += tmp4 * color.x;
00323 densitycol4.y += tmp4 * color.y;
00324 densitycol4.z += tmp4 * color.z;
00325 #endif
00326 }
00327 }
00328 }
00329 }
00330 }
00331 
00332 DENSITY densityout;
00333 VOLTEX texout;
00334 convert_density(densityout, densityval1 * invisovalue);
00335 densitygrid[outaddr ] = densityout;
00336 convert_color(texout, densitycol1, invisovalue);
00337 voltexmap[outaddr ] = texout;
00338 
00339 #if GTEXUNROLL >= 2
00340 int planesz = volsz.x * volsz.y;
00341 convert_density(densityout, densityval2 * invisovalue);
00342 densitygrid[outaddr + planesz] = densityout;
00343 convert_color(texout, densitycol2, invisovalue);
00344 voltexmap[outaddr + planesz] = texout;
00345 #endif
00346 #if GTEXUNROLL >= 4
00347 convert_density(densityout, densityval3 * invisovalue);
00348 densitygrid[outaddr + 2*planesz] = densityout;
00349 convert_color(texout, densitycol3, invisovalue);
00350 voltexmap[outaddr + 2*planesz] = texout;
00351 
00352 convert_density(densityout, densityval4 * invisovalue);
00353 densitygrid[outaddr + 3*planesz] = densityout;
00354 convert_color(texout, densitycol4, invisovalue);
00355 voltexmap[outaddr + 3*planesz] = texout;
00356 #endif
00357 }
00358 
00359 
00360 __global__ static void 
00361 __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
00362 gaussdensity_fast_tex3f(int natoms,
00363 const float4 * RESTRICT sorted_xyzr, 
00364 const float4 * RESTRICT sorted_color, 
00365 int3 volsz,
00366 int3 acncells,
00367 float acgridspacing,
00368 float invacgridspacing,
00369 const uint2 * RESTRICT cellStartEnd,
00370 float gridspacing, unsigned int z, 
00371 float * RESTRICT densitygrid,
00372 float3 * RESTRICT voltexmap,
00373 float invisovalue) {
00374 unsigned int xindex = (blockIdx.x * blockDim.x) + threadIdx.x;
00375 unsigned int yindex = (blockIdx.y * blockDim.y) + threadIdx.y;
00376 unsigned int zindex = ((blockIdx.z * blockDim.z) + threadIdx.z) * GTEXUNROLL;
00377 
00378 // shave register use slightly
00379 unsigned int outaddr = zindex * volsz.x * volsz.y + 
00380 yindex * volsz.x + xindex;
00381 
00382 // early exit if this thread is outside of the grid bounds
00383 if (xindex >= volsz.x || yindex >= volsz.y || zindex >= volsz.z)
00384 return;
00385 
00386 zindex += z;
00387 
00388 // compute ac grid index of lower corner minus gaussian radius
00389 int xabmin = ((blockIdx.x * blockDim.x) * gridspacing - acgridspacing) * invacgridspacing;
00390 int yabmin = ((blockIdx.y * blockDim.y) * gridspacing - acgridspacing) * invacgridspacing;
00391 int zabmin = ((z + blockIdx.z * blockDim.z * GTEXUNROLL) * gridspacing - acgridspacing) * invacgridspacing;
00392 
00393 // compute ac grid index of upper corner plus gaussian radius
00394 int xabmax = (((blockIdx.x+1) * blockDim.x) * gridspacing + acgridspacing) * invacgridspacing;
00395 int yabmax = (((blockIdx.y+1) * blockDim.y) * gridspacing + acgridspacing) * invacgridspacing;
00396 int zabmax = ((z + (blockIdx.z+1) * blockDim.z * GTEXUNROLL) * gridspacing + acgridspacing) * invacgridspacing;
00397 
00398 xabmin = (xabmin < 0) ? 0 : xabmin;
00399 yabmin = (yabmin < 0) ? 0 : yabmin;
00400 zabmin = (zabmin < 0) ? 0 : zabmin;
00401 xabmax = (xabmax >= acncells.x-1) ? acncells.x-1 : xabmax;
00402 yabmax = (yabmax >= acncells.y-1) ? acncells.y-1 : yabmax;
00403 zabmax = (zabmax >= acncells.z-1) ? acncells.z-1 : zabmax;
00404 
00405 float coorx = gridspacing * xindex;
00406 float coory = gridspacing * yindex;
00407 float coorz = gridspacing * zindex;
00408 
00409 float densityval1=0.0f;
00410 float3 densitycol1=make_float3(0.0f, 0.0f, 0.0f);
00411 #if GTEXUNROLL >= 2
00412 float densityval2=0.0f;
00413 float3 densitycol2=densitycol1;
00414 #endif
00415 #if GTEXUNROLL >= 4
00416 float densityval3=0.0f;
00417 float3 densitycol3=densitycol1;
00418 float densityval4=0.0f;
00419 float3 densitycol4=densitycol1;
00420 #endif
00421 
00422 int acplanesz = acncells.x * acncells.y;
00423 int xab, yab, zab;
00424 for (zab=zabmin; zab<=zabmax; zab++) {
00425 for (yab=yabmin; yab<=yabmax; yab++) {
00426 for (xab=xabmin; xab<=xabmax; xab++) {
00427 int abcellidx = zab * acplanesz + yab * acncells.x + xab;
00428 // this biggest latency hotspot in the kernel, if we could improve
00429 // packing of the grid cell map, we'd likely improve performance 
00430 uint2 atomstartend = cellStartEnd[abcellidx];
00431 if (atomstartend.x != GRID_CELL_EMPTY) {
00432 unsigned int atomid;
00433 for (atomid=atomstartend.x; atomid<atomstartend.y; atomid++) {
00434 float4 atom = sorted_xyzr[atomid];
00435 float3 color = make_float3(sorted_color[atomid]);
00436 float dx = coorx - atom.x;
00437 float dy = coory - atom.y;
00438 float dxy2 = dx*dx + dy*dy;
00439 
00440 float dz = coorz - atom.z;
00441 float r21 = (dxy2 + dz*dz) * atom.w;
00442 float tmp1 = exp2f(r21);
00443 densityval1 += tmp1;
00444 densitycol1 += color * tmp1;
00445 
00446 #if GTEXUNROLL >= 2
00447 float dz2 = dz + gridspacing;
00448 float r22 = (dxy2 + dz2*dz2) * atom.w;
00449 float tmp2 = exp2f(r22);
00450 densityval2 += tmp2;
00451 densitycol2 += color * tmp2;
00452 #endif
00453 #if GTEXUNROLL >= 4
00454 float dz3 = dz2 + gridspacing;
00455 float r23 = (dxy2 + dz3*dz3) * atom.w;
00456 float tmp3 = exp2f(r23);
00457 densityval3 += tmp3;
00458 densitycol3 += color * tmp3;
00459 
00460 float dz4 = dz3 + gridspacing;
00461 float r24 = (dxy2 + dz4*dz4) * atom.w;
00462 float tmp4 = exp2f(r24);
00463 densityval4 += tmp4;
00464 densitycol4 += color * tmp4;
00465 #endif
00466 }
00467 }
00468 }
00469 }
00470 }
00471 
00472 float3 texout;
00473 densitygrid[outaddr ] = densityval1;
00474 convert_color(texout, densitycol1, invisovalue);
00475 voltexmap[outaddr ] = texout;
00476 
00477 #if GTEXUNROLL >= 2
00478 int planesz = volsz.x * volsz.y;
00479 densitygrid[outaddr + planesz] = densityval2;
00480 convert_color(texout, densitycol2, invisovalue);
00481 voltexmap[outaddr + planesz] = texout;
00482 #endif
00483 #if GTEXUNROLL >= 4
00484 densitygrid[outaddr + 2*planesz] = densityval3;
00485 convert_color(texout, densitycol3, invisovalue);
00486 voltexmap[outaddr + 2*planesz] = texout;
00487 
00488 densitygrid[outaddr + 3*planesz] = densityval4;
00489 convert_color(texout, densitycol4, invisovalue);
00490 voltexmap[outaddr + 3*planesz] = texout;
00491 #endif
00492 }
00493 
00494 
00495 __global__ static void 
00496 // __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
00497 gaussdensity_fast(int natoms,
00498 const float4 * RESTRICT sorted_xyzr, 
00499 int3 volsz,
00500 int3 acncells,
00501 float acgridspacing,
00502 float invacgridspacing,
00503 const uint2 * RESTRICT cellStartEnd,
00504 float gridspacing, unsigned int z, 
00505 float * RESTRICT densitygrid) {
00506 unsigned int xindex = (blockIdx.x * blockDim.x) + threadIdx.x;
00507 unsigned int yindex = (blockIdx.y * blockDim.y) + threadIdx.y;
00508 unsigned int zindex = ((blockIdx.z * blockDim.z) + threadIdx.z) * GUNROLL;
00509 unsigned int outaddr = zindex * volsz.x * volsz.y + 
00510 yindex * volsz.x + 
00511 xindex;
00512 
00513 // early exit if this thread is outside of the grid bounds
00514 if (xindex >= volsz.x || yindex >= volsz.y || zindex >= volsz.z)
00515 return;
00516 
00517 zindex += z;
00518 
00519 // compute ac grid index of lower corner minus gaussian radius
00520 int xabmin = ((blockIdx.x * blockDim.x) * gridspacing - acgridspacing) * invacgridspacing;
00521 int yabmin = ((blockIdx.y * blockDim.y) * gridspacing - acgridspacing) * invacgridspacing;
00522 int zabmin = ((z + blockIdx.z * blockDim.z * GUNROLL) * gridspacing - acgridspacing) * invacgridspacing;
00523 
00524 // compute ac grid index of upper corner plus gaussian radius
00525 int xabmax = (((blockIdx.x+1) * blockDim.x) * gridspacing + acgridspacing) * invacgridspacing;
00526 int yabmax = (((blockIdx.y+1) * blockDim.y) * gridspacing + acgridspacing) * invacgridspacing;
00527 int zabmax = ((z + (blockIdx.z+1) * blockDim.z * GUNROLL) * gridspacing + acgridspacing) * invacgridspacing;
00528 
00529 xabmin = (xabmin < 0) ? 0 : xabmin;
00530 yabmin = (yabmin < 0) ? 0 : yabmin;
00531 zabmin = (zabmin < 0) ? 0 : zabmin;
00532 xabmax = (xabmax >= acncells.x-1) ? acncells.x-1 : xabmax;
00533 yabmax = (yabmax >= acncells.y-1) ? acncells.y-1 : yabmax;
00534 zabmax = (zabmax >= acncells.z-1) ? acncells.z-1 : zabmax;
00535 
00536 float coorx = gridspacing * xindex;
00537 float coory = gridspacing * yindex;
00538 float coorz = gridspacing * zindex;
00539 
00540 float densityval1=0.0f;
00541 #if GUNROLL >= 2
00542 float densityval2=0.0f;
00543 #endif
00544 #if GUNROLL >= 4
00545 float densityval3=0.0f;
00546 float densityval4=0.0f;
00547 #endif
00548 
00549 int acplanesz = acncells.x * acncells.y;
00550 int xab, yab, zab;
00551 for (zab=zabmin; zab<=zabmax; zab++) {
00552 for (yab=yabmin; yab<=yabmax; yab++) {
00553 for (xab=xabmin; xab<=xabmax; xab++) {
00554 int abcellidx = zab * acplanesz + yab * acncells.x + xab;
00555 uint2 atomstartend = cellStartEnd[abcellidx];
00556 if (atomstartend.x != GRID_CELL_EMPTY) {
00557 unsigned int atomid;
00558 for (atomid=atomstartend.x; atomid<atomstartend.y; atomid++) {
00559 float4 atom = sorted_xyzr[atomid];
00560 float dx = coorx - atom.x;
00561 float dy = coory - atom.y;
00562 float dxy2 = dx*dx + dy*dy;
00563 
00564 float dz = coorz - atom.z;
00565 float r21 = (dxy2 + dz*dz) * atom.w;
00566 densityval1 += exp2f(r21);
00567 
00568 #if GUNROLL >= 2
00569 float dz2 = dz + gridspacing;
00570 float r22 = (dxy2 + dz2*dz2) * atom.w;
00571 densityval2 += exp2f(r22);
00572 #endif
00573 #if GUNROLL >= 4
00574 float dz3 = dz2 + gridspacing;
00575 float r23 = (dxy2 + dz3*dz3) * atom.w;
00576 densityval3 += exp2f(r23);
00577 
00578 float dz4 = dz3 + gridspacing;
00579 float r24 = (dxy2 + dz4*dz4) * atom.w;
00580 densityval4 += exp2f(r24);
00581 #endif
00582 }
00583 }
00584 }
00585 }
00586 }
00587 
00588 densitygrid[outaddr ] = densityval1;
00589 #if GUNROLL >= 2
00590 int planesz = volsz.x * volsz.y;
00591 densitygrid[outaddr + planesz] = densityval2;
00592 #endif
00593 #if GUNROLL >= 4
00594 densitygrid[outaddr + 2*planesz] = densityval3;
00595 densitygrid[outaddr + 3*planesz] = densityval4;
00596 #endif
00597 }
00598 
00599 
00600 // per-GPU handle with various memory buffer pointers, etc.
00601 typedef struct {
00602 cudaDeviceProp deviceProp; 
00603 int dev; 
00604 int verbose; 
00605 
00607 long int natoms; 
00608 int colorperatom; 
00609 CUDAQuickSurf::VolTexFormat vtexformat; 
00610 int acx; 
00611 int acy; 
00612 int acz; 
00613 int gx; 
00614 int gy; 
00615 int gz; 
00616 
00617 CUDAMarchingCubes *mc; 
00618 
00619 float *devdensity; 
00620 void *devvoltexmap; 
00621 float4 *xyzr_d; 
00622 float4 *sorted_xyzr_d; 
00623 float4 *color_d; 
00624 float4 *sorted_color_d; 
00625 
00626 unsigned int *atomIndex_d; 
00627 unsigned int *atomHash_d; 
00628 uint2 *cellStartEnd_d; 
00629 
00630 void *safety; 
00631 
00632 float3 *v3f_d; 
00633 float3 *n3f_d; 
00634 float3 *c3f_d; 
00635 char3 *n3b_d; 
00636 uchar4 *c4u_d; 
00637 } qsurf_gpuhandle;
00638 
00639 
00640 CUDAQuickSurf::CUDAQuickSurf() {
00641 voidgpu = calloc(1, sizeof(qsurf_gpuhandle));
00642 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00643 
00644 if (getenv("VMDQUICKSURFVERBOSE") != NULL) {
00645 gpuh->verbose = 1;
00646 int tmp = atoi(getenv("VMDQUICKSURFVERBOSE"));
00647 if (tmp > 0)
00648 gpuh->verbose = tmp;
00649 }
00650 
00651 gpuh->vtexformat = RGB3F; // large size allocs by default
00652 
00653 if (cudaGetDevice(&gpuh->dev) != cudaSuccess) {
00654 gpuh->dev = -1; // flag GPU as unusable
00655 }
00656 
00657 if (cudaGetDeviceProperties(&gpuh->deviceProp, gpuh->dev) != cudaSuccess) {
00658 cudaError_t err = cudaGetLastError(); // eat error so next CUDA op succeeds
00659 gpuh->dev = -1; // flag GPU as unusable
00660 }
00661 }
00662 
00663 
00664 CUDAQuickSurf::~CUDAQuickSurf() {
00665 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00666 
00667 // free all working buffers if not done already
00668 free_bufs();
00669 
00670 // delete marching cubes object
00671 delete gpuh->mc;
00672 
00673 free(voidgpu);
00674 }
00675 
00676 
00677 int CUDAQuickSurf::free_bufs() {
00678 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00679 
00680 // zero out max buffer capacities
00681 gpuh->natoms = 0;
00682 gpuh->colorperatom = 0;
00683 gpuh->acx = 0;
00684 gpuh->acy = 0;
00685 gpuh->acz = 0;
00686 gpuh->gx = 0;
00687 gpuh->gy = 0;
00688 gpuh->gz = 0;
00689 
00690 if (gpuh->safety != NULL)
00691 cudaFree(gpuh->safety);
00692 gpuh->safety=NULL;
00693 
00694 if (gpuh->devdensity != NULL)
00695 cudaFree(gpuh->devdensity);
00696 gpuh->devdensity=NULL;
00697 
00698 if (gpuh->devvoltexmap != NULL)
00699 cudaFree(gpuh->devvoltexmap);
00700 gpuh->devvoltexmap=NULL;
00701 
00702 if (gpuh->xyzr_d != NULL)
00703 cudaFree(gpuh->xyzr_d);
00704 gpuh->xyzr_d=NULL;
00705 
00706 if (gpuh->sorted_xyzr_d != NULL)
00707 cudaFree(gpuh->sorted_xyzr_d); 
00708 gpuh->sorted_xyzr_d=NULL;
00709 
00710 if (gpuh->color_d != NULL)
00711 cudaFree(gpuh->color_d);
00712 gpuh->color_d=NULL;
00713 
00714 if (gpuh->sorted_color_d != NULL)
00715 cudaFree(gpuh->sorted_color_d);
00716 gpuh->sorted_color_d=NULL;
00717 
00718 if (gpuh->atomIndex_d != NULL)
00719 cudaFree(gpuh->atomIndex_d);
00720 gpuh->atomIndex_d=NULL;
00721 
00722 if (gpuh->atomHash_d != NULL)
00723 cudaFree(gpuh->atomHash_d);
00724 gpuh->atomHash_d=NULL;
00725 
00726 if (gpuh->cellStartEnd_d != NULL)
00727 cudaFree(gpuh->cellStartEnd_d);
00728 gpuh->cellStartEnd_d=NULL;
00729 
00730 if (gpuh->v3f_d != NULL)
00731 cudaFree(gpuh->v3f_d);
00732 gpuh->v3f_d=NULL;
00733 
00734 if (gpuh->n3f_d != NULL)
00735 cudaFree(gpuh->n3f_d);
00736 gpuh->n3f_d=NULL;
00737 
00738 if (gpuh->c3f_d != NULL)
00739 cudaFree(gpuh->c3f_d);
00740 gpuh->c3f_d=NULL;
00741 
00742 if (gpuh->n3b_d != NULL)
00743 cudaFree(gpuh->n3b_d);
00744 gpuh->n3b_d=NULL;
00745 
00746 if (gpuh->c4u_d != NULL)
00747 cudaFree(gpuh->c4u_d);
00748 gpuh->c4u_d=NULL;
00749 
00750 
00751 return 0;
00752 }
00753 
00754 
00755 int CUDAQuickSurf::check_bufs(long int natoms, int colorperatom, 
00756 VolTexFormat vtexformat, 
00757 int acx, int acy, int acz,
00758 int gx, int gy, int gz) {
00759 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00760 
00761 // If the current atom count, texturing mode, and total voxel count
00762 // use the same or less storage than the size of the existing buffers,
00763 // we can reuse the same buffers without having to go through the 
00764 // complex allocation and validation loops. This is a big performance
00765 // benefit during trajectory animation.
00766 if (natoms <= gpuh->natoms &&
00767 colorperatom <= gpuh->colorperatom &&
00768 vtexformat == gpuh->vtexformat &&
00769 (acx*acy*acz) <= (gpuh->acx * gpuh->acy * gpuh->acz) && 
00770 (gx*gy*gz) <= (gpuh->gx * gpuh->gy * gpuh->gz))
00771 return 0;
00772 
00773 return -1; // no existing bufs, or too small to be used 
00774 }
00775 
00776 
00777 int CUDAQuickSurf::alloc_bufs(long int natoms, int colorperatom, 
00778 VolTexFormat vtexformat, 
00779 int acx, int acy, int acz,
00780 int gx, int gy, int gz) {
00781 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00782 
00783 // early exit from allocation call if we've already got existing
00784 // buffers that are large enough to support the request
00785 if (check_bufs(natoms, colorperatom, vtexformat, acx, acy, acz, gx, gy, gz) == 0)
00786 return 0;
00787 
00788 // If we have any existing allocations, trash them as they weren't
00789 // usable for this new request and we need to reallocate them from scratch
00790 free_bufs();
00791 
00792 long int acncells = ((long) acx) * ((long) acy) * ((long) acz);
00793 long int ncells = ((long) gx) * ((long) gy) * ((long) gz);
00794 long int volmemsz = ncells * sizeof(float);
00795 long int chunkmaxverts = 3L * ncells; // assume worst case 50% triangle occupancy
00796 long int MCsz = CUDAMarchingCubes::MemUsageMC(gx, gy, gz);
00797 
00798 // Allocate all of the memory buffers our algorithms will need up-front,
00799 // so we can retry and gracefully reduce the sizes of various buffers
00800 // to attempt to fit within available GPU memory 
00801 long int totalmemsz = 
00802 volmemsz + // volume
00803 (2L * natoms * sizeof(unsigned int)) + // bin sort
00804 (acncells * sizeof(uint2)) + // bin sort
00805 (3L * chunkmaxverts * sizeof(float3)) + // MC vertex bufs 
00806 natoms*sizeof(float4) + // thrust
00807 8L * gx * gy * sizeof(float) + // thrust
00808 MCsz; // mcubes
00809 
00810 cudaMalloc((void**)&gpuh->devdensity, volmemsz);
00811 if (colorperatom) {
00812 int voltexsz = 0;
00813 switch (vtexformat) {
00814 case RGB3F:
00815 voltexsz = ncells * sizeof(float3);
00816 break;
00817 
00818 case RGB4U:
00819 voltexsz = ncells * sizeof(uchar4);
00820 break;
00821 }
00822 gpuh->vtexformat = vtexformat;
00823 
00824 cudaMalloc((void**)&gpuh->devvoltexmap, voltexsz);
00825 cudaMalloc((void**)&gpuh->color_d, natoms * sizeof(float4));
00826 cudaMalloc((void**)&gpuh->sorted_color_d, natoms * sizeof(float4));
00827 totalmemsz += 2 * natoms * sizeof(float4);
00828 }
00829 cudaMalloc((void**)&gpuh->xyzr_d, natoms * sizeof(float4));
00830 cudaMalloc((void**)&gpuh->sorted_xyzr_d, natoms * sizeof(float4));
00831 cudaMalloc((void**)&gpuh->atomIndex_d, natoms * sizeof(unsigned int));
00832 cudaMalloc((void**)&gpuh->atomHash_d, natoms * sizeof(unsigned int));
00833 cudaMalloc((void**)&gpuh->cellStartEnd_d, acncells * sizeof(uint2));
00834 
00835 // allocate marching cubes output buffers
00836 cudaMalloc((void**)&gpuh->v3f_d, 3 * chunkmaxverts * sizeof(float3));
00837 #if 1
00838 // Note: The CUDA char3 type is explicitly a signed type
00839 cudaMalloc((void**)&gpuh->n3b_d, 3 * chunkmaxverts * sizeof(char3));
00840 totalmemsz += 3 * chunkmaxverts * sizeof(char3); // MC normal bufs 
00841 #else
00842 cudaMalloc((void**)&gpuh->n3f_d, 3 * chunkmaxverts * sizeof(float3));
00843 totalmemsz += 3 * chunkmaxverts * sizeof(float3); // MC normal bufs 
00844 #endif
00845 #if 1
00846 cudaMalloc((void**)&gpuh->c4u_d, 3 * chunkmaxverts * sizeof(uchar4));
00847 totalmemsz += 3 * chunkmaxverts * sizeof(uchar4); // MC vertex color bufs 
00848 #else
00849 cudaMalloc((void**)&gpuh->c3f_d, 3 * chunkmaxverts * sizeof(float3));
00850 totalmemsz += 3 * chunkmaxverts * sizeof(float3); // MC vertex color bufs 
00851 #endif
00852 
00853 // Allocate an extra phantom array to act as a safety net to
00854 // ensure that subsequent allocations performed internally by 
00855 // the NVIDIA thrust template library or by our 
00856 // marching cubes implementation don't fail, since we can't 
00857 // currently pre-allocate all of them.
00858 cudaMalloc(&gpuh->safety, 
00859 natoms*sizeof(float4) + // thrust
00860 8 * gx * gy * sizeof(float) + // thrust
00861 MCsz); // mcubes
00862 
00863 if (gpuh->verbose > 1)
00864 printf("Total QuickSurf mem size: %ld MB\n", totalmemsz / (1024*1024));
00865 
00866 cudaError_t err = cudaGetLastError();
00867 if (err != cudaSuccess)
00868 return -1;
00869 
00870 // once the allocation has succeeded, we update the GPU handle info 
00871 // so that the next test/allocation pass knows the latest state.
00872 gpuh->natoms = natoms;
00873 gpuh->colorperatom = colorperatom;
00874 
00875 gpuh->acx = acx;
00876 gpuh->acy = acy;
00877 gpuh->acz = acz;
00878 
00879 gpuh->gx = gx;
00880 gpuh->gy = gy;
00881 gpuh->gz = gz;
00882 
00883 return 0;
00884 }
00885 
00886 
00887 int CUDAQuickSurf::get_chunk_bufs(int testexisting,
00888 long int natoms, int colorperatom, 
00889 VolTexFormat vtexformat,
00890 int acx, int acy, int acz,
00891 int gx, int gy, int gz,
00892 int &cx, int &cy, int &cz,
00893 int &sx, int &sy, int &sz) {
00894 dim3 Bsz(GBLOCKSZX, GBLOCKSZY, GBLOCKSZZ);
00895 if (colorperatom)
00896 Bsz.z = GTEXBLOCKSZZ;
00897 
00898 cudaError_t err = cudaGetLastError(); // eat error so next CUDA op succeeds
00899 
00900 // enter loop to attempt a single-pass computation, but if the
00901 // allocation fails, cut the chunk size Z dimension by half
00902 // repeatedly until we either run chunks of 8 planes at a time,
00903 // otherwise we assume it is hopeless.
00904 cz <<= 1; // premultiply by two to simplify loop body
00905 int chunkiters = 0;
00906 int chunkallocated = 0;
00907 while (!chunkallocated) {
00908 // Cut the Z chunk size in half
00909 chunkiters++;
00910 cz >>= 1;
00911 
00912 // if we've already dropped to a subvolume size, subtract off the
00913 // four extra Z planes from last time before we do the modulo padding
00914 // calculation so we don't hit an infinite loop trying to go below 
00915 // 16 planes due the padding math below.
00916 if (cz != gz)
00917 cz-=4;
00918 
00919 // Pad the chunk to a multiple of the computational tile size since
00920 // each thread computes multiple elements (unrolled in the Z direction)
00921 cz += (8 - (cz % 8));
00922 
00923 // The density map "slab" size is the chunk size but without the extra
00924 // plane used to copy the last plane of the previous chunk's density
00925 // into the start, for use by the marching cubes.
00926 sx = cx;
00927 sy = cy;
00928 sz = cz;
00929 
00930 // Add four extra Z-planes for copying the previous end planes into 
00931 // the start of the next chunk.
00932 cz+=4;
00933 
00934 #if 0
00935 printf(" Trying slab size: %d (test: %d)\n", sz, testexisting);
00936 #endif
00937 
00938 #if 1
00939 // test to see if total number of thread blocks exceeds maximum
00940 // number we can reasonably run prior to a kernel timeout error
00941 dim3 tGsz((sx+Bsz.x-1) / Bsz.x, 
00942 (sy+Bsz.y-1) / Bsz.y,
00943 (sz+(Bsz.z*GUNROLL)-1) / (Bsz.z * GUNROLL));
00944 if (colorperatom) {
00945 tGsz.z = (sz+(Bsz.z*GTEXUNROLL)-1) / (Bsz.z * GTEXUNROLL);
00946 }
00947 if (tGsz.x * tGsz.y * tGsz.z > 65535)
00948 continue; 
00949 #endif
00950 
00951 // Bail out if we can't get enough memory to run at least
00952 // 8 slices in a single pass (making sure we've freed any allocations
00953 // beforehand, so they aren't leaked).
00954 if (sz <= 8) {
00955 return -1;
00956 }
00957 
00958 if (testexisting) {
00959 if (check_bufs(natoms, colorperatom, vtexformat, acx, acy, acz, cx, cy, cz) != 0)
00960 continue;
00961 } else {
00962 if (alloc_bufs(natoms, colorperatom, vtexformat, acx, acy, acz, cx, cy, cz) != 0)
00963 continue;
00964 }
00965 
00966 chunkallocated=1;
00967 }
00968 
00969 return 0;
00970 }
00971 
00972 
00973 int CUDAQuickSurf::calc_surf(long int natoms, const float *xyzr_f, 
00974 const float *colors_f,
00975 int colorperatom, VolTexFormat voltexformat,
00976 float *origin, int *numvoxels, float maxrad,
00977 float radscale, float gridspacing, 
00978 float isovalue, float gausslim,
00979 VMDDisplayList *cmdList) {
00980 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00981 
00982 // if there were any problems when the constructor tried to get
00983 // GPU hardware capabilities, we consider it to be unusable for all time.
00984 if (gpuh->dev < 0)
00985 return -1;
00986 
00987 // This code currently requires compute capability 2.x or greater.
00988 // We absolutely depend on hardware broadcasts for 
00989 // global memory reads by multiple threads reading the same element,
00990 // and the code more generally assumes the Fermi L1 cache and prefers
00991 // to launch 3-D grids where possible. 
00992 if (gpuh->deviceProp.major < 2) {
00993 return -1;
00994 }
00995 
00996 // start timing...
00997 wkf_timerhandle globaltimer = wkf_timer_create();
00998 wkf_timer_start(globaltimer);
00999 
01000 int vtexsize = 0;
01001 // const VolTexFormat voltexformat = RGB4U; // XXX caller may want to set this
01002 // const VolTexFormat voltexformat = RGB3F; // XXX caller may want to set this
01003 switch (voltexformat) {
01004 case RGB3F: 
01005 vtexsize = sizeof(float3);
01006 break;
01007 
01008 case RGB4U: 
01009 vtexsize = sizeof(uchar4);
01010 break;
01011 }
01012 
01013 float4 *colors = (float4 *) colors_f;
01014 int chunkmaxverts=0;
01015 int chunknumverts=0; 
01016 int numverts=0;
01017 int numfacets=0;
01018 
01019 // compute grid spacing for the acceleration grid
01020 float acgridspacing = gausslim * radscale * maxrad;
01021 
01022 // ensure acceleration grid spacing >= density grid spacing
01023 if (acgridspacing < gridspacing)
01024 acgridspacing = gridspacing;
01025 
01026 // Allocate output arrays for the gaussian density map and 3-D texture map
01027 // We test for errors carefully here since this is the most likely place
01028 // for a memory allocation failure due to the size of the grid.
01029 int3 volsz = make_int3(numvoxels[0], numvoxels[1], numvoxels[2]);
01030 int3 chunksz = volsz;
01031 int3 slabsz = volsz;
01032 
01033 // An alternative scheme to minimize the QuickSurf GPU memory footprint
01034 if (getenv("VMDQUICKSURFMINMEM")) {
01035 if (volsz.z > 32) {
01036 slabsz.z = chunksz.z = 16;
01037 }
01038 }
01039 
01040 int3 accelcells;
01041 accelcells.x = max(int((volsz.x*gridspacing) / acgridspacing), 1);
01042 accelcells.y = max(int((volsz.y*gridspacing) / acgridspacing), 1);
01043 accelcells.z = max(int((volsz.z*gridspacing) / acgridspacing), 1);
01044 
01045 dim3 Bsz(GBLOCKSZX, GBLOCKSZY, GBLOCKSZZ);
01046 if (colorperatom)
01047 Bsz.z = GTEXBLOCKSZZ;
01048 
01049 // check to see if it's possible to use an existing allocation,
01050 // if so, just leave things as they are, and do the computation 
01051 // using the existing buffers
01052 if (gpuh->natoms == 0 ||
01053 get_chunk_bufs(1, natoms, colorperatom, voltexformat,
01054 accelcells.x, accelcells.y, accelcells.z,
01055 volsz.x, volsz.y, volsz.z,
01056 chunksz.x, chunksz.y, chunksz.z,
01057 slabsz.x, slabsz.y, slabsz.z) == -1) {
01058 // reset the chunksz and slabsz after failing to try and
01059 // fit them into the existing allocations...
01060 chunksz = volsz;
01061 slabsz = volsz;
01062 
01063 // reallocate the chunk buffers from scratch since we weren't
01064 // able to reuse them
01065 if (get_chunk_bufs(0, natoms, colorperatom, voltexformat,
01066 accelcells.x, accelcells.y, accelcells.z,
01067 volsz.x, volsz.y, volsz.z,
01068 chunksz.x, chunksz.y, chunksz.z,
01069 slabsz.x, slabsz.y, slabsz.z) == -1) {
01070 wkf_timer_destroy(globaltimer);
01071 return -1;
01072 }
01073 }
01074 chunkmaxverts = 3 * chunksz.x * chunksz.y * chunksz.z;
01075 
01076 // Free the "safety padding" memory we allocate to ensure we dont
01077 // have trouble with thrust calls that allocate their own memory later
01078 if (gpuh->safety != NULL)
01079 cudaFree(gpuh->safety);
01080 gpuh->safety = NULL;
01081 
01082 if (gpuh->verbose > 1) {
01083 printf(" Using GPU chunk size: %d\n", chunksz.z);
01084 printf(" Accel grid(%d, %d, %d) spacing %f\n",
01085 accelcells.x, accelcells.y, accelcells.z, acgridspacing);
01086 }
01087 
01088 // pre-process the atom coordinates and radii as needed
01089 // short-term fix until a new CUDA kernel takes care of this
01090 int i, i4;
01091 float4 *xyzr = (float4 *) malloc(natoms * sizeof(float4));
01092 float log2e = log2f(2.718281828f);
01093 for (i=0,i4=0; i<natoms; i++,i4+=4) {
01094 xyzr[i].x = xyzr_f[i4 ];
01095 xyzr[i].y = xyzr_f[i4 + 1];
01096 xyzr[i].z = xyzr_f[i4 + 2];
01097 
01098 float scaledrad = xyzr_f[i4 + 3] * radscale;
01099 float arinv = -1.0f * log2e / (2.0f*scaledrad*scaledrad);
01100 
01101 xyzr[i].w = arinv;
01102 }
01103 cudaMemcpy(gpuh->xyzr_d, xyzr, natoms * sizeof(float4), cudaMemcpyHostToDevice);
01104 free(xyzr);
01105 
01106 if (colorperatom)
01107 cudaMemcpy(gpuh->color_d, colors, natoms * sizeof(float4), cudaMemcpyHostToDevice);
01108 
01109 // build uniform grid acceleration structure
01110 if (vmd_cuda_build_density_atom_grid(natoms, gpuh->xyzr_d, gpuh->color_d,
01111 gpuh->sorted_xyzr_d,
01112 gpuh->sorted_color_d,
01113 gpuh->atomIndex_d, gpuh->atomHash_d,
01114 gpuh->cellStartEnd_d, 
01115 accelcells, 1.0f / acgridspacing) != 0) {
01116 wkf_timer_destroy(globaltimer);
01117 free_bufs();
01118 return -1;
01119 }
01120 
01121 double sorttime = wkf_timer_timenow(globaltimer);
01122 double lastlooptime = sorttime;
01123 
01124 double densitykerneltime = 0.0f;
01125 double densitytime = 0.0f;
01126 double mckerneltime = 0.0f;
01127 double mctime = 0.0f; 
01128 double copycalltime = 0.0f;
01129 double copytime = 0.0f;
01130 
01131 float *volslab_d = NULL;
01132 void *texslab_d = NULL;
01133 
01134 int lzplane = GBLOCKSZZ * GUNROLL;
01135 if (colorperatom)
01136 lzplane = GTEXBLOCKSZZ * GTEXUNROLL;
01137 
01138 // initialize CUDA marching cubes class instance or rebuild it if needed
01139 uint3 mgsz = make_uint3(chunksz.x, chunksz.y, chunksz.z);
01140 if (gpuh->mc == NULL) {
01141 gpuh->mc = new CUDAMarchingCubes(); 
01142 if (!gpuh->mc->Initialize(mgsz)) {
01143 printf("QuickSurf call to MC Initialize() failed\n");
01144 }
01145 } else {
01146 uint3 mcmaxgridsize = gpuh->mc->GetMaxGridSize();
01147 if (slabsz.x > mcmaxgridsize.x ||
01148 slabsz.y > mcmaxgridsize.y ||
01149 slabsz.z > mcmaxgridsize.z) {
01150 if (gpuh->verbose)
01151 printf("*** QuickSurf Allocating new MC object...\n");
01152 
01153 // delete marching cubes object
01154 delete gpuh->mc;
01155 
01156 // create and initialize CUDA marching cubes class instance
01157 gpuh->mc = new CUDAMarchingCubes(); 
01158 
01159 if (!gpuh->mc->Initialize(mgsz)) {
01160 printf("QuickSurf MC Initialize() call failed to recreate MC object\n");
01161 }
01162 } 
01163 }
01164 
01165 int z;
01166 int chunkcount=0;
01167 float invacgridspacing = 1.0f / acgridspacing;
01168 float invisovalue = 1.0f / isovalue;
01169 for (z=0; z<volsz.z; z+=slabsz.z) {
01170 int3 curslab = slabsz;
01171 if (z+curslab.z > volsz.z)
01172 curslab.z = volsz.z - z; 
01173 
01174 int slabplanesz = curslab.x * curslab.y;
01175 
01176 dim3 Gsz((curslab.x+Bsz.x-1) / Bsz.x, 
01177 (curslab.y+Bsz.y-1) / Bsz.y,
01178 (curslab.z+(Bsz.z*GUNROLL)-1) / (Bsz.z * GUNROLL));
01179 if (colorperatom)
01180 Gsz.z = (curslab.z+(Bsz.z*GTEXUNROLL)-1) / (Bsz.z * GTEXUNROLL);
01181 
01182 // For SM >= 2.x, we can run the entire slab in one pass by launching
01183 // a 3-D grid of thread blocks.
01184 dim3 Gszslice = Gsz;
01185 
01186 if (gpuh->verbose > 1) {
01187 printf("CUDA device %d, grid size %dx%dx%d\n", 
01188 0, Gsz.x, Gsz.y, Gsz.z);
01189 printf("CUDA: vol(%d,%d,%d) accel(%d,%d,%d)\n",
01190 curslab.x, curslab.y, curslab.z,
01191 accelcells.x, accelcells.y, accelcells.z);
01192 printf("Z=%d, curslab.z=%d\n", z, curslab.z);
01193 }
01194 
01195 // For all but the first density slab, we copy the last four 
01196 // planes of the previous run into the start of the next run so
01197 // that we can extract the isosurface with no discontinuities
01198 if (z == 0) {
01199 volslab_d = gpuh->devdensity;
01200 if (colorperatom)
01201 texslab_d = gpuh->devvoltexmap;
01202 } else {
01203 int fourplanes = 4 * slabplanesz;
01204 cudaMemcpy(gpuh->devdensity,
01205 volslab_d + (slabsz.z-4) * slabplanesz, 
01206 fourplanes * sizeof(float), cudaMemcpyDeviceToDevice);
01207 volslab_d = gpuh->devdensity + fourplanes;
01208 
01209 if (colorperatom) {
01210 cudaMemcpy(gpuh->devvoltexmap,
01211 ((unsigned char *) texslab_d) + (slabsz.z-4) * slabplanesz * vtexsize, 
01212 fourplanes * vtexsize, cudaMemcpyDeviceToDevice);
01213 texslab_d = ((unsigned char *) gpuh->devvoltexmap) + fourplanes * vtexsize;
01214 }
01215 }
01216 
01217 // loop over the planes/slices in a slab and compute density and texture
01218 for (int lz=0; lz<Gsz.z; lz+=Gszslice.z) {
01219 int lzinc = lz * lzplane;
01220 float *volslice_d = volslab_d + lzinc * slabplanesz;
01221 
01222 if (colorperatom) {
01223 void *texslice_d = ((unsigned char *) texslab_d) + lzinc * slabplanesz * vtexsize;
01224 switch (voltexformat) {
01225 case RGB3F:
01226 gaussdensity_fast_tex3f<<<Gszslice, Bsz, 0>>>(natoms, 
01227 gpuh->sorted_xyzr_d, gpuh->sorted_color_d, 
01228 curslab, accelcells, acgridspacing, invacgridspacing, 
01229 gpuh->cellStartEnd_d, gridspacing, z+lzinc,
01230 volslice_d, (float3 *) texslice_d, invisovalue);
01231 break;
01232 
01233 case RGB4U:
01234 gaussdensity_fast_tex_norm<float, uchar4><<<Gszslice, Bsz, 0>>>(natoms, 
01235 gpuh->sorted_xyzr_d, gpuh->sorted_color_d, 
01236 curslab, accelcells, acgridspacing, invacgridspacing, 
01237 gpuh->cellStartEnd_d, gridspacing, z+lzinc,
01238 volslice_d, (uchar4 *) texslice_d, invisovalue);
01239 break;
01240 }
01241 } else {
01242 gaussdensity_fast<<<Gszslice, Bsz, 0>>>(natoms, gpuh->sorted_xyzr_d, 
01243 curslab, accelcells, acgridspacing, invacgridspacing, 
01244 gpuh->cellStartEnd_d, gridspacing, z+lzinc, volslice_d);
01245 }
01246 }
01247 cudaDeviceSynchronize(); 
01248 densitykerneltime = wkf_timer_timenow(globaltimer);
01249 
01250 #if 0
01251 printf(" CUDA mcubes..."); fflush(stdout);
01252 #endif
01253 
01254 uint3 gvsz = make_uint3(curslab.x, curslab.y, curslab.z);
01255 
01256 // For all but the first density slab, we copy the last four
01257 // planes of the previous run into the start of the next run so
01258 // that we can extract the isosurface with no discontinuities
01259 if (z != 0)
01260 gvsz.z=curslab.z + 4;
01261 
01262 float3 bbox = make_float3(gvsz.x * gridspacing, gvsz.y * gridspacing,
01263 gvsz.z * gridspacing);
01264 
01265 float3 gorigin = make_float3(origin[0], origin[1], 
01266 origin[2] + (z * gridspacing));
01267 if (z != 0)
01268 gorigin.z = origin[2] + ((z-4) * gridspacing);
01269 
01270 #if 0
01271 printf("\n ... vsz: %d %d %d\n", gvsz.x, gvsz.y, gvsz.z);
01272 printf(" ... org: %.2f %.2f %.2f\n", gorigin.x, gorigin.y, gorigin.z);
01273 printf(" ... bxs: %.2f %.2f %.2f\n", bbox.x, bbox.y, bbox.y);
01274 printf(" ... bbe: %.2f %.2f %.2f\n", gorigin.x+bbox.x, gorigin.y+bbox.y, gorigin.z+bbox.z);
01275 #endif
01276 
01277 // If we are computing the volume using multiple passes, we have to 
01278 // overlap the marching cubes grids and compute a sub-volume to exclude
01279 // the end planes, except for the first and last sub-volume, in order to
01280 // get correct per-vertex normals at the edges of each sub-volume 
01281 int skipstartplane=0;
01282 int skipendplane=0;
01283 if (chunksz.z < volsz.z) {
01284 // on any but the first pass, we skip the first Z plane
01285 if (z != 0)
01286 skipstartplane=1;
01287 
01288 // on any but the last pass, we skip the last Z plane
01289 if (z+curslab.z < volsz.z)
01290 skipendplane=1;
01291 }
01292 
01293 //
01294 // Extract density map isosurface using marching cubes
01295 //
01296 
01297 // Choose the isovalue depending on whether the desnity map 
01298 // contains normalized or un-normalized density values
01299 if (colorperatom && (voltexformat == RGB4U)) {
01300 // incoming densities are pre-normalized so that the target isovalue
01301 // is represented as a density of 1.0f
01302 gpuh->mc->SetIsovalue(1.0f);
01303 } else {
01304 gpuh->mc->SetIsovalue(isovalue);
01305 }
01306 
01307 int mcstat = 0;
01308 switch (voltexformat) {
01309 case RGB3F:
01310 mcstat = gpuh->mc->SetVolumeData(gpuh->devdensity, 
01311 (float3 *) gpuh->devvoltexmap,
01312 gvsz, gorigin, bbox, true);
01313 break;
01314 
01315 case RGB4U:
01316 mcstat = gpuh->mc->SetVolumeData(gpuh->devdensity, 
01317 (uchar4 *) gpuh->devvoltexmap,
01318 gvsz, gorigin, bbox, true);
01319 break;
01320 }
01321 if (!mcstat) {
01322 printf("QuickSurf call to MC SetVolumeData() failed\n");
01323 }
01324 
01325 // set the sub-volume starting/ending indices if needed
01326 if (skipstartplane || skipendplane) {
01327 uint3 volstart = make_uint3(0, 0, 0);
01328 uint3 volend = make_uint3(gvsz.x, gvsz.y, gvsz.z);
01329 
01330 if (skipstartplane)
01331 volstart.z = 2;
01332 
01333 if (skipendplane)
01334 volend.z = gvsz.z - 2;
01335 
01336 gpuh->mc->SetSubVolume(volstart, volend);
01337 }
01338 if (gpuh->n3b_d) {
01339 gpuh->mc->computeIsosurface(gpuh->v3f_d, gpuh->n3b_d, 
01340 gpuh->c4u_d, chunkmaxverts);
01341 } else if (gpuh->c4u_d) {
01342 gpuh->mc->computeIsosurface(gpuh->v3f_d, gpuh->n3f_d, 
01343 gpuh->c4u_d, chunkmaxverts);
01344 } else {
01345 gpuh->mc->computeIsosurface(gpuh->v3f_d, gpuh->n3f_d, 
01346 gpuh->c3f_d, chunkmaxverts);
01347 }
01348 chunknumverts = gpuh->mc->GetVertexCount();
01349 
01350 #if 0
01351 printf("generated %d vertices, max vert limit: %d\n", chunknumverts, chunkmaxverts);
01352 #endif
01353 if (chunknumverts == chunkmaxverts)
01354 printf(" *** QuickSurf exceeded marching cubes vertex limit (%d verts)\n", chunknumverts);
01355 
01356 cudaDeviceSynchronize(); 
01357 mckerneltime = wkf_timer_timenow(globaltimer);
01358 
01359 // Create a triangle mesh
01360 if (chunknumverts > 0) {
01361 DispCmdTriMesh cmdTriMesh;
01362 if (colorperatom) {
01363 // emit triangle mesh with per-vertex colors
01364 if (gpuh->n3b_d) {
01365 cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d, 
01366 (const char *) gpuh->n3b_d, 
01367 (const unsigned char *) gpuh->c4u_d,
01368 chunknumverts/3, cmdList);
01369 } else if (gpuh->c4u_d) {
01370 cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d, 
01371 (const float *) gpuh->n3f_d, 
01372 (const unsigned char *) gpuh->c4u_d,
01373 chunknumverts/3, cmdList);
01374 } else {
01375 cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d, 
01376 (const float *) gpuh->n3f_d, 
01377 (const float *) gpuh->c3f_d,
01378 chunknumverts/3, cmdList);
01379 }
01380 } else {
01381 // emit triangle mesh with no colors, uses current rendering state
01382 if (gpuh->n3b_d) {
01383 cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d, 
01384 (const char *) gpuh->n3b_d, 
01385 (const unsigned char *) NULL,
01386 chunknumverts/3, cmdList);
01387 } else {
01388 cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d,
01389 (const float *) gpuh->n3f_d, 
01390 (const float *) NULL,
01391 chunknumverts/3, cmdList);
01392 }
01393 }
01394 }
01395 numverts+=chunknumverts;
01396 numfacets+=chunknumverts/3;
01397 
01398 #if 0
01399 // XXX we'll hold onto this as we'll want to rescue this approach
01400 // for electrostatics coloring where we have to have the 
01401 // entire triangle mesh in order to do the calculation
01402 int l;
01403 int vertstart = 3 * numverts;
01404 int vertbufsz = 3 * (numverts + chunknumverts) * sizeof(float);
01405 int facebufsz = (numverts + chunknumverts) * sizeof(int);
01406 int chunkvertsz = 3 * chunknumverts * sizeof(float);
01407 
01408 v = (float*) realloc(v, vertbufsz);
01409 n = (float*) realloc(n, vertbufsz);
01410 c = (float*) realloc(c, vertbufsz);
01411 f = (int*) realloc(f, facebufsz);
01412 cudaMemcpy(v+vertstart, gpuh->v3f_d, chunkvertsz, cudaMemcpyDeviceToHost);
01413 cudaMemcpy(n+vertstart, gpuh->n3f_d, chunkvertsz, cudaMemcpyDeviceToHost);
01414 if (colorperatom) {
01415 cudaMemcpy(c+vertstart, gpuh->c3f_d, chunkvertsz, cudaMemcpyDeviceToHost);
01416 } else {
01417 float *color = c+vertstart;
01418 for (l=0; l<chunknumverts*3; l+=3) {
01419 color[l + 0] = colors[0].x;
01420 color[l + 1] = colors[0].y;
01421 color[l + 2] = colors[0].z;
01422 }
01423 }
01424 for (l=numverts; l<numverts+chunknumverts; l++) {
01425 f[l]=l;
01426 }
01427 numverts+=chunknumverts;
01428 numfacets+=chunknumverts/3;
01429 #endif
01430 
01431 copycalltime = wkf_timer_timenow(globaltimer);
01432 
01433 densitytime += densitykerneltime - lastlooptime;
01434 mctime += mckerneltime - densitykerneltime;
01435 copytime += copycalltime - mckerneltime;
01436 
01437 lastlooptime = wkf_timer_timenow(globaltimer);
01438 
01439 chunkcount++; // increment number of chunks processed
01440 }
01441 
01442 // catch any errors that may have occured so that at the very least,
01443 // all of the subsequent resource deallocation calls will succeed
01444 cudaError_t err = cudaGetLastError();
01445 
01446 wkf_timer_stop(globaltimer);
01447 double totalruntime = wkf_timer_time(globaltimer);
01448 wkf_timer_destroy(globaltimer);
01449 
01450 // If an error occured, we print it and return an error code, once
01451 // all of the memory deallocations have completed.
01452 if (err != cudaSuccess) { 
01453 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
01454 return -1;
01455 }
01456 
01457 if (gpuh->verbose) {
01458 printf(" GPU generated %d vertices, %d facets, in %d passes\n", 
01459 numverts, numfacets, chunkcount);
01460 printf(" GPU time (%s): %.3f [sort: %.3f density %.3f mcubes: %.3f copy: %.3f]\n", 
01461 "SM >= 2.x", totalruntime, sorttime, densitytime, mctime, copytime);
01462 }
01463 
01464 return 0;
01465 }
01466 
01467 
01468 
01469 
01470