00001 /*************************************************************************** 00002 *cr 00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the 00004 *cr University of Illinois 00005 *cr All Rights Reserved 00006 *cr 00007 ***************************************************************************/ 00008 00009 /*************************************************************************** 00010 * RCS INFORMATION: 00011 * 00012 * $RCSfile: QuickSurf.C,v $ 00013 * $Author: johns $ $Locker: $ $State: Exp $ 00014 * $Revision: 1.136 $ $Date: 2022年05月23日 19:10:01 $ 00015 * 00016 *************************************************************************** 00017 * DESCRIPTION: 00018 * Fast gaussian surface representation 00019 ***************************************************************************/ 00020 00021 // pgcc 2016 has troubles with hand-vectorized x86 intrinsics presently 00022 #if !defined(__PGIC__) 00023 00024 // Intel x86 hardware 00025 #if (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_AMD64)) 00026 #if !defined(__SSE2__) && defined(_WIN64) 00027 #define __SSE2__ 1 /* MSVC fails to define SSE macros */ 00028 #endif 00029 #endif 00030 00031 #define VMDQSURFUSESSE 1 00032 #endif 00033 00034 // The OpenPOWER VSX code path runs on POWER8 and later hardware, but is 00035 // untested on older platforms that support VSX instructions. 00036 // XXX GCC 4.8.5 breaks with conflicts between vec_xxx() routines 00037 // defined in utilities.h vs. VSX intrinsics in altivec.h and similar. 00038 // For now, we disable VSX for GCC for this source file. 00039 // IBM Power 8/9/10 Altivec/VSX instructions: 00040 // https://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf 00041 #if !defined(__GNUC__) && defined(__VEC__) 00042 #define VMDQSURFUSEVSX 1 00043 #endif 00044 00045 #include <stdio.h> 00046 #include <stdlib.h> 00047 #if VMDQSURFUSESSE && defined(__SSE2__) 00048 #include <emmintrin.h> 00049 #endif 00050 00051 #if (defined(VMDQSURFUSEVSX) && defined(__VSX__)) 00052 #if defined(__GNUC__) && defined(__VEC__) 00053 #include <altivec.h> 00054 #endif 00055 #endif 00056 00057 #include <string.h> 00058 #include <math.h> 00059 #include "QuickSurf.h" 00060 #if defined(VMDCUDA) 00061 #include "CUDAQuickSurf.h" 00062 #endif 00063 #include "Measure.h" 00064 #include "Inform.h" 00065 #include "utilities.h" 00066 #include "WKFUtils.h" 00067 #include "VolumetricData.h" 00068 00069 #include "VMDDisplayList.h" 00070 #include "Displayable.h" 00071 #include "DispCmds.h" 00072 #include "ProfileHooks.h" 00073 #include "VMDApp.h" // access global CPU insn flags for dispatch 00074 00075 #define MIN(X,Y) (((X)<(Y))? (X) : (Y)) 00076 #define MAX(X,Y) (((X)>(Y))? (X) : (Y)) 00077 00078 // fctn prototypes for runtime dispatched AVX2 kernels, etc 00079 void vmd_gaussdensity_avx2(int verbose, 00080 int natoms, const float *xyzr, 00081 const float *atomicnum, 00082 const float *colors, 00083 float *densitymap, float *voltexmap, 00084 const int *numvoxels, 00085 float radscale, float gridspacing, 00086 float isovalue, float gausslim); 00087 00088 void vmd_gaussdensity_neon(int verbose, 00089 int natoms, const float *xyzr, 00090 const float *atomicnum, 00091 const float *colors, 00092 float *densitymap, float *voltexmap, 00093 const int *numvoxels, 00094 float radscale, float gridspacing, 00095 float isovalue, float gausslim); 00096 00097 /* 00098 * David J. Hardy 00099 * 12 Dec 2008 00100 * 00101 * aexpfnx() - Approximate expf() for negative x. 00102 * 00103 * Assumes that x <= 0. 00104 * 00105 * Assumes IEEE format for single precision float, specifically: 00106 * 1 sign bit, 8 exponent bits biased by 127, and 23 mantissa bits. 00107 * 00108 * Interpolates exp() on interval (-1/log2(e), 0], then shifts it by 00109 * multiplication of a fast calculation for 2^(-N). The interpolation 00110 * uses a linear blending of 3rd degree Taylor polynomials at the end 00111 * points, so the approximation is once differentiable. 00112 * 00113 * The error is small (max relative error per interval is calculated 00114 * to be 0.131%, with a max absolute error of -0.000716). 00115 * 00116 * The cutoff is chosen so as to speed up the computation by early 00117 * exit from function, with the value chosen to give less than the 00118 * the max absolute error. Use of a cutoff is unnecessary, except 00119 * for needing to shift smallest floating point numbers to zero, 00120 * i.e. you could remove cutoff and replace by: 00121 * 00122 * #define MINXNZ -88.0296919311130 // -127 * log(2) 00123 * 00124 * if (x < MINXNZ) return 0.f; 00125 * 00126 * Use of a cutoff causes a discontinuity which can be eliminated 00127 * through the use of a switching function. 00128 * 00129 * We can obtain arbitrarily smooth approximation by taking k+1 nodes on 00130 * the interval and weighting their respective Taylor polynomials by the 00131 * kth order Lagrange interpolant through those nodes. The wiggle in the 00132 * polynomial interpolation due to equidistant nodes (Runge's phenomenon) 00133 * can be reduced by using Chebyshev nodes. 00134 */ 00135 00136 #if defined(__GNUC__) && ! defined(__INTEL_COMPILER) 00137 #define __align(X) __attribute__((aligned(X) )) 00138 #if (__GNUC__ < 4) 00139 #define MISSING_mm_cvtsd_f64 00140 #endif 00141 #else 00142 #define __align(X) __declspec(align(X) ) 00143 #endif 00144 00145 #define MLOG2EF -1.44269504088896f 00146 00147 /* 00148 * Interpolating coefficients for linear blending of the 00149 * 3rd degree Taylor expansion of 2^x about 0 and -1. 00150 */ 00151 #define SCEXP0 1.0000000000000000f 00152 #define SCEXP1 0.6987082824680118f 00153 #define SCEXP2 0.2633174272827404f 00154 #define SCEXP3 0.0923611991471395f 00155 #define SCEXP4 0.0277520543324108f 00156 00157 /* for single precision float */ 00158 #define EXPOBIAS 127 00159 #define EXPOSHIFT 23 00160 00161 /* cutoff is optional, but can help avoid unnecessary work */ 00162 #define ACUTOFF -10 00163 00164 typedef union flint_t { 00165 float f; 00166 int n; 00167 } flint; 00168 00169 #if VMDQSURFUSESSE && defined(__SSE2__) 00170 // SSE variant of the 'flint' union above 00171 typedef union SSEreg_t { 00172 __m128 f; // 4x float (SSE) 00173 __m128i i; // 4x 32-bit int (SSE2) 00174 } SSEreg; 00175 #endif 00176 00177 00178 #if 0 00179 static float aexpfnx(float x) { 00180 /* assume x <= 0 */ 00181 float mb; 00182 int mbflr; 00183 float d; 00184 float sy; 00185 flint scalfac; 00186 00187 if (x < ACUTOFF) return 0.f; 00188 00189 mb = x * MLOG2EF; /* change base to 2, mb >= 0 */ 00190 mbflr = (int) mb; /* get int part, floor() */ 00191 d = mbflr - mb; /* remaining exponent, -1 < d <= 0 */ 00192 sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4))); 00193 /* approx with linear blend of Taylor polys */ 00194 scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT; /* 2^(-mbflr) */ 00195 return (sy * scalfac.f); /* scaled approx */ 00196 } 00197 00198 00199 static void vmd_gaussdensity(int verbose, 00200 int natoms, const float *xyzr, 00201 const float *atomicnum, 00202 const float *colors, 00203 float *densitymap, float *voltexmap, 00204 const int *numvoxels, 00205 float radscale, float gridspacing, 00206 float isovalue, float gausslim) { 00207 int i, x, y, z; 00208 int maxvoxel[3]; 00209 maxvoxel[0] = numvoxels[0]-1; 00210 maxvoxel[1] = numvoxels[1]-1; 00211 maxvoxel[2] = numvoxels[2]-1; 00212 const float invgridspacing = 1.0f / gridspacing; 00213 00214 // compute colors only if necessary, since they are costly 00215 if (voltexmap != NULL) { 00216 float invisovalue = 1.0f / isovalue; 00217 // compute both density map and floating point color texture map 00218 for (i=0; i<natoms; i++) { 00219 if (verbose && ((i & 0x3fff) == 0)) { 00220 printf("."); 00221 fflush(stdout); 00222 } 00223 00224 ptrdiff_t ind = i*4L; 00225 float scaledrad = xyzr[ind + 3L] * radscale; 00226 00227 // MDFF atomic number weighted density factor 00228 float atomicnumfactor = 1.0f; 00229 if (atomicnum != NULL) { 00230 atomicnumfactor = atomicnum[i]; 00231 } 00232 00233 float arinv = 1.0f/(2.0f*scaledrad*scaledrad); 00234 float radlim = gausslim * scaledrad; 00235 float radlim2 = radlim * radlim; // cutoff test done in cartesian coords 00236 radlim *= invgridspacing; 00237 00238 float tmp; 00239 tmp = xyzr[ind ] * invgridspacing; 00240 int xmin = MAX((int) (tmp - radlim), 0); 00241 int xmax = MIN((int) (tmp + radlim), maxvoxel[0]); 00242 tmp = xyzr[ind+1] * invgridspacing; 00243 int ymin = MAX((int) (tmp - radlim), 0); 00244 int ymax = MIN((int) (tmp + radlim), maxvoxel[1]); 00245 tmp = xyzr[ind+2] * invgridspacing; 00246 int zmin = MAX((int) (tmp - radlim), 0); 00247 int zmax = MIN((int) (tmp + radlim), maxvoxel[2]); 00248 00249 float dz = zmin*gridspacing - xyzr[ind+2]; 00250 for (z=zmin; z<=zmax; z++,dz+=gridspacing) { 00251 float dy = ymin*gridspacing - xyzr[ind+1]; 00252 for (y=ymin; y<=ymax; y++,dy+=gridspacing) { 00253 float dy2dz2 = dy*dy + dz*dz; 00254 00255 // early-exit when outside the cutoff radius in the Y-Z plane 00256 if (dy2dz2 >= radlim2) 00257 continue; 00258 00259 int addr = z * numvoxels[0] * numvoxels[1] + y * numvoxels[0]; 00260 float dx = xmin*gridspacing - xyzr[ind]; 00261 for (x=xmin; x<=xmax; x++,dx+=gridspacing) { 00262 float r2 = dx*dx + dy2dz2; 00263 float expval = -r2 * arinv; 00264 #if VMDUSEFULLEXP 00265 // use the math library exponential routine 00266 float density = exp(expval); 00267 #else 00268 // use our (much faster) fast exponential approximation 00269 float density = aexpfnx(expval); 00270 #endif 00271 00272 density *= atomicnumfactor; // MDFF Cryo-EM atomic number density 00273 00274 // accumulate density value to density map 00275 densitymap[addr + x] += density; 00276 00277 // Accumulate density-weighted color to texture map. 00278 // Pre-multiply colors by the inverse isovalue we will extract 00279 // the surface on, to cause the final color to be normalized. 00280 density *= invisovalue; 00281 ptrdiff_t caddr = (addr + x) * 3L; 00282 00283 // color by atom colors 00284 voltexmap[caddr ] += density * colors[ind ]; 00285 voltexmap[caddr + 1] += density * colors[ind + 1]; 00286 voltexmap[caddr + 2] += density * colors[ind + 2]; 00287 } 00288 } 00289 } 00290 } 00291 } else { 00292 // compute density map only 00293 for (i=0; i<natoms; i++) { 00294 if (verbose && ((i & 0x3fff) == 0)) { 00295 printf("."); 00296 fflush(stdout); 00297 } 00298 00299 ptrdiff_t ind = i*4L; 00300 float scaledrad = xyzr[ind + 3] * radscale; 00301 00302 // MDFF atomic number weighted density factor 00303 float atomicnumfactor = 1.0f; 00304 if (atomicnum != NULL) { 00305 atomicnumfactor = atomicnum[i]; 00306 } 00307 00308 float arinv = 1.0f/(2.0f*scaledrad*scaledrad); 00309 float radlim = gausslim * scaledrad; 00310 float radlim2 = radlim * radlim; // cutoff test done in cartesian coords 00311 radlim *= invgridspacing; 00312 00313 float tmp; 00314 tmp = xyzr[ind ] * invgridspacing; 00315 int xmin = MAX((int) (tmp - radlim), 0); 00316 int xmax = MIN((int) (tmp + radlim), maxvoxel[0]); 00317 tmp = xyzr[ind+1] * invgridspacing; 00318 int ymin = MAX((int) (tmp - radlim), 0); 00319 int ymax = MIN((int) (tmp + radlim), maxvoxel[1]); 00320 tmp = xyzr[ind+2] * invgridspacing; 00321 int zmin = MAX((int) (tmp - radlim), 0); 00322 int zmax = MIN((int) (tmp + radlim), maxvoxel[2]); 00323 00324 float dz = zmin*gridspacing - xyzr[ind+2]; 00325 for (z=zmin; z<=zmax; z++,dz+=gridspacing) { 00326 float dy = ymin*gridspacing - xyzr[ind+1]; 00327 for (y=ymin; y<=ymax; y++,dy+=gridspacing) { 00328 float dy2dz2 = dy*dy + dz*dz; 00329 00330 // early-exit when outside the cutoff radius in the Y-Z plane 00331 if (dy2dz2 >= radlim2) 00332 continue; 00333 00334 int addr = z * numvoxels[0] * numvoxels[1] + y * numvoxels[0]; 00335 float dx = xmin*gridspacing - xyzr[ind]; 00336 for (x=xmin; x<=xmax; x++,dx+=gridspacing) { 00337 float r2 = dx*dx + dy2dz2; 00338 float expval = -r2 * arinv; 00339 #if VMDUSEFULLEXP 00340 // use the math library exponential routine 00341 float density = exp(expval); 00342 #else 00343 // use our (much faster) fast exponential approximation 00344 float density = aexpfnx(expval); 00345 #endif 00346 00347 density *= atomicnumfactor; // MDFF Cryo-EM atomic number density 00348 00349 // accumulate density value to density map 00350 densitymap[addr + x] += density; 00351 } 00352 } 00353 } 00354 } 00355 } 00356 } 00357 #endif 00358 00359 00360 00361 static void vmd_gaussdensity_opt(wkf_cpu_caps_t *cpucaps, int verbose, 00362 int natoms, const float *xyzr, 00363 const float *atomicnum, 00364 const float *colors, 00365 float *densitymap, float *voltexmap, 00366 const int *numvoxels, 00367 float radscale, float gridspacing, 00368 float isovalue, float gausslim) { 00369 int i, x, y, z; 00370 int maxvoxel[3]; 00371 maxvoxel[0] = numvoxels[0]-1; 00372 maxvoxel[1] = numvoxels[1]-1; 00373 maxvoxel[2] = numvoxels[2]-1; 00374 const float invgridspacing = 1.0f / gridspacing; 00375 00376 // 00377 // runtime CPU dispatch 00378 // check for optional vector instructions and execute custom kernels 00379 // for the fastest code path supported by the detected hardware 00380 // 00381 #if defined(VMDCPUDISPATCH) 00382 00383 // Intel x86 00384 #if (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_AMD64)) 00385 00386 if ((cpucaps != NULL) && 00387 (cpucaps->flags & CPU_FMA) && (cpucaps->flags & CPU_AVX2) && 00388 (getenv("VMDNOAVX2")==NULL)) { 00389 if (verbose) 00390 printf("vmd_gaussdensity_avx2()\n"); 00391 00392 vmd_gaussdensity_avx2(verbose, natoms, xyzr, atomicnum, colors, 00393 densitymap, voltexmap, numvoxels, radscale, 00394 gridspacing, isovalue, gausslim); 00395 return; 00396 } 00397 #endif 00398 00399 #if defined(VMDUSENEON) 00400 // if ((cpucaps->flags & CPU_NEON) && (getenv("VMDNONEON") == NULL)) { 00401 if (1 && (getenv("VMDNONEON") == NULL)) { 00402 if (verbose) 00403 printf("vmd_gaussdensity_neon()\n"); 00404 00405 vmd_gaussdensity_neon(verbose, natoms, xyzr, atomicnum, colors, 00406 densitymap, voltexmap, numvoxels, radscale, 00407 gridspacing, isovalue, gausslim); 00408 return; 00409 } 00410 #endif 00411 00412 00413 if (verbose) 00414 printf("vmd_gaussdensity_opt()\n"); 00415 #endif 00416 00417 #if VMDQSURFUSESSE && defined(__SSE2__) 00418 int usesse=1; 00419 if (getenv("VMDNOSSE")) { 00420 usesse=0; 00421 } 00422 #endif 00423 00424 #if VMDQSURFUSEVSX && defined(__VEC__) 00425 int usevsx=1; 00426 if (getenv("VMDNOVSX")) { 00427 usevsx=0; 00428 } 00429 #endif 00430 00431 #if VMDQSURFUSESSE && defined(__SSE2__) 00432 // Variables for SSE optimized inner loop 00433 __m128 gridspacing4_4; 00434 __align(16) float sxdelta4[4]; // 16-byte aligned for SSE 00435 00436 if (usesse) { 00437 gridspacing4_4 = _mm_set1_ps(gridspacing * 4.0f); 00438 for (x=0; x<4; x++) 00439 sxdelta4[x] = ((float) x) * gridspacing; 00440 } 00441 #endif 00442 00443 #if VMDQSURFUSEVSX && defined(__VEC__) 00444 // Variables for VSX optimized inner loop 00445 vector float gridspacing4_4; 00446 __attribute__((aligned(16))) float sxdelta4[4]; // 16-byte aligned for VSX 00447 00448 if (usevsx) { 00449 gridspacing4_4 = vec_splats(gridspacing * 4.0f); 00450 for (x=0; x<4; x++) 00451 sxdelta4[x] = ((float) x) * gridspacing; 00452 } 00453 #endif 00454 00455 // compute colors only if necessary, since they are costly 00456 if (voltexmap != NULL) { 00457 float invisovalue = 1.0f / isovalue; 00458 // compute both density map and floating point color texture map 00459 for (i=0; i<natoms; i++) { 00460 if (verbose && ((i & 0x3fff) == 0)) { 00461 printf("."); 00462 fflush(stdout); 00463 } 00464 00465 ptrdiff_t ind = i*4L; 00466 float scaledrad = xyzr[ind + 3] * radscale; 00467 00468 // MDFF atomic number weighted density factor 00469 float atomicnumfactor = 1.0f; 00470 if (atomicnum != NULL) { 00471 atomicnumfactor = atomicnum[i]; 00472 } 00473 00474 // negate, precompute reciprocal, and change to base 2 from the outset 00475 float arinv = -(1.0f/(2.0f*scaledrad*scaledrad)) * MLOG2EF; 00476 float radlim = gausslim * scaledrad; 00477 float radlim2 = radlim * radlim; // cutoff test done in cartesian coords 00478 radlim *= invgridspacing; 00479 00480 #if VMDQSURFUSESSE && defined(__SSE2__) 00481 __m128 atomicnumfactor_4 = { 0 }; 00482 __m128 arinv_4; 00483 if (usesse) { 00484 atomicnumfactor_4 = _mm_set1_ps(atomicnumfactor); 00485 #if VMDUSESVMLEXP 00486 // Use of Intel's SVML requires changing the pre-scaling factor 00487 arinv_4 = _mm_set1_ps(arinv * (2.718281828f/2.0f) / MLOG2EF); 00488 #else 00489 // Use our fully inlined exp approximation 00490 arinv_4 = _mm_set1_ps(arinv); 00491 #endif 00492 } 00493 #endif 00494 00495 float tmp; 00496 tmp = xyzr[ind ] * invgridspacing; 00497 int xmin = MAX((int) (tmp - radlim), 0); 00498 int xmax = MIN((int) (tmp + radlim), maxvoxel[0]); 00499 tmp = xyzr[ind+1] * invgridspacing; 00500 int ymin = MAX((int) (tmp - radlim), 0); 00501 int ymax = MIN((int) (tmp + radlim), maxvoxel[1]); 00502 tmp = xyzr[ind+2] * invgridspacing; 00503 int zmin = MAX((int) (tmp - radlim), 0); 00504 int zmax = MIN((int) (tmp + radlim), maxvoxel[2]); 00505 00506 float dz = zmin*gridspacing - xyzr[ind+2]; 00507 for (z=zmin; z<=zmax; z++,dz+=gridspacing) { 00508 float dy = ymin*gridspacing - xyzr[ind+1]; 00509 for (y=ymin; y<=ymax; y++,dy+=gridspacing) { 00510 float dy2dz2 = dy*dy + dz*dz; 00511 00512 // early-exit when outside the cutoff radius in the Y-Z plane 00513 if (dy2dz2 >= radlim2) 00514 continue; 00515 00516 ptrdiff_t addr = ptrdiff_t(z * numvoxels[0]) * ptrdiff_t(numvoxels[1]) + ptrdiff_t(y * numvoxels[0]); 00517 float dx = xmin*gridspacing - xyzr[ind]; 00518 x=xmin; 00519 00520 #if VMDQSURFUSESSE && defined(__SSE2__) 00521 // Use SSE when we have a multiple-of-4 to compute 00522 // finish all remaining density map points with regular non-SSE loop 00523 if (usesse) { 00524 __align(16) SSEreg n; 00525 __align(16) SSEreg y; 00526 __m128 dy2dz2_4 = _mm_set1_ps(dy2dz2); 00527 __m128 dx_4 = _mm_add_ps(_mm_set1_ps(dx), _mm_load_ps(&sxdelta4[0])); 00528 00529 for (; (x+3)<=xmax; x+=4,dx_4=_mm_add_ps(dx_4, gridspacing4_4)) { 00530 __m128 r2 = _mm_add_ps(_mm_mul_ps(dx_4, dx_4), dy2dz2_4); 00531 __m128 d; 00532 #if VMDUSESVMLEXP 00533 // use Intel's SVML exp2() routine 00534 y.f = _mm_exp2_ps(_mm_mul_ps(r2, arinv_4)); 00535 #else 00536 // use our (much faster) fully inlined exponential approximation 00537 y.f = _mm_mul_ps(r2, arinv_4); /* already negated and in base 2 */ 00538 n.i = _mm_cvttps_epi32(y.f); 00539 d = _mm_cvtepi32_ps(n.i); 00540 d = _mm_sub_ps(d, y.f); 00541 00542 // Approximate 2^{-d}, 0 <= d < 1, by interpolation. 00543 // Perform Horner's method to evaluate interpolating polynomial. 00544 #if 0 00545 // SSE 4.x FMADD instructions are not universally available 00546 y.f = _mm_fmadd_ps(d, _mm_set1_ps(SCEXP4), _mm_set1_ps(SCEXP3)); 00547 y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP2)); 00548 y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP1)); 00549 y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP0)); 00550 #else 00551 y.f = _mm_mul_ps(d, _mm_set_ps1(SCEXP4)); /* for x^4 term */ 00552 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP3)); /* for x^3 term */ 00553 y.f = _mm_mul_ps(y.f, d); 00554 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP2)); /* for x^2 term */ 00555 y.f = _mm_mul_ps(y.f, d); 00556 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP1)); /* for x^1 term */ 00557 y.f = _mm_mul_ps(y.f, d); 00558 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP0)); /* for x^0 term */ 00559 #endif 00560 00561 // Calculate 2^N exactly by directly manipulating floating point exponent, 00562 // then use it to scale y for the final result. 00563 n.i = _mm_sub_epi32(_mm_set1_epi32(EXPOBIAS), n.i); 00564 n.i = _mm_slli_epi32(n.i, EXPOSHIFT); 00565 y.f = _mm_mul_ps(y.f, n.f); 00566 #endif 00567 00568 // At present, we do unaligned loads/stores since we can't guarantee 00569 // that the X-dimension is always a multiple of 4. 00570 float *ufptr = &densitymap[addr + x]; 00571 d = _mm_loadu_ps(ufptr); 00572 y.f = _mm_mul_ps(y.f, atomicnumfactor_4); // MDFF density maps 00573 _mm_storeu_ps(ufptr, _mm_add_ps(d, y.f)); 00574 00575 // Accumulate density-weighted color to texture map. 00576 // Pre-multiply colors by the inverse isovalue we will extract 00577 // the surface on, to cause the final color to be normalized. 00578 d = _mm_mul_ps(y.f, _mm_set_ps1(invisovalue)); 00579 ptrdiff_t caddr = (addr + x) * 3L; 00580 00581 #if 1 00582 float *txptr = &voltexmap[caddr]; 00583 // unaligned load of 4 consecutive rgb3f texture map texels 00584 __m128 r0g0b0r1 = _mm_loadu_ps(txptr+0); 00585 __m128 g1b1r2g2 = _mm_loadu_ps(txptr+4); 00586 __m128 b2r3g3b3 = _mm_loadu_ps(txptr+8); 00587 00588 // convert rgb3f AOS format to 4-element SOA vectors using shuffle instructions 00589 __m128 r2g2r3g3 = _mm_shuffle_ps(g1b1r2g2, b2r3g3b3, _MM_SHUFFLE(2, 1, 3, 2)); 00590 __m128 g0b0g1b1 = _mm_shuffle_ps(r0g0b0r1, g1b1r2g2, _MM_SHUFFLE(1, 0, 2, 1)); 00591 __m128 r = _mm_shuffle_ps(r0g0b0r1, r2g2r3g3, _MM_SHUFFLE(2, 0, 3, 0)); // r0r1r2r3 00592 __m128 g = _mm_shuffle_ps(g0b0g1b1, r2g2r3g3, _MM_SHUFFLE(3, 1, 2, 0)); // g0g1g2g3 00593 __m128 b = _mm_shuffle_ps(g0b0g1b1, b2r3g3b3, _MM_SHUFFLE(3, 0, 3, 1)); // b0g1b2b3 00594 00595 // accumulate density-scaled colors into texels 00596 r = _mm_add_ps(r, _mm_mul_ps(d, _mm_set_ps1(colors[ind ]))); 00597 g = _mm_add_ps(g, _mm_mul_ps(d, _mm_set_ps1(colors[ind + 1]))); 00598 b = _mm_add_ps(b, _mm_mul_ps(d, _mm_set_ps1(colors[ind + 2]))); 00599 00600 // convert 4-element SOA vectors to rgb3f AOS format using shuffle instructions 00601 __m128 r0r2g0g2 = _mm_shuffle_ps(r, g, _MM_SHUFFLE(2, 0, 2, 0)); 00602 __m128 g1g3b1b3 = _mm_shuffle_ps(g, b, _MM_SHUFFLE(3, 1, 3, 1)); 00603 __m128 b0b2r1r3 = _mm_shuffle_ps(b, r, _MM_SHUFFLE(3, 1, 2, 0)); 00604 00605 __m128 rr0g0b0r1 = _mm_shuffle_ps(r0r2g0g2, b0b2r1r3, _MM_SHUFFLE(2, 0, 2, 0)); 00606 __m128 rg1b1r2g2 = _mm_shuffle_ps(g1g3b1b3, r0r2g0g2, _MM_SHUFFLE(3, 1, 2, 0)); 00607 __m128 rb2r3g3b3 = _mm_shuffle_ps(b0b2r1r3, g1g3b1b3, _MM_SHUFFLE(3, 1, 3, 1)); 00608 00609 // unaligned store of 4 consecutive rgb3f texture map texels 00610 _mm_storeu_ps(txptr+0, rr0g0b0r1); 00611 _mm_storeu_ps(txptr+4, rg1b1r2g2); 00612 _mm_storeu_ps(txptr+8, rb2r3g3b3); 00613 00614 #else 00615 00616 // color by atom colors 00617 float r, g, b; 00618 r = colors[ind ]; 00619 g = colors[ind + 1]; 00620 b = colors[ind + 2]; 00621 00622 SSEreg tmp; 00623 tmp.f = d; 00624 float density; 00625 density = tmp.floatreg.r0; 00626 voltexmap[caddr ] += density * r; 00627 voltexmap[caddr + 1] += density * g; 00628 voltexmap[caddr + 2] += density * b; 00629 00630 density = tmp.floatreg.r1; 00631 voltexmap[caddr + 3] += density * r; 00632 voltexmap[caddr + 4] += density * g; 00633 voltexmap[caddr + 5] += density * b; 00634 00635 density = tmp.floatreg.r2; 00636 voltexmap[caddr + 6] += density * r; 00637 voltexmap[caddr + 7] += density * g; 00638 voltexmap[caddr + 8] += density * b; 00639 00640 density = tmp.floatreg.r3; 00641 voltexmap[caddr + 9] += density * r; 00642 voltexmap[caddr + 10] += density * g; 00643 voltexmap[caddr + 11] += density * b; 00644 #endif 00645 } 00646 } 00647 #endif 00648 00649 // finish all remaining density map points with regular non-SSE loop 00650 for (; x<=xmax; x++,dx+=gridspacing) { 00651 float r2 = dx*dx + dy2dz2; 00652 00653 // use our (much faster) fully inlined exponential approximation 00654 float mb = r2 * arinv; /* already negated and in base 2 */ 00655 int mbflr = (int) mb; /* get int part, floor() */ 00656 float d = mbflr - mb; /* remaining exponent, -1 < d <= 0 */ 00657 00658 /* approx with linear blend of Taylor polys */ 00659 float sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4))); 00660 00661 /* 2^(-mbflr) */ 00662 flint scalfac; 00663 scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT; 00664 00665 // XXX assume we are never beyond the cutoff value in this loop 00666 float density = (sy * scalfac.f); 00667 00668 density *= atomicnumfactor; // MDFF Cryo-EM atomic number density 00669 00670 // accumulate density value to density map 00671 densitymap[addr + x] += density; 00672 00673 // Accumulate density-weighted color to texture map. 00674 // Pre-multiply colors by the inverse isovalue we will extract 00675 // the surface on, to cause the final color to be normalized. 00676 density *= invisovalue; 00677 ptrdiff_t caddr = (addr + x) * 3L; 00678 00679 // color by atom colors 00680 voltexmap[caddr ] += density * colors[ind ]; 00681 voltexmap[caddr + 1] += density * colors[ind + 1]; 00682 voltexmap[caddr + 2] += density * colors[ind + 2]; 00683 } 00684 } 00685 } 00686 } 00687 } else { 00688 // compute density map only 00689 for (i=0; i<natoms; i++) { 00690 if (verbose && ((i & 0x3fff) == 0)) { 00691 printf("."); 00692 fflush(stdout); 00693 } 00694 00695 ptrdiff_t ind = i*4L; 00696 float scaledrad = xyzr[ind+3] * radscale; 00697 00698 // MDFF atomic number weighted density factor 00699 float atomicnumfactor = 1.0f; 00700 if (atomicnum != NULL) { 00701 atomicnumfactor = atomicnum[i]; 00702 } 00703 00704 // negate, precompute reciprocal, and change to base 2 from the outset 00705 float arinv = -(1.0f/(2.0f*scaledrad*scaledrad)) * MLOG2EF; 00706 float radlim = gausslim * scaledrad; 00707 float radlim2 = radlim * radlim; // cutoff test done in cartesian coords 00708 radlim *= invgridspacing; 00709 00710 #if VMDQSURFUSESSE && defined(__SSE2__) 00711 __m128 atomicnumfactor_4 = { 0 }; 00712 __m128 arinv_4; 00713 if (usesse) { 00714 atomicnumfactor_4 = _mm_set1_ps(atomicnumfactor); 00715 #if VMDUSESVMLEXP 00716 // Use of Intel's SVML requires changing the pre-scaling factor 00717 arinv_4 = _mm_set1_ps(arinv * (2.718281828f/2.0f) / MLOG2EF); 00718 #else 00719 // Use our fully inlined exp approximation 00720 arinv_4 = _mm_set1_ps(arinv); 00721 #endif 00722 } 00723 #endif 00724 00725 #if VMDQSURFUSEVSX && defined(__VEC__) 00726 vector float atomicnumfactor_4; 00727 vector float arinv_4; 00728 if (usevsx) { 00729 atomicnumfactor_4 = vec_splats(atomicnumfactor); 00730 00731 // Use our fully inlined exp approximation 00732 arinv_4 = vec_splats(arinv); 00733 } 00734 #endif 00735 00736 float tmp; 00737 tmp = xyzr[ind ] * invgridspacing; 00738 int xmin = MAX((int) (tmp - radlim), 0); 00739 int xmax = MIN((int) (tmp + radlim), maxvoxel[0]); 00740 tmp = xyzr[ind+1] * invgridspacing; 00741 int ymin = MAX((int) (tmp - radlim), 0); 00742 int ymax = MIN((int) (tmp + radlim), maxvoxel[1]); 00743 tmp = xyzr[ind+2] * invgridspacing; 00744 int zmin = MAX((int) (tmp - radlim), 0); 00745 int zmax = MIN((int) (tmp + radlim), maxvoxel[2]); 00746 00747 float dz = zmin*gridspacing - xyzr[ind+2]; 00748 for (z=zmin; z<=zmax; z++,dz+=gridspacing) { 00749 float dy = ymin*gridspacing - xyzr[ind+1]; 00750 for (y=ymin; y<=ymax; y++,dy+=gridspacing) { 00751 float dy2dz2 = dy*dy + dz*dz; 00752 00753 // early-exit when outside the cutoff radius in the Y-Z plane 00754 if (dy2dz2 >= radlim2) 00755 continue; 00756 00757 ptrdiff_t addr = ptrdiff_t(z * numvoxels[0]) * ptrdiff_t(numvoxels[1]) + ptrdiff_t(y * numvoxels[0]); 00758 float dx = xmin*gridspacing - xyzr[ind]; 00759 x=xmin; 00760 00761 #if VMDQSURFUSESSE && defined(__SSE2__) 00762 // Use SSE when we have a multiple-of-4 to compute 00763 // finish all remaining density map points with regular non-SSE loop 00764 if (usesse) { 00765 __align(16) SSEreg n; 00766 __align(16) SSEreg y; 00767 __m128 dy2dz2_4 = _mm_set1_ps(dy2dz2); 00768 __m128 dx_4 = _mm_add_ps(_mm_set1_ps(dx), _mm_load_ps(&sxdelta4[0])); 00769 00770 for (; (x+3)<=xmax; x+=4,dx_4=_mm_add_ps(dx_4, gridspacing4_4)) { 00771 __m128 r2 = _mm_add_ps(_mm_mul_ps(dx_4, dx_4), dy2dz2_4); 00772 __m128 d; 00773 #if VMDUSESVMLEXP 00774 // use Intel's SVML exp2() routine 00775 y.f = _mm_exp2_ps(_mm_mul_ps(r2, arinv_4)); 00776 #else 00777 // use our (much faster) fully inlined exponential approximation 00778 y.f = _mm_mul_ps(r2, arinv_4); /* already negated and in base 2 */ 00779 n.i = _mm_cvttps_epi32(y.f); 00780 d = _mm_cvtepi32_ps(n.i); 00781 d = _mm_sub_ps(d, y.f); 00782 00783 // Approximate 2^{-d}, 0 <= d < 1, by interpolation. 00784 // Perform Horner's method to evaluate interpolating polynomial. 00785 #if 0 00786 // SSE 4.x FMADD instructions are not universally available 00787 y.f = _mm_fmadd_ps(d, _mm_set1_ps(SCEXP4), _mm_set1_ps(SCEXP3)); 00788 y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP2)); 00789 y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP1)); 00790 y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP0)); 00791 #else 00792 y.f = _mm_mul_ps(d, _mm_set_ps1(SCEXP4)); /* for x^4 term */ 00793 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP3)); /* for x^3 term */ 00794 y.f = _mm_mul_ps(y.f, d); 00795 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP2)); /* for x^2 term */ 00796 y.f = _mm_mul_ps(y.f, d); 00797 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP1)); /* for x^1 term */ 00798 y.f = _mm_mul_ps(y.f, d); 00799 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP0)); /* for x^0 term */ 00800 #endif 00801 00802 // Calculate 2^N exactly by directly manipulating floating point exponent, 00803 // then use it to scale y for the final result. 00804 n.i = _mm_sub_epi32(_mm_set1_epi32(EXPOBIAS), n.i); 00805 n.i = _mm_slli_epi32(n.i, EXPOSHIFT); 00806 y.f = _mm_mul_ps(y.f, n.f); 00807 y.f = _mm_mul_ps(y.f, atomicnumfactor_4); // MDFF density maps 00808 #endif 00809 00810 // At present, we do unaligned loads/stores since we can't guarantee 00811 // that the X-dimension is always a multiple of 4. 00812 float *ufptr = &densitymap[addr + x]; 00813 d = _mm_loadu_ps(ufptr); 00814 _mm_storeu_ps(ufptr, _mm_add_ps(d, y.f)); 00815 } 00816 } 00817 #endif 00818 00819 00820 #if VMDQSURFUSEVSX && defined(__VEC__) 00821 // Use VSX when we have a multiple-of-4 to compute 00822 // finish all remaining density map points with regular non-VSX loop 00823 // 00824 // XXX it may be useful to compare the speed/accuracy of the 00825 // polynomial approximation vs. the hardware-provided 00826 // exp2f() approximation: vec_expte() 00827 // 00828 if (usevsx) { 00829 vector float dy2dz2_4 = vec_splats(dy2dz2); 00830 vector float tmpvsxdelta4 = *((__vector float *) &sxdelta4[0]); 00831 vector float dx_4 = vec_add(vec_splats(dx), tmpvsxdelta4); 00832 00833 for (; (x+3)<=xmax; x+=4,dx_4=vec_add(dx_4, gridspacing4_4)) { 00834 vector float r2 = vec_add(vec_mul(dx_4, dx_4), dy2dz2_4); 00835 00836 // use our (much faster) fully inlined exponential approximation 00837 vector float mb = vec_mul(r2, arinv_4); /* already negated and in base 2 */ 00838 vector float mbflr = vec_floor(mb); 00839 vector float d = vec_sub(mbflr, mb); 00840 vector float y; 00841 00842 // Approximate 2^{-d}, 0 <= d < 1, by interpolation. 00843 // Perform Horner's method to evaluate interpolating polynomial. 00844 y = vec_madd(d, vec_splats(SCEXP4), vec_splats(SCEXP3)); // x^4 00845 y = vec_madd(y, d, vec_splats(SCEXP2)); // x^2 00846 y = vec_madd(y, d, vec_splats(SCEXP1)); // x^1 00847 y = vec_madd(y, d, vec_splats(SCEXP0)); // x^0 00848 00849 // Calculate 2^N exactly via vec_expte() 00850 // then use it to scale y for the final result. 00851 y = vec_mul(y, vec_expte(-mbflr)); 00852 y = vec_mul(y, atomicnumfactor_4); // MDFF density maps 00853 00854 // At present, we do unaligned loads/stores since we can't 00855 // guarantee that the X-dimension is always a multiple of 4. 00856 float *ufptr = &densitymap[addr + x]; 00857 d = *((__vector float *) &ufptr[0]); 00858 // XXX there must be a cleaner way to implement this 00859 // d = _mm_loadu_ps(ufptr); 00860 // _mm_storeu_ps(ufptr, _mm_add_ps(d, y.f)); 00861 d = vec_add(d, y); 00862 00863 ufptr[0] = d[0]; 00864 ufptr[1] = d[1]; 00865 ufptr[2] = d[2]; 00866 ufptr[3] = d[3]; 00867 } 00868 } 00869 #endif 00870 00871 // finish all remaining density map points with regular non-SSE loop 00872 for (; x<=xmax; x++,dx+=gridspacing) { 00873 float r2 = dx*dx + dy2dz2; 00874 00875 // use our (much faster) fully inlined exponential approximation 00876 float mb = r2 * arinv; /* already negated and in base 2 */ 00877 int mbflr = (int) mb; /* get int part, floor() */ 00878 float d = mbflr - mb; /* remaining exponent, -1 < d <= 0 */ 00879 00880 /* approx with linear blend of Taylor polys */ 00881 float sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4))); 00882 00883 /* 2^(-mbflr) */ 00884 flint scalfac; 00885 scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT; 00886 00887 // XXX assume we are never beyond the cutoff value in this loop 00888 float density = (sy * scalfac.f); 00889 00890 density *= atomicnumfactor; // MDFF Cryo-EM atomic number density 00891 00892 densitymap[addr + x] += density; 00893 } 00894 } 00895 } 00896 } 00897 } 00898 } 00899 00900 00901 typedef struct { 00902 wkf_cpu_caps_t *cpucaps; 00903 int verbose; 00904 int natoms; 00905 float radscale; 00906 float gridspacing; 00907 float isovalue; 00908 float gausslim; 00909 const int *numvoxels; 00910 const float *xyzr; 00911 const float *atomicnum; 00912 const float *colors; 00913 float **thrdensitymaps; 00914 float **thrvoltexmaps; 00915 } densitythrparms; 00916 00917 00918 static void * densitythread(void *voidparms) { 00919 wkf_tasktile_t tile; 00920 densitythrparms *parms = NULL; 00921 int threadid; 00922 00923 wkf_threadlaunch_getid(voidparms, &threadid, NULL); 00924 wkf_threadlaunch_getdata(voidparms, (void **) &parms); 00925 00926 while (wkf_threadlaunch_next_tile(voidparms, 16384, &tile) != WKF_SCHED_DONE) { 00927 int natoms = tile.end-tile.start; 00928 const float *atomicnum = (parms->atomicnum == NULL) ? NULL : &parms->atomicnum[tile.start]; 00929 vmd_gaussdensity_opt(parms->cpucaps, 00930 parms->verbose, natoms, 00931 &parms->xyzr[4L*tile.start], 00932 atomicnum, 00933 (parms->thrvoltexmaps[0]!=NULL) ? &parms->colors[4L*tile.start] : NULL, 00934 parms->thrdensitymaps[threadid], 00935 parms->thrvoltexmaps[threadid], 00936 parms->numvoxels, 00937 parms->radscale, 00938 parms->gridspacing, 00939 parms->isovalue, 00940 parms->gausslim); 00941 } 00942 00943 return NULL; 00944 } 00945 00946 00947 static void * reductionthread(void *voidparms) { 00948 wkf_tasktile_t tile; 00949 densitythrparms *parms = NULL; 00950 int threadid, numthreads; 00951 00952 wkf_threadlaunch_getid(voidparms, &threadid, &numthreads); 00953 wkf_threadlaunch_getdata(voidparms, (void **) &parms); 00954 00955 while (wkf_threadlaunch_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) { 00956 // do a reduction over each of the individual density grids 00957 ptrdiff_t planesz = ptrdiff_t(parms->numvoxels[0]) * ptrdiff_t(parms->numvoxels[1]); 00958 ptrdiff_t start = ptrdiff_t(tile.start) * planesz; 00959 ptrdiff_t end = ptrdiff_t(tile.end) * planesz; 00960 00961 ptrdiff_t i, x; 00962 for (x=start; x<end; x++) { 00963 float tmp = 0.0f; 00964 for (i=1; i<numthreads; i++) { 00965 tmp += parms->thrdensitymaps[i][x]; 00966 } 00967 parms->thrdensitymaps[0][x] += tmp; 00968 } 00969 00970 // do a reduction over each of the individual texture grids 00971 if (parms->thrvoltexmaps[0] != NULL) { 00972 for (x=start*3L; x<end*3L; x++) { 00973 float tmp = 0.0f; 00974 for (i=1; i<numthreads; i++) { 00975 tmp += parms->thrvoltexmaps[i][x]; 00976 } 00977 parms->thrvoltexmaps[0][x] += tmp; 00978 } 00979 } 00980 } 00981 00982 return NULL; 00983 } 00984 00985 00986 static int vmd_gaussdensity_threaded(wkf_cpu_caps_t *cpucaps, int verbose, 00987 int natoms, const float *xyzr, 00988 const float *atomicnum, 00989 const float *colors, 00990 float *densitymap, float *voltexmap, 00991 const int *numvoxels, 00992 float radscale, float gridspacing, 00993 float isovalue, float gausslim) { 00994 densitythrparms parms; 00995 memset(&parms, 0, sizeof(parms)); 00996 00997 parms.cpucaps = cpucaps; 00998 parms.verbose = verbose; 00999 parms.natoms = natoms; 01000 parms.radscale = radscale; 01001 parms.gridspacing = gridspacing; 01002 parms.isovalue = isovalue; 01003 parms.gausslim = gausslim; 01004 parms.numvoxels = numvoxels; 01005 parms.xyzr = xyzr; 01006 parms.atomicnum = atomicnum; 01007 parms.colors = colors; 01008 01009 int physprocs = wkf_thread_numprocessors(); 01010 int maxprocs = physprocs; 01011 01012 // We can productively use only a few cores per socket due to the 01013 // limited memory bandwidth per socket. Also, hyperthreading 01014 // actually hurts performance. These two considerations combined 01015 // with the linear increase in memory use prevent us from using large 01016 // numbers of cores with this simple approach, so if we've got more 01017 // than 8 CPU cores, we'll iteratively cutting the core count in 01018 // half until we're under 8 cores. 01019 while (maxprocs > 8) 01020 maxprocs /= 2; 01021 01022 // Limit the number of CPU cores used so we don't run the 01023 // machine out of memory during surface computation. 01024 // Use either a dynamic or hard-coded heuristic to limit the 01025 // number of CPU threads we will spawn so that we don't run 01026 // the machine out of memory. 01027 ptrdiff_t volsz = ptrdiff_t(numvoxels[0]) * 01028 ptrdiff_t(numvoxels[1]) * ptrdiff_t(numvoxels[2]); 01029 ptrdiff_t volmemsz = sizeof(float) * volsz; 01030 ptrdiff_t volmemszkb = volmemsz / 1024; 01031 ptrdiff_t volmemtexszkb = volmemszkb + ((voltexmap != NULL) ? 3L*volmemszkb : 0); 01032 01033 // Platforms that don't have a means of determining available 01034 // physical memory will return -1, in which case we fall back to the 01035 // simple hard-coded 2GB-max-per-core heuristic. 01036 ptrdiff_t vmdcorefree = -1; 01037 01038 #if defined(ARCH_BLUEWATERS) || defined(ARCH_CRAY_XC) || defined(ARCH_CRAY_XK) || defined(ARCH_LINUXAMD64) || defined(ARCH_SOLARIS2_64) || defined(ARCH_SOLARISX86_64) || defined(ARCH_AIX6_64) || defined(MACOSXARM64) || defined(ARCH_MACOSXX86_64) 01039 // XXX The core-free query scheme has one weakness in that we might have a 01040 // 32-bit version of VMD running on a 64-bit machine, where the available 01041 // physical memory may be much larger than is possible for a 01042 // 32-bit VMD process to address. To do this properly we must therefore 01043 // use conditional compilation safety checks here until we have a better 01044 // way of determining this with a standardized helper routine. 01045 vmdcorefree = vmd_get_avail_physmem_mb(); 01046 #endif 01047 01048 if (vmdcorefree >= 0) { 01049 // Make sure QuickSurf uses no more than a fraction of the free memory 01050 // as an upper bound alternative to the hard-coded heuristic. 01051 // This should be highly preferable to the fixed-size heuristic 01052 // we had used in all cases previously. 01053 while ((volmemtexszkb * maxprocs) > (1024L*vmdcorefree/4)) { 01054 maxprocs /= 2; 01055 } 01056 } else { 01057 // Set a practical per-core maximum memory use limit to 2GB, for all cores 01058 while ((volmemtexszkb * maxprocs) > (2L * 1024L * 1024L)) 01059 maxprocs /= 2; 01060 } 01061 01062 if (maxprocs < 1) 01063 maxprocs = 1; 01064 01065 // Loop over number of physical processors and try to create 01066 // per-thread volumetric maps for each of them. 01067 parms.thrdensitymaps = (float **) calloc(1,maxprocs * sizeof(float *)); 01068 parms.thrvoltexmaps = (float **) calloc(1, maxprocs * sizeof(float *)); 01069 01070 // first thread is already ready to go 01071 parms.thrdensitymaps[0] = densitymap; 01072 parms.thrvoltexmaps[0] = voltexmap; 01073 01074 int i; 01075 int numprocs = maxprocs; // ever the optimist 01076 for (i=1; i<maxprocs; i++) { 01077 parms.thrdensitymaps[i] = (float *) calloc(1, volmemsz); 01078 if (parms.thrdensitymaps[i] == NULL) { 01079 numprocs = i; 01080 break; 01081 } 01082 if (voltexmap != NULL) { 01083 parms.thrvoltexmaps[i] = (float *) calloc(1, 3L * volmemsz); 01084 if (parms.thrvoltexmaps[i] == NULL) { 01085 free(parms.thrdensitymaps[i]); 01086 parms.thrdensitymaps[i] = NULL; 01087 numprocs = i; 01088 break; 01089 } 01090 } 01091 } 01092 01093 // launch independent thread calculations 01094 wkf_tasktile_t tile; 01095 tile.start = 0; 01096 tile.end = natoms; 01097 wkf_threadlaunch(numprocs, &parms, densitythread, &tile); 01098 01099 // do a parallel reduction of the resulting density maps 01100 tile.start = 0; 01101 tile.end = numvoxels[2]; 01102 wkf_threadlaunch(numprocs, &parms, reductionthread, &tile); 01103 01104 // free work area 01105 for (i=1; i<maxprocs; i++) { 01106 if (parms.thrdensitymaps[i] != NULL) 01107 free(parms.thrdensitymaps[i]); 01108 01109 if (parms.thrvoltexmaps[i] != NULL) 01110 free(parms.thrvoltexmaps[i]); 01111 } 01112 free(parms.thrdensitymaps); 01113 free(parms.thrvoltexmaps); 01114 01115 return 0; 01116 } 01117 01118 QuickSurf::QuickSurf(int forcecpuonly) { 01119 volmap = NULL; 01120 voltexmap = NULL; 01121 s.clear(); 01122 isovalue = 0.5f; 01123 01124 numvoxels[0] = 128; 01125 numvoxels[1] = 128; 01126 numvoxels[2] = 128; 01127 01128 origin[0] = 0.0f; 01129 origin[1] = 0.0f; 01130 origin[2] = 0.0f; 01131 01132 xaxis[0] = 1.0f; 01133 xaxis[1] = 0.0f; 01134 xaxis[2] = 0.0f; 01135 01136 yaxis[0] = 0.0f; 01137 yaxis[1] = 1.0f; 01138 yaxis[2] = 0.0f; 01139 01140 zaxis[0] = 0.0f; 01141 zaxis[1] = 0.0f; 01142 zaxis[2] = 1.0f; 01143 01144 cudaqs = NULL; 01145 force_cpuonly = forcecpuonly; 01146 #if defined(VMDCUDA) 01147 if (!force_cpuonly && !getenv("VMDNOCUDA")) { 01148 cudaqs = new CUDAQuickSurf(); 01149 } 01150 #endif 01151 01152 timer = wkf_timer_create(); 01153 } 01154 01155 01156 void QuickSurf::free_gpu_memory(void) { 01157 if (cudaqs) { 01158 #if defined(VMDCUDA) 01159 delete cudaqs; 01160 #endif 01161 cudaqs = NULL; 01162 } 01163 } 01164 01165 01166 int QuickSurf::calc_surf(AtomSel *atomSel, DrawMolecule *mol, 01167 const float *atompos, const float *atomradii, 01168 int quality, float radscale, float gridspacing, 01169 float isoval, const int *colidx, const float *cmap, 01170 VMDDisplayList *cmdList) { 01171 PROFILE_PUSH_RANGE("QuickSurf", 3); 01172 01173 wkf_timer_start(timer); 01174 int colorperatom = (colidx != NULL && cmap != NULL); 01175 int usebeads=0; 01176 01177 int verbose = (getenv("VMDQUICKSURFVERBOSE") != NULL); 01178 01179 // Disable MDFF atomic number weighted densities until we implement 01180 // GUI controls for this if it turns out to be useful for more than 01181 // than just analytical usage. 01182 const float *atomicnum = NULL; 01183 01184 // clean up any existing CPU arrays before going any further... 01185 if (voltexmap != NULL) 01186 free(voltexmap); 01187 voltexmap = NULL; 01188 01189 ResizeArray<float> beadpos(64 + (3L * atomSel->selected) / 20); 01190 ResizeArray<float> beadradii(64 + (3L * atomSel->selected) / 20); 01191 ResizeArray<float> beadcolors(64 + (3L * atomSel->selected) / 20); 01192 01193 if (getenv("VMDQUICKSURFBEADS")) { 01194 usebeads=1; 01195 if (verbose) 01196 printf("QuickSurf using residue beads representation...\n"); 01197 } 01198 01199 int numbeads = 0; 01200 if (usebeads) { 01201 int i, resid, numres; 01202 01203 // draw a bead for each residue 01204 numres = mol->residueList.num(); 01205 for (resid=0; resid<numres; resid++) { 01206 float com[3] = {0.0, 0.0, 0.0}; 01207 const ResizeArray<int> &atoms = mol->residueList[resid]->atoms; 01208 int numatoms = atoms.num(); 01209 int oncount = 0; 01210 01211 // find COM for residue 01212 for (i=0; i<numatoms; i++) { 01213 int idx = atoms[i]; 01214 if (atomSel->on[idx]) { 01215 oncount++; 01216 vec_add(com, com, atompos + 3L*idx); 01217 } 01218 } 01219 01220 if (oncount < 1) 01221 continue; // exit if there weren't any atoms 01222 01223 vec_scale(com, 1.0f / (float) oncount, com); 01224 01225 // find radius of bounding sphere and save last atom index for color 01226 int atomcolorindex=0; // initialize, to please compilers 01227 float boundradsq = 0.0f; 01228 for (i=0; i<numatoms; i++) { 01229 int idx = atoms[i]; 01230 if (atomSel->on[idx]) { 01231 float tmpdist[3]; 01232 atomcolorindex = idx; 01233 vec_sub(tmpdist, com, atompos + 3L*idx); 01234 float distsq = dot_prod(tmpdist, tmpdist); 01235 if (distsq > boundradsq) { 01236 boundradsq = distsq; 01237 } 01238 } 01239 } 01240 beadpos.append3(&com[0]); 01241 beadradii.append(sqrtf(boundradsq) + 1.0f); 01242 01243 if (colorperatom) { 01244 const float *cp = &cmap[colidx[atomcolorindex] * 3L]; 01245 beadcolors.append3(&cp[0]); 01246 } 01247 01248 // XXX still need to add pick points... 01249 } 01250 01251 numbeads = beadpos.num() / 3; 01252 } 01253 01254 // initialize class variables 01255 isovalue=isoval; 01256 01257 // If no volumetric texture will be computed we will use the cmap 01258 // parameter to pass in the solid color to be applied to all vertices 01259 // Since QS can now also be called by MDFF, we have to check whether 01260 // display related parms are set or not before using them. 01261 if (cmap != NULL) 01262 vec_copy(solidcolor, cmap); 01263 01264 // compute min/max atom radius, build list of selected atom radii, 01265 // and compute bounding box for the selected atoms 01266 float minx, miny, minz, maxx, maxy, maxz; 01267 float minrad, maxrad; 01268 int i; 01269 if (usebeads) { 01270 minx = maxx = beadpos[0]; 01271 miny = maxy = beadpos[1]; 01272 minz = maxz = beadpos[2]; 01273 minrad = maxrad = beadradii[0]; 01274 for (i=0; i<numbeads; i++) { 01275 ptrdiff_t ind = i * 3L; 01276 float tmpx = beadpos[ind ]; 01277 float tmpy = beadpos[ind+1]; 01278 float tmpz = beadpos[ind+2]; 01279 01280 minx = (tmpx < minx) ? tmpx : minx; 01281 maxx = (tmpx > maxx) ? tmpx : maxx; 01282 01283 miny = (tmpy < miny) ? tmpy : miny; 01284 maxy = (tmpy > maxy) ? tmpy : maxy; 01285 01286 minz = (tmpz < minz) ? tmpz : minz; 01287 maxz = (tmpz > maxz) ? tmpz : maxz; 01288 01289 // we always have to compute the rmin/rmax for beads 01290 // since these radii are defined on-the-fly 01291 float r = beadradii[i]; 01292 minrad = (r < minrad) ? r : minrad; 01293 maxrad = (r > maxrad) ? r : maxrad; 01294 } 01295 } else { 01296 minx = maxx = atompos[atomSel->firstsel*3L ]; 01297 miny = maxy = atompos[atomSel->firstsel*3L+1]; 01298 minz = maxz = atompos[atomSel->firstsel*3L+2]; 01299 01300 // Query min/max atom radii for the entire molecule 01301 mol->get_radii_minmax(minrad, maxrad); 01302 01303 // We only compute rmin/rmax for the actual group of selected atoms if 01304 // (rmax/rmin > 2.5) for the whole molecule, otherwise it's a small 01305 // enough range that we don't care since it won't hurt our performance. 01306 if (minrad <= 0.001 || maxrad/minrad > 2.5) { 01307 minrad = maxrad = atomradii[atomSel->firstsel]; 01308 for (i=atomSel->firstsel; i<=atomSel->lastsel; i++) { 01309 if (atomSel->on[i]) { 01310 ptrdiff_t ind = i * 3L; 01311 float tmpx = atompos[ind ]; 01312 float tmpy = atompos[ind+1]; 01313 float tmpz = atompos[ind+2]; 01314 01315 minx = (tmpx < minx) ? tmpx : minx; 01316 maxx = (tmpx > maxx) ? tmpx : maxx; 01317 01318 miny = (tmpy < miny) ? tmpy : miny; 01319 maxy = (tmpy > maxy) ? tmpy : maxy; 01320 01321 minz = (tmpz < minz) ? tmpz : minz; 01322 maxz = (tmpz > maxz) ? tmpz : maxz; 01323 01324 float r = atomradii[i]; 01325 minrad = (r < minrad) ? r : minrad; 01326 maxrad = (r > maxrad) ? r : maxrad; 01327 } 01328 } 01329 } else { 01330 #if 1 01331 float fmin[3], fmax[3]; 01332 minmax_selected_3fv_aligned(atompos, atomSel->on, atomSel->num_atoms, 01333 atomSel->firstsel, atomSel->lastsel, 01334 fmin, fmax); 01335 minx = fmin[0]; 01336 miny = fmin[1]; 01337 minz = fmin[2]; 01338 01339 maxx = fmax[0]; 01340 maxy = fmax[1]; 01341 maxz = fmax[2]; 01342 #else 01343 for (i=atomSel->firstsel; i<=atomSel->lastsel; i++) { 01344 if (atomSel->on[i]) { 01345 ptrdiff_t ind = i * 3L; 01346 float tmpx = atompos[ind ]; 01347 float tmpy = atompos[ind+1]; 01348 float tmpz = atompos[ind+2]; 01349 01350 minx = (tmpx < minx) ? tmpx : minx; 01351 maxx = (tmpx > maxx) ? tmpx : maxx; 01352 01353 miny = (tmpy < miny) ? tmpy : miny; 01354 maxy = (tmpy > maxy) ? tmpy : maxy; 01355 01356 minz = (tmpz < minz) ? tmpz : minz; 01357 maxz = (tmpz > maxz) ? tmpz : maxz; 01358 } 01359 } 01360 #endif 01361 } 01362 } 01363 01364 float mincoord[3], maxcoord[3]; 01365 mincoord[0] = minx; 01366 mincoord[1] = miny; 01367 mincoord[2] = minz; 01368 maxcoord[0] = maxx; 01369 maxcoord[1] = maxy; 01370 maxcoord[2] = maxz; 01371 01372 // crude estimate of the grid padding we require to prevent the 01373 // resulting isosurface from being clipped 01374 float gridpadding = radscale * maxrad * 1.70f; 01375 float padrad = gridpadding; 01376 padrad = 0.65f * sqrtf(4.0f/3.0f*((float) VMD_PI)*padrad*padrad*padrad); 01377 gridpadding = MAX(gridpadding, padrad); 01378 01379 // Handle coarse-grained structures and whole-cell models 01380 // XXX The switch at 4.0A from an assumed all-atom scale structure to 01381 // CG or cell models is a simple heuristic at a somewhat arbitrary 01382 // threshold value. 01383 // For all-atom models the units shown in the GUI are in Angstroms 01384 // and are absolute, but for CG or cell models the units in the GUI 01385 // are relative to the atom with the minimum radius. 01386 // This code doesn't do anything to handle structures with a minrad 01387 // of zero, where perhaps only one particle has an unset radius. 01388 if (minrad > 4.0f) { 01389 gridspacing *= minrad; 01390 } 01391 01392 if (verbose) { 01393 printf("QuickSurf: R*%.1f, I=%.1f, H=%.1f Pad: %.1f minR: %.1f maxR: %.1f)\n", 01394 radscale, isovalue, gridspacing, gridpadding, minrad, maxrad); 01395 } 01396 01397 mincoord[0] -= gridpadding; 01398 mincoord[1] -= gridpadding; 01399 mincoord[2] -= gridpadding; 01400 maxcoord[0] += gridpadding; 01401 maxcoord[1] += gridpadding; 01402 maxcoord[2] += gridpadding; 01403 01404 // compute the real grid dimensions from the selected atoms 01405 numvoxels[0] = (int) ceil((maxcoord[0]-mincoord[0]) / gridspacing); 01406 numvoxels[1] = (int) ceil((maxcoord[1]-mincoord[1]) / gridspacing); 01407 numvoxels[2] = (int) ceil((maxcoord[2]-mincoord[2]) / gridspacing); 01408 01409 // recalc the grid dimensions from rounded/padded voxel counts 01410 xaxis[0] = (numvoxels[0]-1) * gridspacing; 01411 yaxis[1] = (numvoxels[1]-1) * gridspacing; 01412 zaxis[2] = (numvoxels[2]-1) * gridspacing; 01413 maxcoord[0] = mincoord[0] + xaxis[0]; 01414 maxcoord[1] = mincoord[1] + yaxis[1]; 01415 maxcoord[2] = mincoord[2] + zaxis[2]; 01416 01417 int boundserr=0; 01418 for (i=0; i<3; i++) { 01419 if (isnan(mincoord[i]) || isnan(maxcoord[i]) || (numvoxels[i] < 1)) 01420 boundserr = 1; 01421 } 01422 01423 if (boundserr) 01424 msgErr << "QuickSurf) NaN or illegal bounding box, grid size" << sendmsg; 01425 01426 if (verbose || boundserr) { 01427 printf(" GridSZ: (%4d %4d %4d) BBox: (%.1f %.1f %.1f)->(%.1f %.1f %.1f)\n", 01428 numvoxels[0], numvoxels[1], numvoxels[2], 01429 mincoord[0], mincoord[1], mincoord[2], 01430 maxcoord[0], maxcoord[1], maxcoord[2]); 01431 } 01432 01433 if (boundserr) 01434 return -1; 01435 01436 vec_copy(origin, mincoord); 01437 01438 // build compacted lists of bead coordinates, radii, and colors 01439 float *xyzr = NULL; 01440 float *colors = NULL; 01441 if (usebeads) { 01442 int ind =0; 01443 int ind4=0; 01444 xyzr = (float *) malloc(numbeads * sizeof(float) * 4L); 01445 if (colorperatom) { 01446 colors = (float *) malloc(numbeads * sizeof(float) * 4L); 01447 01448 // build compacted lists of bead coordinates, radii, and colors 01449 for (i=0; i<numbeads; i++) { 01450 const float *fp = &beadpos[0] + ind; 01451 xyzr[ind4 ] = fp[0]-origin[0]; 01452 xyzr[ind4 + 1] = fp[1]-origin[1]; 01453 xyzr[ind4 + 2] = fp[2]-origin[2]; 01454 xyzr[ind4 + 3] = beadradii[i]; 01455 01456 const float *cp = &beadcolors[0] + ind; 01457 colors[ind4 ] = cp[0]; 01458 colors[ind4 + 1] = cp[1]; 01459 colors[ind4 + 2] = cp[2]; 01460 colors[ind4 + 3] = 1.0f; 01461 ind4 += 4; 01462 ind += 3; 01463 } 01464 } else { 01465 // build compacted lists of bead coordinates and radii only 01466 for (i=0; i<numbeads; i++) { 01467 const float *fp = &beadpos[0] + ind; 01468 xyzr[ind4 ] = fp[0]-origin[0]; 01469 xyzr[ind4 + 1] = fp[1]-origin[1]; 01470 xyzr[ind4 + 2] = fp[2]-origin[2]; 01471 xyzr[ind4 + 3] = beadradii[i]; 01472 ind4 += 4; 01473 ind += 3; 01474 } 01475 } 01476 } else { 01477 ptrdiff_t ind = atomSel->firstsel * 3L; 01478 ptrdiff_t ind4=0; 01479 xyzr = (float *) malloc(atomSel->selected * sizeof(float) * 4L); 01480 if (colorperatom) { 01481 colors = (float *) malloc(atomSel->selected * sizeof(float) * 4L); 01482 01483 // build compacted lists of atom coordinates, radii, and colors 01484 for (i=atomSel->firstsel; i <= atomSel->lastsel; i++) { 01485 if (atomSel->on[i]) { 01486 const float *fp = atompos + ind; 01487 xyzr[ind4 ] = fp[0]-origin[0]; 01488 xyzr[ind4 + 1] = fp[1]-origin[1]; 01489 xyzr[ind4 + 2] = fp[2]-origin[2]; 01490 xyzr[ind4 + 3] = atomradii[i]; 01491 01492 const float *cp = &cmap[colidx[i] * 3L]; 01493 colors[ind4 ] = cp[0]; 01494 colors[ind4 + 1] = cp[1]; 01495 colors[ind4 + 2] = cp[2]; 01496 colors[ind4 + 3] = 1.0f; 01497 ind4 += 4; 01498 } 01499 ind += 3; 01500 } 01501 } else { 01502 // build compacted lists of atom coordinates and radii only 01503 for (i=atomSel->firstsel; i <= atomSel->lastsel; i++) { 01504 if (atomSel->on[i]) { 01505 const float *fp = atompos + ind; 01506 xyzr[ind4 ] = fp[0]-origin[0]; 01507 xyzr[ind4 + 1] = fp[1]-origin[1]; 01508 xyzr[ind4 + 2] = fp[2]-origin[2]; 01509 xyzr[ind4 + 3] = atomradii[i]; 01510 ind4 += 4; 01511 } 01512 ind += 3; 01513 } 01514 } 01515 } 01516 01517 // set gaussian window size based on user-specified quality parameter 01518 float gausslim = 2.0f; 01519 switch (quality) { 01520 case 3: gausslim = 4.0f; break; // max quality 01521 01522 case 2: gausslim = 3.0f; break; // high quality 01523 01524 case 1: gausslim = 2.5f; break; // medium quality 01525 01526 case 0: 01527 default: gausslim = 2.0f; // low quality 01528 break; 01529 } 01530 01531 pretime = wkf_timer_timenow(timer); 01532 01533 #if defined(VMDCUDA) 01534 if (!force_cpuonly && !getenv("VMDNOCUDA")) { 01535 // allocate a new CUDAQuickSurf object if we destroyed the old one... 01536 if (cudaqs == NULL) 01537 cudaqs = new CUDAQuickSurf(); 01538 01539 // Assign texture format according to quality 01540 CUDAQuickSurf::VolTexFormat voltexformat = CUDAQuickSurf::RGB3F; 01541 switch (quality) { 01542 case 3: voltexformat = CUDAQuickSurf::RGB3F; break; // max quality 01543 01544 case 2: voltexformat = CUDAQuickSurf::RGB3F; break; // high quality 01545 01546 case 1: voltexformat = CUDAQuickSurf::RGB3F; break; // medium quality 01547 01548 case 0: 01549 default: voltexformat = CUDAQuickSurf::RGB4U; // low quality 01550 break; 01551 } 01552 01553 // compute both density map and floating point color texture map 01554 int pcount = (usebeads) ? numbeads : atomSel->selected; 01555 int rc = cudaqs->calc_surf(pcount, &xyzr[0], 01556 (colorperatom) ? &colors[0] : &cmap[0], 01557 colorperatom, voltexformat, 01558 origin, numvoxels, maxrad, 01559 radscale, gridspacing, isovalue, gausslim, 01560 cmdList); 01561 01562 // If we're running in a memory-limited scenario, we can force 01563 // VMD to dump the QuickSurf GPU data to prevent out-of-memory 01564 // problems later on, either during other surface calcs or when 01565 // using the GPU for things like OptiX ray tracing 01566 if (getenv("VMDQUICKSURFMINMEM")) { 01567 free_gpu_memory(); 01568 } 01569 01570 if (rc == 0) { 01571 free(xyzr); 01572 if (colors) 01573 free(colors); 01574 01575 voltime = wkf_timer_timenow(timer); 01576 01577 PROFILE_POP_RANGE(); // first return point 01578 01579 return 0; 01580 } 01581 } 01582 #endif 01583 01584 if (verbose) { 01585 printf(" Computing density map grid on CPUs "); 01586 } 01587 01588 ptrdiff_t volsz = numvoxels[0] * numvoxels[1] * numvoxels[2]; 01589 volmap = new float[volsz]; 01590 if (colidx != NULL && cmap != NULL) { 01591 voltexmap = (float*) calloc(1, 3L * sizeof(float) * numvoxels[0] * numvoxels[1] * numvoxels[2]); 01592 } 01593 01594 fflush(stdout); 01595 memset(volmap, 0, sizeof(float) * volsz); 01596 if ((volsz * atomSel->selected) > 20000000) { 01597 vmd_gaussdensity_threaded(mol->app->cpucaps, verbose, 01598 atomSel->selected, &xyzr[0], atomicnum, 01599 (voltexmap!=NULL) ? &colors[0] : NULL, 01600 volmap, voltexmap, numvoxels, radscale, 01601 gridspacing, isovalue, gausslim); 01602 } else { 01603 vmd_gaussdensity_opt(mol->app->cpucaps, verbose, 01604 atomSel->selected, &xyzr[0], atomicnum, 01605 (voltexmap!=NULL) ? &colors[0] : NULL, 01606 volmap, voltexmap, numvoxels, radscale, 01607 gridspacing, isovalue, gausslim); 01608 } 01609 01610 free(xyzr); 01611 if (colors) 01612 free(colors); 01613 01614 voltime = wkf_timer_timenow(timer); 01615 01616 // draw the surface if the caller provided the display list 01617 if (cmdList != NULL) { 01618 draw_trimesh(cmdList); 01619 } 01620 01621 if (verbose) { 01622 printf(" Done.\n"); 01623 } 01624 01625 PROFILE_POP_RANGE(); // second return point 01626 01627 return 0; 01628 } 01629 01630 01631 // compute synthetic density map, but nothing else 01632 VolumetricData * QuickSurf::calc_density_map(AtomSel * atomSel, 01633 DrawMolecule *mymol, 01634 const float *atompos, 01635 const float *atomradii, 01636 int quality, float radscale, 01637 float gridspacing) { 01638 if (!calc_surf(atomSel, mymol, atompos, atomradii, quality, 01639 radscale, gridspacing, 1.0f, NULL, NULL, NULL)) { 01640 VolumetricData *surfvol; 01641 surfvol = new VolumetricData("density map", origin, xaxis, yaxis, zaxis, 01642 numvoxels[0], numvoxels[1], numvoxels[2], 01643 volmap); 01644 return surfvol; 01645 } 01646 01647 return NULL; 01648 } 01649 01650 01651 // Extract the isosurface from the QuickSurf density map 01652 int QuickSurf::get_trimesh(int &numverts, 01653 float *&v3fv, float *&n3fv, float *&c3fv, 01654 int &numfacets, int *&fiv) { 01655 01656 int verbose = (getenv("VMDQUICKSURFVERBOSE") != NULL); 01657 01658 if (verbose) 01659 printf("Running marching cubes on CPU...\n"); 01660 01661 VolumetricData *surfvol; 01662 surfvol = new VolumetricData("molecular surface", 01663 origin, xaxis, yaxis, zaxis, 01664 numvoxels[0], numvoxels[1], numvoxels[2], 01665 volmap); 01666 01667 // XXX we should calculate the volume gradient only for those 01668 // vertices we extract, since for this rep any changes to settings 01669 // will require recomputation of the entire volume 01670 surfvol->compute_volume_gradient(); // calc gradients: smooth vertex normals 01671 gradtime = wkf_timer_timenow(timer); 01672 01673 // trimesh polygonalized surface, max of 6 triangles per voxel 01674 const int stepsize = 1; 01675 s.clear(); // initialize isosurface data 01676 s.compute(surfvol, isovalue, stepsize); // compute the isosurface 01677 01678 mctime = wkf_timer_timenow(timer); 01679 01680 s.vertexfusion(9, 9); // eliminate duplicated vertices 01681 s.normalize(); // normalize interpolated gradient/surface normals 01682 01683 if (s.numtriangles > 0) { 01684 if (voltexmap != NULL) { 01685 // assign per-vertex colors by a 3-D texture map 01686 s.set_color_voltex_rgb3fv(voltexmap); 01687 } else { 01688 // use a single color for the entire mesh 01689 s.set_color_rgb3fv(solidcolor); 01690 } 01691 } 01692 01693 numverts = s.v.num() / 3; 01694 v3fv=&s.v[0]; 01695 n3fv=&s.n[0]; 01696 c3fv=&s.c[0]; 01697 01698 numfacets = s.numtriangles; 01699 fiv=&s.f[0]; 01700 01701 delete surfvol; 01702 01703 mcverttime = wkf_timer_timenow(timer); 01704 reptime = mcverttime; 01705 01706 if (verbose) { 01707 char strmsg[1024]; 01708 sprintf(strmsg, "QuickSurf: %.3f [pre:%.3f vol:%.3f gr:%.3f mc:%.2f mcv:%.3f]", 01709 reptime, pretime, voltime-pretime, gradtime-voltime, 01710 mctime-gradtime, mcverttime-mctime); 01711 01712 msgInfo << strmsg << sendmsg; 01713 } 01714 01715 return 0; 01716 } 01717 01718 01719 int QuickSurf::draw_trimesh(VMDDisplayList *cmdList) { 01720 DispCmdTriMesh cmdTriMesh; 01721 01722 int numverts=0; 01723 float *v=NULL, *n=NULL, *c=NULL; 01724 int numfacets=0; 01725 int *f=NULL; 01726 01727 get_trimesh(numverts, v, n, c, numfacets, f); 01728 01729 // Create a triangle mesh 01730 if (numfacets > 0) { 01731 cmdTriMesh.putdata(v, n, c, numverts, f, numfacets, 0, cmdList); 01732 } 01733 01734 return 0; 01735 } 01736 01737 01738 QuickSurf::~QuickSurf() { 01739 #if defined(VMDCUDA) 01740 free_gpu_memory(); 01741 #endif 01742 01743 if (voltexmap != NULL) 01744 free(voltexmap); 01745 voltexmap = NULL; 01746 01747 wkf_timer_destroy(timer); 01748 } 01749 01750