Main Page Namespace List Class Hierarchy Alphabetical List Compound List File List Namespace Members Compound Members File Members Related Pages
QuickSurf.C

Go to the documentation of this file.
00001 /***************************************************************************
00002 *cr 
00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the 
00004 *cr University of Illinois 
00005 *cr All Rights Reserved 
00006 *cr 
00007 ***************************************************************************/
00008 
00009 /***************************************************************************
00010 * RCS INFORMATION:
00011 *
00012 * $RCSfile: QuickSurf.C,v $
00013 * $Author: johns $ $Locker: $ $State: Exp $
00014 * $Revision: 1.136 $ $Date: 2022年05月23日 19:10:01 $
00015 *
00016 ***************************************************************************
00017 * DESCRIPTION:
00018 * Fast gaussian surface representation
00019 ***************************************************************************/
00020 
00021 // pgcc 2016 has troubles with hand-vectorized x86 intrinsics presently
00022 #if !defined(__PGIC__)
00023 
00024 // Intel x86 hardware
00025 #if (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_AMD64))
00026 #if !defined(__SSE2__) && defined(_WIN64)
00027 #define __SSE2__ 1 /* MSVC fails to define SSE macros */
00028 #endif
00029 #endif
00030 
00031 #define VMDQSURFUSESSE 1
00032 #endif
00033 
00034 // The OpenPOWER VSX code path runs on POWER8 and later hardware, but is
00035 // untested on older platforms that support VSX instructions.
00036 // XXX GCC 4.8.5 breaks with conflicts between vec_xxx() routines
00037 // defined in utilities.h vs. VSX intrinsics in altivec.h and similar.
00038 // For now, we disable VSX for GCC for this source file.
00039 // IBM Power 8/9/10 Altivec/VSX instructions:
00040 // https://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
00041 #if !defined(__GNUC__) && defined(__VEC__)
00042 #define VMDQSURFUSEVSX 1
00043 #endif
00044 
00045 #include <stdio.h>
00046 #include <stdlib.h>
00047 #if VMDQSURFUSESSE && defined(__SSE2__) 
00048 #include <emmintrin.h>
00049 #endif
00050 
00051 #if (defined(VMDQSURFUSEVSX) && defined(__VSX__))
00052 #if defined(__GNUC__) && defined(__VEC__)
00053 #include <altivec.h>
00054 #endif
00055 #endif
00056 
00057 #include <string.h>
00058 #include <math.h>
00059 #include "QuickSurf.h"
00060 #if defined(VMDCUDA)
00061 #include "CUDAQuickSurf.h"
00062 #endif
00063 #include "Measure.h"
00064 #include "Inform.h"
00065 #include "utilities.h"
00066 #include "WKFUtils.h"
00067 #include "VolumetricData.h"
00068 
00069 #include "VMDDisplayList.h"
00070 #include "Displayable.h"
00071 #include "DispCmds.h"
00072 #include "ProfileHooks.h"
00073 #include "VMDApp.h" // access global CPU insn flags for dispatch
00074 
00075 #define MIN(X,Y) (((X)<(Y))? (X) : (Y))
00076 #define MAX(X,Y) (((X)>(Y))? (X) : (Y))
00077 
00078 // fctn prototypes for runtime dispatched AVX2 kernels, etc
00079 void vmd_gaussdensity_avx2(int verbose,
00080 int natoms, const float *xyzr,
00081 const float *atomicnum,
00082 const float *colors,
00083 float *densitymap, float *voltexmap,
00084 const int *numvoxels,
00085 float radscale, float gridspacing,
00086 float isovalue, float gausslim);
00087 
00088 void vmd_gaussdensity_neon(int verbose,
00089 int natoms, const float *xyzr,
00090 const float *atomicnum,
00091 const float *colors,
00092 float *densitymap, float *voltexmap,
00093 const int *numvoxels,
00094 float radscale, float gridspacing,
00095 float isovalue, float gausslim);
00096 
00097 /*
00098 * David J. Hardy
00099 * 12 Dec 2008
00100 *
00101 * aexpfnx() - Approximate expf() for negative x.
00102 *
00103 * Assumes that x <= 0.
00104 *
00105 * Assumes IEEE format for single precision float, specifically:
00106 * 1 sign bit, 8 exponent bits biased by 127, and 23 mantissa bits.
00107 *
00108 * Interpolates exp() on interval (-1/log2(e), 0], then shifts it by
00109 * multiplication of a fast calculation for 2^(-N). The interpolation
00110 * uses a linear blending of 3rd degree Taylor polynomials at the end
00111 * points, so the approximation is once differentiable.
00112 *
00113 * The error is small (max relative error per interval is calculated
00114 * to be 0.131%, with a max absolute error of -0.000716).
00115 *
00116 * The cutoff is chosen so as to speed up the computation by early
00117 * exit from function, with the value chosen to give less than the
00118 * the max absolute error. Use of a cutoff is unnecessary, except
00119 * for needing to shift smallest floating point numbers to zero,
00120 * i.e. you could remove cutoff and replace by:
00121 *
00122 * #define MINXNZ -88.0296919311130 // -127 * log(2)
00123 *
00124 * if (x < MINXNZ) return 0.f;
00125 *
00126 * Use of a cutoff causes a discontinuity which can be eliminated
00127 * through the use of a switching function.
00128 *
00129 * We can obtain arbitrarily smooth approximation by taking k+1 nodes on
00130 * the interval and weighting their respective Taylor polynomials by the
00131 * kth order Lagrange interpolant through those nodes. The wiggle in the
00132 * polynomial interpolation due to equidistant nodes (Runge's phenomenon)
00133 * can be reduced by using Chebyshev nodes.
00134 */
00135 
00136 #if defined(__GNUC__) && ! defined(__INTEL_COMPILER)
00137 #define __align(X) __attribute__((aligned(X) ))
00138 #if (__GNUC__ < 4)
00139 #define MISSING_mm_cvtsd_f64
00140 #endif
00141 #else
00142 #define __align(X) __declspec(align(X) )
00143 #endif
00144 
00145 #define MLOG2EF -1.44269504088896f
00146 
00147 /*
00148 * Interpolating coefficients for linear blending of the
00149 * 3rd degree Taylor expansion of 2^x about 0 and -1.
00150 */
00151 #define SCEXP0 1.0000000000000000f
00152 #define SCEXP1 0.6987082824680118f
00153 #define SCEXP2 0.2633174272827404f
00154 #define SCEXP3 0.0923611991471395f
00155 #define SCEXP4 0.0277520543324108f
00156 
00157 /* for single precision float */
00158 #define EXPOBIAS 127
00159 #define EXPOSHIFT 23
00160 
00161 /* cutoff is optional, but can help avoid unnecessary work */
00162 #define ACUTOFF -10
00163 
00164 typedef union flint_t {
00165 float f;
00166 int n;
00167 } flint;
00168 
00169 #if VMDQSURFUSESSE && defined(__SSE2__)
00170 // SSE variant of the 'flint' union above
00171 typedef union SSEreg_t {
00172 __m128 f; // 4x float (SSE)
00173 __m128i i; // 4x 32-bit int (SSE2)
00174 } SSEreg;
00175 #endif
00176 
00177 
00178 #if 0
00179 static float aexpfnx(float x) {
00180 /* assume x <= 0 */
00181 float mb;
00182 int mbflr;
00183 float d;
00184 float sy;
00185 flint scalfac;
00186 
00187 if (x < ACUTOFF) return 0.f;
00188 
00189 mb = x * MLOG2EF; /* change base to 2, mb >= 0 */
00190 mbflr = (int) mb; /* get int part, floor() */
00191 d = mbflr - mb; /* remaining exponent, -1 < d <= 0 */
00192 sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
00193 /* approx with linear blend of Taylor polys */
00194 scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT; /* 2^(-mbflr) */
00195 return (sy * scalfac.f); /* scaled approx */
00196 }
00197 
00198 
00199 static void vmd_gaussdensity(int verbose, 
00200 int natoms, const float *xyzr,
00201 const float *atomicnum,
00202 const float *colors,
00203 float *densitymap, float *voltexmap, 
00204 const int *numvoxels, 
00205 float radscale, float gridspacing, 
00206 float isovalue, float gausslim) {
00207 int i, x, y, z;
00208 int maxvoxel[3];
00209 maxvoxel[0] = numvoxels[0]-1; 
00210 maxvoxel[1] = numvoxels[1]-1; 
00211 maxvoxel[2] = numvoxels[2]-1; 
00212 const float invgridspacing = 1.0f / gridspacing;
00213 
00214 // compute colors only if necessary, since they are costly
00215 if (voltexmap != NULL) {
00216 float invisovalue = 1.0f / isovalue;
00217 // compute both density map and floating point color texture map
00218 for (i=0; i<natoms; i++) {
00219 if (verbose && ((i & 0x3fff) == 0)) {
00220 printf("."); 
00221 fflush(stdout);
00222 }
00223 
00224 ptrdiff_t ind = i*4L;
00225 float scaledrad = xyzr[ind + 3L] * radscale;
00226 
00227 // MDFF atomic number weighted density factor
00228 float atomicnumfactor = 1.0f;
00229 if (atomicnum != NULL) {
00230 atomicnumfactor = atomicnum[i];
00231 }
00232 
00233 float arinv = 1.0f/(2.0f*scaledrad*scaledrad);
00234 float radlim = gausslim * scaledrad;
00235 float radlim2 = radlim * radlim; // cutoff test done in cartesian coords
00236 radlim *= invgridspacing;
00237 
00238 float tmp;
00239 tmp = xyzr[ind ] * invgridspacing;
00240 int xmin = MAX((int) (tmp - radlim), 0);
00241 int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00242 tmp = xyzr[ind+1] * invgridspacing;
00243 int ymin = MAX((int) (tmp - radlim), 0);
00244 int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00245 tmp = xyzr[ind+2] * invgridspacing;
00246 int zmin = MAX((int) (tmp - radlim), 0);
00247 int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00248 
00249 float dz = zmin*gridspacing - xyzr[ind+2];
00250 for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00251 float dy = ymin*gridspacing - xyzr[ind+1];
00252 for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00253 float dy2dz2 = dy*dy + dz*dz;
00254 
00255 // early-exit when outside the cutoff radius in the Y-Z plane
00256 if (dy2dz2 >= radlim2) 
00257 continue;
00258 
00259 int addr = z * numvoxels[0] * numvoxels[1] + y * numvoxels[0];
00260 float dx = xmin*gridspacing - xyzr[ind];
00261 for (x=xmin; x<=xmax; x++,dx+=gridspacing) {
00262 float r2 = dx*dx + dy2dz2;
00263 float expval = -r2 * arinv;
00264 #if VMDUSEFULLEXP
00265 // use the math library exponential routine
00266 float density = exp(expval);
00267 #else
00268 // use our (much faster) fast exponential approximation
00269 float density = aexpfnx(expval);
00270 #endif
00271 
00272 density *= atomicnumfactor; // MDFF Cryo-EM atomic number density
00273 
00274 // accumulate density value to density map
00275 densitymap[addr + x] += density;
00276 
00277 // Accumulate density-weighted color to texture map.
00278 // Pre-multiply colors by the inverse isovalue we will extract 
00279 // the surface on, to cause the final color to be normalized.
00280 density *= invisovalue;
00281 ptrdiff_t caddr = (addr + x) * 3L;
00282 
00283 // color by atom colors
00284 voltexmap[caddr ] += density * colors[ind ];
00285 voltexmap[caddr + 1] += density * colors[ind + 1];
00286 voltexmap[caddr + 2] += density * colors[ind + 2];
00287 }
00288 }
00289 }
00290 }
00291 } else {
00292 // compute density map only
00293 for (i=0; i<natoms; i++) {
00294 if (verbose && ((i & 0x3fff) == 0)) {
00295 printf("."); 
00296 fflush(stdout);
00297 }
00298 
00299 ptrdiff_t ind = i*4L;
00300 float scaledrad = xyzr[ind + 3] * radscale;
00301 
00302 // MDFF atomic number weighted density factor
00303 float atomicnumfactor = 1.0f;
00304 if (atomicnum != NULL) {
00305 atomicnumfactor = atomicnum[i];
00306 }
00307 
00308 float arinv = 1.0f/(2.0f*scaledrad*scaledrad);
00309 float radlim = gausslim * scaledrad;
00310 float radlim2 = radlim * radlim; // cutoff test done in cartesian coords
00311 radlim *= invgridspacing;
00312 
00313 float tmp;
00314 tmp = xyzr[ind ] * invgridspacing;
00315 int xmin = MAX((int) (tmp - radlim), 0);
00316 int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00317 tmp = xyzr[ind+1] * invgridspacing;
00318 int ymin = MAX((int) (tmp - radlim), 0);
00319 int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00320 tmp = xyzr[ind+2] * invgridspacing;
00321 int zmin = MAX((int) (tmp - radlim), 0);
00322 int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00323 
00324 float dz = zmin*gridspacing - xyzr[ind+2];
00325 for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00326 float dy = ymin*gridspacing - xyzr[ind+1];
00327 for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00328 float dy2dz2 = dy*dy + dz*dz;
00329 
00330 // early-exit when outside the cutoff radius in the Y-Z plane
00331 if (dy2dz2 >= radlim2) 
00332 continue;
00333 
00334 int addr = z * numvoxels[0] * numvoxels[1] + y * numvoxels[0];
00335 float dx = xmin*gridspacing - xyzr[ind];
00336 for (x=xmin; x<=xmax; x++,dx+=gridspacing) {
00337 float r2 = dx*dx + dy2dz2;
00338 float expval = -r2 * arinv;
00339 #if VMDUSEFULLEXP
00340 // use the math library exponential routine
00341 float density = exp(expval);
00342 #else
00343 // use our (much faster) fast exponential approximation
00344 float density = aexpfnx(expval);
00345 #endif
00346 
00347 density *= atomicnumfactor; // MDFF Cryo-EM atomic number density
00348 
00349 // accumulate density value to density map
00350 densitymap[addr + x] += density;
00351 }
00352 }
00353 }
00354 }
00355 }
00356 }
00357 #endif
00358 
00359 
00360 
00361 static void vmd_gaussdensity_opt(wkf_cpu_caps_t *cpucaps, int verbose,
00362 int natoms, const float *xyzr,
00363 const float *atomicnum,
00364 const float *colors,
00365 float *densitymap, float *voltexmap, 
00366 const int *numvoxels, 
00367 float radscale, float gridspacing, 
00368 float isovalue, float gausslim) {
00369 int i, x, y, z;
00370 int maxvoxel[3];
00371 maxvoxel[0] = numvoxels[0]-1; 
00372 maxvoxel[1] = numvoxels[1]-1; 
00373 maxvoxel[2] = numvoxels[2]-1; 
00374 const float invgridspacing = 1.0f / gridspacing;
00375 
00376 //
00377 // runtime CPU dispatch
00378 // check for optional vector instructions and execute custom kernels
00379 // for the fastest code path supported by the detected hardware
00380 //
00381 #if defined(VMDCPUDISPATCH)
00382 
00383 // Intel x86
00384 #if (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_AMD64))
00385 
00386 if ((cpucaps != NULL) && 
00387 (cpucaps->flags & CPU_FMA) && (cpucaps->flags & CPU_AVX2) && 
00388 (getenv("VMDNOAVX2")==NULL)) {
00389 if (verbose)
00390 printf("vmd_gaussdensity_avx2()\n");
00391 
00392 vmd_gaussdensity_avx2(verbose, natoms, xyzr, atomicnum, colors, 
00393 densitymap, voltexmap, numvoxels, radscale, 
00394 gridspacing, isovalue, gausslim);
00395 return;
00396 }
00397 #endif
00398 
00399 #if defined(VMDUSENEON)
00400 // if ((cpucaps->flags & CPU_NEON) && (getenv("VMDNONEON") == NULL)) {
00401 if (1 && (getenv("VMDNONEON") == NULL)) {
00402 if (verbose)
00403 printf("vmd_gaussdensity_neon()\n");
00404 
00405 vmd_gaussdensity_neon(verbose, natoms, xyzr, atomicnum, colors, 
00406 densitymap, voltexmap, numvoxels, radscale, 
00407 gridspacing, isovalue, gausslim);
00408 return;
00409 }
00410 #endif
00411 
00412 
00413 if (verbose)
00414 printf("vmd_gaussdensity_opt()\n");
00415 #endif
00416 
00417 #if VMDQSURFUSESSE && defined(__SSE2__)
00418 int usesse=1;
00419 if (getenv("VMDNOSSE")) {
00420 usesse=0;
00421 }
00422 #endif
00423 
00424 #if VMDQSURFUSEVSX && defined(__VEC__)
00425 int usevsx=1;
00426 if (getenv("VMDNOVSX")) {
00427 usevsx=0;
00428 }
00429 #endif
00430 
00431 #if VMDQSURFUSESSE && defined(__SSE2__)
00432 // Variables for SSE optimized inner loop
00433 __m128 gridspacing4_4;
00434 __align(16) float sxdelta4[4]; // 16-byte aligned for SSE
00435 
00436 if (usesse) {
00437 gridspacing4_4 = _mm_set1_ps(gridspacing * 4.0f);
00438 for (x=0; x<4; x++)
00439 sxdelta4[x] = ((float) x) * gridspacing;
00440 }
00441 #endif
00442 
00443 #if VMDQSURFUSEVSX && defined(__VEC__)
00444 // Variables for VSX optimized inner loop
00445 vector float gridspacing4_4;
00446 __attribute__((aligned(16))) float sxdelta4[4]; // 16-byte aligned for VSX
00447 
00448 if (usevsx) {
00449 gridspacing4_4 = vec_splats(gridspacing * 4.0f);
00450 for (x=0; x<4; x++)
00451 sxdelta4[x] = ((float) x) * gridspacing;
00452 }
00453 #endif
00454 
00455 // compute colors only if necessary, since they are costly
00456 if (voltexmap != NULL) {
00457 float invisovalue = 1.0f / isovalue;
00458 // compute both density map and floating point color texture map
00459 for (i=0; i<natoms; i++) {
00460 if (verbose && ((i & 0x3fff) == 0)) {
00461 printf("."); 
00462 fflush(stdout);
00463 }
00464 
00465 ptrdiff_t ind = i*4L;
00466 float scaledrad = xyzr[ind + 3] * radscale;
00467 
00468 // MDFF atomic number weighted density factor
00469 float atomicnumfactor = 1.0f;
00470 if (atomicnum != NULL) {
00471 atomicnumfactor = atomicnum[i];
00472 }
00473 
00474 // negate, precompute reciprocal, and change to base 2 from the outset
00475 float arinv = -(1.0f/(2.0f*scaledrad*scaledrad)) * MLOG2EF;
00476 float radlim = gausslim * scaledrad;
00477 float radlim2 = radlim * radlim; // cutoff test done in cartesian coords
00478 radlim *= invgridspacing;
00479 
00480 #if VMDQSURFUSESSE && defined(__SSE2__)
00481 __m128 atomicnumfactor_4 = { 0 };
00482 __m128 arinv_4;
00483 if (usesse) {
00484 atomicnumfactor_4 = _mm_set1_ps(atomicnumfactor);
00485 #if VMDUSESVMLEXP
00486 // Use of Intel's SVML requires changing the pre-scaling factor
00487 arinv_4 = _mm_set1_ps(arinv * (2.718281828f/2.0f) / MLOG2EF); 
00488 #else
00489 // Use our fully inlined exp approximation
00490 arinv_4 = _mm_set1_ps(arinv);
00491 #endif
00492 }
00493 #endif
00494 
00495 float tmp;
00496 tmp = xyzr[ind ] * invgridspacing;
00497 int xmin = MAX((int) (tmp - radlim), 0);
00498 int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00499 tmp = xyzr[ind+1] * invgridspacing;
00500 int ymin = MAX((int) (tmp - radlim), 0);
00501 int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00502 tmp = xyzr[ind+2] * invgridspacing;
00503 int zmin = MAX((int) (tmp - radlim), 0);
00504 int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00505 
00506 float dz = zmin*gridspacing - xyzr[ind+2];
00507 for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00508 float dy = ymin*gridspacing - xyzr[ind+1];
00509 for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00510 float dy2dz2 = dy*dy + dz*dz;
00511 
00512 // early-exit when outside the cutoff radius in the Y-Z plane
00513 if (dy2dz2 >= radlim2) 
00514 continue;
00515 
00516 ptrdiff_t addr = ptrdiff_t(z * numvoxels[0]) * ptrdiff_t(numvoxels[1]) + ptrdiff_t(y * numvoxels[0]);
00517 float dx = xmin*gridspacing - xyzr[ind];
00518 x=xmin;
00519 
00520 #if VMDQSURFUSESSE && defined(__SSE2__)
00521 // Use SSE when we have a multiple-of-4 to compute
00522 // finish all remaining density map points with regular non-SSE loop
00523 if (usesse) {
00524 __align(16) SSEreg n;
00525 __align(16) SSEreg y;
00526 __m128 dy2dz2_4 = _mm_set1_ps(dy2dz2);
00527 __m128 dx_4 = _mm_add_ps(_mm_set1_ps(dx), _mm_load_ps(&sxdelta4[0]));
00528 
00529 for (; (x+3)<=xmax; x+=4,dx_4=_mm_add_ps(dx_4, gridspacing4_4)) {
00530 __m128 r2 = _mm_add_ps(_mm_mul_ps(dx_4, dx_4), dy2dz2_4);
00531 __m128 d;
00532 #if VMDUSESVMLEXP
00533 // use Intel's SVML exp2() routine
00534 y.f = _mm_exp2_ps(_mm_mul_ps(r2, arinv_4));
00535 #else
00536 // use our (much faster) fully inlined exponential approximation
00537 y.f = _mm_mul_ps(r2, arinv_4); /* already negated and in base 2 */
00538 n.i = _mm_cvttps_epi32(y.f);
00539 d = _mm_cvtepi32_ps(n.i);
00540 d = _mm_sub_ps(d, y.f);
00541 
00542 // Approximate 2^{-d}, 0 <= d < 1, by interpolation.
00543 // Perform Horner's method to evaluate interpolating polynomial.
00544 #if 0
00545 // SSE 4.x FMADD instructions are not universally available
00546 y.f = _mm_fmadd_ps(d, _mm_set1_ps(SCEXP4), _mm_set1_ps(SCEXP3)); 
00547 y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP2));
00548 y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP1));
00549 y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP0));
00550 #else
00551 y.f = _mm_mul_ps(d, _mm_set_ps1(SCEXP4)); /* for x^4 term */
00552 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP3)); /* for x^3 term */
00553 y.f = _mm_mul_ps(y.f, d);
00554 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP2)); /* for x^2 term */
00555 y.f = _mm_mul_ps(y.f, d);
00556 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP1)); /* for x^1 term */
00557 y.f = _mm_mul_ps(y.f, d);
00558 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP0)); /* for x^0 term */
00559 #endif
00560 
00561 // Calculate 2^N exactly by directly manipulating floating point exponent,
00562 // then use it to scale y for the final result.
00563 n.i = _mm_sub_epi32(_mm_set1_epi32(EXPOBIAS), n.i);
00564 n.i = _mm_slli_epi32(n.i, EXPOSHIFT);
00565 y.f = _mm_mul_ps(y.f, n.f);
00566 #endif
00567 
00568 // At present, we do unaligned loads/stores since we can't guarantee
00569 // that the X-dimension is always a multiple of 4.
00570 float *ufptr = &densitymap[addr + x];
00571 d = _mm_loadu_ps(ufptr);
00572 y.f = _mm_mul_ps(y.f, atomicnumfactor_4); // MDFF density maps
00573 _mm_storeu_ps(ufptr, _mm_add_ps(d, y.f)); 
00574 
00575 // Accumulate density-weighted color to texture map.
00576 // Pre-multiply colors by the inverse isovalue we will extract 
00577 // the surface on, to cause the final color to be normalized.
00578 d = _mm_mul_ps(y.f, _mm_set_ps1(invisovalue));
00579 ptrdiff_t caddr = (addr + x) * 3L;
00580 
00581 #if 1
00582 float *txptr = &voltexmap[caddr];
00583 // unaligned load of 4 consecutive rgb3f texture map texels
00584 __m128 r0g0b0r1 = _mm_loadu_ps(txptr+0);
00585 __m128 g1b1r2g2 = _mm_loadu_ps(txptr+4); 
00586 __m128 b2r3g3b3 = _mm_loadu_ps(txptr+8);
00587 
00588 // convert rgb3f AOS format to 4-element SOA vectors using shuffle instructions
00589 __m128 r2g2r3g3 = _mm_shuffle_ps(g1b1r2g2, b2r3g3b3, _MM_SHUFFLE(2, 1, 3, 2)); 
00590 __m128 g0b0g1b1 = _mm_shuffle_ps(r0g0b0r1, g1b1r2g2, _MM_SHUFFLE(1, 0, 2, 1));
00591 __m128 r = _mm_shuffle_ps(r0g0b0r1, r2g2r3g3, _MM_SHUFFLE(2, 0, 3, 0)); // r0r1r2r3
00592 __m128 g = _mm_shuffle_ps(g0b0g1b1, r2g2r3g3, _MM_SHUFFLE(3, 1, 2, 0)); // g0g1g2g3
00593 __m128 b = _mm_shuffle_ps(g0b0g1b1, b2r3g3b3, _MM_SHUFFLE(3, 0, 3, 1)); // b0g1b2b3
00594 
00595 // accumulate density-scaled colors into texels
00596 r = _mm_add_ps(r, _mm_mul_ps(d, _mm_set_ps1(colors[ind ])));
00597 g = _mm_add_ps(g, _mm_mul_ps(d, _mm_set_ps1(colors[ind + 1])));
00598 b = _mm_add_ps(b, _mm_mul_ps(d, _mm_set_ps1(colors[ind + 2])));
00599 
00600 // convert 4-element SOA vectors to rgb3f AOS format using shuffle instructions
00601 __m128 r0r2g0g2 = _mm_shuffle_ps(r, g, _MM_SHUFFLE(2, 0, 2, 0));
00602 __m128 g1g3b1b3 = _mm_shuffle_ps(g, b, _MM_SHUFFLE(3, 1, 3, 1));
00603 __m128 b0b2r1r3 = _mm_shuffle_ps(b, r, _MM_SHUFFLE(3, 1, 2, 0));
00604 
00605 __m128 rr0g0b0r1 = _mm_shuffle_ps(r0r2g0g2, b0b2r1r3, _MM_SHUFFLE(2, 0, 2, 0)); 
00606 __m128 rg1b1r2g2 = _mm_shuffle_ps(g1g3b1b3, r0r2g0g2, _MM_SHUFFLE(3, 1, 2, 0)); 
00607 __m128 rb2r3g3b3 = _mm_shuffle_ps(b0b2r1r3, g1g3b1b3, _MM_SHUFFLE(3, 1, 3, 1)); 
00608 
00609 // unaligned store of 4 consecutive rgb3f texture map texels
00610 _mm_storeu_ps(txptr+0, rr0g0b0r1);
00611 _mm_storeu_ps(txptr+4, rg1b1r2g2);
00612 _mm_storeu_ps(txptr+8, rb2r3g3b3);
00613 
00614 #else
00615 
00616 // color by atom colors
00617 float r, g, b;
00618 r = colors[ind ];
00619 g = colors[ind + 1];
00620 b = colors[ind + 2];
00621 
00622 SSEreg tmp; 
00623 tmp.f = d;
00624 float density;
00625 density = tmp.floatreg.r0; 
00626 voltexmap[caddr ] += density * r;
00627 voltexmap[caddr + 1] += density * g;
00628 voltexmap[caddr + 2] += density * b;
00629 
00630 density = tmp.floatreg.r1; 
00631 voltexmap[caddr + 3] += density * r;
00632 voltexmap[caddr + 4] += density * g;
00633 voltexmap[caddr + 5] += density * b;
00634 
00635 density = tmp.floatreg.r2; 
00636 voltexmap[caddr + 6] += density * r;
00637 voltexmap[caddr + 7] += density * g;
00638 voltexmap[caddr + 8] += density * b;
00639 
00640 density = tmp.floatreg.r3; 
00641 voltexmap[caddr + 9] += density * r;
00642 voltexmap[caddr + 10] += density * g;
00643 voltexmap[caddr + 11] += density * b;
00644 #endif
00645 }
00646 }
00647 #endif
00648 
00649 // finish all remaining density map points with regular non-SSE loop
00650 for (; x<=xmax; x++,dx+=gridspacing) {
00651 float r2 = dx*dx + dy2dz2;
00652 
00653 // use our (much faster) fully inlined exponential approximation
00654 float mb = r2 * arinv; /* already negated and in base 2 */
00655 int mbflr = (int) mb; /* get int part, floor() */
00656 float d = mbflr - mb; /* remaining exponent, -1 < d <= 0 */
00657 
00658 /* approx with linear blend of Taylor polys */
00659 float sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
00660 
00661 /* 2^(-mbflr) */
00662 flint scalfac;
00663 scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT; 
00664 
00665 // XXX assume we are never beyond the cutoff value in this loop
00666 float density = (sy * scalfac.f);
00667 
00668 density *= atomicnumfactor; // MDFF Cryo-EM atomic number density
00669 
00670 // accumulate density value to density map
00671 densitymap[addr + x] += density;
00672 
00673 // Accumulate density-weighted color to texture map.
00674 // Pre-multiply colors by the inverse isovalue we will extract 
00675 // the surface on, to cause the final color to be normalized.
00676 density *= invisovalue;
00677 ptrdiff_t caddr = (addr + x) * 3L;
00678 
00679 // color by atom colors
00680 voltexmap[caddr ] += density * colors[ind ];
00681 voltexmap[caddr + 1] += density * colors[ind + 1];
00682 voltexmap[caddr + 2] += density * colors[ind + 2];
00683 }
00684 }
00685 }
00686 }
00687 } else {
00688 // compute density map only
00689 for (i=0; i<natoms; i++) {
00690 if (verbose && ((i & 0x3fff) == 0)) {
00691 printf("."); 
00692 fflush(stdout);
00693 }
00694 
00695 ptrdiff_t ind = i*4L;
00696 float scaledrad = xyzr[ind+3] * radscale;
00697 
00698 // MDFF atomic number weighted density factor
00699 float atomicnumfactor = 1.0f;
00700 if (atomicnum != NULL) {
00701 atomicnumfactor = atomicnum[i];
00702 }
00703 
00704 // negate, precompute reciprocal, and change to base 2 from the outset
00705 float arinv = -(1.0f/(2.0f*scaledrad*scaledrad)) * MLOG2EF;
00706 float radlim = gausslim * scaledrad;
00707 float radlim2 = radlim * radlim; // cutoff test done in cartesian coords
00708 radlim *= invgridspacing;
00709 
00710 #if VMDQSURFUSESSE && defined(__SSE2__)
00711 __m128 atomicnumfactor_4 = { 0 };
00712 __m128 arinv_4;
00713 if (usesse) {
00714 atomicnumfactor_4 = _mm_set1_ps(atomicnumfactor);
00715 #if VMDUSESVMLEXP
00716 // Use of Intel's SVML requires changing the pre-scaling factor
00717 arinv_4 = _mm_set1_ps(arinv * (2.718281828f/2.0f) / MLOG2EF); 
00718 #else
00719 // Use our fully inlined exp approximation
00720 arinv_4 = _mm_set1_ps(arinv);
00721 #endif
00722 }
00723 #endif
00724 
00725 #if VMDQSURFUSEVSX && defined(__VEC__)
00726 vector float atomicnumfactor_4;
00727 vector float arinv_4;
00728 if (usevsx) {
00729 atomicnumfactor_4 = vec_splats(atomicnumfactor);
00730 
00731 // Use our fully inlined exp approximation
00732 arinv_4 = vec_splats(arinv);
00733 }
00734 #endif
00735 
00736 float tmp;
00737 tmp = xyzr[ind ] * invgridspacing;
00738 int xmin = MAX((int) (tmp - radlim), 0);
00739 int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00740 tmp = xyzr[ind+1] * invgridspacing;
00741 int ymin = MAX((int) (tmp - radlim), 0);
00742 int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00743 tmp = xyzr[ind+2] * invgridspacing;
00744 int zmin = MAX((int) (tmp - radlim), 0);
00745 int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00746 
00747 float dz = zmin*gridspacing - xyzr[ind+2];
00748 for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00749 float dy = ymin*gridspacing - xyzr[ind+1];
00750 for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00751 float dy2dz2 = dy*dy + dz*dz;
00752 
00753 // early-exit when outside the cutoff radius in the Y-Z plane
00754 if (dy2dz2 >= radlim2) 
00755 continue;
00756 
00757 ptrdiff_t addr = ptrdiff_t(z * numvoxels[0]) * ptrdiff_t(numvoxels[1]) + ptrdiff_t(y * numvoxels[0]);
00758 float dx = xmin*gridspacing - xyzr[ind];
00759 x=xmin;
00760 
00761 #if VMDQSURFUSESSE && defined(__SSE2__)
00762 // Use SSE when we have a multiple-of-4 to compute
00763 // finish all remaining density map points with regular non-SSE loop
00764 if (usesse) {
00765 __align(16) SSEreg n;
00766 __align(16) SSEreg y;
00767 __m128 dy2dz2_4 = _mm_set1_ps(dy2dz2);
00768 __m128 dx_4 = _mm_add_ps(_mm_set1_ps(dx), _mm_load_ps(&sxdelta4[0]));
00769 
00770 for (; (x+3)<=xmax; x+=4,dx_4=_mm_add_ps(dx_4, gridspacing4_4)) {
00771 __m128 r2 = _mm_add_ps(_mm_mul_ps(dx_4, dx_4), dy2dz2_4);
00772 __m128 d;
00773 #if VMDUSESVMLEXP
00774 // use Intel's SVML exp2() routine
00775 y.f = _mm_exp2_ps(_mm_mul_ps(r2, arinv_4));
00776 #else
00777 // use our (much faster) fully inlined exponential approximation
00778 y.f = _mm_mul_ps(r2, arinv_4); /* already negated and in base 2 */
00779 n.i = _mm_cvttps_epi32(y.f);
00780 d = _mm_cvtepi32_ps(n.i);
00781 d = _mm_sub_ps(d, y.f);
00782 
00783 // Approximate 2^{-d}, 0 <= d < 1, by interpolation.
00784 // Perform Horner's method to evaluate interpolating polynomial.
00785 #if 0
00786 // SSE 4.x FMADD instructions are not universally available
00787 y.f = _mm_fmadd_ps(d, _mm_set1_ps(SCEXP4), _mm_set1_ps(SCEXP3)); 
00788 y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP2));
00789 y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP1));
00790 y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP0));
00791 #else
00792 y.f = _mm_mul_ps(d, _mm_set_ps1(SCEXP4)); /* for x^4 term */
00793 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP3)); /* for x^3 term */
00794 y.f = _mm_mul_ps(y.f, d);
00795 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP2)); /* for x^2 term */
00796 y.f = _mm_mul_ps(y.f, d);
00797 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP1)); /* for x^1 term */
00798 y.f = _mm_mul_ps(y.f, d);
00799 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP0)); /* for x^0 term */
00800 #endif
00801 
00802 // Calculate 2^N exactly by directly manipulating floating point exponent,
00803 // then use it to scale y for the final result.
00804 n.i = _mm_sub_epi32(_mm_set1_epi32(EXPOBIAS), n.i);
00805 n.i = _mm_slli_epi32(n.i, EXPOSHIFT);
00806 y.f = _mm_mul_ps(y.f, n.f);
00807 y.f = _mm_mul_ps(y.f, atomicnumfactor_4); // MDFF density maps
00808 #endif
00809 
00810 // At present, we do unaligned loads/stores since we can't guarantee
00811 // that the X-dimension is always a multiple of 4.
00812 float *ufptr = &densitymap[addr + x];
00813 d = _mm_loadu_ps(ufptr); 
00814 _mm_storeu_ps(ufptr, _mm_add_ps(d, y.f)); 
00815 }
00816 }
00817 #endif
00818 
00819 
00820 #if VMDQSURFUSEVSX && defined(__VEC__)
00821 // Use VSX when we have a multiple-of-4 to compute
00822 // finish all remaining density map points with regular non-VSX loop
00823 //
00824 // XXX it may be useful to compare the speed/accuracy of the
00825 // polynomial approximation vs. the hardware-provided 
00826 // exp2f() approximation: vec_expte()
00827 //
00828 if (usevsx) {
00829 vector float dy2dz2_4 = vec_splats(dy2dz2);
00830 vector float tmpvsxdelta4 = *((__vector float *) &sxdelta4[0]);
00831 vector float dx_4 = vec_add(vec_splats(dx), tmpvsxdelta4);
00832 
00833 for (; (x+3)<=xmax; x+=4,dx_4=vec_add(dx_4, gridspacing4_4)) {
00834 vector float r2 = vec_add(vec_mul(dx_4, dx_4), dy2dz2_4);
00835 
00836 // use our (much faster) fully inlined exponential approximation
00837 vector float mb = vec_mul(r2, arinv_4); /* already negated and in base 2 */
00838 vector float mbflr = vec_floor(mb);
00839 vector float d = vec_sub(mbflr, mb);
00840 vector float y;
00841 
00842 // Approximate 2^{-d}, 0 <= d < 1, by interpolation.
00843 // Perform Horner's method to evaluate interpolating polynomial.
00844 y = vec_madd(d, vec_splats(SCEXP4), vec_splats(SCEXP3)); // x^4
00845 y = vec_madd(y, d, vec_splats(SCEXP2)); // x^2 
00846 y = vec_madd(y, d, vec_splats(SCEXP1)); // x^1 
00847 y = vec_madd(y, d, vec_splats(SCEXP0)); // x^0 
00848 
00849 // Calculate 2^N exactly via vec_expte()
00850 // then use it to scale y for the final result.
00851 y = vec_mul(y, vec_expte(-mbflr));
00852 y = vec_mul(y, atomicnumfactor_4); // MDFF density maps
00853 
00854 // At present, we do unaligned loads/stores since we can't 
00855 // guarantee that the X-dimension is always a multiple of 4.
00856 float *ufptr = &densitymap[addr + x];
00857 d = *((__vector float *) &ufptr[0]);
00858 // XXX there must be a cleaner way to implement this
00859 // d = _mm_loadu_ps(ufptr); 
00860 // _mm_storeu_ps(ufptr, _mm_add_ps(d, y.f)); 
00861 d = vec_add(d, y);
00862 
00863 ufptr[0] = d[0];
00864 ufptr[1] = d[1];
00865 ufptr[2] = d[2];
00866 ufptr[3] = d[3];
00867 }
00868 }
00869 #endif
00870 
00871 // finish all remaining density map points with regular non-SSE loop
00872 for (; x<=xmax; x++,dx+=gridspacing) {
00873 float r2 = dx*dx + dy2dz2;
00874 
00875 // use our (much faster) fully inlined exponential approximation
00876 float mb = r2 * arinv; /* already negated and in base 2 */
00877 int mbflr = (int) mb; /* get int part, floor() */
00878 float d = mbflr - mb; /* remaining exponent, -1 < d <= 0 */
00879 
00880 /* approx with linear blend of Taylor polys */
00881 float sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
00882 
00883 /* 2^(-mbflr) */
00884 flint scalfac;
00885 scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT; 
00886 
00887 // XXX assume we are never beyond the cutoff value in this loop
00888 float density = (sy * scalfac.f);
00889 
00890 density *= atomicnumfactor; // MDFF Cryo-EM atomic number density
00891 
00892 densitymap[addr + x] += density;
00893 }
00894 }
00895 }
00896 }
00897 }
00898 }
00899 
00900 
00901 typedef struct {
00902 wkf_cpu_caps_t *cpucaps;
00903 int verbose;
00904 int natoms;
00905 float radscale;
00906 float gridspacing;
00907 float isovalue;
00908 float gausslim;
00909 const int *numvoxels;
00910 const float *xyzr; 
00911 const float *atomicnum;
00912 const float *colors;
00913 float **thrdensitymaps;
00914 float **thrvoltexmaps;
00915 } densitythrparms;
00916 
00917 
00918 static void * densitythread(void *voidparms) {
00919 wkf_tasktile_t tile;
00920 densitythrparms *parms = NULL;
00921 int threadid;
00922 
00923 wkf_threadlaunch_getid(voidparms, &threadid, NULL);
00924 wkf_threadlaunch_getdata(voidparms, (void **) &parms);
00925 
00926 while (wkf_threadlaunch_next_tile(voidparms, 16384, &tile) != WKF_SCHED_DONE) {
00927 int natoms = tile.end-tile.start;
00928 const float *atomicnum = (parms->atomicnum == NULL) ? NULL : &parms->atomicnum[tile.start]; 
00929 vmd_gaussdensity_opt(parms->cpucaps,
00930 parms->verbose, natoms, 
00931 &parms->xyzr[4L*tile.start],
00932 atomicnum,
00933 (parms->thrvoltexmaps[0]!=NULL) ? &parms->colors[4L*tile.start] : NULL,
00934 parms->thrdensitymaps[threadid], 
00935 parms->thrvoltexmaps[threadid], 
00936 parms->numvoxels, 
00937 parms->radscale, 
00938 parms->gridspacing, 
00939 parms->isovalue, 
00940 parms->gausslim);
00941 }
00942 
00943 return NULL;
00944 }
00945 
00946 
00947 static void * reductionthread(void *voidparms) {
00948 wkf_tasktile_t tile;
00949 densitythrparms *parms = NULL;
00950 int threadid, numthreads;
00951 
00952 wkf_threadlaunch_getid(voidparms, &threadid, &numthreads);
00953 wkf_threadlaunch_getdata(voidparms, (void **) &parms);
00954 
00955 while (wkf_threadlaunch_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) {
00956 // do a reduction over each of the individual density grids
00957 ptrdiff_t planesz = ptrdiff_t(parms->numvoxels[0]) * ptrdiff_t(parms->numvoxels[1]);
00958 ptrdiff_t start = ptrdiff_t(tile.start) * planesz;
00959 ptrdiff_t end = ptrdiff_t(tile.end) * planesz;
00960 
00961 ptrdiff_t i, x;
00962 for (x=start; x<end; x++) {
00963 float tmp = 0.0f;
00964 for (i=1; i<numthreads; i++) {
00965 tmp += parms->thrdensitymaps[i][x];
00966 }
00967 parms->thrdensitymaps[0][x] += tmp;
00968 }
00969 
00970 // do a reduction over each of the individual texture grids
00971 if (parms->thrvoltexmaps[0] != NULL) {
00972 for (x=start*3L; x<end*3L; x++) {
00973 float tmp = 0.0f;
00974 for (i=1; i<numthreads; i++) {
00975 tmp += parms->thrvoltexmaps[i][x];
00976 }
00977 parms->thrvoltexmaps[0][x] += tmp;
00978 }
00979 }
00980 }
00981 
00982 return NULL;
00983 }
00984 
00985 
00986 static int vmd_gaussdensity_threaded(wkf_cpu_caps_t *cpucaps, int verbose, 
00987 int natoms, const float *xyzr,
00988 const float *atomicnum,
00989 const float *colors,
00990 float *densitymap, float *voltexmap, 
00991 const int *numvoxels, 
00992 float radscale, float gridspacing, 
00993 float isovalue, float gausslim) {
00994 densitythrparms parms;
00995 memset(&parms, 0, sizeof(parms));
00996 
00997 parms.cpucaps = cpucaps;
00998 parms.verbose = verbose;
00999 parms.natoms = natoms;
01000 parms.radscale = radscale;
01001 parms.gridspacing = gridspacing;
01002 parms.isovalue = isovalue;
01003 parms.gausslim = gausslim;
01004 parms.numvoxels = numvoxels;
01005 parms.xyzr = xyzr;
01006 parms.atomicnum = atomicnum;
01007 parms.colors = colors;
01008 
01009 int physprocs = wkf_thread_numprocessors();
01010 int maxprocs = physprocs;
01011 
01012 // We can productively use only a few cores per socket due to the
01013 // limited memory bandwidth per socket. Also, hyperthreading
01014 // actually hurts performance. These two considerations combined
01015 // with the linear increase in memory use prevent us from using large
01016 // numbers of cores with this simple approach, so if we've got more 
01017 // than 8 CPU cores, we'll iteratively cutting the core count in 
01018 // half until we're under 8 cores.
01019 while (maxprocs > 8) 
01020 maxprocs /= 2;
01021 
01022 // Limit the number of CPU cores used so we don't run the 
01023 // machine out of memory during surface computation.
01024 // Use either a dynamic or hard-coded heuristic to limit the
01025 // number of CPU threads we will spawn so that we don't run
01026 // the machine out of memory. 
01027 ptrdiff_t volsz = ptrdiff_t(numvoxels[0]) * 
01028 ptrdiff_t(numvoxels[1]) * ptrdiff_t(numvoxels[2]);
01029 ptrdiff_t volmemsz = sizeof(float) * volsz;
01030 ptrdiff_t volmemszkb = volmemsz / 1024;
01031 ptrdiff_t volmemtexszkb = volmemszkb + ((voltexmap != NULL) ? 3L*volmemszkb : 0);
01032 
01033 // Platforms that don't have a means of determining available
01034 // physical memory will return -1, in which case we fall back to the
01035 // simple hard-coded 2GB-max-per-core heuristic.
01036 ptrdiff_t vmdcorefree = -1;
01037 
01038 #if defined(ARCH_BLUEWATERS) || defined(ARCH_CRAY_XC) || defined(ARCH_CRAY_XK) || defined(ARCH_LINUXAMD64) || defined(ARCH_SOLARIS2_64) || defined(ARCH_SOLARISX86_64) || defined(ARCH_AIX6_64) || defined(MACOSXARM64) || defined(ARCH_MACOSXX86_64) 
01039 // XXX The core-free query scheme has one weakness in that we might have a 
01040 // 32-bit version of VMD running on a 64-bit machine, where the available
01041 // physical memory may be much larger than is possible for a 
01042 // 32-bit VMD process to address. To do this properly we must therefore
01043 // use conditional compilation safety checks here until we have a better
01044 // way of determining this with a standardized helper routine.
01045 vmdcorefree = vmd_get_avail_physmem_mb();
01046 #endif
01047 
01048 if (vmdcorefree >= 0) {
01049 // Make sure QuickSurf uses no more than a fraction of the free memory
01050 // as an upper bound alternative to the hard-coded heuristic.
01051 // This should be highly preferable to the fixed-size heuristic
01052 // we had used in all cases previously.
01053 while ((volmemtexszkb * maxprocs) > (1024L*vmdcorefree/4)) {
01054 maxprocs /= 2;
01055 }
01056 } else {
01057 // Set a practical per-core maximum memory use limit to 2GB, for all cores
01058 while ((volmemtexszkb * maxprocs) > (2L * 1024L * 1024L))
01059 maxprocs /= 2;
01060 }
01061 
01062 if (maxprocs < 1) 
01063 maxprocs = 1;
01064 
01065 // Loop over number of physical processors and try to create 
01066 // per-thread volumetric maps for each of them.
01067 parms.thrdensitymaps = (float **) calloc(1,maxprocs * sizeof(float *));
01068 parms.thrvoltexmaps = (float **) calloc(1, maxprocs * sizeof(float *));
01069 
01070 // first thread is already ready to go
01071 parms.thrdensitymaps[0] = densitymap;
01072 parms.thrvoltexmaps[0] = voltexmap;
01073 
01074 int i;
01075 int numprocs = maxprocs; // ever the optimist
01076 for (i=1; i<maxprocs; i++) {
01077 parms.thrdensitymaps[i] = (float *) calloc(1, volmemsz);
01078 if (parms.thrdensitymaps[i] == NULL) {
01079 numprocs = i;
01080 break;
01081 }
01082 if (voltexmap != NULL) {
01083 parms.thrvoltexmaps[i] = (float *) calloc(1, 3L * volmemsz);
01084 if (parms.thrvoltexmaps[i] == NULL) {
01085 free(parms.thrdensitymaps[i]);
01086 parms.thrdensitymaps[i] = NULL;
01087 numprocs = i;
01088 break;
01089 }
01090 }
01091 }
01092 
01093 // launch independent thread calculations
01094 wkf_tasktile_t tile;
01095 tile.start = 0;
01096 tile.end = natoms;
01097 wkf_threadlaunch(numprocs, &parms, densitythread, &tile);
01098 
01099 // do a parallel reduction of the resulting density maps
01100 tile.start = 0;
01101 tile.end = numvoxels[2];
01102 wkf_threadlaunch(numprocs, &parms, reductionthread, &tile);
01103 
01104 // free work area
01105 for (i=1; i<maxprocs; i++) {
01106 if (parms.thrdensitymaps[i] != NULL)
01107 free(parms.thrdensitymaps[i]);
01108 
01109 if (parms.thrvoltexmaps[i] != NULL)
01110 free(parms.thrvoltexmaps[i]);
01111 }
01112 free(parms.thrdensitymaps);
01113 free(parms.thrvoltexmaps);
01114 
01115 return 0;
01116 }
01117 
01118 QuickSurf::QuickSurf(int forcecpuonly) {
01119 volmap = NULL;
01120 voltexmap = NULL;
01121 s.clear();
01122 isovalue = 0.5f;
01123 
01124 numvoxels[0] = 128;
01125 numvoxels[1] = 128;
01126 numvoxels[2] = 128;
01127 
01128 origin[0] = 0.0f;
01129 origin[1] = 0.0f;
01130 origin[2] = 0.0f;
01131 
01132 xaxis[0] = 1.0f;
01133 xaxis[1] = 0.0f;
01134 xaxis[2] = 0.0f;
01135 
01136 yaxis[0] = 0.0f;
01137 yaxis[1] = 1.0f;
01138 yaxis[2] = 0.0f;
01139 
01140 zaxis[0] = 0.0f;
01141 zaxis[1] = 0.0f;
01142 zaxis[2] = 1.0f;
01143 
01144 cudaqs = NULL;
01145 force_cpuonly = forcecpuonly;
01146 #if defined(VMDCUDA)
01147 if (!force_cpuonly && !getenv("VMDNOCUDA")) {
01148 cudaqs = new CUDAQuickSurf();
01149 }
01150 #endif
01151 
01152 timer = wkf_timer_create();
01153 }
01154 
01155 
01156 void QuickSurf::free_gpu_memory(void) {
01157 if (cudaqs) {
01158 #if defined(VMDCUDA)
01159 delete cudaqs; 
01160 #endif
01161 cudaqs = NULL; 
01162 }
01163 }
01164 
01165 
01166 int QuickSurf::calc_surf(AtomSel *atomSel, DrawMolecule *mol,
01167 const float *atompos, const float *atomradii,
01168 int quality, float radscale, float gridspacing,
01169 float isoval, const int *colidx, const float *cmap,
01170 VMDDisplayList *cmdList) {
01171 PROFILE_PUSH_RANGE("QuickSurf", 3);
01172 
01173 wkf_timer_start(timer);
01174 int colorperatom = (colidx != NULL && cmap != NULL);
01175 int usebeads=0;
01176 
01177 int verbose = (getenv("VMDQUICKSURFVERBOSE") != NULL);
01178 
01179 // Disable MDFF atomic number weighted densities until we implement
01180 // GUI controls for this if it turns out to be useful for more than
01181 // than just analytical usage.
01182 const float *atomicnum = NULL;
01183 
01184 // clean up any existing CPU arrays before going any further...
01185 if (voltexmap != NULL)
01186 free(voltexmap);
01187 voltexmap = NULL;
01188 
01189 ResizeArray<float> beadpos(64 + (3L * atomSel->selected) / 20);
01190 ResizeArray<float> beadradii(64 + (3L * atomSel->selected) / 20);
01191 ResizeArray<float> beadcolors(64 + (3L * atomSel->selected) / 20);
01192 
01193 if (getenv("VMDQUICKSURFBEADS")) {
01194 usebeads=1;
01195 if (verbose)
01196 printf("QuickSurf using residue beads representation...\n");
01197 }
01198 
01199 int numbeads = 0;
01200 if (usebeads) {
01201 int i, resid, numres;
01202 
01203 // draw a bead for each residue
01204 numres = mol->residueList.num();
01205 for (resid=0; resid<numres; resid++) {
01206 float com[3] = {0.0, 0.0, 0.0};
01207 const ResizeArray<int> &atoms = mol->residueList[resid]->atoms;
01208 int numatoms = atoms.num();
01209 int oncount = 0;
01210 
01211 // find COM for residue
01212 for (i=0; i<numatoms; i++) {
01213 int idx = atoms[i];
01214 if (atomSel->on[idx]) {
01215 oncount++;
01216 vec_add(com, com, atompos + 3L*idx);
01217 }
01218 }
01219 
01220 if (oncount < 1)
01221 continue; // exit if there weren't any atoms
01222 
01223 vec_scale(com, 1.0f / (float) oncount, com);
01224 
01225 // find radius of bounding sphere and save last atom index for color
01226 int atomcolorindex=0; // initialize, to please compilers
01227 float boundradsq = 0.0f;
01228 for (i=0; i<numatoms; i++) {
01229 int idx = atoms[i];
01230 if (atomSel->on[idx]) {
01231 float tmpdist[3];
01232 atomcolorindex = idx;
01233 vec_sub(tmpdist, com, atompos + 3L*idx);
01234 float distsq = dot_prod(tmpdist, tmpdist);
01235 if (distsq > boundradsq) {
01236 boundradsq = distsq;
01237 }
01238 }
01239 }
01240 beadpos.append3(&com[0]);
01241 beadradii.append(sqrtf(boundradsq) + 1.0f);
01242 
01243 if (colorperatom) {
01244 const float *cp = &cmap[colidx[atomcolorindex] * 3L];
01245 beadcolors.append3(&cp[0]);
01246 }
01247 
01248 // XXX still need to add pick points...
01249 }
01250 
01251 numbeads = beadpos.num() / 3;
01252 }
01253 
01254 // initialize class variables
01255 isovalue=isoval;
01256 
01257 // If no volumetric texture will be computed we will use the cmap
01258 // parameter to pass in the solid color to be applied to all vertices
01259 // Since QS can now also be called by MDFF, we have to check whether
01260 // display related parms are set or not before using them.
01261 if (cmap != NULL)
01262 vec_copy(solidcolor, cmap);
01263 
01264 // compute min/max atom radius, build list of selected atom radii,
01265 // and compute bounding box for the selected atoms
01266 float minx, miny, minz, maxx, maxy, maxz;
01267 float minrad, maxrad;
01268 int i;
01269 if (usebeads) {
01270 minx = maxx = beadpos[0];
01271 miny = maxy = beadpos[1];
01272 minz = maxz = beadpos[2];
01273 minrad = maxrad = beadradii[0];
01274 for (i=0; i<numbeads; i++) {
01275 ptrdiff_t ind = i * 3L;
01276 float tmpx = beadpos[ind ];
01277 float tmpy = beadpos[ind+1];
01278 float tmpz = beadpos[ind+2];
01279 
01280 minx = (tmpx < minx) ? tmpx : minx;
01281 maxx = (tmpx > maxx) ? tmpx : maxx;
01282 
01283 miny = (tmpy < miny) ? tmpy : miny;
01284 maxy = (tmpy > maxy) ? tmpy : maxy;
01285 
01286 minz = (tmpz < minz) ? tmpz : minz;
01287 maxz = (tmpz > maxz) ? tmpz : maxz;
01288 
01289 // we always have to compute the rmin/rmax for beads
01290 // since these radii are defined on-the-fly
01291 float r = beadradii[i];
01292 minrad = (r < minrad) ? r : minrad;
01293 maxrad = (r > maxrad) ? r : maxrad;
01294 }
01295 } else {
01296 minx = maxx = atompos[atomSel->firstsel*3L ];
01297 miny = maxy = atompos[atomSel->firstsel*3L+1];
01298 minz = maxz = atompos[atomSel->firstsel*3L+2];
01299 
01300 // Query min/max atom radii for the entire molecule
01301 mol->get_radii_minmax(minrad, maxrad);
01302 
01303 // We only compute rmin/rmax for the actual group of selected atoms if 
01304 // (rmax/rmin > 2.5) for the whole molecule, otherwise it's a small 
01305 // enough range that we don't care since it won't hurt our performance. 
01306 if (minrad <= 0.001 || maxrad/minrad > 2.5) {
01307 minrad = maxrad = atomradii[atomSel->firstsel];
01308 for (i=atomSel->firstsel; i<=atomSel->lastsel; i++) {
01309 if (atomSel->on[i]) {
01310 ptrdiff_t ind = i * 3L;
01311 float tmpx = atompos[ind ];
01312 float tmpy = atompos[ind+1];
01313 float tmpz = atompos[ind+2];
01314 
01315 minx = (tmpx < minx) ? tmpx : minx;
01316 maxx = (tmpx > maxx) ? tmpx : maxx;
01317 
01318 miny = (tmpy < miny) ? tmpy : miny;
01319 maxy = (tmpy > maxy) ? tmpy : maxy;
01320 
01321 minz = (tmpz < minz) ? tmpz : minz;
01322 maxz = (tmpz > maxz) ? tmpz : maxz;
01323 
01324 float r = atomradii[i];
01325 minrad = (r < minrad) ? r : minrad;
01326 maxrad = (r > maxrad) ? r : maxrad;
01327 }
01328 }
01329 } else {
01330 #if 1
01331 float fmin[3], fmax[3];
01332 minmax_selected_3fv_aligned(atompos, atomSel->on, atomSel->num_atoms,
01333 atomSel->firstsel, atomSel->lastsel,
01334 fmin, fmax);
01335 minx = fmin[0];
01336 miny = fmin[1];
01337 minz = fmin[2];
01338 
01339 maxx = fmax[0]; 
01340 maxy = fmax[1]; 
01341 maxz = fmax[2]; 
01342 #else
01343 for (i=atomSel->firstsel; i<=atomSel->lastsel; i++) {
01344 if (atomSel->on[i]) {
01345 ptrdiff_t ind = i * 3L;
01346 float tmpx = atompos[ind ];
01347 float tmpy = atompos[ind+1];
01348 float tmpz = atompos[ind+2];
01349 
01350 minx = (tmpx < minx) ? tmpx : minx;
01351 maxx = (tmpx > maxx) ? tmpx : maxx;
01352 
01353 miny = (tmpy < miny) ? tmpy : miny;
01354 maxy = (tmpy > maxy) ? tmpy : maxy;
01355 
01356 minz = (tmpz < minz) ? tmpz : minz;
01357 maxz = (tmpz > maxz) ? tmpz : maxz;
01358 }
01359 }
01360 #endif
01361 }
01362 }
01363 
01364 float mincoord[3], maxcoord[3];
01365 mincoord[0] = minx;
01366 mincoord[1] = miny;
01367 mincoord[2] = minz;
01368 maxcoord[0] = maxx;
01369 maxcoord[1] = maxy;
01370 maxcoord[2] = maxz;
01371 
01372 // crude estimate of the grid padding we require to prevent the
01373 // resulting isosurface from being clipped
01374 float gridpadding = radscale * maxrad * 1.70f;
01375 float padrad = gridpadding;
01376 padrad = 0.65f * sqrtf(4.0f/3.0f*((float) VMD_PI)*padrad*padrad*padrad);
01377 gridpadding = MAX(gridpadding, padrad);
01378 
01379 // Handle coarse-grained structures and whole-cell models
01380 // XXX The switch at 4.0A from an assumed all-atom scale structure to 
01381 // CG or cell models is a simple heuristic at a somewhat arbitrary 
01382 // threshold value. 
01383 // For all-atom models the units shown in the GUI are in Angstroms
01384 // and are absolute, but for CG or cell models the units in the GUI 
01385 // are relative to the atom with the minimum radius.
01386 // This code doesn't do anything to handle structures with a minrad 
01387 // of zero, where perhaps only one particle has an unset radius.
01388 if (minrad > 4.0f) {
01389 gridspacing *= minrad;
01390 }
01391 
01392 if (verbose) {
01393 printf("QuickSurf: R*%.1f, I=%.1f, H=%.1f Pad: %.1f minR: %.1f maxR: %.1f)\n",
01394 radscale, isovalue, gridspacing, gridpadding, minrad, maxrad);
01395 }
01396 
01397 mincoord[0] -= gridpadding;
01398 mincoord[1] -= gridpadding;
01399 mincoord[2] -= gridpadding;
01400 maxcoord[0] += gridpadding;
01401 maxcoord[1] += gridpadding;
01402 maxcoord[2] += gridpadding;
01403 
01404 // compute the real grid dimensions from the selected atoms
01405 numvoxels[0] = (int) ceil((maxcoord[0]-mincoord[0]) / gridspacing);
01406 numvoxels[1] = (int) ceil((maxcoord[1]-mincoord[1]) / gridspacing);
01407 numvoxels[2] = (int) ceil((maxcoord[2]-mincoord[2]) / gridspacing);
01408 
01409 // recalc the grid dimensions from rounded/padded voxel counts
01410 xaxis[0] = (numvoxels[0]-1) * gridspacing;
01411 yaxis[1] = (numvoxels[1]-1) * gridspacing;
01412 zaxis[2] = (numvoxels[2]-1) * gridspacing;
01413 maxcoord[0] = mincoord[0] + xaxis[0];
01414 maxcoord[1] = mincoord[1] + yaxis[1];
01415 maxcoord[2] = mincoord[2] + zaxis[2];
01416 
01417 int boundserr=0;
01418 for (i=0; i<3; i++) {
01419 if (isnan(mincoord[i]) || isnan(maxcoord[i]) || (numvoxels[i] < 1))
01420 boundserr = 1;
01421 }
01422 
01423 if (boundserr)
01424 msgErr << "QuickSurf) NaN or illegal bounding box, grid size" << sendmsg;
01425 
01426 if (verbose || boundserr) {
01427 printf(" GridSZ: (%4d %4d %4d) BBox: (%.1f %.1f %.1f)->(%.1f %.1f %.1f)\n",
01428 numvoxels[0], numvoxels[1], numvoxels[2],
01429 mincoord[0], mincoord[1], mincoord[2],
01430 maxcoord[0], maxcoord[1], maxcoord[2]);
01431 }
01432 
01433 if (boundserr)
01434 return -1; 
01435 
01436 vec_copy(origin, mincoord);
01437 
01438 // build compacted lists of bead coordinates, radii, and colors
01439 float *xyzr = NULL;
01440 float *colors = NULL;
01441 if (usebeads) { 
01442 int ind =0;
01443 int ind4=0; 
01444 xyzr = (float *) malloc(numbeads * sizeof(float) * 4L);
01445 if (colorperatom) {
01446 colors = (float *) malloc(numbeads * sizeof(float) * 4L);
01447 
01448 // build compacted lists of bead coordinates, radii, and colors
01449 for (i=0; i<numbeads; i++) {
01450 const float *fp = &beadpos[0] + ind;
01451 xyzr[ind4 ] = fp[0]-origin[0];
01452 xyzr[ind4 + 1] = fp[1]-origin[1];
01453 xyzr[ind4 + 2] = fp[2]-origin[2];
01454 xyzr[ind4 + 3] = beadradii[i];
01455 
01456 const float *cp = &beadcolors[0] + ind;
01457 colors[ind4 ] = cp[0];
01458 colors[ind4 + 1] = cp[1];
01459 colors[ind4 + 2] = cp[2];
01460 colors[ind4 + 3] = 1.0f;
01461 ind4 += 4;
01462 ind += 3;
01463 }
01464 } else {
01465 // build compacted lists of bead coordinates and radii only
01466 for (i=0; i<numbeads; i++) {
01467 const float *fp = &beadpos[0] + ind;
01468 xyzr[ind4 ] = fp[0]-origin[0];
01469 xyzr[ind4 + 1] = fp[1]-origin[1];
01470 xyzr[ind4 + 2] = fp[2]-origin[2];
01471 xyzr[ind4 + 3] = beadradii[i];
01472 ind4 += 4;
01473 ind += 3;
01474 }
01475 }
01476 } else {
01477 ptrdiff_t ind = atomSel->firstsel * 3L;
01478 ptrdiff_t ind4=0; 
01479 xyzr = (float *) malloc(atomSel->selected * sizeof(float) * 4L);
01480 if (colorperatom) {
01481 colors = (float *) malloc(atomSel->selected * sizeof(float) * 4L);
01482 
01483 // build compacted lists of atom coordinates, radii, and colors
01484 for (i=atomSel->firstsel; i <= atomSel->lastsel; i++) {
01485 if (atomSel->on[i]) {
01486 const float *fp = atompos + ind;
01487 xyzr[ind4 ] = fp[0]-origin[0];
01488 xyzr[ind4 + 1] = fp[1]-origin[1];
01489 xyzr[ind4 + 2] = fp[2]-origin[2];
01490 xyzr[ind4 + 3] = atomradii[i];
01491 
01492 const float *cp = &cmap[colidx[i] * 3L];
01493 colors[ind4 ] = cp[0];
01494 colors[ind4 + 1] = cp[1];
01495 colors[ind4 + 2] = cp[2];
01496 colors[ind4 + 3] = 1.0f;
01497 ind4 += 4;
01498 }
01499 ind += 3;
01500 }
01501 } else {
01502 // build compacted lists of atom coordinates and radii only
01503 for (i=atomSel->firstsel; i <= atomSel->lastsel; i++) {
01504 if (atomSel->on[i]) {
01505 const float *fp = atompos + ind;
01506 xyzr[ind4 ] = fp[0]-origin[0];
01507 xyzr[ind4 + 1] = fp[1]-origin[1];
01508 xyzr[ind4 + 2] = fp[2]-origin[2];
01509 xyzr[ind4 + 3] = atomradii[i];
01510 ind4 += 4;
01511 }
01512 ind += 3;
01513 }
01514 }
01515 }
01516 
01517 // set gaussian window size based on user-specified quality parameter
01518 float gausslim = 2.0f;
01519 switch (quality) {
01520 case 3: gausslim = 4.0f; break; // max quality
01521 
01522 case 2: gausslim = 3.0f; break; // high quality
01523 
01524 case 1: gausslim = 2.5f; break; // medium quality
01525 
01526 case 0: 
01527 default: gausslim = 2.0f; // low quality
01528 break;
01529 }
01530 
01531 pretime = wkf_timer_timenow(timer);
01532 
01533 #if defined(VMDCUDA)
01534 if (!force_cpuonly && !getenv("VMDNOCUDA")) {
01535 // allocate a new CUDAQuickSurf object if we destroyed the old one...
01536 if (cudaqs == NULL)
01537 cudaqs = new CUDAQuickSurf();
01538 
01539 // Assign texture format according to quality
01540 CUDAQuickSurf::VolTexFormat voltexformat = CUDAQuickSurf::RGB3F;
01541 switch (quality) {
01542 case 3: voltexformat = CUDAQuickSurf::RGB3F; break; // max quality
01543 
01544 case 2: voltexformat = CUDAQuickSurf::RGB3F; break; // high quality
01545 
01546 case 1: voltexformat = CUDAQuickSurf::RGB3F; break; // medium quality
01547 
01548 case 0: 
01549 default: voltexformat = CUDAQuickSurf::RGB4U; // low quality
01550 break;
01551 }
01552 
01553 // compute both density map and floating point color texture map
01554 int pcount = (usebeads) ? numbeads : atomSel->selected; 
01555 int rc = cudaqs->calc_surf(pcount, &xyzr[0],
01556 (colorperatom) ? &colors[0] : &cmap[0],
01557 colorperatom, voltexformat, 
01558 origin, numvoxels, maxrad,
01559 radscale, gridspacing, isovalue, gausslim,
01560 cmdList);
01561 
01562 // If we're running in a memory-limited scenario, we can force
01563 // VMD to dump the QuickSurf GPU data to prevent out-of-memory
01564 // problems later on, either during other surface calcs or when 
01565 // using the GPU for things like OptiX ray tracing
01566 if (getenv("VMDQUICKSURFMINMEM")) {
01567 free_gpu_memory();
01568 }
01569 
01570 if (rc == 0) {
01571 free(xyzr);
01572 if (colors)
01573 free(colors);
01574 
01575 voltime = wkf_timer_timenow(timer);
01576 
01577 PROFILE_POP_RANGE(); // first return point
01578 
01579 return 0;
01580 }
01581 }
01582 #endif
01583 
01584 if (verbose) {
01585 printf(" Computing density map grid on CPUs ");
01586 }
01587 
01588 ptrdiff_t volsz = numvoxels[0] * numvoxels[1] * numvoxels[2];
01589 volmap = new float[volsz];
01590 if (colidx != NULL && cmap != NULL) {
01591 voltexmap = (float*) calloc(1, 3L * sizeof(float) * numvoxels[0] * numvoxels[1] * numvoxels[2]);
01592 }
01593 
01594 fflush(stdout);
01595 memset(volmap, 0, sizeof(float) * volsz);
01596 if ((volsz * atomSel->selected) > 20000000) {
01597 vmd_gaussdensity_threaded(mol->app->cpucaps, verbose, 
01598 atomSel->selected, &xyzr[0], atomicnum,
01599 (voltexmap!=NULL) ? &colors[0] : NULL,
01600 volmap, voltexmap, numvoxels, radscale, 
01601 gridspacing, isovalue, gausslim);
01602 } else {
01603 vmd_gaussdensity_opt(mol->app->cpucaps, verbose, 
01604 atomSel->selected, &xyzr[0], atomicnum,
01605 (voltexmap!=NULL) ? &colors[0] : NULL,
01606 volmap, voltexmap, numvoxels, radscale, 
01607 gridspacing, isovalue, gausslim);
01608 }
01609 
01610 free(xyzr);
01611 if (colors)
01612 free(colors);
01613 
01614 voltime = wkf_timer_timenow(timer);
01615 
01616 // draw the surface if the caller provided the display list
01617 if (cmdList != NULL) {
01618 draw_trimesh(cmdList);
01619 }
01620 
01621 if (verbose) {
01622 printf(" Done.\n");
01623 }
01624 
01625 PROFILE_POP_RANGE(); // second return point
01626 
01627 return 0;
01628 }
01629 
01630 
01631 // compute synthetic density map, but nothing else
01632 VolumetricData * QuickSurf::calc_density_map(AtomSel * atomSel, 
01633 DrawMolecule *mymol, 
01634 const float *atompos, 
01635 const float *atomradii,
01636 int quality, float radscale, 
01637 float gridspacing) {
01638 if (!calc_surf(atomSel, mymol, atompos, atomradii, quality, 
01639 radscale, gridspacing, 1.0f, NULL, NULL, NULL)) {
01640 VolumetricData *surfvol;
01641 surfvol = new VolumetricData("density map", origin, xaxis, yaxis, zaxis,
01642 numvoxels[0], numvoxels[1], numvoxels[2],
01643 volmap);
01644 return surfvol;
01645 }
01646 
01647 return NULL;
01648 }
01649 
01650 
01651 // Extract the isosurface from the QuickSurf density map
01652 int QuickSurf::get_trimesh(int &numverts, 
01653 float *&v3fv, float *&n3fv, float *&c3fv, 
01654 int &numfacets, int *&fiv) {
01655 
01656 int verbose = (getenv("VMDQUICKSURFVERBOSE") != NULL);
01657 
01658 if (verbose)
01659 printf("Running marching cubes on CPU...\n");
01660 
01661 VolumetricData *surfvol; 
01662 surfvol = new VolumetricData("molecular surface",
01663 origin, xaxis, yaxis, zaxis,
01664 numvoxels[0], numvoxels[1], numvoxels[2],
01665 volmap);
01666 
01667 // XXX we should calculate the volume gradient only for those
01668 // vertices we extract, since for this rep any changes to settings
01669 // will require recomputation of the entire volume
01670 surfvol->compute_volume_gradient(); // calc gradients: smooth vertex normals
01671 gradtime = wkf_timer_timenow(timer);
01672 
01673 // trimesh polygonalized surface, max of 6 triangles per voxel
01674 const int stepsize = 1;
01675 s.clear(); // initialize isosurface data
01676 s.compute(surfvol, isovalue, stepsize); // compute the isosurface
01677 
01678 mctime = wkf_timer_timenow(timer);
01679 
01680 s.vertexfusion(9, 9); // eliminate duplicated vertices
01681 s.normalize(); // normalize interpolated gradient/surface normals
01682 
01683 if (s.numtriangles > 0) {
01684 if (voltexmap != NULL) {
01685 // assign per-vertex colors by a 3-D texture map
01686 s.set_color_voltex_rgb3fv(voltexmap);
01687 } else {
01688 // use a single color for the entire mesh
01689 s.set_color_rgb3fv(solidcolor);
01690 }
01691 }
01692 
01693 numverts = s.v.num() / 3;
01694 v3fv=&s.v[0];
01695 n3fv=&s.n[0];
01696 c3fv=&s.c[0];
01697 
01698 numfacets = s.numtriangles;
01699 fiv=&s.f[0];
01700 
01701 delete surfvol;
01702 
01703 mcverttime = wkf_timer_timenow(timer);
01704 reptime = mcverttime;
01705 
01706 if (verbose) {
01707 char strmsg[1024];
01708 sprintf(strmsg, "QuickSurf: %.3f [pre:%.3f vol:%.3f gr:%.3f mc:%.2f mcv:%.3f]",
01709 reptime, pretime, voltime-pretime, gradtime-voltime, 
01710 mctime-gradtime, mcverttime-mctime);
01711 
01712 msgInfo << strmsg << sendmsg;
01713 }
01714 
01715 return 0;
01716 }
01717 
01718 
01719 int QuickSurf::draw_trimesh(VMDDisplayList *cmdList) {
01720 DispCmdTriMesh cmdTriMesh;
01721 
01722 int numverts=0;
01723 float *v=NULL, *n=NULL, *c=NULL;
01724 int numfacets=0;
01725 int *f=NULL;
01726 
01727 get_trimesh(numverts, v, n, c, numfacets, f);
01728 
01729 // Create a triangle mesh
01730 if (numfacets > 0) {
01731 cmdTriMesh.putdata(v, n, c, numverts, f, numfacets, 0, cmdList);
01732 }
01733 
01734 return 0;
01735 }
01736 
01737 
01738 QuickSurf::~QuickSurf() {
01739 #if defined(VMDCUDA)
01740 free_gpu_memory();
01741 #endif
01742 
01743 if (voltexmap != NULL)
01744 free(voltexmap);
01745 voltexmap = NULL;
01746 
01747 wkf_timer_destroy(timer);
01748 }
01749 
01750