Main Page Namespace List Class Hierarchy Alphabetical List Compound List File List Namespace Members Compound Members File Members Related Pages
Orbital.C

Go to the documentation of this file.
00001 /***************************************************************************
00002 *cr 
00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the 
00004 *cr University of Illinois 
00005 *cr All Rights Reserved 
00006 *cr 
00007 ***************************************************************************/
00008 /***************************************************************************
00009 * RCS INFORMATION:
00010 *
00011 * $RCSfile: Orbital.C,v $
00012 * $Author: johns $ $Locker: $ $State: Exp $
00013 * $Revision: 1.166 $ $Date: 2022年05月23日 19:10:01 $
00014 *
00015 ***************************************************************************/
00021 // Intel x86 hardware 
00022 #if (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_AMD64))
00023 #if !defined(__SSE2__) && defined(_WIN64)
00024 #define __SSE2__ 1 /* MSVC fails to define SSE macros */
00025 #endif
00026 #if defined(__SSE2__)
00027 #include <emmintrin.h>
00028 #define VMDORBUSESSE 1 /* build SSE code for static launch */
00029 #endif
00030 
00031 #if defined(VMDCPUDISPATCH)
00032 #define VECPADSZ 16 /* max vec size is for x86 AVX512F or AVX512ER */
00033 #else
00034 #define VECPADSZ 4 /* fall-back to x86 SSE vector size */
00035 #endif
00036 
00037 // IBM Power 8/9/10 Altivec/VSX instructions:
00038 // https://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
00039 #elif defined(__VSX__)
00040 #if defined(__GNUC__) && defined(__VEC__)
00041 #include <altivec.h>
00042 #endif
00043 // The OpenPOWER VSX code path runs on POWER8 and later hardware, but is
00044 // untested on older platforms that support VSX instructions.
00045 // XXX GCC 4.8.5 breaks with conflicts between vec_xxx() routines 
00046 // defined in utilities.h vs. VSX intrinsics in altivec.h and similar.
00047 // For now, we disable VSX for GCC for this source file.
00048 #define VECPADSZ 4 /* IBM POWER VSX vector size */
00049 #define VMDORBUSEVSX 1 /* build POWER VSX code for static launch */
00050 
00051 // ARM64 SVE... 
00052 #elif defined(__ARM_ARCH_ISA_A64) && !defined(ARCH_MACOSXARM64)
00053 #define VMDUSESVE 1 /* build SVE code for static launch */
00054 
00055 #if defined(VMDCPUDISPATCH)
00056 #define VECPADSZ 16 /* max vec size is for Fujitsu A64fx */
00057 #else
00058 #define VECPADSZ 4 /* fall-back to NEON vector size */
00059 #endif
00060 
00061 // generic scalar C++ code
00062 #else
00063 #define VECPADSZ 1 /* scalar code, no padding */
00064 #endif
00065 
00066 // padding mask determined from hardware-specific 
00067 #define VECPADMASK (VECPADSZ - 1)
00068 
00069 // #define DEBUGORBS 1
00070 
00071 #include <math.h>
00072 #include <stdio.h>
00073 #include "VMDApp.h"
00074 #include "Orbital.h"
00075 #include "DrawMolecule.h"
00076 #include "utilities.h"
00077 #include "Inform.h"
00078 #include "WKFThreads.h"
00079 #include "WKFUtils.h"
00080 #if defined(VMDCUDA)
00081 #include "CUDAOrbital.h"
00082 #endif
00083 #if defined(VMDOPENCL)
00084 #include "OpenCLUtils.h"
00085 #include "OpenCLKernels.h"
00086 #endif
00087 #include "ProfileHooks.h"
00088 
00089 //
00090 // fctn prototypes for CPU runtime dispatch kernels
00091 //
00092 
00093 // AVX-512ER implementation for Xeon Phi w/ special fctn units
00094 extern int evaluate_grid_avx512er(int numatoms,
00095 const float *wave_f, const float *basis_array,
00096 const float *atompos,
00097 const int *atom_basis,
00098 const int *num_shells_per_atom,
00099 const int *num_prim_per_shell,
00100 const int *shell_types,
00101 const int *numvoxels,
00102 float voxelsize,
00103 const float *origin,
00104 int density,
00105 float * orbitalgrid);
00106 
00107 // AVX-512F implementation for CPUs without exponent/reciprocal support
00108 extern int evaluate_grid_avx512f(int numatoms,
00109 const float *wave_f, const float *basis_array,
00110 const float *atompos,
00111 const int *atom_basis,
00112 const int *num_shells_per_atom,
00113 const int *num_prim_per_shell,
00114 const int *shell_types,
00115 const int *numvoxels,
00116 float voxelsize,
00117 const float *origin,
00118 int density,
00119 float * orbitalgrid);
00120 
00121 // AVX2 implementation 
00122 extern int evaluate_grid_avx2(int numatoms,
00123 const float *wave_f, const float *basis_array,
00124 const float *atompos,
00125 const int *atom_basis,
00126 const int *num_shells_per_atom,
00127 const int *num_prim_per_shell,
00128 const int *shell_types,
00129 const int *numvoxels,
00130 float voxelsize,
00131 const float *origin,
00132 int density,
00133 float * orbitalgrid);
00134 
00135 // ARM NEON implementation 
00136 extern int evaluate_grid_neon(int numatoms,
00137 const float *wave_f, const float *basis_array,
00138 const float *atompos,
00139 const int *atom_basis,
00140 const int *num_shells_per_atom,
00141 const int *num_prim_per_shell,
00142 const int *shell_types,
00143 const int *numvoxels,
00144 float voxelsize,
00145 const float *origin,
00146 int density,
00147 float * orbitalgrid);
00148 
00149 // ARM SVE implementation 
00150 extern int evaluate_grid_sve(int numatoms,
00151 const float *wave_f, const float *basis_array,
00152 const float *atompos,
00153 const int *atom_basis,
00154 const int *num_shells_per_atom,
00155 const int *num_prim_per_shell,
00156 const int *shell_types,
00157 const int *numvoxels,
00158 float voxelsize,
00159 const float *origin,
00160 int density,
00161 float * orbitalgrid);
00162 
00163 
00164 #define ANGS_TO_BOHR 1.88972612478289694072f
00165 
00167 Orbital::Orbital(const float *pos,
00168 const float *wfn,
00169 const float *barray,
00170 const basis_atom_t *bset,
00171 const int *types,
00172 const int *asort,
00173 const int *abasis,
00174 const float **norm,
00175 const int *nshells,
00176 const int *nprimshell,
00177 const int *shelltypes, 
00178 int natoms, int ntypes, int numwave, int numbasis, 
00179 int orbid) :
00180 numatoms(natoms), atompos(pos),
00181 num_wave_f(numwave),
00182 wave_f(NULL),
00183 num_basis_funcs(numbasis),
00184 basis_array(barray),
00185 numtypes(ntypes), 
00186 basis_set(bset),
00187 atom_types(types),
00188 atom_sort(asort),
00189 atom_basis(abasis),
00190 norm_factors(norm),
00191 num_shells_per_atom(nshells),
00192 num_prim_per_shell(nprimshell),
00193 shell_types(shelltypes), 
00194 grid_data(NULL)
00195 {
00196 origin[0] = origin[1] = origin[2] = 0.0;
00197 
00198 // Multiply wavefunction coefficients with the
00199 // angular momentum dependent part of the basis set
00200 // normalization factors.
00201 normalize_wavefunction(wfn + num_wave_f*orbid);
00202 
00203 //print_wavefunction();
00204 }
00205 
00207 Orbital::~Orbital() {
00208 if (wave_f) delete [] wave_f;
00209 }
00210 
00211 
00212 // Multiply wavefunction coefficients with the
00213 // basis set normalization factors. We do this
00214 // here rather than normalizing the basisset itself
00215 // because we need different factors for the different
00216 // cartesian components of a shell and the basis set
00217 // stores data only per shell.
00218 // By doing the multiplication here we save a lot of
00219 // flops during orbital rendering.
00220 void Orbital::normalize_wavefunction(const float *wfn) {
00221 #ifdef DEBUGORBS
00222 char shellname[8] = {'S', 'P', 'D', 'F', 'G', 'H', 'I', 'K'};
00223 #endif
00224 int i, j, k;
00225 // Get size of the symmetry-expanded wavefunction array
00226 // int wave_size = 0;
00227 // for (i=0; i<numatoms; i++) {
00228 // printf("atom[%d]: type = %d\n", i, atom_types[i]);
00229 // const basis_atom_t *basis_atom = &basis_set[atom_types[i]];
00230 // for (j=0; j<basis_atom->numshells; j++) {
00231 // wave_size += basis_atom->shell[j].num_cart_func;
00232 // }
00233 // }
00234 // printf("num_wave_f/wave_size = %d/%d\n", num_wave_f, wave_size);
00235 
00236 wave_f = new float[num_wave_f];
00237 int ifunc = 0;
00238 for (i=0; i<numatoms; i++) {
00239 const basis_atom_t *basis_atom = &basis_set[atom_types[i]];
00240 for (j=0; j<basis_atom->numshells; j++) {
00241 int stype = basis_atom->shell[j].type;
00242 #ifdef DEBUGORBS
00243 printf("atom %i/%i, %i/%i %c-shell\n", i+1, numatoms, j+1, basis_atom->numshells, shellname[stype]);
00244 #endif
00245 for (k=0; k<basis_atom->shell[j].num_cart_func; k++) {
00246 wave_f[ifunc] = wfn[ifunc] * norm_factors[stype][k];
00247 
00248 #ifdef DEBUGORBS
00249 printf("%3i %c %2i wave_f[%3i]=% f norm=%.3f normwave=% f\n",
00250 i, shellname[stype], k, ifunc, wfn[ifunc],
00251 norm_factors[stype][k], wave_f[ifunc]);
00252 #endif
00253 ifunc++;
00254 }
00255 }
00256 }
00257 }
00258 
00259 
00260 // Sets the grid dimensions to the bounding box of the given
00261 // set of atoms *pos including a padding in all dimensions.
00262 // The resulting grid dimensions will be rounded to a multiple
00263 // of the voxel size.
00264 int Orbital::set_grid_to_bbox(const float *pos, float padding,
00265 float resolution) {
00266 int i = 0;
00267 float xyzdim[3];
00268 
00269 /* set initial values of temp values to the coordinates
00270 * of the first atom. */
00271 origin[0] = xyzdim[0] = pos[0];
00272 origin[1] = xyzdim[1] = pos[1];
00273 origin[2] = xyzdim[2] = pos[2];
00274 
00275 /* now loop over the rest of the atoms to check if there's
00276 * something larger/smaller for the maximum and minimum
00277 * respectively */
00278 for(i=1; i<numatoms; i++) {
00279 if (pos[3*i ] < origin[0]) origin[0] = pos[3*i];
00280 if (pos[3*i+1] < origin[1]) origin[1] = pos[3*i+1];
00281 if (pos[3*i+2] < origin[2]) origin[2] = pos[3*i+2];
00282 if (pos[3*i ] > xyzdim[0]) xyzdim[0] = pos[3*i];
00283 if (pos[3*i+1] > xyzdim[1]) xyzdim[1] = pos[3*i+1];
00284 if (pos[3*i+2] > xyzdim[2]) xyzdim[2] = pos[3*i+2];
00285 }
00286 
00287 // Apply padding in each direction
00288 origin[0] -= padding;
00289 origin[1] -= padding;
00290 origin[2] -= padding;
00291 gridsize[0] = xyzdim[0] + padding - origin[0];
00292 gridsize[1] = xyzdim[1] + padding - origin[1];
00293 gridsize[2] = xyzdim[2] + padding - origin[2]; 
00294 
00295 set_resolution(resolution);
00296 
00297 return TRUE;
00298 }
00299 
00300 
00301 // Set the dimensions and resolution of the grid for which 
00302 // the orbital shall be computed.
00303 // The given grid dimensions will be rounded to a multiple
00304 // of the voxel size.
00305 void Orbital::set_grid(float newori[3], float newdim[3], float newvoxelsize) {
00306 origin[0] = newori[0];
00307 origin[1] = newori[1];
00308 origin[2] = newori[2];
00309 gridsize[0] = newdim[0];
00310 gridsize[1] = newdim[1];
00311 gridsize[2] = newdim[2];
00312 set_resolution(newvoxelsize);
00313 }
00314 
00315 // Change the resolution of the grid
00316 void Orbital::set_resolution(float resolution) {
00317 voxelsize = resolution;
00318 int i;
00319 for (i=0; i<3; i++) {
00320 numvoxels[i] = (int)(gridsize[i]/voxelsize) + 1;
00321 gridsize[i] = voxelsize*(numvoxels[i]-1);
00322 }
00323 }
00324 
00325 #define XNEG 0
00326 #define YNEG 1
00327 #define ZNEG 2
00328 #define XPOS 3
00329 #define YPOS 4
00330 #define ZPOS 5
00331 
00332 // Check if all values in the boundary plane given by dir 
00333 // are below threshold.
00334 // If not, jump back, decrease the stepsize and test again.
00335 int Orbital::check_plane(int dir, float threshold, int minstepsize,
00336 int &stepsize) {
00337 bool repeat=0;
00338 int u, v, w, nu, nv;
00339 // w is the dimension we want to adjust,
00340 // u and v are the other two, i.e. the plane in which we test
00341 // the orbital values. 
00342 u = (dir+1)%3;
00343 v = (dir+2)%3;
00344 w = dir%3;
00345 
00346 // for debugging
00347 //char axis[3] = {'X', 'Y', 'Z'};
00348 //char sign[2] = {'-', '+'};
00349 //printf("%c%c: ", sign[dir/3], axis[w]);
00350 
00351 do {
00352 int success = 0;
00353 int gridstep = stepsize;
00354 
00355 if (repeat) {
00356 // We are repeating the test on the previous slate but with
00357 // twice the resolution. Hence we only have to test the new
00358 // grid points lying between the old ones.
00359 gridstep = 2*stepsize;
00360 }
00361 
00362 
00363 float grid[3];
00364 grid[w] = origin[w] + (dir/3)*(numvoxels[w]-1) * voxelsize;
00365 
00366 // Search for a value of the wave function larger than threshold.
00367 for (nu=0; nu<numvoxels[u]; nu+=gridstep) {
00368 grid[u] = origin[u] + nu * voxelsize;
00369 
00370 for (nv=0; nv<numvoxels[v]; nv+=gridstep) {
00371 grid[v] = origin[v] + nv * voxelsize;
00372 
00373 if (fabs(evaluate_grid_point(grid[0], grid[1], grid[2])) > threshold) {
00374 success = 1;
00375 break;
00376 }
00377 }
00378 if (success) break;
00379 }
00380 
00381 if (success) {
00382 // Found an orbital value higher than the threshold.
00383 // We want the threshold isosurface to be completely inside the grid.
00384 // The boundary must be between the previous and this plane.
00385 if (!(dir/3)) origin[w] -= stepsize*voxelsize;
00386 numvoxels[w] += stepsize;
00387 if (stepsize<=minstepsize) {
00388 //printf("success!\n");
00389 return 1;
00390 }
00391 stepsize /=2;
00392 repeat = 1;
00393 //printf("increase by %i, reduce stepsize to %i.\n", 2*stepsize, stepsize);
00394 
00395 } else {
00396 // All values lower than threshold, we decrease the grid size.
00397 if (!(dir/3)) origin[w] += stepsize*voxelsize;
00398 numvoxels[w] -= stepsize;
00399 //printf("decrease by %i\n", stepsize);
00400 repeat = 0;
00401 if (numvoxels[w] <= 1) {
00402 // Here we ended up with a zero grid size.
00403 // We must have missed something. Let's increase grid again and 
00404 // try a smaller step size.
00405 numvoxels[w] = stepsize; 
00406 if (!(dir/3)) origin[w] -= stepsize*voxelsize;
00407 stepsize /=2;
00408 repeat = 1;
00409 //printf("zero grid size - increase to %i, reduce stepsize to %i.\n", 2*stepsize, stepsize);
00410 }
00411 }
00412 
00413 } while (repeat);
00414 
00415 return 0;
00416 }
00417 
00418 
00419 // Optimize position and dimension of current grid so that all orbital
00420 // values higher than threshold are contained in the grid.
00421 //
00422 // Algorithm:
00423 // Based on the idea that the wave function trails off in a distance of
00424 // a few Angstroms from the molecule.
00425 // We start from the current grid size (which could be for instance the
00426 // molecular bounding box plus a padding region) and test the values on
00427 // each of the six boundary planes. If there is no value larger than the
00428 // given threshold in a plane then we shrink the system along the plane
00429 // normal. In the distance the wave function tends to be smoother so we
00430 // start the testing on a coarser grid. A parameter maxstepsize=4 means
00431 // to begin with a grid using a four times higher voxel side length than
00432 // the original grid. When we find the first value above the threshold we
00433 // jump back one step and continue with half of the previous stepsize.
00434 // When stepsize has reached minstepsize then we consider the corresponding
00435 // boundary plane optimal. Note that starting out with a too coarse
00436 // grid one might miss some features of the wave function.
00437 // If you want to be sure not to miss anything then use the voxelsize
00438 // for both minstepsize and maxstepsize.
00439 void Orbital::find_optimal_grid(float threshold, 
00440 int minstepsize, int maxstepsize) {
00441 int optimal[6] = {0, 0, 0, 0, 0, 0};
00442 int stepsize[6];
00443 int i;
00444 for (i=0; i<6; i++) stepsize[i] = maxstepsize;
00445 
00446 #ifdef DEBUGORBS
00447 printf("origin = {%f %f %f}\n", origin[0], origin[1], origin[2]);
00448 printf("gridsize = {%f %f %f}\n", gridsize[0], gridsize[1], gridsize[2]);
00449 #endif 
00450 
00451 
00452 // Loop until we have optimal grid boundaries in all
00453 // dimensions
00454 int iter = 0;
00455 while ( !optimal[0] || !optimal[1] || !optimal[2] ||
00456 !optimal[3] || !optimal[4] || !optimal[5] )
00457 {
00458 if (iter>100) {
00459 msgInfo << "WARNING: Could not optimize orbital grid boundaries in"
00460 << iter << "steps!" << sendmsg; 
00461 break;
00462 }
00463 iter++;
00464 
00465 // Examine the current grid boundaries and shrink if
00466 // all values are smaller than threshold .
00467 if (!optimal[XNEG])
00468 optimal[XNEG] = check_plane(XNEG, threshold, minstepsize, stepsize[XNEG]);
00469 
00470 if (!optimal[XPOS])
00471 optimal[XPOS] = check_plane(XPOS, threshold, minstepsize, stepsize[XPOS]);
00472 
00473 if (!optimal[YNEG])
00474 optimal[YNEG] = check_plane(YNEG, threshold, minstepsize, stepsize[YNEG]);
00475 
00476 if (!optimal[YPOS])
00477 optimal[YPOS] = check_plane(YPOS, threshold, minstepsize, stepsize[YPOS]);
00478 
00479 if (!optimal[ZNEG])
00480 optimal[ZNEG] = check_plane(ZNEG, threshold, minstepsize, stepsize[ZNEG]);
00481 
00482 if (!optimal[ZPOS])
00483 optimal[ZPOS] = check_plane(ZPOS, threshold, minstepsize, stepsize[ZPOS]);
00484 
00485 #if defined(DEBUGORBS)
00486 printf("origin {%f %f %f}\n", origin[0], origin[1], origin[2]);
00487 printf("ngrid {%i %i %i}\n", numvoxels[0], numvoxels[1], numvoxels[2]);
00488 printf("stepsize {%i %i %i %i %i %i}\n", stepsize[0], stepsize[1], stepsize[2],
00489 stepsize[3], stepsize[4], stepsize[5]);
00490 #endif
00491 }
00492 
00493 
00494 gridsize[0] = numvoxels[0]*voxelsize;
00495 gridsize[1] = numvoxels[1]*voxelsize;
00496 gridsize[2] = numvoxels[2]*voxelsize;
00497 }
00498 
00499 
00500 // this function creates the orbital grid given the system dimensions
00501 int Orbital::calculate_mo(DrawMolecule *mol, int density) {
00502 PROFILE_PUSH_RANGE("Orbital", 4);
00503 
00504 wkf_timerhandle timer=wkf_timer_create();
00505 wkf_timer_start(timer);
00506 
00507 //
00508 // Force vectorized N-element padding for the X dimension to prevent
00509 // the possibility of an out-of-bounds orbital grid read/write operation
00510 //
00511 int vecpadmask = VECPADMASK;
00512 if ((mol->app->cpucaps != NULL) && (mol->app->cpucaps->flags & CPU_AVX2)) {
00513 vecpadmask = 7; // AVX2 kernels pad to multiples of 8 
00514 }
00515 if ((mol->app->cpucaps != NULL) && (mol->app->cpucaps->flags & (CPU_AVX512F | CPU_AVX512ER))) {
00516 vecpadmask = 15; // AVX512 kernels pad to multiples of 16
00517 }
00518 
00519 // pad the grid X dimension for vector-multiple length and memory alignment 
00520 numvoxels[0] = (numvoxels[0] + vecpadmask) & ~(vecpadmask);
00521 gridsize[0] = numvoxels[0]*voxelsize;
00522 
00523 // Allocate memory for the volumetric grid
00524 int numgridpoints = numvoxels[0] * numvoxels[1] * numvoxels[2];
00525 grid_data = new float[numgridpoints];
00526 
00527 #if defined(DEBUGORBS)
00528 printf("num_wave_f=%i\n", num_wave_f);
00529 
00530 int i=0;
00531 for (i=0; i<num_wave_f; i++) {
00532 printf("wave_f[%i] = %f\n", i, wave_f[i]);
00533 }
00534 
00535 // perhaps give the user a warning, since the calculation
00536 // could take a while, otherwise they might think the system is borked 
00537 printf("Calculating %ix%ix%i orbital grid.\n", 
00538 numvoxels[0], numvoxels[1], numvoxels[2]);
00539 #endif
00540 
00541 
00542 int rc=-1; // initialize to sentinel value
00543 
00544 // Calculate the value of the orbital at each gridpoint
00545 #if defined(VMDCUDA)
00546 // The CUDA kernel currently only handles up to "G" shells,
00547 // and up to 32 primitives per basis function
00548 if ((max_shell_type() <= G_SHELL) &&
00549 (max_primitives() <= 32) &&
00550 (!getenv("VMDNOCUDA"))) {
00551 rc = vmd_cuda_evaluate_orbital_grid(mol->cuda_devpool(), 
00552 numatoms, wave_f, num_wave_f,
00553 basis_array, num_basis_funcs,
00554 atompos, atom_basis,
00555 num_shells_per_atom, 
00556 num_prim_per_shell,
00557 shell_types, total_shells(),
00558 numvoxels, voxelsize, 
00559 origin, density, grid_data);
00560 }
00561 #endif
00562 #if defined(VMDOPENCL)
00563 // The OpenCL kernel currently only handles up to "G" shells,
00564 // and up to 32 primitives per basis function
00565 if (rc!=0 &&
00566 (max_shell_type() <= G_SHELL) &&
00567 (max_primitives() <= 32) &&
00568 (!getenv("VMDNOOPENCL"))) {
00569 
00570 #if 1
00571 // XXX this would be done during app startup normally...
00572 static vmd_opencl_orbital_handle *orbh = NULL;
00573 static cl_context clctx = NULL;
00574 static cl_command_queue clcmdq = NULL;
00575 static cl_device_id *cldevs = NULL;
00576 if (orbh == NULL) {
00577 printf("Attaching OpenCL device:\n");
00578 wkf_timer_start(timer);
00579 cl_int clerr = CL_SUCCESS;
00580 
00581 cl_platform_id clplatid = vmd_cl_get_platform_index(0);
00582 cl_context_properties clctxprops[] = {(cl_context_properties) CL_CONTEXT_PLATFORM, (cl_context_properties) clplatid, (cl_context_properties) 0};
00583 clctx = clCreateContextFromType(clctxprops, CL_DEVICE_TYPE_GPU, NULL, NULL, &clerr);
00584 
00585 size_t parmsz;
00586 clerr |= clGetContextInfo(clctx, CL_CONTEXT_DEVICES, 0, NULL, &parmsz);
00587 if (clerr != CL_SUCCESS) return -1;
00588 cldevs = (cl_device_id *) malloc(parmsz);
00589 if (clerr != CL_SUCCESS) return -1;
00590 clerr |= clGetContextInfo(clctx, CL_CONTEXT_DEVICES, parmsz, cldevs, NULL);
00591 if (clerr != CL_SUCCESS) return -1;
00592 clcmdq = clCreateCommandQueue(clctx, cldevs[0], 0, &clerr);
00593 if (clerr != CL_SUCCESS) return -1;
00594 wkf_timer_stop(timer);
00595 printf(" OpenCL context creation time: %.3f sec\n", wkf_timer_time(timer));
00596 
00597 wkf_timer_start(timer);
00598 orbh = vmd_opencl_create_orbital_handle(clctx, clcmdq, cldevs);
00599 wkf_timer_stop(timer);
00600 printf(" OpenCL kernel compilation time: %.3f sec\n", wkf_timer_time(timer));
00601 
00602 wkf_timer_start(timer);
00603 }
00604 #endif
00605 
00606 rc = vmd_opencl_evaluate_orbital_grid(mol->cuda_devpool(), orbh,
00607 numatoms, wave_f, num_wave_f,
00608 basis_array, num_basis_funcs,
00609 atompos, atom_basis,
00610 num_shells_per_atom, 
00611 num_prim_per_shell,
00612 shell_types, total_shells(),
00613 numvoxels, voxelsize, 
00614 origin, density, grid_data);
00615 
00616 #if 0
00617 // XXX this would normally be done at program shutdown
00618 vmd_opencl_destroy_orbital_handle(parms.orbh);
00619 clReleaseCommandQueue(clcmdq);
00620 clReleaseContext(clctx);
00621 free(cldevs);
00622 #endif
00623 }
00624 #endif
00625 #if 0
00626 int numprocs = 1;
00627 if (getenv("VMDDUMPORBITALS")) {
00628 write_orbital_data(getenv("VMDDUMPORBITALS"), numatoms,
00629 wave_f, num_wave_f, basis_array, num_basis,
00630 atompos, atom_basis, num_shells_per_atom,
00631 num_prim_per_shell, shell_types,
00632 num_shells, numvoxels, voxelsize, origin);
00633 
00634 read_calc_orbitals(devpool, getenv("VMDDUMPORBITALS"));
00635 }
00636 #endif
00637 
00638 
00639 #if !defined(VMDORBUSETHRPOOL)
00640 #if defined(VMDTHREADS)
00641 int numcputhreads = wkf_thread_numprocessors();
00642 #else
00643 int numcputhreads = 1;
00644 #endif
00645 #endif
00646 if (rc!=0) rc = evaluate_grid_fast(mol->app->cpucaps,
00647 #if defined(VMDORBUSETHRPOOL)
00648 mol->cpu_threadpool(), 
00649 #else
00650 numcputhreads,
00651 #endif
00652 numatoms, wave_f, basis_array,
00653 atompos, atom_basis,
00654 num_shells_per_atom, num_prim_per_shell,
00655 shell_types, numvoxels, voxelsize, 
00656 origin, density, grid_data);
00657 
00658 if (rc!=0) {
00659 msgErr << "Error computing orbital grid" << sendmsg;
00660 delete [] grid_data;
00661 grid_data=NULL;
00662 
00663 PROFILE_POP_RANGE(); // first return point
00664 
00665 return FALSE;
00666 }
00667 
00668 wkf_timer_stop(timer);
00669 
00670 #if 1
00671 if (getenv("VMDORBTIMING") != NULL) { 
00672 double gflops = (numgridpoints * flops_per_gridpoint()) / (wkf_timer_time(timer) * 1000000000.0);
00673 
00674 char strbuf[1024];
00675 sprintf(strbuf, "Orbital calc. time %.3f secs, %.2f gridpoints/sec, %.2f GFLOPS",
00676 wkf_timer_time(timer), 
00677 (((double) numgridpoints) / wkf_timer_time(timer)),
00678 gflops);
00679 msgInfo << strbuf << sendmsg;
00680 }
00681 #endif
00682 
00683 wkf_timer_destroy(timer);
00684 
00685 PROFILE_POP_RANGE(); // second return point
00686 
00687 return TRUE;
00688 }
00689 
00690 
00691 /*********************************************************
00692 *
00693 * This function calculates the value of the wavefunction
00694 * corresponding to a particular orbital at grid point
00695 * grid_x, grid_y, grid_z.
00696 
00697 
00698 Here's an example of a basis set definition for one atom:
00699 
00700 SHELL TYPE PRIMITIVE EXPONENT CONTRACTION COEFFICIENT(S)
00701 
00702 Oxygen
00703 
00704 1 S 1 5484.6716600 0.001831074430
00705 1 S 2 825.2349460 0.013950172200
00706 1 S 3 188.0469580 0.068445078098
00707 1 S 4 52.9645000 0.232714335992
00708 1 S 5 16.8975704 0.470192897984
00709 1 S 6 5.7996353 0.358520852987
00710 
00711 2 L 7 15.5396162 -0.110777549525 0.070874268231
00712 2 L 8 3.5999336 -0.148026262701 0.339752839147
00713 2 L 9 1.0137618 1.130767015354 0.727158577316
00714 
00715 3 L 10 0.2700058 1.000000000000 1.000000000000
00716 
00717 *********************************************************/
00718 float Orbital::evaluate_grid_point(float grid_x, float grid_y, float grid_z) {
00719 int at;
00720 int prim, shell;
00721 
00722 // initialize value of orbital at gridpoint
00723 float value = 0.0;
00724 
00725 // initialize the wavefunction and shell counters
00726 int ifunc = 0; 
00727 int shell_counter = 0;
00728 
00729 // loop over all the QM atoms
00730 for (at=0; at<numatoms; at++) {
00731 int maxshell = num_shells_per_atom[at];
00732 int prim_counter = atom_basis[at];
00733 
00734 // calculate distance between grid point and center of atom
00735 float xdist = (grid_x - atompos[3*at ])*ANGS_TO_BOHR;
00736 float ydist = (grid_y - atompos[3*at+1])*ANGS_TO_BOHR;
00737 float zdist = (grid_z - atompos[3*at+2])*ANGS_TO_BOHR;
00738 float dist2 = xdist*xdist + ydist*ydist + zdist*zdist;
00739 
00740 // loop over the shells belonging to this atom
00741 // XXX this is maybe a misnomer because in split valence
00742 // basis sets like 6-31G we have more than one basis
00743 // function per (valence-)shell and we are actually
00744 // looping over the individual contracted GTOs
00745 for (shell=0; shell < maxshell; shell++) {
00746 float contracted_gto = 0.0f;
00747 
00748 // Loop over the Gaussian primitives of this contracted 
00749 // basis function to build the atomic orbital
00750 int maxprim = num_prim_per_shell[shell_counter];
00751 int shelltype = shell_types[shell_counter];
00752 for (prim=0; prim < maxprim; prim++) {
00753 float exponent = basis_array[prim_counter ];
00754 float contract_coeff = basis_array[prim_counter + 1];
00755 contracted_gto += contract_coeff * expf(-exponent*dist2);
00756 prim_counter += 2;
00757 }
00758 
00759 /* multiply with the appropriate wavefunction coefficient */
00760 float tmpshell=0;
00761 // Loop over the cartesian angular momenta of the shell.
00762 // avoid unnecessary branching and minimize use of pow()
00763 int i, j; 
00764 float xdp, ydp, zdp;
00765 float xdiv = 1.0f / xdist;
00766 for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) {
00767 int imax = shelltype - j; 
00768 for (i=0, ydp=1.0f, xdp=powf(xdist, float(imax)); i<=imax; i++, ydp*=ydist, xdp*=xdiv) {
00769 tmpshell += wave_f[ifunc++] * xdp * ydp * zdp;
00770 }
00771 }
00772 value += tmpshell * contracted_gto;
00773 
00774 shell_counter++;
00775 } 
00776 }
00777 
00778 /* return the final value at grid point */
00779 return value;
00780 }
00781 
00782 
00783 //
00784 // Return the max number of primitives that occur in a basis function
00785 //
00786 int Orbital::max_primitives(void) {
00787 int maxprim=-1;
00788 
00789 int shell_counter = 0;
00790 for (int at=0; at<numatoms; at++) {
00791 for (int shell=0; shell < num_shells_per_atom[at]; shell++) {
00792 int numprim = num_prim_per_shell[shell_counter];
00793 if (numprim > maxprim)
00794 maxprim = numprim; 
00795 }
00796 }
00797 
00798 return maxprim;
00799 }
00800 
00801 
00802 //
00803 // Return the maximum shell type used
00804 //
00805 int Orbital::max_shell_type(void) {
00806 int maxshell=-1;
00807 
00808 int shell_counter = 0;
00809 for (int at=0; at<numatoms; at++) {
00810 for (int shell=0; shell < num_shells_per_atom[at]; shell++) {
00811 int shelltype = shell_types[shell_counter];
00812 shell_counter++;
00813 if (shelltype > maxshell)
00814 maxshell=shelltype;
00815 }
00816 }
00817 
00818 return maxshell;
00819 }
00820 
00821 
00822 //
00823 // count the maximum number of wavefunction coefficient accesses
00824 // required for the highest shell types contained in this orbital
00825 //
00826 int Orbital::max_wave_f_count(void) {
00827 int maxcount=0;
00828 
00829 int shell_counter = 0;
00830 for (int at=0; at<numatoms; at++) {
00831 for (int shell=0; shell < num_shells_per_atom[at]; shell++) {
00832 int shelltype = shell_types[shell_counter];
00833 int i, j; 
00834 int count=0;
00835 for (i=0; i<=shelltype; i++) {
00836 int jmax = shelltype - i; 
00837 for (j=0; j<=jmax; j++) {
00838 count++;
00839 }
00840 }
00841 shell_counter++;
00842 if (count > maxcount)
00843 maxcount=count;
00844 }
00845 }
00846 
00847 return maxcount;
00848 }
00849 
00850 
00851 //
00852 // compute the FLOPS per grid point for performance measurement purposes
00853 //
00854 double Orbital::flops_per_gridpoint() {
00855 double flops=0.0;
00856 
00857 int shell_counter = 0;
00858 for (int at=0; at<numatoms; at++) {
00859 flops += 7;
00860 
00861 for (int shell=0; shell < num_shells_per_atom[at]; shell++) {
00862 for (int prim=0; prim < num_prim_per_shell[shell_counter]; prim++)
00863 flops += 4; // expf() costs far more, but we count as one.
00864 
00865 int shelltype = shell_types[shell_counter];
00866 
00867 switch (shelltype) {
00868 // separately count for the hand-optimized cases
00869 case S_SHELL: flops += 2; break;
00870 case P_SHELL: flops += 8; break;
00871 case D_SHELL: flops += 17; break;
00872 case F_SHELL: flops += 30; break;
00873 case G_SHELL: flops += 50; break;
00874 
00875 // count up for catch-all loop
00876 default:
00877 int i, j; 
00878 for (i=0; i<=shelltype; i++) {
00879 int jmax = shelltype - i; 
00880 flops += 1;
00881 for (j=0; j<=jmax; j++) {
00882 flops += 6;
00883 }
00884 }
00885 break;
00886 }
00887 
00888 shell_counter++;
00889 } 
00890 }
00891 
00892 return flops;
00893 }
00894 
00895 
00896 //
00897 // Fast single-precision expf() implementation
00898 // Adapted from the free cephes math library on Netlib
00899 // http://www.netlib.org/cephes/
00900 //
00901 // Cephes Math Library Release 2.2: June, 1992
00902 // Copyright 1984, 1987, 1989 by Stephen L. Moshier
00903 // Direct inquiries to 30 Frost Street, Cambridge, MA 02140
00904 //
00905 static const float MAXNUMF = 3.4028234663852885981170418348451692544e38f;
00906 static const float MAXLOGF = 88.72283905206835f;
00907 static const float MINLOGF = -103.278929903431851103f; /* log(2^-149) */
00908 static const float LOG2EF = 1.44269504088896341f;
00909 static const float C1 = 0.693359375f;
00910 static const float C2 = -2.12194440e-4f;
00911 
00912 static inline float cephesfastexpf(float x) {
00913 float z;
00914 int n;
00915 
00916 if(x > MAXLOGF) 
00917 return MAXNUMF;
00918 
00919 if(x < MINLOGF) 
00920 return 0.0;
00921 
00922 // Express e^x = e^g 2^n = e^g e^(n loge(2)) = e^(g + n loge(2))
00923 z = floorf( LOG2EF * x + 0.5f ); // floor() truncates toward -infinity.
00924 x -= z * C1;
00925 x -= z * C2;
00926 n = (int) z;
00927 
00928 z = x * x;
00929 // Theoretical peak relative error in [-0.5, +0.5] is 4.2e-9.
00930 z = ((((( 1.9875691500E-4f * x + 1.3981999507E-3f) * x
00931 + 8.3334519073E-3f) * x + 4.1665795894E-2f) * x
00932 + 1.6666665459E-1f) * x + 5.0000001201E-1f) * z + x + 1.0f;
00933 
00934 x = ldexpf(z, n); // multiply by power of 2
00935 return x;
00936 }
00937 
00938 
00939 
00940 
00941 /*
00942 * David J. Hardy
00943 * 12 Dec 2008
00944 *
00945 * aexpfnx() - Approximate expf() for negative x.
00946 *
00947 * Assumes that x <= 0.
00948 *
00949 * Assumes IEEE format for single precision float, specifically:
00950 * 1 sign bit, 8 exponent bits biased by 127, and 23 mantissa bits.
00951 *
00952 * Interpolates exp() on interval (-1/log2(e), 0], then shifts it by
00953 * multiplication of a fast calculation for 2^(-N). The interpolation
00954 * uses a linear blending of 3rd degree Taylor polynomials at the end
00955 * points, so the approximation is once differentiable.
00956 *
00957 * The error is small (max relative error per interval is calculated
00958 * to be 0.131%, with a max absolute error of -0.000716).
00959 *
00960 * The cutoff is chosen so as to speed up the computation by early
00961 * exit from function, with the value chosen to give less than the
00962 * the max absolute error. Use of a cutoff is unnecessary, except
00963 * for needing to shift smallest floating point numbers to zero,
00964 * i.e. you could remove cutoff and replace by:
00965 *
00966 * #define MINXNZ -88.0296919311130 // -127 * log(2)
00967 *
00968 * if (x < MINXNZ) return 0.f;
00969 *
00970 * Use of a cutoff causes a discontinuity which can be eliminated
00971 * through the use of a switching function.
00972 *
00973 * We can obtain arbitrarily smooth approximation by taking k+1 nodes on
00974 * the interval and weighting their respective Taylor polynomials by the
00975 * kth order Lagrange interpolant through those nodes. The wiggle in the
00976 * polynomial interpolation due to equidistant nodes (Runge's phenomenon)
00977 * can be reduced by using Chebyshev nodes.
00978 */
00979 
00980 #define MLOG2EF -1.44269504088896f
00981 
00982 /*
00983 * Interpolating coefficients for linear blending of the
00984 * 3rd degree Taylor expansion of 2^x about 0 and -1.
00985 */
00986 #define SCEXP0 1.0000000000000000f
00987 #define SCEXP1 0.6987082824680118f
00988 #define SCEXP2 0.2633174272827404f
00989 #define SCEXP3 0.0923611991471395f
00990 #define SCEXP4 0.0277520543324108f
00991 
00992 /* for single precision float */
00993 #define EXPOBIAS 127
00994 #define EXPOSHIFT 23
00995 
00996 /* cutoff is optional, but can help avoid unnecessary work */
00997 #define ACUTOFF -10
00998 
00999 typedef union flint_t {
01000 float f;
01001 int n;
01002 } flint;
01003 
01004 float aexpfnx(float x) {
01005 /* assume x <= 0 */
01006 float mb;
01007 int mbflr;
01008 float d;
01009 float sy;
01010 flint scalfac;
01011 
01012 if (x < ACUTOFF) return 0.f;
01013 
01014 mb = x * MLOG2EF; /* change base to 2, mb >= 0 */
01015 mbflr = (int) mb; /* get int part, floor() */
01016 d = mbflr - mb; /* remaining exponent, -1 < d <= 0 */
01017 sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
01018 /* approx with linear blend of Taylor polys */
01019 scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT; /* 2^(-mbflr) */
01020 return (sy * scalfac.f); /* scaled approx */
01021 }
01022 
01023 
01024 
01025 //
01026 // Optimized molecular orbital grid evaluation code
01027 //
01028 #define S_SHELL 0
01029 #define P_SHELL 1
01030 #define D_SHELL 2
01031 #define F_SHELL 3
01032 #define G_SHELL 4
01033 #define H_SHELL 5
01034 
01035 int evaluate_grid(int numatoms,
01036 const float *wave_f, const float *basis_array,
01037 const float *atompos,
01038 const int *atom_basis,
01039 const int *num_shells_per_atom,
01040 const int *num_prim_per_shell,
01041 const int *shell_types,
01042 const int *numvoxels,
01043 float voxelsize,
01044 const float *origin,
01045 int density,
01046 float * orbitalgrid) {
01047 if (!orbitalgrid)
01048 return -1;
01049 
01050 int nx, ny, nz;
01051 // Calculate the value of the orbital at each gridpoint and store in 
01052 // the current oribtalgrid array
01053 int numgridxy = numvoxels[0]*numvoxels[1];
01054 for (nz=0; nz<numvoxels[2]; nz++) {
01055 float grid_x, grid_y, grid_z;
01056 grid_z = origin[2] + nz * voxelsize;
01057 for (ny=0; ny<numvoxels[1]; ny++) {
01058 grid_y = origin[1] + ny * voxelsize;
01059 int gaddrzy = ny*numvoxels[0] + nz*numgridxy;
01060 for (nx=0; nx<numvoxels[0]; nx++) {
01061 grid_x = origin[0] + nx * voxelsize;
01062 
01063 // calculate the value of the wavefunction of the
01064 // selected orbital at the current grid point
01065 int at;
01066 int prim, shell;
01067 
01068 // initialize value of orbital at gridpoint
01069 float value = 0.0;
01070 
01071 // initialize the wavefunction and shell counters
01072 int ifunc = 0; 
01073 int shell_counter = 0;
01074 
01075 // loop over all the QM atoms
01076 for (at=0; at<numatoms; at++) {
01077 int maxshell = num_shells_per_atom[at];
01078 int prim_counter = atom_basis[at];
01079 
01080 // calculate distance between grid point and center of atom
01081 float xdist = (grid_x - atompos[3*at ])*ANGS_TO_BOHR;
01082 float ydist = (grid_y - atompos[3*at+1])*ANGS_TO_BOHR;
01083 float zdist = (grid_z - atompos[3*at+2])*ANGS_TO_BOHR;
01084 
01085 float xdist2 = xdist*xdist;
01086 float ydist2 = ydist*ydist;
01087 float zdist2 = zdist*zdist;
01088 float xdist3 = xdist2*xdist;
01089 float ydist3 = ydist2*ydist;
01090 float zdist3 = zdist2*zdist;
01091 
01092 float dist2 = xdist2 + ydist2 + zdist2;
01093 
01094 // loop over the shells belonging to this atom
01095 // XXX this is maybe a misnomer because in split valence
01096 // basis sets like 6-31G we have more than one basis
01097 // function per (valence-)shell and we are actually
01098 // looping over the individual contracted GTOs
01099 for (shell=0; shell < maxshell; shell++) {
01100 float contracted_gto = 0.0f;
01101 
01102 // Loop over the Gaussian primitives of this contracted 
01103 // basis function to build the atomic orbital
01104 // 
01105 // XXX there's a significant opportunity here for further
01106 // speedup if we replace the entire set of primitives
01107 // with the single gaussian that they are attempting 
01108 // to model. This could give us another 6x speedup in 
01109 // some of the common/simple cases.
01110 int maxprim = num_prim_per_shell[shell_counter];
01111 int shelltype = shell_types[shell_counter];
01112 for (prim=0; prim<maxprim; prim++) {
01113 float exponent = basis_array[prim_counter ];
01114 float contract_coeff = basis_array[prim_counter + 1];
01115 
01116 // XXX By premultiplying the stored exponent factors etc,
01117 // we should be able to use exp2f() rather than exp(),
01118 // saving several FLOPS per iteration of this loop
01119 #if defined(__GNUC__) && !defined(__ICC)
01120 // Use David Hardy's fast spline approximation instead
01121 // Works well for GCC, but runs slower for Intel C.
01122 contracted_gto += contract_coeff * aexpfnx(-exponent*dist2);
01123 #elif defined(__ICC)
01124 // When compiling with ICC, we'll use an inlined 
01125 // single-precision expf() implementation based on the
01126 // cephes math library found on Netlib. This outruns the
01127 // standard glibc expf() by over 2x in this algorithm.
01128 contracted_gto += contract_coeff * cephesfastexpf(-exponent*dist2);
01129 #else
01130 // XXX By far the most costly operation here is exp(),
01131 // for gcc builds, exp() accounts for 90% of the runtime
01132 contracted_gto += contract_coeff * expf(-exponent*dist2);
01133 #endif
01134 prim_counter += 2;
01135 }
01136 
01137 /* multiply with the appropriate wavefunction coefficient */
01138 float tmpshell=0;
01139 switch (shelltype) {
01140 case S_SHELL:
01141 value += wave_f[ifunc++] * contracted_gto;
01142 break;
01143 
01144 case P_SHELL:
01145 tmpshell += wave_f[ifunc++] * xdist;
01146 tmpshell += wave_f[ifunc++] * ydist;
01147 tmpshell += wave_f[ifunc++] * zdist;
01148 value += tmpshell * contracted_gto;
01149 break;
01150 
01151 case D_SHELL:
01152 tmpshell += wave_f[ifunc++] * xdist2;
01153 tmpshell += wave_f[ifunc++] * xdist * ydist;
01154 tmpshell += wave_f[ifunc++] * ydist2;
01155 tmpshell += wave_f[ifunc++] * xdist * zdist;
01156 tmpshell += wave_f[ifunc++] * ydist * zdist;
01157 tmpshell += wave_f[ifunc++] * zdist2;
01158 value += tmpshell * contracted_gto;
01159 break;
01160 
01161 case F_SHELL:
01162 tmpshell += wave_f[ifunc++] * xdist3; // xxx
01163 tmpshell += wave_f[ifunc++] * xdist2 * ydist; // xxy
01164 tmpshell += wave_f[ifunc++] * ydist2 * xdist; // xyy
01165 tmpshell += wave_f[ifunc++] * ydist3; // yyy
01166 tmpshell += wave_f[ifunc++] * xdist2 * zdist; // xxz
01167 tmpshell += wave_f[ifunc++] * xdist * ydist * zdist; // xyz
01168 tmpshell += wave_f[ifunc++] * ydist2 * zdist; // yyz
01169 tmpshell += wave_f[ifunc++] * zdist2 * xdist; // xzz
01170 tmpshell += wave_f[ifunc++] * zdist2 * ydist; // yzz
01171 tmpshell += wave_f[ifunc++] * zdist3; // zzz
01172 value += tmpshell * contracted_gto;
01173 break;
01174 
01175 case G_SHELL:
01176 tmpshell += wave_f[ifunc++] * xdist2 * xdist2; // xxxx
01177 tmpshell += wave_f[ifunc++] * xdist3 * ydist; // xxxy
01178 tmpshell += wave_f[ifunc++] * xdist2 * ydist2; // xxyy
01179 tmpshell += wave_f[ifunc++] * ydist3 * xdist; // xyyy
01180 tmpshell += wave_f[ifunc++] * ydist2 * ydist2; // yyyy
01181 tmpshell += wave_f[ifunc++] * xdist3 * zdist; // xxxz
01182 tmpshell += wave_f[ifunc++] * xdist2 * ydist * zdist; // xxyz
01183 tmpshell += wave_f[ifunc++] * ydist2 * xdist * zdist; // xyyz
01184 tmpshell += wave_f[ifunc++] * ydist3 * zdist; // yyyz
01185 tmpshell += wave_f[ifunc++] * xdist2 * zdist2; // xxzz
01186 tmpshell += wave_f[ifunc++] * zdist2 * xdist * ydist; // xyzz
01187 tmpshell += wave_f[ifunc++] * ydist2 * zdist2; // yyzz
01188 tmpshell += wave_f[ifunc++] * zdist3 * xdist; // zzzx
01189 tmpshell += wave_f[ifunc++] * zdist3 * ydist; // zzzy
01190 tmpshell += wave_f[ifunc++] * zdist2 * zdist2; // zzzz
01191 value += tmpshell * contracted_gto;
01192 break;
01193 
01194 default:
01195 #if 1
01196 // avoid unnecessary branching and minimize use of pow()
01197 int i, j; 
01198 float xdp, ydp, zdp;
01199 float xdiv = 1.0f / xdist;
01200 for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) {
01201 int imax = shelltype - j; 
01202 for (i=0, ydp=1.0f, xdp=powf(xdist, float(imax)); i<=imax; i++, ydp*=ydist, xdp*=xdiv) {
01203 tmpshell += wave_f[ifunc++] * xdp * ydp * zdp;
01204 }
01205 }
01206 value += tmpshell * contracted_gto;
01207 #else
01208 int i, j, k;
01209 for (k=0; k<=shelltype; k++) {
01210 for (j=0; j<=shelltype; j++) {
01211 for (i=0; i<=shelltype; i++) {
01212 if (i+j+k==shelltype) {
01213 value += wave_f[ifunc++] * contracted_gto
01214 * pow(xdist,i) * pow(ydist,j) * pow(zdist,k);
01215 }
01216 }
01217 }
01218 }
01219 #endif
01220 } // end switch
01221 
01222 shell_counter++;
01223 } // end shell
01224 } // end atom
01225 
01226 // return either orbital density or orbital wavefunction amplitude
01227 if (density) {
01228 float orbdensity = value * value;
01229 if (value < 0.0)
01230 orbdensity = -orbdensity;
01231 orbitalgrid[gaddrzy + nx] = orbdensity;
01232 } else {
01233 orbitalgrid[gaddrzy + nx] = value;
01234 }
01235 }
01236 }
01237 }
01238 
01239 return 0;
01240 }
01241 
01242 
01243 
01244 #if defined(VMDORBUSESSE) && defined(__SSE2__)
01245 
01246 #if 0 && !defined(_WIN64) // MSVC doesn't support old MMX intrinsics
01247 //
01248 // Adaptation of the Cephes exp() to an SSE-ized exp_ps() routine 
01249 // originally by Julien Pommier
01250 // Copyright (C) 2007 Julien Pommier, ZLIB license
01251 // http://gruntthepeon.free.fr/ssemath/
01252 // 
01253 #ifdef _MSC_VER /* visual c++ */
01254 # define ALIGN16_BEG __declspec(align(16))
01255 # define ALIGN16_END
01256 #else /* gcc or icc */
01257 # define ALIGN16_BEG
01258 # define ALIGN16_END __attribute__((aligned(16)))
01259 #endif
01260 
01261 #define _PS_CONST(Name, Val) \
01262 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
01263 #define _PI32_CONST(Name, Val) \
01264 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
01265 
01266 _PS_CONST(exp_hi, 88.3762626647949f);
01267 _PS_CONST(exp_lo, -88.3762626647949f);
01268 
01269 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
01270 _PS_CONST(cephes_exp_C1, 0.693359375);
01271 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
01272 
01273 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
01274 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
01275 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
01276 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
01277 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
01278 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
01279 _PS_CONST(one, 1.0);
01280 _PS_CONST(half, 0.5);
01281 
01282 _PI32_CONST(0x7f, 0x7f);
01283 
01284 typedef union xmm_mm_union {
01285 __m128 xmm;
01286 __m64 mm[2];
01287 } xmm_mm_union;
01288 
01289 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
01290 xmm_mm_union u; u.xmm = xmm_; \
01291 mm0_ = u.mm[0]; \
01292 mm1_ = u.mm[1]; \
01293 }
01294 
01295 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
01296 xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
01297 }
01298 
01299 __m128 exp_ps(__m128 x) {
01300 __m128 tmp = _mm_setzero_ps(), fx;
01301 __m64 mm0, mm1;
01302 
01303 x = _mm_min_ps(x, *(__m128*)_ps_exp_hi);
01304 x = _mm_max_ps(x, *(__m128*)_ps_exp_lo);
01305 
01306 /* express exp(x) as exp(g + n*log(2)) */
01307 fx = _mm_mul_ps(x, *(__m128*)_ps_cephes_LOG2EF);
01308 fx = _mm_add_ps(fx,*(__m128*)_ps_half);
01309 
01310 /* how to perform a floorf with SSE: just below */
01311 /* step 1 : cast to int */
01312 tmp = _mm_movehl_ps(tmp, fx);
01313 mm0 = _mm_cvttps_pi32(fx);
01314 mm1 = _mm_cvttps_pi32(tmp);
01315 /* step 2 : cast back to float */
01316 tmp = _mm_cvtpi32x2_ps(mm0, mm1);
01317 /* if greater, substract 1 */
01318 __m128 mask = _mm_cmpgt_ps(tmp, fx);
01319 mask = _mm_and_ps(mask, *(__m128*)_ps_one);
01320 fx = _mm_sub_ps(tmp, mask);
01321 
01322 tmp = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C1);
01323 __m128 z = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C2);
01324 x = _mm_sub_ps(x, tmp);
01325 x = _mm_sub_ps(x, z);
01326 
01327 z = _mm_mul_ps(x,x);
01328 
01329 __m128 y = *(__m128*)_ps_cephes_exp_p0;
01330 y = _mm_mul_ps(y, x);
01331 y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p1);
01332 y = _mm_mul_ps(y, x);
01333 y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p2);
01334 y = _mm_mul_ps(y, x);
01335 y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p3);
01336 y = _mm_mul_ps(y, x);
01337 y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p4);
01338 y = _mm_mul_ps(y, x);
01339 y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p5);
01340 y = _mm_mul_ps(y, z);
01341 y = _mm_add_ps(y, x);
01342 y = _mm_add_ps(y, *(__m128*)_ps_one);
01343 
01344 /* build 2^n */
01345 z = _mm_movehl_ps(z, fx);
01346 mm0 = _mm_cvttps_pi32(fx);
01347 mm1 = _mm_cvttps_pi32(z);
01348 mm0 = _mm_add_pi32(mm0, *(__m64*)_pi32_0x7f);
01349 mm1 = _mm_add_pi32(mm1, *(__m64*)_pi32_0x7f);
01350 mm0 = _mm_slli_pi32(mm0, 23);
01351 mm1 = _mm_slli_pi32(mm1, 23);
01352 
01353 __m128 pow2n;
01354 COPY_MM_TO_XMM(mm0, mm1, pow2n);
01355 
01356 y = _mm_mul_ps(y, pow2n);
01357 _mm_empty();
01358 return y;
01359 }
01360 #endif // MSVC doesn't support old MMX intrinsics
01361 
01362 
01363 //
01364 // David J. Hardy
01365 // 12 Dec 2008
01366 //
01367 // aexpfnxsse() - SSE2 version of aexpfnx().
01368 //
01369 //
01370 #if defined(__GNUC__) && ! defined(__INTEL_COMPILER)
01371 #define __align(X) __attribute__((aligned(X) ))
01372 #if (__GNUC__ < 4)
01373 #define MISSING_mm_cvtsd_f64
01374 #endif
01375 #else
01376 #define __align(X) __declspec(align(X) )
01377 #endif
01378 
01379 #define MLOG2EF -1.44269504088896f
01380 
01381 /*
01382 * Interpolating coefficients for linear blending of the
01383 * 3rd degree Taylor expansion of 2^x about 0 and -1.
01384 */
01385 #define SCEXP0 1.0000000000000000f
01386 #define SCEXP1 0.6987082824680118f
01387 #define SCEXP2 0.2633174272827404f
01388 #define SCEXP3 0.0923611991471395f
01389 #define SCEXP4 0.0277520543324108f
01390 
01391 /* for single precision float */
01392 #define EXPOBIAS 127
01393 #define EXPOSHIFT 23
01394 
01395 /* cutoff is optional, but can help avoid unnecessary work */
01396 #define ACUTOFF -10
01397 
01398 typedef union SSEreg_t {
01399 __m128 f; // 4x float (SSE)
01400 __m128i i; // 4x 32-bit int (SSE2)
01401 } SSEreg;
01402 
01403 __m128 aexpfnxsse(__m128 x) {
01404 __align(16) SSEreg scal;
01405 __align(16) SSEreg n;
01406 __align(16) SSEreg y;
01407 
01408 scal.f = _mm_cmpge_ps(x, _mm_set_ps1(ACUTOFF)); /* Is x within cutoff? */
01409 
01410 /* If all x are outside of cutoff, return 0s. */
01411 if (_mm_movemask_ps(scal.f) == 0) {
01412 return _mm_setzero_ps();
01413 }
01414 /* Otherwise, scal.f contains mask to be ANDed with the scale factor */
01415 
01416 /*
01417 * Convert base: exp(x) = 2^(N-d) where N is integer and 0 <= d < 1.
01418 *
01419 * Below we calculate n=N and x=-d, with "y" for temp storage,
01420 * calculate floor of x*log2(e) and subtract to get -d.
01421 */
01422 y.f = _mm_mul_ps(x, _mm_set_ps1(MLOG2EF));
01423 n.i = _mm_cvttps_epi32(y.f);
01424 x = _mm_cvtepi32_ps(n.i);
01425 x = _mm_sub_ps(x, y.f);
01426 
01427 /*
01428 * Approximate 2^{-d}, 0 <= d < 1, by interpolation.
01429 * Perform Horner's method to evaluate interpolating polynomial.
01430 */
01431 y.f = _mm_mul_ps(x, _mm_set_ps1(SCEXP4)); /* for x^4 term */
01432 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP3)); /* for x^3 term */
01433 y.f = _mm_mul_ps(y.f, x);
01434 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP2)); /* for x^2 term */
01435 y.f = _mm_mul_ps(y.f, x);
01436 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP1)); /* for x^1 term */
01437 y.f = _mm_mul_ps(y.f, x);
01438 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP0)); /* for x^0 term */
01439 
01440 /*
01441 * Calculate 2^N exactly by directly manipulating floating point exponent.
01442 * Bitwise AND the result with scal.f mask to create the scale factor,
01443 * then use it to scale y for the final result.
01444 */
01445 n.i = _mm_sub_epi32(_mm_set1_epi32(EXPOBIAS), n.i);
01446 n.i = _mm_slli_epi32(n.i, EXPOSHIFT);
01447 scal.f = _mm_and_ps(scal.f, n.f);
01448 y.f = _mm_mul_ps(y.f, scal.f);
01449 
01450 return y.f;
01451 }
01452 
01453 
01454 int evaluate_grid_sse(int numatoms,
01455 const float *wave_f, const float *basis_array,
01456 const float *atompos,
01457 const int *atom_basis,
01458 const int *num_shells_per_atom,
01459 const int *num_prim_per_shell,
01460 const int *shell_types,
01461 const int *numvoxels,
01462 float voxelsize,
01463 const float *origin,
01464 int density,
01465 float * orbitalgrid) {
01466 if (!orbitalgrid)
01467 return -1;
01468 
01469 int nx, ny, nz;
01470 __align(16) float sxdelta[4]; // 16-byte aligned for SSE
01471 for (nx=0; nx<4; nx++) 
01472 sxdelta[nx] = ((float) nx) * voxelsize * ANGS_TO_BOHR;
01473 
01474 // Calculate the value of the orbital at each gridpoint and store in 
01475 // the current oribtalgrid array
01476 int numgridxy = numvoxels[0]*numvoxels[1];
01477 for (nz=0; nz<numvoxels[2]; nz++) {
01478 float grid_x, grid_y, grid_z;
01479 grid_z = origin[2] + nz * voxelsize;
01480 for (ny=0; ny<numvoxels[1]; ny++) {
01481 grid_y = origin[1] + ny * voxelsize;
01482 int gaddrzy = ny*numvoxels[0] + nz*numgridxy;
01483 for (nx=0; nx<numvoxels[0]; nx+=4) {
01484 grid_x = origin[0] + nx * voxelsize;
01485 
01486 // calculate the value of the wavefunction of the
01487 // selected orbital at the current grid point
01488 int at;
01489 int prim, shell;
01490 
01491 // initialize value of orbital at gridpoint
01492 __m128 value = _mm_setzero_ps();
01493 
01494 // initialize the wavefunction and shell counters
01495 int ifunc = 0; 
01496 int shell_counter = 0;
01497 
01498 // loop over all the QM atoms
01499 for (at=0; at<numatoms; at++) {
01500 int maxshell = num_shells_per_atom[at];
01501 int prim_counter = atom_basis[at];
01502 
01503 // calculate distance between grid point and center of atom
01504 float sxdist = (grid_x - atompos[3*at ])*ANGS_TO_BOHR;
01505 float sydist = (grid_y - atompos[3*at+1])*ANGS_TO_BOHR;
01506 float szdist = (grid_z - atompos[3*at+2])*ANGS_TO_BOHR;
01507 
01508 float sydist2 = sydist*sydist;
01509 float szdist2 = szdist*szdist;
01510 float yzdist2 = sydist2 + szdist2;
01511 
01512 __m128 xdelta = _mm_load_ps(&sxdelta[0]); // aligned load
01513 __m128 xdist = _mm_load_ps1(&sxdist);
01514 xdist = _mm_add_ps(xdist, xdelta);
01515 __m128 ydist = _mm_load_ps1(&sydist);
01516 __m128 zdist = _mm_load_ps1(&szdist);
01517 __m128 xdist2 = _mm_mul_ps(xdist, xdist);
01518 __m128 ydist2 = _mm_mul_ps(ydist, ydist);
01519 __m128 zdist2 = _mm_mul_ps(zdist, zdist);
01520 __m128 dist2 = _mm_load_ps1(&yzdist2); 
01521 dist2 = _mm_add_ps(dist2, xdist2);
01522 
01523 // loop over the shells belonging to this atom
01524 // XXX this is maybe a misnomer because in split valence
01525 // basis sets like 6-31G we have more than one basis
01526 // function per (valence-)shell and we are actually
01527 // looping over the individual contracted GTOs
01528 for (shell=0; shell < maxshell; shell++) {
01529 __m128 contracted_gto = _mm_setzero_ps();
01530 
01531 // Loop over the Gaussian primitives of this contracted 
01532 // basis function to build the atomic orbital
01533 // 
01534 // XXX there's a significant opportunity here for further
01535 // speedup if we replace the entire set of primitives
01536 // with the single gaussian that they are attempting 
01537 // to model. This could give us another 6x speedup in 
01538 // some of the common/simple cases.
01539 int maxprim = num_prim_per_shell[shell_counter];
01540 int shelltype = shell_types[shell_counter];
01541 for (prim=0; prim<maxprim; prim++) {
01542 // XXX pre-negate exponent value
01543 float exponent = -basis_array[prim_counter ];
01544 float contract_coeff = basis_array[prim_counter + 1];
01545 
01546 // contracted_gto += contract_coeff * exp(-exponent*dist2);
01547 __m128 expval = _mm_mul_ps(_mm_load_ps1(&exponent), dist2);
01548 // SSE expf() required here
01549 #if 1
01550 __m128 retval = aexpfnxsse(expval);
01551 #else
01552 __m128 retval = exp_ps(expval);
01553 #endif
01554 __m128 ctmp = _mm_mul_ps(_mm_load_ps1(&contract_coeff), retval);
01555 contracted_gto = _mm_add_ps(contracted_gto, ctmp);
01556 prim_counter += 2;
01557 }
01558 
01559 /* multiply with the appropriate wavefunction coefficient */
01560 __m128 tmpshell = _mm_setzero_ps();
01561 switch (shelltype) {
01562 case S_SHELL:
01563 value = _mm_add_ps(value, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), contracted_gto));
01564 break;
01565 
01566 case P_SHELL:
01567 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), xdist));
01568 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), ydist));
01569 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), zdist));
01570 value = _mm_add_ps(value, _mm_mul_ps(tmpshell, contracted_gto));
01571 break;
01572 
01573 case D_SHELL:
01574 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), xdist2));
01575 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(xdist, ydist)));
01576 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), ydist2));
01577 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(xdist, zdist)));
01578 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(ydist, zdist)));
01579 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), zdist2));
01580 value = _mm_add_ps(value, _mm_mul_ps(tmpshell, contracted_gto));
01581 break;
01582 
01583 case F_SHELL:
01584 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(xdist2, xdist)));
01585 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(xdist2, ydist)));
01586 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(ydist2, xdist)));
01587 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(ydist2, ydist)));
01588 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(xdist2, zdist)));
01589 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(_mm_mul_ps(xdist, ydist), zdist)));
01590 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(ydist2, zdist)));
01591 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(zdist2, xdist)));
01592 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(zdist2, ydist)));
01593 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(zdist2, zdist)));
01594 value = _mm_add_ps(value, _mm_mul_ps(tmpshell, contracted_gto));
01595 break;
01596 
01597 #if 0
01598 default:
01599 // avoid unnecessary branching and minimize use of pow()
01600 int i, j; 
01601 float xdp, ydp, zdp;
01602 float xdiv = 1.0f / xdist;
01603 for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) {
01604 int imax = shelltype - j; 
01605 for (i=0, ydp=1.0f, xdp=pow(xdist, imax); i<=imax; i++, ydp*=ydist, xdp*=xdiv) {
01606 tmpshell += wave_f[ifunc++] * xdp * ydp * zdp;
01607 }
01608 }
01609 value += tmpshell * contracted_gto;
01610 #endif
01611 } // end switch
01612 
01613 shell_counter++;
01614 } // end shell
01615 } // end atom
01616 
01617 // return either orbital density or orbital wavefunction amplitude
01618 if (density) {
01619 __m128 mask = _mm_cmplt_ps(value, _mm_setzero_ps());
01620 __m128 sqdensity = _mm_mul_ps(value, value);
01621 __m128 orbdensity = sqdensity;
01622 __m128 nsqdensity = _mm_and_ps(sqdensity, mask);
01623 orbdensity = _mm_sub_ps(orbdensity, nsqdensity);
01624 orbdensity = _mm_sub_ps(orbdensity, nsqdensity);
01625 _mm_storeu_ps(&orbitalgrid[gaddrzy + nx], orbdensity);
01626 } else {
01627 _mm_storeu_ps(&orbitalgrid[gaddrzy + nx], value);
01628 }
01629 }
01630 }
01631 }
01632 
01633 return 0;
01634 }
01635 
01636 #endif
01637 
01638 
01639 
01640 #if defined(VMDORBUSEVSX) && defined(__VSX__)
01641 //
01642 // John Stone, June 2016
01643 //
01644 // aexpfnxsse() - VSX version of aexpfnx().
01645 //
01646 #if defined(__GNUC__) && ! defined(__INTEL_COMPILER)
01647 #define __align(X) __attribute__((aligned(X) ))
01648 #else
01649 #define __align(X) __declspec(align(X) )
01650 #endif
01651 
01652 #define MLOG2EF -1.44269504088896f
01653 
01654 /*
01655 * Interpolating coefficients for linear blending of the
01656 * 3rd degree Taylor expansion of 2^x about 0 and -1.
01657 */
01658 #define SCEXP0 1.0000000000000000f
01659 #define SCEXP1 0.6987082824680118f
01660 #define SCEXP2 0.2633174272827404f
01661 #define SCEXP3 0.0923611991471395f
01662 #define SCEXP4 0.0277520543324108f
01663 
01664 /* for single precision float */
01665 #define EXPOBIAS 127
01666 #define EXPOSHIFT 23
01667 
01668 /* cutoff is optional, but can help avoid unnecessary work */
01669 #define ACUTOFF -10
01670 
01671 #if 0
01672 vector float ref_expf(vector float x) {
01673 vector float result;
01674 
01675 int i;
01676 for (i=0; i<4; i++) {
01677 result[i] = expf(x[i]);
01678 }
01679 
01680 return result;
01681 }
01682 #endif
01683 
01684 vector float aexpfnxvsx(vector float x) {
01685 // scal.f = _mm_cmpge_ps(x, _mm_set_ps1(ACUTOFF)); /* Is x within cutoff? */
01686 // 
01687 // If all x are outside of cutoff, return 0s.
01688 // if (_mm_movemask_ps(scal.f) == 0) {
01689 // return _mm_setzero_ps();
01690 // }
01691 // Otherwise, scal.f contains mask to be ANDed with the scale factor
01692 
01693 /*
01694 * Convert base: exp(x) = 2^(N-d) where N is integer and 0 <= d < 1.
01695 *
01696 * Below we calculate n=N and x=-d, with "y" for temp storage,
01697 * calculate floor of x*log2(e) and subtract to get -d.
01698 */
01699 vector float mb = vec_mul(x, vec_splats(MLOG2EF));
01700 vector float mbflr = vec_floor(mb);
01701 vector float d = vec_sub(mbflr, mb);
01702 vector float y;
01703 
01704 // Approximate 2^{-d}, 0 <= d < 1, by interpolation.
01705 // Perform Horner's method to evaluate interpolating polynomial.
01706 y = vec_madd(d, vec_splats(SCEXP4), vec_splats(SCEXP3));
01707 y = vec_madd(y, d, vec_splats(SCEXP2));
01708 y = vec_madd(y, d, vec_splats(SCEXP1));
01709 y = vec_madd(y, d, vec_splats(SCEXP0));
01710 
01711 return vec_mul(y, vec_expte(-mbflr));
01712 }
01713 
01714 
01715 int evaluate_grid_vsx(int numatoms,
01716 const float *wave_f, const float *basis_array,
01717 const float *atompos,
01718 const int *atom_basis,
01719 const int *num_shells_per_atom,
01720 const int *num_prim_per_shell,
01721 const int *shell_types,
01722 const int *numvoxels,
01723 float voxelsize,
01724 const float *origin,
01725 int density,
01726 float * orbitalgrid) {
01727 if (!orbitalgrid)
01728 return -1;
01729 
01730 int nx, ny, nz;
01731 __attribute__((aligned(16))) float sxdelta[4]; // 16-byte aligned for VSX
01732 for (nx=0; nx<4; nx++) 
01733 sxdelta[nx] = ((float) nx) * voxelsize * ANGS_TO_BOHR;
01734 
01735 // Calculate the value of the orbital at each gridpoint and store in 
01736 // the current oribtalgrid array
01737 int numgridxy = numvoxels[0]*numvoxels[1];
01738 for (nz=0; nz<numvoxels[2]; nz++) {
01739 float grid_x, grid_y, grid_z;
01740 grid_z = origin[2] + nz * voxelsize;
01741 for (ny=0; ny<numvoxels[1]; ny++) {
01742 grid_y = origin[1] + ny * voxelsize;
01743 int gaddrzy = ny*numvoxels[0] + nz*numgridxy;
01744 for (nx=0; nx<numvoxels[0]; nx+=4) {
01745 grid_x = origin[0] + nx * voxelsize;
01746 
01747 // calculate the value of the wavefunction of the
01748 // selected orbital at the current grid point
01749 int at;
01750 int prim, shell;
01751 
01752 // initialize value of orbital at gridpoint
01753 vector float value = vec_splats(0.0f); 
01754 
01755 // initialize the wavefunction and shell counters
01756 int ifunc = 0; 
01757 int shell_counter = 0;
01758 
01759 // loop over all the QM atoms
01760 for (at=0; at<numatoms; at++) {
01761 int maxshell = num_shells_per_atom[at];
01762 int prim_counter = atom_basis[at];
01763 
01764 // calculate distance between grid point and center of atom
01765 float sxdist = (grid_x - atompos[3*at ])*ANGS_TO_BOHR;
01766 float sydist = (grid_y - atompos[3*at+1])*ANGS_TO_BOHR;
01767 float szdist = (grid_z - atompos[3*at+2])*ANGS_TO_BOHR;
01768 
01769 float sydist2 = sydist*sydist;
01770 float szdist2 = szdist*szdist;
01771 float yzdist2 = sydist2 + szdist2;
01772 
01773 vector float xdelta = *((__vector float *) &sxdelta[0]); // aligned load
01774 vector float xdist = vec_splats(sxdist);
01775 xdist = vec_add(xdist, xdelta);
01776 vector float ydist = vec_splats(sydist);
01777 vector float zdist = vec_splats(szdist);
01778 vector float xdist2 = vec_mul(xdist, xdist);
01779 vector float ydist2 = vec_mul(ydist, ydist);
01780 vector float zdist2 = vec_mul(zdist, zdist);
01781 vector float dist2 = vec_splats(yzdist2); 
01782 dist2 = vec_add(dist2, xdist2);
01783 
01784 // loop over the shells belonging to this atom
01785 // XXX this is maybe a misnomer because in split valence
01786 // basis sets like 6-31G we have more than one basis
01787 // function per (valence-)shell and we are actually
01788 // looping over the individual contracted GTOs
01789 for (shell=0; shell < maxshell; shell++) {
01790 vector float contracted_gto = vec_splats(0.0f);
01791 
01792 // Loop over the Gaussian primitives of this contracted 
01793 // basis function to build the atomic orbital
01794 // 
01795 // XXX there's a significant opportunity here for further
01796 // speedup if we replace the entire set of primitives
01797 // with the single gaussian that they are attempting 
01798 // to model. This could give us another 6x speedup in 
01799 // some of the common/simple cases.
01800 int maxprim = num_prim_per_shell[shell_counter];
01801 int shelltype = shell_types[shell_counter];
01802 for (prim=0; prim<maxprim; prim++) {
01803 // XXX pre-negate exponent value
01804 float exponent = -basis_array[prim_counter ];
01805 float contract_coeff = basis_array[prim_counter + 1];
01806 
01807 // contracted_gto += contract_coeff * exp(-exponent*dist2);
01808 vector float expval = vec_mul(vec_splats(exponent), dist2);
01809 
01810 // VSX expf() required here
01811 vector float retval = aexpfnxvsx(expval);
01812 
01813 vector float ctmp = vec_mul(vec_splats(contract_coeff), retval);
01814 contracted_gto = vec_add(contracted_gto, ctmp);
01815 prim_counter += 2;
01816 }
01817 
01818 /* multiply with the appropriate wavefunction coefficient */
01819 vector float tmpshell = vec_splats(0.0f);
01820 switch (shelltype) {
01821 case S_SHELL:
01822 value = vec_add(value, vec_mul(vec_splats(wave_f[ifunc++]), contracted_gto));
01823 break;
01824 
01825 case P_SHELL:
01826 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), xdist));
01827 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), ydist));
01828 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), zdist));
01829 value = vec_add(value, vec_mul(tmpshell, contracted_gto));
01830 break;
01831 
01832 case D_SHELL:
01833 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), xdist2));
01834 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(xdist, ydist)));
01835 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), ydist2));
01836 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(xdist, zdist)));
01837 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(ydist, zdist)));
01838 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), zdist2));
01839 value = vec_add(value, vec_mul(tmpshell, contracted_gto));
01840 break;
01841 
01842 case F_SHELL:
01843 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(xdist2, xdist)));
01844 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(xdist2, ydist)));
01845 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(ydist2, xdist)));
01846 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(ydist2, ydist)));
01847 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(xdist2, zdist)));
01848 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(vec_mul(xdist, ydist), zdist)));
01849 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(ydist2, zdist)));
01850 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(zdist2, xdist)));
01851 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(zdist2, ydist)));
01852 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(zdist2, zdist)));
01853 value = vec_add(value, vec_mul(tmpshell, contracted_gto));
01854 break;
01855 
01856 #if 0
01857 default:
01858 // avoid unnecessary branching and minimize use of pow()
01859 int i, j; 
01860 float xdp, ydp, zdp;
01861 float xdiv = 1.0f / xdist;
01862 for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) {
01863 int imax = shelltype - j; 
01864 for (i=0, ydp=1.0f, xdp=pow(xdist, imax); i<=imax; i++, ydp*=ydist, xdp*=xdiv) {
01865 tmpshell += wave_f[ifunc++] * xdp * ydp * zdp;
01866 }
01867 }
01868 value += tmpshell * contracted_gto;
01869 #endif
01870 } // end switch
01871 
01872 shell_counter++;
01873 } // end shell
01874 } // end atom
01875 
01876 // return either orbital density or orbital wavefunction amplitude
01877 if (density) {
01878 value = vec_cpsgn(value, vec_mul(value, value));
01879 
01880 float *ufptr = &orbitalgrid[gaddrzy + nx];
01881 ufptr[0] = value[0];
01882 ufptr[1] = value[1];
01883 ufptr[2] = value[2];
01884 ufptr[3] = value[3];
01885 } else {
01886 float *ufptr = &orbitalgrid[gaddrzy + nx];
01887 ufptr[0] = value[0];
01888 ufptr[1] = value[1];
01889 ufptr[2] = value[2];
01890 ufptr[3] = value[3];
01891 }
01892 }
01893 }
01894 }
01895 
01896 return 0;
01897 }
01898 
01899 #endif
01900 
01901 
01902 
01903 //
01904 // Multithreaded molecular orbital computation engine
01905 //
01906 
01907 typedef struct {
01908 wkf_cpu_caps_t *cpucaps;
01909 int numatoms;
01910 const float *wave_f;
01911 const float *basis_array;
01912 const float *atompos;
01913 const int *atom_basis;
01914 const int *num_shells_per_atom;
01915 const int *num_prim_per_shell;
01916 const int *shell_types;
01917 const int *numvoxels;
01918 float voxelsize;
01919 int density;
01920 const float *origin;
01921 float *orbitalgrid;
01922 } orbthrparms;
01923 
01924 
01925 extern "C" void * orbitalthread(void *voidparms) {
01926 int numvoxels[3];
01927 float origin[3];
01928 orbthrparms *parms = NULL;
01929 #if defined(VMDORBUSETHRPOOL)
01930 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
01931 #else
01932 wkf_threadlaunch_getdata(voidparms, (void **) &parms);
01933 #endif
01934 
01935 #if defined(VMDCPUDISPATCH)
01936 wkf_cpu_caps_t *cpucaps = parms->cpucaps;
01937 
01938 #if defined(VMDUSEAVX512)
01939 int dispatch_AVX512ER = 0;
01940 if ((cpucaps->flags & CPU_AVX512ER) && (getenv("VMDNOAVX512ER") == NULL)) {
01941 // printf("evaluate_grid_avx512er\n");
01942 dispatch_AVX512ER = 1;
01943 }
01944 
01945 int dispatch_AVX512F = 0;
01946 if ((cpucaps->flags & CPU_AVX512F) && (getenv("VMDNOAVX512F") == NULL)) {
01947 dispatch_AVX512F = 1;
01948 // printf("evaluate_grid_avx512f\n");
01949 }
01950 #endif
01951 
01952 #if defined(VMDUSEAVX2)
01953 int dispatch_AVX2 = 0;
01954 if ((cpucaps->flags & CPU_AVX2) && (getenv("VMDNOAVX2") == NULL)) {
01955 dispatch_AVX2 = 1;
01956 // printf("evaluate_grid_avx2\n");
01957 }
01958 #endif
01959 
01960 #if defined(VMDUSESVE)
01961 int dispatch_SVE = 0;
01962 if ((cpucaps->flags & CPU_ARM64_SVE) && (getenv("VMDNOSVE") == NULL)) {
01963 dispatch_SVE = 1;
01964 // printf("evaluate_grid_sve\n");
01965 }
01966 #endif
01967 
01968 #if defined(VMDUSENEON)
01969 int dispatch_NEON = 0;
01970 if ((cpucaps->flags & CPU_ARM64_ASIMD) && (getenv("VMDNONEON") == NULL)) {
01971 dispatch_NEON = 1;
01972 // printf("evaluate_grid_neon\n");
01973 }
01974 #endif
01975 
01976 #endif // VMDCPUDISPATCH
01977 
01978 // 
01979 // Hard-coded compile-time fall-through vectorization paths
01980 //
01981 #if defined(VMDORBUSESSE) && defined(__SSE2__)
01982 int dispatch_SSE2 = 0;
01983 if ((getenv("VMDNOSSE2") == NULL)) {
01984 // printf("evaluate_grid_sse\n");
01985 dispatch_SSE2 = 1;
01986 }
01987 #endif
01988 
01989 #if defined(VMDORBUSEVSX) && defined(__VSX__)
01990 int dispatch_VSX = 0;
01991 if (getenv("VMDNOVSX") == NULL) {
01992 // printf("evaluate_grid_vsx\n");
01993 dispatch_VSX = 1;
01994 }
01995 #endif
01996 
01997 numvoxels[0] = parms->numvoxels[0];
01998 numvoxels[1] = parms->numvoxels[1];
01999 numvoxels[2] = 1; // we compute only a single plane
02000 
02001 origin[0] = parms->origin[0];
02002 origin[1] = parms->origin[1];
02003 
02004 // loop over orbital planes
02005 int planesize = numvoxels[0] * numvoxels[1];
02006 wkf_tasktile_t tile;
02007 #if defined(VMDORBUSETHRPOOL)
02008 while (wkf_threadpool_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) {
02009 #else
02010 while (wkf_threadlaunch_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) {
02011 #endif
02012 int k;
02013 for (k=tile.start; k<tile.end; k++) {
02014 origin[2] = parms->origin[2] + parms->voxelsize * k;
02015 
02016 #if defined(VMDCPUDISPATCH)
02017 //
02018 // runtime CPU dispatch
02019 // check for optional vector instructions and execute custom kernels 
02020 // for the fastest code path supported by the detected hardware
02021 // 
02022 if (cpucaps != NULL) {
02023 #if defined(VMDUSEAVX512)
02024 if (dispatch_AVX512ER) {
02025 evaluate_grid_avx512er(parms->numatoms, parms->wave_f, 
02026 parms->basis_array, parms->atompos, parms->atom_basis,
02027 parms->num_shells_per_atom, parms->num_prim_per_shell,
02028 parms->shell_types, numvoxels, parms->voxelsize,
02029 origin, parms->density, parms->orbitalgrid + planesize*k);
02030 continue;
02031 }
02032 
02033 if (dispatch_AVX512F) {
02034 evaluate_grid_avx512f(parms->numatoms, parms->wave_f, 
02035 parms->basis_array, parms->atompos, parms->atom_basis,
02036 parms->num_shells_per_atom, parms->num_prim_per_shell,
02037 parms->shell_types, numvoxels, parms->voxelsize,
02038 origin, parms->density, parms->orbitalgrid + planesize*k);
02039 continue;
02040 }
02041 #endif
02042 
02043 #if defined(VMDUSEAVX2)
02044 if (dispatch_AVX2) {
02045 evaluate_grid_avx2(parms->numatoms, parms->wave_f, 
02046 parms->basis_array, parms->atompos, parms->atom_basis,
02047 parms->num_shells_per_atom, parms->num_prim_per_shell,
02048 parms->shell_types, numvoxels, parms->voxelsize,
02049 origin, parms->density, parms->orbitalgrid + planesize*k);
02050 continue;
02051 }
02052 #endif
02053 
02054 #if defined(VMDUSESVE)
02055 if (dispatch_SVE) {
02056 evaluate_grid_sve(parms->numatoms, parms->wave_f,
02057 parms->basis_array, parms->atompos, parms->atom_basis,
02058 parms->num_shells_per_atom, parms->num_prim_per_shell,
02059 parms->shell_types, numvoxels, parms->voxelsize,
02060 origin, parms->density, parms->orbitalgrid + planesize*k);
02061 continue;
02062 }
02063 #endif
02064 
02065 #if defined(VMDUSENEON)
02066 if (dispatch_NEON) {
02067 evaluate_grid_neon(parms->numatoms, parms->wave_f, 
02068 parms->basis_array, parms->atompos, parms->atom_basis,
02069 parms->num_shells_per_atom, parms->num_prim_per_shell,
02070 parms->shell_types, numvoxels, parms->voxelsize,
02071 origin, parms->density, parms->orbitalgrid + planesize*k);
02072 continue;
02073 }
02074 #endif
02075 
02076 } // runtime cpucaps-based dispatch
02077 #endif
02078 
02079 
02080 //
02081 // hard-coded fall-through path if runtime CPU dispatch doesn't match up
02082 //
02083 #if defined(VMDORBUSESSE) && defined(__SSE2__)
02084 if (dispatch_SSE2) {
02085 evaluate_grid_sse(parms->numatoms, parms->wave_f, 
02086 parms->basis_array, parms->atompos, parms->atom_basis,
02087 parms->num_shells_per_atom, parms->num_prim_per_shell,
02088 parms->shell_types, numvoxels, parms->voxelsize,
02089 origin, parms->density, parms->orbitalgrid + planesize*k);
02090 continue;
02091 }
02092 #endif
02093 
02094 #if defined(VMDORBUSEVSX) && defined(__VSX__)
02095 if (dispatch_VSX) {
02096 evaluate_grid_vsx(parms->numatoms, parms->wave_f,
02097 parms->basis_array, parms->atompos, parms->atom_basis,
02098 parms->num_shells_per_atom, parms->num_prim_per_shell,
02099 parms->shell_types, numvoxels, parms->voxelsize,
02100 origin, parms->density, parms->orbitalgrid + planesize*k);
02101 continue;
02102 }
02103 #endif
02104 
02105 // Standard C++-based implementation that uses wither the
02106 // standard math library expf(), a faster modified Cephes expf(),
02107 // our our own fast exponential approximation routine.
02108 evaluate_grid(parms->numatoms, parms->wave_f, 
02109 parms->basis_array, parms->atompos, parms->atom_basis,
02110 parms->num_shells_per_atom, parms->num_prim_per_shell,
02111 parms->shell_types, numvoxels, parms->voxelsize,
02112 origin, parms->density, parms->orbitalgrid + planesize*k);
02113 }
02114 }
02115 
02116 return NULL;
02117 }
02118 
02119 
02120 int evaluate_grid_fast(wkf_cpu_caps_t *cpucaps,
02121 #if defined(VMDORBUSETHRPOOL) 
02122 wkf_threadpool_t *thrpool, 
02123 #else
02124 int numcputhreads,
02125 #endif
02126 int numatoms,
02127 const float *wave_f,
02128 const float *basis_array,
02129 const float *atompos,
02130 const int *atom_basis,
02131 const int *num_shells_per_atom,
02132 const int *num_prim_per_shell,
02133 const int *shell_types,
02134 const int *numvoxels,
02135 float voxelsize,
02136 const float *origin,
02137 int density,
02138 float * orbitalgrid) {
02139 int rc=0;
02140 orbthrparms parms;
02141 
02142 parms.cpucaps = cpucaps;
02143 parms.numatoms = numatoms;
02144 parms.wave_f = wave_f;
02145 parms.basis_array = basis_array;
02146 parms.atompos = atompos;
02147 parms.atom_basis = atom_basis;
02148 parms.num_shells_per_atom = num_shells_per_atom;
02149 parms.num_prim_per_shell = num_prim_per_shell;
02150 parms.shell_types = shell_types;
02151 parms.numvoxels = numvoxels;
02152 parms.voxelsize = voxelsize;
02153 parms.origin = origin;
02154 parms.density = density;
02155 parms.orbitalgrid = orbitalgrid;
02156 
02157 /* spawn child threads to do the work */
02158 wkf_tasktile_t tile;
02159 tile.start = 0;
02160 tile.end = numvoxels[2];
02161 
02162 #if defined(VMDORBUSETHRPOOL) 
02163 wkf_threadpool_sched_dynamic(thrpool, &tile);
02164 rc = wkf_threadpool_launch(thrpool, orbitalthread, &parms, 1);
02165 #else
02166 rc = wkf_threadlaunch(numcputhreads, &parms, orbitalthread, &tile);
02167 #endif
02168 
02169 return rc;
02170 }
02171 
02172 
02173 void Orbital::print_wavefunction() {
02174 // XXX Android, IRIX, and Windows don't provide log2f(), nor log2() ?!?!?!?!
02175 // for now we'll just avoid compiling this debugging code
02176 #if !(defined(_MSC_VER) || defined(ARCH_IRIX6) || defined(ARCH_IRIX6_64) || defined(ARCH_ANDROIDARMV7A))
02177 char shellname[6] = {'S', 'P', 'D', 'F', 'G', 'H'};
02178 int ifunc = 0;
02179 int at;
02180 int shell;
02181 for (at=0; at<numatoms; at++) {
02182 for (shell=0; shell < num_shells_per_atom[at]; shell++) {
02183 int shelltype = basis_set[at].shell[shell].type;
02184 
02185 // avoid unnecessary branching and minimize use of pow()
02186 int i, j, iang=0; 
02187 float xdist=2.0;
02188 float ydist=2.0;
02189 float zdist=2.0;
02190 float xdp, ydp, zdp;
02191 float xdiv = 1.0f / xdist;
02192 for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) {
02193 int imax = shelltype - j; 
02194 for (i=0, ydp=1.0f, xdp=pow(xdist, imax); i<=imax; i++, ydp*=ydist, xdp*=xdiv) {
02195 printf("%3i %c", at, shellname[shelltype]);
02196 int k, m=0;
02197 char buf[20]; buf[0] = '0円';
02198 for (k=0; k<(int)log2f(xdp); k++, m++) sprintf(buf+m, "x");
02199 for (k=0; k<(int)log2f(ydp); k++, m++) sprintf(buf+m, "y");
02200 for (k=0; k<(int)log2f(zdp); k++, m++) sprintf(buf+m, "z");
02201 //char *ang = qmdata->get_angular_momentum(at, shell, iang);
02202 printf("%-5s (%1.0f%1.0f%1.0f) wave_f[%3i] = % 11.6f\n", buf,
02203 log2f(xdp), log2f(ydp), log2f(zdp), ifunc, wave_f[ifunc]);
02204 //delete [] ang;
02205 iang++;
02206 ifunc++;
02207 }
02208 }
02209 }
02210 }
02211 #endif
02212 
02213 }