00001 /*************************************************************************** 00002 *cr 00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the 00004 *cr University of Illinois 00005 *cr All Rights Reserved 00006 *cr 00007 ***************************************************************************/ 00008 /*************************************************************************** 00009 * RCS INFORMATION: 00010 * 00011 * $RCSfile: Orbital.C,v $ 00012 * $Author: johns $ $Locker: $ $State: Exp $ 00013 * $Revision: 1.166 $ $Date: 2022年05月23日 19:10:01 $ 00014 * 00015 ***************************************************************************/ 00021 // Intel x86 hardware 00022 #if (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_AMD64)) 00023 #if !defined(__SSE2__) && defined(_WIN64) 00024 #define __SSE2__ 1 /* MSVC fails to define SSE macros */ 00025 #endif 00026 #if defined(__SSE2__) 00027 #include <emmintrin.h> 00028 #define VMDORBUSESSE 1 /* build SSE code for static launch */ 00029 #endif 00030 00031 #if defined(VMDCPUDISPATCH) 00032 #define VECPADSZ 16 /* max vec size is for x86 AVX512F or AVX512ER */ 00033 #else 00034 #define VECPADSZ 4 /* fall-back to x86 SSE vector size */ 00035 #endif 00036 00037 // IBM Power 8/9/10 Altivec/VSX instructions: 00038 // https://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf 00039 #elif defined(__VSX__) 00040 #if defined(__GNUC__) && defined(__VEC__) 00041 #include <altivec.h> 00042 #endif 00043 // The OpenPOWER VSX code path runs on POWER8 and later hardware, but is 00044 // untested on older platforms that support VSX instructions. 00045 // XXX GCC 4.8.5 breaks with conflicts between vec_xxx() routines 00046 // defined in utilities.h vs. VSX intrinsics in altivec.h and similar. 00047 // For now, we disable VSX for GCC for this source file. 00048 #define VECPADSZ 4 /* IBM POWER VSX vector size */ 00049 #define VMDORBUSEVSX 1 /* build POWER VSX code for static launch */ 00050 00051 // ARM64 SVE... 00052 #elif defined(__ARM_ARCH_ISA_A64) && !defined(ARCH_MACOSXARM64) 00053 #define VMDUSESVE 1 /* build SVE code for static launch */ 00054 00055 #if defined(VMDCPUDISPATCH) 00056 #define VECPADSZ 16 /* max vec size is for Fujitsu A64fx */ 00057 #else 00058 #define VECPADSZ 4 /* fall-back to NEON vector size */ 00059 #endif 00060 00061 // generic scalar C++ code 00062 #else 00063 #define VECPADSZ 1 /* scalar code, no padding */ 00064 #endif 00065 00066 // padding mask determined from hardware-specific 00067 #define VECPADMASK (VECPADSZ - 1) 00068 00069 // #define DEBUGORBS 1 00070 00071 #include <math.h> 00072 #include <stdio.h> 00073 #include "VMDApp.h" 00074 #include "Orbital.h" 00075 #include "DrawMolecule.h" 00076 #include "utilities.h" 00077 #include "Inform.h" 00078 #include "WKFThreads.h" 00079 #include "WKFUtils.h" 00080 #if defined(VMDCUDA) 00081 #include "CUDAOrbital.h" 00082 #endif 00083 #if defined(VMDOPENCL) 00084 #include "OpenCLUtils.h" 00085 #include "OpenCLKernels.h" 00086 #endif 00087 #include "ProfileHooks.h" 00088 00089 // 00090 // fctn prototypes for CPU runtime dispatch kernels 00091 // 00092 00093 // AVX-512ER implementation for Xeon Phi w/ special fctn units 00094 extern int evaluate_grid_avx512er(int numatoms, 00095 const float *wave_f, const float *basis_array, 00096 const float *atompos, 00097 const int *atom_basis, 00098 const int *num_shells_per_atom, 00099 const int *num_prim_per_shell, 00100 const int *shell_types, 00101 const int *numvoxels, 00102 float voxelsize, 00103 const float *origin, 00104 int density, 00105 float * orbitalgrid); 00106 00107 // AVX-512F implementation for CPUs without exponent/reciprocal support 00108 extern int evaluate_grid_avx512f(int numatoms, 00109 const float *wave_f, const float *basis_array, 00110 const float *atompos, 00111 const int *atom_basis, 00112 const int *num_shells_per_atom, 00113 const int *num_prim_per_shell, 00114 const int *shell_types, 00115 const int *numvoxels, 00116 float voxelsize, 00117 const float *origin, 00118 int density, 00119 float * orbitalgrid); 00120 00121 // AVX2 implementation 00122 extern int evaluate_grid_avx2(int numatoms, 00123 const float *wave_f, const float *basis_array, 00124 const float *atompos, 00125 const int *atom_basis, 00126 const int *num_shells_per_atom, 00127 const int *num_prim_per_shell, 00128 const int *shell_types, 00129 const int *numvoxels, 00130 float voxelsize, 00131 const float *origin, 00132 int density, 00133 float * orbitalgrid); 00134 00135 // ARM NEON implementation 00136 extern int evaluate_grid_neon(int numatoms, 00137 const float *wave_f, const float *basis_array, 00138 const float *atompos, 00139 const int *atom_basis, 00140 const int *num_shells_per_atom, 00141 const int *num_prim_per_shell, 00142 const int *shell_types, 00143 const int *numvoxels, 00144 float voxelsize, 00145 const float *origin, 00146 int density, 00147 float * orbitalgrid); 00148 00149 // ARM SVE implementation 00150 extern int evaluate_grid_sve(int numatoms, 00151 const float *wave_f, const float *basis_array, 00152 const float *atompos, 00153 const int *atom_basis, 00154 const int *num_shells_per_atom, 00155 const int *num_prim_per_shell, 00156 const int *shell_types, 00157 const int *numvoxels, 00158 float voxelsize, 00159 const float *origin, 00160 int density, 00161 float * orbitalgrid); 00162 00163 00164 #define ANGS_TO_BOHR 1.88972612478289694072f 00165 00167 Orbital::Orbital(const float *pos, 00168 const float *wfn, 00169 const float *barray, 00170 const basis_atom_t *bset, 00171 const int *types, 00172 const int *asort, 00173 const int *abasis, 00174 const float **norm, 00175 const int *nshells, 00176 const int *nprimshell, 00177 const int *shelltypes, 00178 int natoms, int ntypes, int numwave, int numbasis, 00179 int orbid) : 00180 numatoms(natoms), atompos(pos), 00181 num_wave_f(numwave), 00182 wave_f(NULL), 00183 num_basis_funcs(numbasis), 00184 basis_array(barray), 00185 numtypes(ntypes), 00186 basis_set(bset), 00187 atom_types(types), 00188 atom_sort(asort), 00189 atom_basis(abasis), 00190 norm_factors(norm), 00191 num_shells_per_atom(nshells), 00192 num_prim_per_shell(nprimshell), 00193 shell_types(shelltypes), 00194 grid_data(NULL) 00195 { 00196 origin[0] = origin[1] = origin[2] = 0.0; 00197 00198 // Multiply wavefunction coefficients with the 00199 // angular momentum dependent part of the basis set 00200 // normalization factors. 00201 normalize_wavefunction(wfn + num_wave_f*orbid); 00202 00203 //print_wavefunction(); 00204 } 00205 00207 Orbital::~Orbital() { 00208 if (wave_f) delete [] wave_f; 00209 } 00210 00211 00212 // Multiply wavefunction coefficients with the 00213 // basis set normalization factors. We do this 00214 // here rather than normalizing the basisset itself 00215 // because we need different factors for the different 00216 // cartesian components of a shell and the basis set 00217 // stores data only per shell. 00218 // By doing the multiplication here we save a lot of 00219 // flops during orbital rendering. 00220 void Orbital::normalize_wavefunction(const float *wfn) { 00221 #ifdef DEBUGORBS 00222 char shellname[8] = {'S', 'P', 'D', 'F', 'G', 'H', 'I', 'K'}; 00223 #endif 00224 int i, j, k; 00225 // Get size of the symmetry-expanded wavefunction array 00226 // int wave_size = 0; 00227 // for (i=0; i<numatoms; i++) { 00228 // printf("atom[%d]: type = %d\n", i, atom_types[i]); 00229 // const basis_atom_t *basis_atom = &basis_set[atom_types[i]]; 00230 // for (j=0; j<basis_atom->numshells; j++) { 00231 // wave_size += basis_atom->shell[j].num_cart_func; 00232 // } 00233 // } 00234 // printf("num_wave_f/wave_size = %d/%d\n", num_wave_f, wave_size); 00235 00236 wave_f = new float[num_wave_f]; 00237 int ifunc = 0; 00238 for (i=0; i<numatoms; i++) { 00239 const basis_atom_t *basis_atom = &basis_set[atom_types[i]]; 00240 for (j=0; j<basis_atom->numshells; j++) { 00241 int stype = basis_atom->shell[j].type; 00242 #ifdef DEBUGORBS 00243 printf("atom %i/%i, %i/%i %c-shell\n", i+1, numatoms, j+1, basis_atom->numshells, shellname[stype]); 00244 #endif 00245 for (k=0; k<basis_atom->shell[j].num_cart_func; k++) { 00246 wave_f[ifunc] = wfn[ifunc] * norm_factors[stype][k]; 00247 00248 #ifdef DEBUGORBS 00249 printf("%3i %c %2i wave_f[%3i]=% f norm=%.3f normwave=% f\n", 00250 i, shellname[stype], k, ifunc, wfn[ifunc], 00251 norm_factors[stype][k], wave_f[ifunc]); 00252 #endif 00253 ifunc++; 00254 } 00255 } 00256 } 00257 } 00258 00259 00260 // Sets the grid dimensions to the bounding box of the given 00261 // set of atoms *pos including a padding in all dimensions. 00262 // The resulting grid dimensions will be rounded to a multiple 00263 // of the voxel size. 00264 int Orbital::set_grid_to_bbox(const float *pos, float padding, 00265 float resolution) { 00266 int i = 0; 00267 float xyzdim[3]; 00268 00269 /* set initial values of temp values to the coordinates 00270 * of the first atom. */ 00271 origin[0] = xyzdim[0] = pos[0]; 00272 origin[1] = xyzdim[1] = pos[1]; 00273 origin[2] = xyzdim[2] = pos[2]; 00274 00275 /* now loop over the rest of the atoms to check if there's 00276 * something larger/smaller for the maximum and minimum 00277 * respectively */ 00278 for(i=1; i<numatoms; i++) { 00279 if (pos[3*i ] < origin[0]) origin[0] = pos[3*i]; 00280 if (pos[3*i+1] < origin[1]) origin[1] = pos[3*i+1]; 00281 if (pos[3*i+2] < origin[2]) origin[2] = pos[3*i+2]; 00282 if (pos[3*i ] > xyzdim[0]) xyzdim[0] = pos[3*i]; 00283 if (pos[3*i+1] > xyzdim[1]) xyzdim[1] = pos[3*i+1]; 00284 if (pos[3*i+2] > xyzdim[2]) xyzdim[2] = pos[3*i+2]; 00285 } 00286 00287 // Apply padding in each direction 00288 origin[0] -= padding; 00289 origin[1] -= padding; 00290 origin[2] -= padding; 00291 gridsize[0] = xyzdim[0] + padding - origin[0]; 00292 gridsize[1] = xyzdim[1] + padding - origin[1]; 00293 gridsize[2] = xyzdim[2] + padding - origin[2]; 00294 00295 set_resolution(resolution); 00296 00297 return TRUE; 00298 } 00299 00300 00301 // Set the dimensions and resolution of the grid for which 00302 // the orbital shall be computed. 00303 // The given grid dimensions will be rounded to a multiple 00304 // of the voxel size. 00305 void Orbital::set_grid(float newori[3], float newdim[3], float newvoxelsize) { 00306 origin[0] = newori[0]; 00307 origin[1] = newori[1]; 00308 origin[2] = newori[2]; 00309 gridsize[0] = newdim[0]; 00310 gridsize[1] = newdim[1]; 00311 gridsize[2] = newdim[2]; 00312 set_resolution(newvoxelsize); 00313 } 00314 00315 // Change the resolution of the grid 00316 void Orbital::set_resolution(float resolution) { 00317 voxelsize = resolution; 00318 int i; 00319 for (i=0; i<3; i++) { 00320 numvoxels[i] = (int)(gridsize[i]/voxelsize) + 1; 00321 gridsize[i] = voxelsize*(numvoxels[i]-1); 00322 } 00323 } 00324 00325 #define XNEG 0 00326 #define YNEG 1 00327 #define ZNEG 2 00328 #define XPOS 3 00329 #define YPOS 4 00330 #define ZPOS 5 00331 00332 // Check if all values in the boundary plane given by dir 00333 // are below threshold. 00334 // If not, jump back, decrease the stepsize and test again. 00335 int Orbital::check_plane(int dir, float threshold, int minstepsize, 00336 int &stepsize) { 00337 bool repeat=0; 00338 int u, v, w, nu, nv; 00339 // w is the dimension we want to adjust, 00340 // u and v are the other two, i.e. the plane in which we test 00341 // the orbital values. 00342 u = (dir+1)%3; 00343 v = (dir+2)%3; 00344 w = dir%3; 00345 00346 // for debugging 00347 //char axis[3] = {'X', 'Y', 'Z'}; 00348 //char sign[2] = {'-', '+'}; 00349 //printf("%c%c: ", sign[dir/3], axis[w]); 00350 00351 do { 00352 int success = 0; 00353 int gridstep = stepsize; 00354 00355 if (repeat) { 00356 // We are repeating the test on the previous slate but with 00357 // twice the resolution. Hence we only have to test the new 00358 // grid points lying between the old ones. 00359 gridstep = 2*stepsize; 00360 } 00361 00362 00363 float grid[3]; 00364 grid[w] = origin[w] + (dir/3)*(numvoxels[w]-1) * voxelsize; 00365 00366 // Search for a value of the wave function larger than threshold. 00367 for (nu=0; nu<numvoxels[u]; nu+=gridstep) { 00368 grid[u] = origin[u] + nu * voxelsize; 00369 00370 for (nv=0; nv<numvoxels[v]; nv+=gridstep) { 00371 grid[v] = origin[v] + nv * voxelsize; 00372 00373 if (fabs(evaluate_grid_point(grid[0], grid[1], grid[2])) > threshold) { 00374 success = 1; 00375 break; 00376 } 00377 } 00378 if (success) break; 00379 } 00380 00381 if (success) { 00382 // Found an orbital value higher than the threshold. 00383 // We want the threshold isosurface to be completely inside the grid. 00384 // The boundary must be between the previous and this plane. 00385 if (!(dir/3)) origin[w] -= stepsize*voxelsize; 00386 numvoxels[w] += stepsize; 00387 if (stepsize<=minstepsize) { 00388 //printf("success!\n"); 00389 return 1; 00390 } 00391 stepsize /=2; 00392 repeat = 1; 00393 //printf("increase by %i, reduce stepsize to %i.\n", 2*stepsize, stepsize); 00394 00395 } else { 00396 // All values lower than threshold, we decrease the grid size. 00397 if (!(dir/3)) origin[w] += stepsize*voxelsize; 00398 numvoxels[w] -= stepsize; 00399 //printf("decrease by %i\n", stepsize); 00400 repeat = 0; 00401 if (numvoxels[w] <= 1) { 00402 // Here we ended up with a zero grid size. 00403 // We must have missed something. Let's increase grid again and 00404 // try a smaller step size. 00405 numvoxels[w] = stepsize; 00406 if (!(dir/3)) origin[w] -= stepsize*voxelsize; 00407 stepsize /=2; 00408 repeat = 1; 00409 //printf("zero grid size - increase to %i, reduce stepsize to %i.\n", 2*stepsize, stepsize); 00410 } 00411 } 00412 00413 } while (repeat); 00414 00415 return 0; 00416 } 00417 00418 00419 // Optimize position and dimension of current grid so that all orbital 00420 // values higher than threshold are contained in the grid. 00421 // 00422 // Algorithm: 00423 // Based on the idea that the wave function trails off in a distance of 00424 // a few Angstroms from the molecule. 00425 // We start from the current grid size (which could be for instance the 00426 // molecular bounding box plus a padding region) and test the values on 00427 // each of the six boundary planes. If there is no value larger than the 00428 // given threshold in a plane then we shrink the system along the plane 00429 // normal. In the distance the wave function tends to be smoother so we 00430 // start the testing on a coarser grid. A parameter maxstepsize=4 means 00431 // to begin with a grid using a four times higher voxel side length than 00432 // the original grid. When we find the first value above the threshold we 00433 // jump back one step and continue with half of the previous stepsize. 00434 // When stepsize has reached minstepsize then we consider the corresponding 00435 // boundary plane optimal. Note that starting out with a too coarse 00436 // grid one might miss some features of the wave function. 00437 // If you want to be sure not to miss anything then use the voxelsize 00438 // for both minstepsize and maxstepsize. 00439 void Orbital::find_optimal_grid(float threshold, 00440 int minstepsize, int maxstepsize) { 00441 int optimal[6] = {0, 0, 0, 0, 0, 0}; 00442 int stepsize[6]; 00443 int i; 00444 for (i=0; i<6; i++) stepsize[i] = maxstepsize; 00445 00446 #ifdef DEBUGORBS 00447 printf("origin = {%f %f %f}\n", origin[0], origin[1], origin[2]); 00448 printf("gridsize = {%f %f %f}\n", gridsize[0], gridsize[1], gridsize[2]); 00449 #endif 00450 00451 00452 // Loop until we have optimal grid boundaries in all 00453 // dimensions 00454 int iter = 0; 00455 while ( !optimal[0] || !optimal[1] || !optimal[2] || 00456 !optimal[3] || !optimal[4] || !optimal[5] ) 00457 { 00458 if (iter>100) { 00459 msgInfo << "WARNING: Could not optimize orbital grid boundaries in" 00460 << iter << "steps!" << sendmsg; 00461 break; 00462 } 00463 iter++; 00464 00465 // Examine the current grid boundaries and shrink if 00466 // all values are smaller than threshold . 00467 if (!optimal[XNEG]) 00468 optimal[XNEG] = check_plane(XNEG, threshold, minstepsize, stepsize[XNEG]); 00469 00470 if (!optimal[XPOS]) 00471 optimal[XPOS] = check_plane(XPOS, threshold, minstepsize, stepsize[XPOS]); 00472 00473 if (!optimal[YNEG]) 00474 optimal[YNEG] = check_plane(YNEG, threshold, minstepsize, stepsize[YNEG]); 00475 00476 if (!optimal[YPOS]) 00477 optimal[YPOS] = check_plane(YPOS, threshold, minstepsize, stepsize[YPOS]); 00478 00479 if (!optimal[ZNEG]) 00480 optimal[ZNEG] = check_plane(ZNEG, threshold, minstepsize, stepsize[ZNEG]); 00481 00482 if (!optimal[ZPOS]) 00483 optimal[ZPOS] = check_plane(ZPOS, threshold, minstepsize, stepsize[ZPOS]); 00484 00485 #if defined(DEBUGORBS) 00486 printf("origin {%f %f %f}\n", origin[0], origin[1], origin[2]); 00487 printf("ngrid {%i %i %i}\n", numvoxels[0], numvoxels[1], numvoxels[2]); 00488 printf("stepsize {%i %i %i %i %i %i}\n", stepsize[0], stepsize[1], stepsize[2], 00489 stepsize[3], stepsize[4], stepsize[5]); 00490 #endif 00491 } 00492 00493 00494 gridsize[0] = numvoxels[0]*voxelsize; 00495 gridsize[1] = numvoxels[1]*voxelsize; 00496 gridsize[2] = numvoxels[2]*voxelsize; 00497 } 00498 00499 00500 // this function creates the orbital grid given the system dimensions 00501 int Orbital::calculate_mo(DrawMolecule *mol, int density) { 00502 PROFILE_PUSH_RANGE("Orbital", 4); 00503 00504 wkf_timerhandle timer=wkf_timer_create(); 00505 wkf_timer_start(timer); 00506 00507 // 00508 // Force vectorized N-element padding for the X dimension to prevent 00509 // the possibility of an out-of-bounds orbital grid read/write operation 00510 // 00511 int vecpadmask = VECPADMASK; 00512 if ((mol->app->cpucaps != NULL) && (mol->app->cpucaps->flags & CPU_AVX2)) { 00513 vecpadmask = 7; // AVX2 kernels pad to multiples of 8 00514 } 00515 if ((mol->app->cpucaps != NULL) && (mol->app->cpucaps->flags & (CPU_AVX512F | CPU_AVX512ER))) { 00516 vecpadmask = 15; // AVX512 kernels pad to multiples of 16 00517 } 00518 00519 // pad the grid X dimension for vector-multiple length and memory alignment 00520 numvoxels[0] = (numvoxels[0] + vecpadmask) & ~(vecpadmask); 00521 gridsize[0] = numvoxels[0]*voxelsize; 00522 00523 // Allocate memory for the volumetric grid 00524 int numgridpoints = numvoxels[0] * numvoxels[1] * numvoxels[2]; 00525 grid_data = new float[numgridpoints]; 00526 00527 #if defined(DEBUGORBS) 00528 printf("num_wave_f=%i\n", num_wave_f); 00529 00530 int i=0; 00531 for (i=0; i<num_wave_f; i++) { 00532 printf("wave_f[%i] = %f\n", i, wave_f[i]); 00533 } 00534 00535 // perhaps give the user a warning, since the calculation 00536 // could take a while, otherwise they might think the system is borked 00537 printf("Calculating %ix%ix%i orbital grid.\n", 00538 numvoxels[0], numvoxels[1], numvoxels[2]); 00539 #endif 00540 00541 00542 int rc=-1; // initialize to sentinel value 00543 00544 // Calculate the value of the orbital at each gridpoint 00545 #if defined(VMDCUDA) 00546 // The CUDA kernel currently only handles up to "G" shells, 00547 // and up to 32 primitives per basis function 00548 if ((max_shell_type() <= G_SHELL) && 00549 (max_primitives() <= 32) && 00550 (!getenv("VMDNOCUDA"))) { 00551 rc = vmd_cuda_evaluate_orbital_grid(mol->cuda_devpool(), 00552 numatoms, wave_f, num_wave_f, 00553 basis_array, num_basis_funcs, 00554 atompos, atom_basis, 00555 num_shells_per_atom, 00556 num_prim_per_shell, 00557 shell_types, total_shells(), 00558 numvoxels, voxelsize, 00559 origin, density, grid_data); 00560 } 00561 #endif 00562 #if defined(VMDOPENCL) 00563 // The OpenCL kernel currently only handles up to "G" shells, 00564 // and up to 32 primitives per basis function 00565 if (rc!=0 && 00566 (max_shell_type() <= G_SHELL) && 00567 (max_primitives() <= 32) && 00568 (!getenv("VMDNOOPENCL"))) { 00569 00570 #if 1 00571 // XXX this would be done during app startup normally... 00572 static vmd_opencl_orbital_handle *orbh = NULL; 00573 static cl_context clctx = NULL; 00574 static cl_command_queue clcmdq = NULL; 00575 static cl_device_id *cldevs = NULL; 00576 if (orbh == NULL) { 00577 printf("Attaching OpenCL device:\n"); 00578 wkf_timer_start(timer); 00579 cl_int clerr = CL_SUCCESS; 00580 00581 cl_platform_id clplatid = vmd_cl_get_platform_index(0); 00582 cl_context_properties clctxprops[] = {(cl_context_properties) CL_CONTEXT_PLATFORM, (cl_context_properties) clplatid, (cl_context_properties) 0}; 00583 clctx = clCreateContextFromType(clctxprops, CL_DEVICE_TYPE_GPU, NULL, NULL, &clerr); 00584 00585 size_t parmsz; 00586 clerr |= clGetContextInfo(clctx, CL_CONTEXT_DEVICES, 0, NULL, &parmsz); 00587 if (clerr != CL_SUCCESS) return -1; 00588 cldevs = (cl_device_id *) malloc(parmsz); 00589 if (clerr != CL_SUCCESS) return -1; 00590 clerr |= clGetContextInfo(clctx, CL_CONTEXT_DEVICES, parmsz, cldevs, NULL); 00591 if (clerr != CL_SUCCESS) return -1; 00592 clcmdq = clCreateCommandQueue(clctx, cldevs[0], 0, &clerr); 00593 if (clerr != CL_SUCCESS) return -1; 00594 wkf_timer_stop(timer); 00595 printf(" OpenCL context creation time: %.3f sec\n", wkf_timer_time(timer)); 00596 00597 wkf_timer_start(timer); 00598 orbh = vmd_opencl_create_orbital_handle(clctx, clcmdq, cldevs); 00599 wkf_timer_stop(timer); 00600 printf(" OpenCL kernel compilation time: %.3f sec\n", wkf_timer_time(timer)); 00601 00602 wkf_timer_start(timer); 00603 } 00604 #endif 00605 00606 rc = vmd_opencl_evaluate_orbital_grid(mol->cuda_devpool(), orbh, 00607 numatoms, wave_f, num_wave_f, 00608 basis_array, num_basis_funcs, 00609 atompos, atom_basis, 00610 num_shells_per_atom, 00611 num_prim_per_shell, 00612 shell_types, total_shells(), 00613 numvoxels, voxelsize, 00614 origin, density, grid_data); 00615 00616 #if 0 00617 // XXX this would normally be done at program shutdown 00618 vmd_opencl_destroy_orbital_handle(parms.orbh); 00619 clReleaseCommandQueue(clcmdq); 00620 clReleaseContext(clctx); 00621 free(cldevs); 00622 #endif 00623 } 00624 #endif 00625 #if 0 00626 int numprocs = 1; 00627 if (getenv("VMDDUMPORBITALS")) { 00628 write_orbital_data(getenv("VMDDUMPORBITALS"), numatoms, 00629 wave_f, num_wave_f, basis_array, num_basis, 00630 atompos, atom_basis, num_shells_per_atom, 00631 num_prim_per_shell, shell_types, 00632 num_shells, numvoxels, voxelsize, origin); 00633 00634 read_calc_orbitals(devpool, getenv("VMDDUMPORBITALS")); 00635 } 00636 #endif 00637 00638 00639 #if !defined(VMDORBUSETHRPOOL) 00640 #if defined(VMDTHREADS) 00641 int numcputhreads = wkf_thread_numprocessors(); 00642 #else 00643 int numcputhreads = 1; 00644 #endif 00645 #endif 00646 if (rc!=0) rc = evaluate_grid_fast(mol->app->cpucaps, 00647 #if defined(VMDORBUSETHRPOOL) 00648 mol->cpu_threadpool(), 00649 #else 00650 numcputhreads, 00651 #endif 00652 numatoms, wave_f, basis_array, 00653 atompos, atom_basis, 00654 num_shells_per_atom, num_prim_per_shell, 00655 shell_types, numvoxels, voxelsize, 00656 origin, density, grid_data); 00657 00658 if (rc!=0) { 00659 msgErr << "Error computing orbital grid" << sendmsg; 00660 delete [] grid_data; 00661 grid_data=NULL; 00662 00663 PROFILE_POP_RANGE(); // first return point 00664 00665 return FALSE; 00666 } 00667 00668 wkf_timer_stop(timer); 00669 00670 #if 1 00671 if (getenv("VMDORBTIMING") != NULL) { 00672 double gflops = (numgridpoints * flops_per_gridpoint()) / (wkf_timer_time(timer) * 1000000000.0); 00673 00674 char strbuf[1024]; 00675 sprintf(strbuf, "Orbital calc. time %.3f secs, %.2f gridpoints/sec, %.2f GFLOPS", 00676 wkf_timer_time(timer), 00677 (((double) numgridpoints) / wkf_timer_time(timer)), 00678 gflops); 00679 msgInfo << strbuf << sendmsg; 00680 } 00681 #endif 00682 00683 wkf_timer_destroy(timer); 00684 00685 PROFILE_POP_RANGE(); // second return point 00686 00687 return TRUE; 00688 } 00689 00690 00691 /********************************************************* 00692 * 00693 * This function calculates the value of the wavefunction 00694 * corresponding to a particular orbital at grid point 00695 * grid_x, grid_y, grid_z. 00696 00697 00698 Here's an example of a basis set definition for one atom: 00699 00700 SHELL TYPE PRIMITIVE EXPONENT CONTRACTION COEFFICIENT(S) 00701 00702 Oxygen 00703 00704 1 S 1 5484.6716600 0.001831074430 00705 1 S 2 825.2349460 0.013950172200 00706 1 S 3 188.0469580 0.068445078098 00707 1 S 4 52.9645000 0.232714335992 00708 1 S 5 16.8975704 0.470192897984 00709 1 S 6 5.7996353 0.358520852987 00710 00711 2 L 7 15.5396162 -0.110777549525 0.070874268231 00712 2 L 8 3.5999336 -0.148026262701 0.339752839147 00713 2 L 9 1.0137618 1.130767015354 0.727158577316 00714 00715 3 L 10 0.2700058 1.000000000000 1.000000000000 00716 00717 *********************************************************/ 00718 float Orbital::evaluate_grid_point(float grid_x, float grid_y, float grid_z) { 00719 int at; 00720 int prim, shell; 00721 00722 // initialize value of orbital at gridpoint 00723 float value = 0.0; 00724 00725 // initialize the wavefunction and shell counters 00726 int ifunc = 0; 00727 int shell_counter = 0; 00728 00729 // loop over all the QM atoms 00730 for (at=0; at<numatoms; at++) { 00731 int maxshell = num_shells_per_atom[at]; 00732 int prim_counter = atom_basis[at]; 00733 00734 // calculate distance between grid point and center of atom 00735 float xdist = (grid_x - atompos[3*at ])*ANGS_TO_BOHR; 00736 float ydist = (grid_y - atompos[3*at+1])*ANGS_TO_BOHR; 00737 float zdist = (grid_z - atompos[3*at+2])*ANGS_TO_BOHR; 00738 float dist2 = xdist*xdist + ydist*ydist + zdist*zdist; 00739 00740 // loop over the shells belonging to this atom 00741 // XXX this is maybe a misnomer because in split valence 00742 // basis sets like 6-31G we have more than one basis 00743 // function per (valence-)shell and we are actually 00744 // looping over the individual contracted GTOs 00745 for (shell=0; shell < maxshell; shell++) { 00746 float contracted_gto = 0.0f; 00747 00748 // Loop over the Gaussian primitives of this contracted 00749 // basis function to build the atomic orbital 00750 int maxprim = num_prim_per_shell[shell_counter]; 00751 int shelltype = shell_types[shell_counter]; 00752 for (prim=0; prim < maxprim; prim++) { 00753 float exponent = basis_array[prim_counter ]; 00754 float contract_coeff = basis_array[prim_counter + 1]; 00755 contracted_gto += contract_coeff * expf(-exponent*dist2); 00756 prim_counter += 2; 00757 } 00758 00759 /* multiply with the appropriate wavefunction coefficient */ 00760 float tmpshell=0; 00761 // Loop over the cartesian angular momenta of the shell. 00762 // avoid unnecessary branching and minimize use of pow() 00763 int i, j; 00764 float xdp, ydp, zdp; 00765 float xdiv = 1.0f / xdist; 00766 for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) { 00767 int imax = shelltype - j; 00768 for (i=0, ydp=1.0f, xdp=powf(xdist, float(imax)); i<=imax; i++, ydp*=ydist, xdp*=xdiv) { 00769 tmpshell += wave_f[ifunc++] * xdp * ydp * zdp; 00770 } 00771 } 00772 value += tmpshell * contracted_gto; 00773 00774 shell_counter++; 00775 } 00776 } 00777 00778 /* return the final value at grid point */ 00779 return value; 00780 } 00781 00782 00783 // 00784 // Return the max number of primitives that occur in a basis function 00785 // 00786 int Orbital::max_primitives(void) { 00787 int maxprim=-1; 00788 00789 int shell_counter = 0; 00790 for (int at=0; at<numatoms; at++) { 00791 for (int shell=0; shell < num_shells_per_atom[at]; shell++) { 00792 int numprim = num_prim_per_shell[shell_counter]; 00793 if (numprim > maxprim) 00794 maxprim = numprim; 00795 } 00796 } 00797 00798 return maxprim; 00799 } 00800 00801 00802 // 00803 // Return the maximum shell type used 00804 // 00805 int Orbital::max_shell_type(void) { 00806 int maxshell=-1; 00807 00808 int shell_counter = 0; 00809 for (int at=0; at<numatoms; at++) { 00810 for (int shell=0; shell < num_shells_per_atom[at]; shell++) { 00811 int shelltype = shell_types[shell_counter]; 00812 shell_counter++; 00813 if (shelltype > maxshell) 00814 maxshell=shelltype; 00815 } 00816 } 00817 00818 return maxshell; 00819 } 00820 00821 00822 // 00823 // count the maximum number of wavefunction coefficient accesses 00824 // required for the highest shell types contained in this orbital 00825 // 00826 int Orbital::max_wave_f_count(void) { 00827 int maxcount=0; 00828 00829 int shell_counter = 0; 00830 for (int at=0; at<numatoms; at++) { 00831 for (int shell=0; shell < num_shells_per_atom[at]; shell++) { 00832 int shelltype = shell_types[shell_counter]; 00833 int i, j; 00834 int count=0; 00835 for (i=0; i<=shelltype; i++) { 00836 int jmax = shelltype - i; 00837 for (j=0; j<=jmax; j++) { 00838 count++; 00839 } 00840 } 00841 shell_counter++; 00842 if (count > maxcount) 00843 maxcount=count; 00844 } 00845 } 00846 00847 return maxcount; 00848 } 00849 00850 00851 // 00852 // compute the FLOPS per grid point for performance measurement purposes 00853 // 00854 double Orbital::flops_per_gridpoint() { 00855 double flops=0.0; 00856 00857 int shell_counter = 0; 00858 for (int at=0; at<numatoms; at++) { 00859 flops += 7; 00860 00861 for (int shell=0; shell < num_shells_per_atom[at]; shell++) { 00862 for (int prim=0; prim < num_prim_per_shell[shell_counter]; prim++) 00863 flops += 4; // expf() costs far more, but we count as one. 00864 00865 int shelltype = shell_types[shell_counter]; 00866 00867 switch (shelltype) { 00868 // separately count for the hand-optimized cases 00869 case S_SHELL: flops += 2; break; 00870 case P_SHELL: flops += 8; break; 00871 case D_SHELL: flops += 17; break; 00872 case F_SHELL: flops += 30; break; 00873 case G_SHELL: flops += 50; break; 00874 00875 // count up for catch-all loop 00876 default: 00877 int i, j; 00878 for (i=0; i<=shelltype; i++) { 00879 int jmax = shelltype - i; 00880 flops += 1; 00881 for (j=0; j<=jmax; j++) { 00882 flops += 6; 00883 } 00884 } 00885 break; 00886 } 00887 00888 shell_counter++; 00889 } 00890 } 00891 00892 return flops; 00893 } 00894 00895 00896 // 00897 // Fast single-precision expf() implementation 00898 // Adapted from the free cephes math library on Netlib 00899 // http://www.netlib.org/cephes/ 00900 // 00901 // Cephes Math Library Release 2.2: June, 1992 00902 // Copyright 1984, 1987, 1989 by Stephen L. Moshier 00903 // Direct inquiries to 30 Frost Street, Cambridge, MA 02140 00904 // 00905 static const float MAXNUMF = 3.4028234663852885981170418348451692544e38f; 00906 static const float MAXLOGF = 88.72283905206835f; 00907 static const float MINLOGF = -103.278929903431851103f; /* log(2^-149) */ 00908 static const float LOG2EF = 1.44269504088896341f; 00909 static const float C1 = 0.693359375f; 00910 static const float C2 = -2.12194440e-4f; 00911 00912 static inline float cephesfastexpf(float x) { 00913 float z; 00914 int n; 00915 00916 if(x > MAXLOGF) 00917 return MAXNUMF; 00918 00919 if(x < MINLOGF) 00920 return 0.0; 00921 00922 // Express e^x = e^g 2^n = e^g e^(n loge(2)) = e^(g + n loge(2)) 00923 z = floorf( LOG2EF * x + 0.5f ); // floor() truncates toward -infinity. 00924 x -= z * C1; 00925 x -= z * C2; 00926 n = (int) z; 00927 00928 z = x * x; 00929 // Theoretical peak relative error in [-0.5, +0.5] is 4.2e-9. 00930 z = ((((( 1.9875691500E-4f * x + 1.3981999507E-3f) * x 00931 + 8.3334519073E-3f) * x + 4.1665795894E-2f) * x 00932 + 1.6666665459E-1f) * x + 5.0000001201E-1f) * z + x + 1.0f; 00933 00934 x = ldexpf(z, n); // multiply by power of 2 00935 return x; 00936 } 00937 00938 00939 00940 00941 /* 00942 * David J. Hardy 00943 * 12 Dec 2008 00944 * 00945 * aexpfnx() - Approximate expf() for negative x. 00946 * 00947 * Assumes that x <= 0. 00948 * 00949 * Assumes IEEE format for single precision float, specifically: 00950 * 1 sign bit, 8 exponent bits biased by 127, and 23 mantissa bits. 00951 * 00952 * Interpolates exp() on interval (-1/log2(e), 0], then shifts it by 00953 * multiplication of a fast calculation for 2^(-N). The interpolation 00954 * uses a linear blending of 3rd degree Taylor polynomials at the end 00955 * points, so the approximation is once differentiable. 00956 * 00957 * The error is small (max relative error per interval is calculated 00958 * to be 0.131%, with a max absolute error of -0.000716). 00959 * 00960 * The cutoff is chosen so as to speed up the computation by early 00961 * exit from function, with the value chosen to give less than the 00962 * the max absolute error. Use of a cutoff is unnecessary, except 00963 * for needing to shift smallest floating point numbers to zero, 00964 * i.e. you could remove cutoff and replace by: 00965 * 00966 * #define MINXNZ -88.0296919311130 // -127 * log(2) 00967 * 00968 * if (x < MINXNZ) return 0.f; 00969 * 00970 * Use of a cutoff causes a discontinuity which can be eliminated 00971 * through the use of a switching function. 00972 * 00973 * We can obtain arbitrarily smooth approximation by taking k+1 nodes on 00974 * the interval and weighting their respective Taylor polynomials by the 00975 * kth order Lagrange interpolant through those nodes. The wiggle in the 00976 * polynomial interpolation due to equidistant nodes (Runge's phenomenon) 00977 * can be reduced by using Chebyshev nodes. 00978 */ 00979 00980 #define MLOG2EF -1.44269504088896f 00981 00982 /* 00983 * Interpolating coefficients for linear blending of the 00984 * 3rd degree Taylor expansion of 2^x about 0 and -1. 00985 */ 00986 #define SCEXP0 1.0000000000000000f 00987 #define SCEXP1 0.6987082824680118f 00988 #define SCEXP2 0.2633174272827404f 00989 #define SCEXP3 0.0923611991471395f 00990 #define SCEXP4 0.0277520543324108f 00991 00992 /* for single precision float */ 00993 #define EXPOBIAS 127 00994 #define EXPOSHIFT 23 00995 00996 /* cutoff is optional, but can help avoid unnecessary work */ 00997 #define ACUTOFF -10 00998 00999 typedef union flint_t { 01000 float f; 01001 int n; 01002 } flint; 01003 01004 float aexpfnx(float x) { 01005 /* assume x <= 0 */ 01006 float mb; 01007 int mbflr; 01008 float d; 01009 float sy; 01010 flint scalfac; 01011 01012 if (x < ACUTOFF) return 0.f; 01013 01014 mb = x * MLOG2EF; /* change base to 2, mb >= 0 */ 01015 mbflr = (int) mb; /* get int part, floor() */ 01016 d = mbflr - mb; /* remaining exponent, -1 < d <= 0 */ 01017 sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4))); 01018 /* approx with linear blend of Taylor polys */ 01019 scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT; /* 2^(-mbflr) */ 01020 return (sy * scalfac.f); /* scaled approx */ 01021 } 01022 01023 01024 01025 // 01026 // Optimized molecular orbital grid evaluation code 01027 // 01028 #define S_SHELL 0 01029 #define P_SHELL 1 01030 #define D_SHELL 2 01031 #define F_SHELL 3 01032 #define G_SHELL 4 01033 #define H_SHELL 5 01034 01035 int evaluate_grid(int numatoms, 01036 const float *wave_f, const float *basis_array, 01037 const float *atompos, 01038 const int *atom_basis, 01039 const int *num_shells_per_atom, 01040 const int *num_prim_per_shell, 01041 const int *shell_types, 01042 const int *numvoxels, 01043 float voxelsize, 01044 const float *origin, 01045 int density, 01046 float * orbitalgrid) { 01047 if (!orbitalgrid) 01048 return -1; 01049 01050 int nx, ny, nz; 01051 // Calculate the value of the orbital at each gridpoint and store in 01052 // the current oribtalgrid array 01053 int numgridxy = numvoxels[0]*numvoxels[1]; 01054 for (nz=0; nz<numvoxels[2]; nz++) { 01055 float grid_x, grid_y, grid_z; 01056 grid_z = origin[2] + nz * voxelsize; 01057 for (ny=0; ny<numvoxels[1]; ny++) { 01058 grid_y = origin[1] + ny * voxelsize; 01059 int gaddrzy = ny*numvoxels[0] + nz*numgridxy; 01060 for (nx=0; nx<numvoxels[0]; nx++) { 01061 grid_x = origin[0] + nx * voxelsize; 01062 01063 // calculate the value of the wavefunction of the 01064 // selected orbital at the current grid point 01065 int at; 01066 int prim, shell; 01067 01068 // initialize value of orbital at gridpoint 01069 float value = 0.0; 01070 01071 // initialize the wavefunction and shell counters 01072 int ifunc = 0; 01073 int shell_counter = 0; 01074 01075 // loop over all the QM atoms 01076 for (at=0; at<numatoms; at++) { 01077 int maxshell = num_shells_per_atom[at]; 01078 int prim_counter = atom_basis[at]; 01079 01080 // calculate distance between grid point and center of atom 01081 float xdist = (grid_x - atompos[3*at ])*ANGS_TO_BOHR; 01082 float ydist = (grid_y - atompos[3*at+1])*ANGS_TO_BOHR; 01083 float zdist = (grid_z - atompos[3*at+2])*ANGS_TO_BOHR; 01084 01085 float xdist2 = xdist*xdist; 01086 float ydist2 = ydist*ydist; 01087 float zdist2 = zdist*zdist; 01088 float xdist3 = xdist2*xdist; 01089 float ydist3 = ydist2*ydist; 01090 float zdist3 = zdist2*zdist; 01091 01092 float dist2 = xdist2 + ydist2 + zdist2; 01093 01094 // loop over the shells belonging to this atom 01095 // XXX this is maybe a misnomer because in split valence 01096 // basis sets like 6-31G we have more than one basis 01097 // function per (valence-)shell and we are actually 01098 // looping over the individual contracted GTOs 01099 for (shell=0; shell < maxshell; shell++) { 01100 float contracted_gto = 0.0f; 01101 01102 // Loop over the Gaussian primitives of this contracted 01103 // basis function to build the atomic orbital 01104 // 01105 // XXX there's a significant opportunity here for further 01106 // speedup if we replace the entire set of primitives 01107 // with the single gaussian that they are attempting 01108 // to model. This could give us another 6x speedup in 01109 // some of the common/simple cases. 01110 int maxprim = num_prim_per_shell[shell_counter]; 01111 int shelltype = shell_types[shell_counter]; 01112 for (prim=0; prim<maxprim; prim++) { 01113 float exponent = basis_array[prim_counter ]; 01114 float contract_coeff = basis_array[prim_counter + 1]; 01115 01116 // XXX By premultiplying the stored exponent factors etc, 01117 // we should be able to use exp2f() rather than exp(), 01118 // saving several FLOPS per iteration of this loop 01119 #if defined(__GNUC__) && !defined(__ICC) 01120 // Use David Hardy's fast spline approximation instead 01121 // Works well for GCC, but runs slower for Intel C. 01122 contracted_gto += contract_coeff * aexpfnx(-exponent*dist2); 01123 #elif defined(__ICC) 01124 // When compiling with ICC, we'll use an inlined 01125 // single-precision expf() implementation based on the 01126 // cephes math library found on Netlib. This outruns the 01127 // standard glibc expf() by over 2x in this algorithm. 01128 contracted_gto += contract_coeff * cephesfastexpf(-exponent*dist2); 01129 #else 01130 // XXX By far the most costly operation here is exp(), 01131 // for gcc builds, exp() accounts for 90% of the runtime 01132 contracted_gto += contract_coeff * expf(-exponent*dist2); 01133 #endif 01134 prim_counter += 2; 01135 } 01136 01137 /* multiply with the appropriate wavefunction coefficient */ 01138 float tmpshell=0; 01139 switch (shelltype) { 01140 case S_SHELL: 01141 value += wave_f[ifunc++] * contracted_gto; 01142 break; 01143 01144 case P_SHELL: 01145 tmpshell += wave_f[ifunc++] * xdist; 01146 tmpshell += wave_f[ifunc++] * ydist; 01147 tmpshell += wave_f[ifunc++] * zdist; 01148 value += tmpshell * contracted_gto; 01149 break; 01150 01151 case D_SHELL: 01152 tmpshell += wave_f[ifunc++] * xdist2; 01153 tmpshell += wave_f[ifunc++] * xdist * ydist; 01154 tmpshell += wave_f[ifunc++] * ydist2; 01155 tmpshell += wave_f[ifunc++] * xdist * zdist; 01156 tmpshell += wave_f[ifunc++] * ydist * zdist; 01157 tmpshell += wave_f[ifunc++] * zdist2; 01158 value += tmpshell * contracted_gto; 01159 break; 01160 01161 case F_SHELL: 01162 tmpshell += wave_f[ifunc++] * xdist3; // xxx 01163 tmpshell += wave_f[ifunc++] * xdist2 * ydist; // xxy 01164 tmpshell += wave_f[ifunc++] * ydist2 * xdist; // xyy 01165 tmpshell += wave_f[ifunc++] * ydist3; // yyy 01166 tmpshell += wave_f[ifunc++] * xdist2 * zdist; // xxz 01167 tmpshell += wave_f[ifunc++] * xdist * ydist * zdist; // xyz 01168 tmpshell += wave_f[ifunc++] * ydist2 * zdist; // yyz 01169 tmpshell += wave_f[ifunc++] * zdist2 * xdist; // xzz 01170 tmpshell += wave_f[ifunc++] * zdist2 * ydist; // yzz 01171 tmpshell += wave_f[ifunc++] * zdist3; // zzz 01172 value += tmpshell * contracted_gto; 01173 break; 01174 01175 case G_SHELL: 01176 tmpshell += wave_f[ifunc++] * xdist2 * xdist2; // xxxx 01177 tmpshell += wave_f[ifunc++] * xdist3 * ydist; // xxxy 01178 tmpshell += wave_f[ifunc++] * xdist2 * ydist2; // xxyy 01179 tmpshell += wave_f[ifunc++] * ydist3 * xdist; // xyyy 01180 tmpshell += wave_f[ifunc++] * ydist2 * ydist2; // yyyy 01181 tmpshell += wave_f[ifunc++] * xdist3 * zdist; // xxxz 01182 tmpshell += wave_f[ifunc++] * xdist2 * ydist * zdist; // xxyz 01183 tmpshell += wave_f[ifunc++] * ydist2 * xdist * zdist; // xyyz 01184 tmpshell += wave_f[ifunc++] * ydist3 * zdist; // yyyz 01185 tmpshell += wave_f[ifunc++] * xdist2 * zdist2; // xxzz 01186 tmpshell += wave_f[ifunc++] * zdist2 * xdist * ydist; // xyzz 01187 tmpshell += wave_f[ifunc++] * ydist2 * zdist2; // yyzz 01188 tmpshell += wave_f[ifunc++] * zdist3 * xdist; // zzzx 01189 tmpshell += wave_f[ifunc++] * zdist3 * ydist; // zzzy 01190 tmpshell += wave_f[ifunc++] * zdist2 * zdist2; // zzzz 01191 value += tmpshell * contracted_gto; 01192 break; 01193 01194 default: 01195 #if 1 01196 // avoid unnecessary branching and minimize use of pow() 01197 int i, j; 01198 float xdp, ydp, zdp; 01199 float xdiv = 1.0f / xdist; 01200 for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) { 01201 int imax = shelltype - j; 01202 for (i=0, ydp=1.0f, xdp=powf(xdist, float(imax)); i<=imax; i++, ydp*=ydist, xdp*=xdiv) { 01203 tmpshell += wave_f[ifunc++] * xdp * ydp * zdp; 01204 } 01205 } 01206 value += tmpshell * contracted_gto; 01207 #else 01208 int i, j, k; 01209 for (k=0; k<=shelltype; k++) { 01210 for (j=0; j<=shelltype; j++) { 01211 for (i=0; i<=shelltype; i++) { 01212 if (i+j+k==shelltype) { 01213 value += wave_f[ifunc++] * contracted_gto 01214 * pow(xdist,i) * pow(ydist,j) * pow(zdist,k); 01215 } 01216 } 01217 } 01218 } 01219 #endif 01220 } // end switch 01221 01222 shell_counter++; 01223 } // end shell 01224 } // end atom 01225 01226 // return either orbital density or orbital wavefunction amplitude 01227 if (density) { 01228 float orbdensity = value * value; 01229 if (value < 0.0) 01230 orbdensity = -orbdensity; 01231 orbitalgrid[gaddrzy + nx] = orbdensity; 01232 } else { 01233 orbitalgrid[gaddrzy + nx] = value; 01234 } 01235 } 01236 } 01237 } 01238 01239 return 0; 01240 } 01241 01242 01243 01244 #if defined(VMDORBUSESSE) && defined(__SSE2__) 01245 01246 #if 0 && !defined(_WIN64) // MSVC doesn't support old MMX intrinsics 01247 // 01248 // Adaptation of the Cephes exp() to an SSE-ized exp_ps() routine 01249 // originally by Julien Pommier 01250 // Copyright (C) 2007 Julien Pommier, ZLIB license 01251 // http://gruntthepeon.free.fr/ssemath/ 01252 // 01253 #ifdef _MSC_VER /* visual c++ */ 01254 # define ALIGN16_BEG __declspec(align(16)) 01255 # define ALIGN16_END 01256 #else /* gcc or icc */ 01257 # define ALIGN16_BEG 01258 # define ALIGN16_END __attribute__((aligned(16))) 01259 #endif 01260 01261 #define _PS_CONST(Name, Val) \ 01262 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val } 01263 #define _PI32_CONST(Name, Val) \ 01264 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val } 01265 01266 _PS_CONST(exp_hi, 88.3762626647949f); 01267 _PS_CONST(exp_lo, -88.3762626647949f); 01268 01269 _PS_CONST(cephes_LOG2EF, 1.44269504088896341); 01270 _PS_CONST(cephes_exp_C1, 0.693359375); 01271 _PS_CONST(cephes_exp_C2, -2.12194440e-4); 01272 01273 _PS_CONST(cephes_exp_p0, 1.9875691500E-4); 01274 _PS_CONST(cephes_exp_p1, 1.3981999507E-3); 01275 _PS_CONST(cephes_exp_p2, 8.3334519073E-3); 01276 _PS_CONST(cephes_exp_p3, 4.1665795894E-2); 01277 _PS_CONST(cephes_exp_p4, 1.6666665459E-1); 01278 _PS_CONST(cephes_exp_p5, 5.0000001201E-1); 01279 _PS_CONST(one, 1.0); 01280 _PS_CONST(half, 0.5); 01281 01282 _PI32_CONST(0x7f, 0x7f); 01283 01284 typedef union xmm_mm_union { 01285 __m128 xmm; 01286 __m64 mm[2]; 01287 } xmm_mm_union; 01288 01289 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \ 01290 xmm_mm_union u; u.xmm = xmm_; \ 01291 mm0_ = u.mm[0]; \ 01292 mm1_ = u.mm[1]; \ 01293 } 01294 01295 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \ 01296 xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \ 01297 } 01298 01299 __m128 exp_ps(__m128 x) { 01300 __m128 tmp = _mm_setzero_ps(), fx; 01301 __m64 mm0, mm1; 01302 01303 x = _mm_min_ps(x, *(__m128*)_ps_exp_hi); 01304 x = _mm_max_ps(x, *(__m128*)_ps_exp_lo); 01305 01306 /* express exp(x) as exp(g + n*log(2)) */ 01307 fx = _mm_mul_ps(x, *(__m128*)_ps_cephes_LOG2EF); 01308 fx = _mm_add_ps(fx,*(__m128*)_ps_half); 01309 01310 /* how to perform a floorf with SSE: just below */ 01311 /* step 1 : cast to int */ 01312 tmp = _mm_movehl_ps(tmp, fx); 01313 mm0 = _mm_cvttps_pi32(fx); 01314 mm1 = _mm_cvttps_pi32(tmp); 01315 /* step 2 : cast back to float */ 01316 tmp = _mm_cvtpi32x2_ps(mm0, mm1); 01317 /* if greater, substract 1 */ 01318 __m128 mask = _mm_cmpgt_ps(tmp, fx); 01319 mask = _mm_and_ps(mask, *(__m128*)_ps_one); 01320 fx = _mm_sub_ps(tmp, mask); 01321 01322 tmp = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C1); 01323 __m128 z = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C2); 01324 x = _mm_sub_ps(x, tmp); 01325 x = _mm_sub_ps(x, z); 01326 01327 z = _mm_mul_ps(x,x); 01328 01329 __m128 y = *(__m128*)_ps_cephes_exp_p0; 01330 y = _mm_mul_ps(y, x); 01331 y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p1); 01332 y = _mm_mul_ps(y, x); 01333 y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p2); 01334 y = _mm_mul_ps(y, x); 01335 y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p3); 01336 y = _mm_mul_ps(y, x); 01337 y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p4); 01338 y = _mm_mul_ps(y, x); 01339 y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p5); 01340 y = _mm_mul_ps(y, z); 01341 y = _mm_add_ps(y, x); 01342 y = _mm_add_ps(y, *(__m128*)_ps_one); 01343 01344 /* build 2^n */ 01345 z = _mm_movehl_ps(z, fx); 01346 mm0 = _mm_cvttps_pi32(fx); 01347 mm1 = _mm_cvttps_pi32(z); 01348 mm0 = _mm_add_pi32(mm0, *(__m64*)_pi32_0x7f); 01349 mm1 = _mm_add_pi32(mm1, *(__m64*)_pi32_0x7f); 01350 mm0 = _mm_slli_pi32(mm0, 23); 01351 mm1 = _mm_slli_pi32(mm1, 23); 01352 01353 __m128 pow2n; 01354 COPY_MM_TO_XMM(mm0, mm1, pow2n); 01355 01356 y = _mm_mul_ps(y, pow2n); 01357 _mm_empty(); 01358 return y; 01359 } 01360 #endif // MSVC doesn't support old MMX intrinsics 01361 01362 01363 // 01364 // David J. Hardy 01365 // 12 Dec 2008 01366 // 01367 // aexpfnxsse() - SSE2 version of aexpfnx(). 01368 // 01369 // 01370 #if defined(__GNUC__) && ! defined(__INTEL_COMPILER) 01371 #define __align(X) __attribute__((aligned(X) )) 01372 #if (__GNUC__ < 4) 01373 #define MISSING_mm_cvtsd_f64 01374 #endif 01375 #else 01376 #define __align(X) __declspec(align(X) ) 01377 #endif 01378 01379 #define MLOG2EF -1.44269504088896f 01380 01381 /* 01382 * Interpolating coefficients for linear blending of the 01383 * 3rd degree Taylor expansion of 2^x about 0 and -1. 01384 */ 01385 #define SCEXP0 1.0000000000000000f 01386 #define SCEXP1 0.6987082824680118f 01387 #define SCEXP2 0.2633174272827404f 01388 #define SCEXP3 0.0923611991471395f 01389 #define SCEXP4 0.0277520543324108f 01390 01391 /* for single precision float */ 01392 #define EXPOBIAS 127 01393 #define EXPOSHIFT 23 01394 01395 /* cutoff is optional, but can help avoid unnecessary work */ 01396 #define ACUTOFF -10 01397 01398 typedef union SSEreg_t { 01399 __m128 f; // 4x float (SSE) 01400 __m128i i; // 4x 32-bit int (SSE2) 01401 } SSEreg; 01402 01403 __m128 aexpfnxsse(__m128 x) { 01404 __align(16) SSEreg scal; 01405 __align(16) SSEreg n; 01406 __align(16) SSEreg y; 01407 01408 scal.f = _mm_cmpge_ps(x, _mm_set_ps1(ACUTOFF)); /* Is x within cutoff? */ 01409 01410 /* If all x are outside of cutoff, return 0s. */ 01411 if (_mm_movemask_ps(scal.f) == 0) { 01412 return _mm_setzero_ps(); 01413 } 01414 /* Otherwise, scal.f contains mask to be ANDed with the scale factor */ 01415 01416 /* 01417 * Convert base: exp(x) = 2^(N-d) where N is integer and 0 <= d < 1. 01418 * 01419 * Below we calculate n=N and x=-d, with "y" for temp storage, 01420 * calculate floor of x*log2(e) and subtract to get -d. 01421 */ 01422 y.f = _mm_mul_ps(x, _mm_set_ps1(MLOG2EF)); 01423 n.i = _mm_cvttps_epi32(y.f); 01424 x = _mm_cvtepi32_ps(n.i); 01425 x = _mm_sub_ps(x, y.f); 01426 01427 /* 01428 * Approximate 2^{-d}, 0 <= d < 1, by interpolation. 01429 * Perform Horner's method to evaluate interpolating polynomial. 01430 */ 01431 y.f = _mm_mul_ps(x, _mm_set_ps1(SCEXP4)); /* for x^4 term */ 01432 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP3)); /* for x^3 term */ 01433 y.f = _mm_mul_ps(y.f, x); 01434 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP2)); /* for x^2 term */ 01435 y.f = _mm_mul_ps(y.f, x); 01436 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP1)); /* for x^1 term */ 01437 y.f = _mm_mul_ps(y.f, x); 01438 y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP0)); /* for x^0 term */ 01439 01440 /* 01441 * Calculate 2^N exactly by directly manipulating floating point exponent. 01442 * Bitwise AND the result with scal.f mask to create the scale factor, 01443 * then use it to scale y for the final result. 01444 */ 01445 n.i = _mm_sub_epi32(_mm_set1_epi32(EXPOBIAS), n.i); 01446 n.i = _mm_slli_epi32(n.i, EXPOSHIFT); 01447 scal.f = _mm_and_ps(scal.f, n.f); 01448 y.f = _mm_mul_ps(y.f, scal.f); 01449 01450 return y.f; 01451 } 01452 01453 01454 int evaluate_grid_sse(int numatoms, 01455 const float *wave_f, const float *basis_array, 01456 const float *atompos, 01457 const int *atom_basis, 01458 const int *num_shells_per_atom, 01459 const int *num_prim_per_shell, 01460 const int *shell_types, 01461 const int *numvoxels, 01462 float voxelsize, 01463 const float *origin, 01464 int density, 01465 float * orbitalgrid) { 01466 if (!orbitalgrid) 01467 return -1; 01468 01469 int nx, ny, nz; 01470 __align(16) float sxdelta[4]; // 16-byte aligned for SSE 01471 for (nx=0; nx<4; nx++) 01472 sxdelta[nx] = ((float) nx) * voxelsize * ANGS_TO_BOHR; 01473 01474 // Calculate the value of the orbital at each gridpoint and store in 01475 // the current oribtalgrid array 01476 int numgridxy = numvoxels[0]*numvoxels[1]; 01477 for (nz=0; nz<numvoxels[2]; nz++) { 01478 float grid_x, grid_y, grid_z; 01479 grid_z = origin[2] + nz * voxelsize; 01480 for (ny=0; ny<numvoxels[1]; ny++) { 01481 grid_y = origin[1] + ny * voxelsize; 01482 int gaddrzy = ny*numvoxels[0] + nz*numgridxy; 01483 for (nx=0; nx<numvoxels[0]; nx+=4) { 01484 grid_x = origin[0] + nx * voxelsize; 01485 01486 // calculate the value of the wavefunction of the 01487 // selected orbital at the current grid point 01488 int at; 01489 int prim, shell; 01490 01491 // initialize value of orbital at gridpoint 01492 __m128 value = _mm_setzero_ps(); 01493 01494 // initialize the wavefunction and shell counters 01495 int ifunc = 0; 01496 int shell_counter = 0; 01497 01498 // loop over all the QM atoms 01499 for (at=0; at<numatoms; at++) { 01500 int maxshell = num_shells_per_atom[at]; 01501 int prim_counter = atom_basis[at]; 01502 01503 // calculate distance between grid point and center of atom 01504 float sxdist = (grid_x - atompos[3*at ])*ANGS_TO_BOHR; 01505 float sydist = (grid_y - atompos[3*at+1])*ANGS_TO_BOHR; 01506 float szdist = (grid_z - atompos[3*at+2])*ANGS_TO_BOHR; 01507 01508 float sydist2 = sydist*sydist; 01509 float szdist2 = szdist*szdist; 01510 float yzdist2 = sydist2 + szdist2; 01511 01512 __m128 xdelta = _mm_load_ps(&sxdelta[0]); // aligned load 01513 __m128 xdist = _mm_load_ps1(&sxdist); 01514 xdist = _mm_add_ps(xdist, xdelta); 01515 __m128 ydist = _mm_load_ps1(&sydist); 01516 __m128 zdist = _mm_load_ps1(&szdist); 01517 __m128 xdist2 = _mm_mul_ps(xdist, xdist); 01518 __m128 ydist2 = _mm_mul_ps(ydist, ydist); 01519 __m128 zdist2 = _mm_mul_ps(zdist, zdist); 01520 __m128 dist2 = _mm_load_ps1(&yzdist2); 01521 dist2 = _mm_add_ps(dist2, xdist2); 01522 01523 // loop over the shells belonging to this atom 01524 // XXX this is maybe a misnomer because in split valence 01525 // basis sets like 6-31G we have more than one basis 01526 // function per (valence-)shell and we are actually 01527 // looping over the individual contracted GTOs 01528 for (shell=0; shell < maxshell; shell++) { 01529 __m128 contracted_gto = _mm_setzero_ps(); 01530 01531 // Loop over the Gaussian primitives of this contracted 01532 // basis function to build the atomic orbital 01533 // 01534 // XXX there's a significant opportunity here for further 01535 // speedup if we replace the entire set of primitives 01536 // with the single gaussian that they are attempting 01537 // to model. This could give us another 6x speedup in 01538 // some of the common/simple cases. 01539 int maxprim = num_prim_per_shell[shell_counter]; 01540 int shelltype = shell_types[shell_counter]; 01541 for (prim=0; prim<maxprim; prim++) { 01542 // XXX pre-negate exponent value 01543 float exponent = -basis_array[prim_counter ]; 01544 float contract_coeff = basis_array[prim_counter + 1]; 01545 01546 // contracted_gto += contract_coeff * exp(-exponent*dist2); 01547 __m128 expval = _mm_mul_ps(_mm_load_ps1(&exponent), dist2); 01548 // SSE expf() required here 01549 #if 1 01550 __m128 retval = aexpfnxsse(expval); 01551 #else 01552 __m128 retval = exp_ps(expval); 01553 #endif 01554 __m128 ctmp = _mm_mul_ps(_mm_load_ps1(&contract_coeff), retval); 01555 contracted_gto = _mm_add_ps(contracted_gto, ctmp); 01556 prim_counter += 2; 01557 } 01558 01559 /* multiply with the appropriate wavefunction coefficient */ 01560 __m128 tmpshell = _mm_setzero_ps(); 01561 switch (shelltype) { 01562 case S_SHELL: 01563 value = _mm_add_ps(value, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), contracted_gto)); 01564 break; 01565 01566 case P_SHELL: 01567 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), xdist)); 01568 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), ydist)); 01569 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), zdist)); 01570 value = _mm_add_ps(value, _mm_mul_ps(tmpshell, contracted_gto)); 01571 break; 01572 01573 case D_SHELL: 01574 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), xdist2)); 01575 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(xdist, ydist))); 01576 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), ydist2)); 01577 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(xdist, zdist))); 01578 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(ydist, zdist))); 01579 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), zdist2)); 01580 value = _mm_add_ps(value, _mm_mul_ps(tmpshell, contracted_gto)); 01581 break; 01582 01583 case F_SHELL: 01584 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(xdist2, xdist))); 01585 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(xdist2, ydist))); 01586 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(ydist2, xdist))); 01587 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(ydist2, ydist))); 01588 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(xdist2, zdist))); 01589 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(_mm_mul_ps(xdist, ydist), zdist))); 01590 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(ydist2, zdist))); 01591 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(zdist2, xdist))); 01592 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(zdist2, ydist))); 01593 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(zdist2, zdist))); 01594 value = _mm_add_ps(value, _mm_mul_ps(tmpshell, contracted_gto)); 01595 break; 01596 01597 #if 0 01598 default: 01599 // avoid unnecessary branching and minimize use of pow() 01600 int i, j; 01601 float xdp, ydp, zdp; 01602 float xdiv = 1.0f / xdist; 01603 for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) { 01604 int imax = shelltype - j; 01605 for (i=0, ydp=1.0f, xdp=pow(xdist, imax); i<=imax; i++, ydp*=ydist, xdp*=xdiv) { 01606 tmpshell += wave_f[ifunc++] * xdp * ydp * zdp; 01607 } 01608 } 01609 value += tmpshell * contracted_gto; 01610 #endif 01611 } // end switch 01612 01613 shell_counter++; 01614 } // end shell 01615 } // end atom 01616 01617 // return either orbital density or orbital wavefunction amplitude 01618 if (density) { 01619 __m128 mask = _mm_cmplt_ps(value, _mm_setzero_ps()); 01620 __m128 sqdensity = _mm_mul_ps(value, value); 01621 __m128 orbdensity = sqdensity; 01622 __m128 nsqdensity = _mm_and_ps(sqdensity, mask); 01623 orbdensity = _mm_sub_ps(orbdensity, nsqdensity); 01624 orbdensity = _mm_sub_ps(orbdensity, nsqdensity); 01625 _mm_storeu_ps(&orbitalgrid[gaddrzy + nx], orbdensity); 01626 } else { 01627 _mm_storeu_ps(&orbitalgrid[gaddrzy + nx], value); 01628 } 01629 } 01630 } 01631 } 01632 01633 return 0; 01634 } 01635 01636 #endif 01637 01638 01639 01640 #if defined(VMDORBUSEVSX) && defined(__VSX__) 01641 // 01642 // John Stone, June 2016 01643 // 01644 // aexpfnxsse() - VSX version of aexpfnx(). 01645 // 01646 #if defined(__GNUC__) && ! defined(__INTEL_COMPILER) 01647 #define __align(X) __attribute__((aligned(X) )) 01648 #else 01649 #define __align(X) __declspec(align(X) ) 01650 #endif 01651 01652 #define MLOG2EF -1.44269504088896f 01653 01654 /* 01655 * Interpolating coefficients for linear blending of the 01656 * 3rd degree Taylor expansion of 2^x about 0 and -1. 01657 */ 01658 #define SCEXP0 1.0000000000000000f 01659 #define SCEXP1 0.6987082824680118f 01660 #define SCEXP2 0.2633174272827404f 01661 #define SCEXP3 0.0923611991471395f 01662 #define SCEXP4 0.0277520543324108f 01663 01664 /* for single precision float */ 01665 #define EXPOBIAS 127 01666 #define EXPOSHIFT 23 01667 01668 /* cutoff is optional, but can help avoid unnecessary work */ 01669 #define ACUTOFF -10 01670 01671 #if 0 01672 vector float ref_expf(vector float x) { 01673 vector float result; 01674 01675 int i; 01676 for (i=0; i<4; i++) { 01677 result[i] = expf(x[i]); 01678 } 01679 01680 return result; 01681 } 01682 #endif 01683 01684 vector float aexpfnxvsx(vector float x) { 01685 // scal.f = _mm_cmpge_ps(x, _mm_set_ps1(ACUTOFF)); /* Is x within cutoff? */ 01686 // 01687 // If all x are outside of cutoff, return 0s. 01688 // if (_mm_movemask_ps(scal.f) == 0) { 01689 // return _mm_setzero_ps(); 01690 // } 01691 // Otherwise, scal.f contains mask to be ANDed with the scale factor 01692 01693 /* 01694 * Convert base: exp(x) = 2^(N-d) where N is integer and 0 <= d < 1. 01695 * 01696 * Below we calculate n=N and x=-d, with "y" for temp storage, 01697 * calculate floor of x*log2(e) and subtract to get -d. 01698 */ 01699 vector float mb = vec_mul(x, vec_splats(MLOG2EF)); 01700 vector float mbflr = vec_floor(mb); 01701 vector float d = vec_sub(mbflr, mb); 01702 vector float y; 01703 01704 // Approximate 2^{-d}, 0 <= d < 1, by interpolation. 01705 // Perform Horner's method to evaluate interpolating polynomial. 01706 y = vec_madd(d, vec_splats(SCEXP4), vec_splats(SCEXP3)); 01707 y = vec_madd(y, d, vec_splats(SCEXP2)); 01708 y = vec_madd(y, d, vec_splats(SCEXP1)); 01709 y = vec_madd(y, d, vec_splats(SCEXP0)); 01710 01711 return vec_mul(y, vec_expte(-mbflr)); 01712 } 01713 01714 01715 int evaluate_grid_vsx(int numatoms, 01716 const float *wave_f, const float *basis_array, 01717 const float *atompos, 01718 const int *atom_basis, 01719 const int *num_shells_per_atom, 01720 const int *num_prim_per_shell, 01721 const int *shell_types, 01722 const int *numvoxels, 01723 float voxelsize, 01724 const float *origin, 01725 int density, 01726 float * orbitalgrid) { 01727 if (!orbitalgrid) 01728 return -1; 01729 01730 int nx, ny, nz; 01731 __attribute__((aligned(16))) float sxdelta[4]; // 16-byte aligned for VSX 01732 for (nx=0; nx<4; nx++) 01733 sxdelta[nx] = ((float) nx) * voxelsize * ANGS_TO_BOHR; 01734 01735 // Calculate the value of the orbital at each gridpoint and store in 01736 // the current oribtalgrid array 01737 int numgridxy = numvoxels[0]*numvoxels[1]; 01738 for (nz=0; nz<numvoxels[2]; nz++) { 01739 float grid_x, grid_y, grid_z; 01740 grid_z = origin[2] + nz * voxelsize; 01741 for (ny=0; ny<numvoxels[1]; ny++) { 01742 grid_y = origin[1] + ny * voxelsize; 01743 int gaddrzy = ny*numvoxels[0] + nz*numgridxy; 01744 for (nx=0; nx<numvoxels[0]; nx+=4) { 01745 grid_x = origin[0] + nx * voxelsize; 01746 01747 // calculate the value of the wavefunction of the 01748 // selected orbital at the current grid point 01749 int at; 01750 int prim, shell; 01751 01752 // initialize value of orbital at gridpoint 01753 vector float value = vec_splats(0.0f); 01754 01755 // initialize the wavefunction and shell counters 01756 int ifunc = 0; 01757 int shell_counter = 0; 01758 01759 // loop over all the QM atoms 01760 for (at=0; at<numatoms; at++) { 01761 int maxshell = num_shells_per_atom[at]; 01762 int prim_counter = atom_basis[at]; 01763 01764 // calculate distance between grid point and center of atom 01765 float sxdist = (grid_x - atompos[3*at ])*ANGS_TO_BOHR; 01766 float sydist = (grid_y - atompos[3*at+1])*ANGS_TO_BOHR; 01767 float szdist = (grid_z - atompos[3*at+2])*ANGS_TO_BOHR; 01768 01769 float sydist2 = sydist*sydist; 01770 float szdist2 = szdist*szdist; 01771 float yzdist2 = sydist2 + szdist2; 01772 01773 vector float xdelta = *((__vector float *) &sxdelta[0]); // aligned load 01774 vector float xdist = vec_splats(sxdist); 01775 xdist = vec_add(xdist, xdelta); 01776 vector float ydist = vec_splats(sydist); 01777 vector float zdist = vec_splats(szdist); 01778 vector float xdist2 = vec_mul(xdist, xdist); 01779 vector float ydist2 = vec_mul(ydist, ydist); 01780 vector float zdist2 = vec_mul(zdist, zdist); 01781 vector float dist2 = vec_splats(yzdist2); 01782 dist2 = vec_add(dist2, xdist2); 01783 01784 // loop over the shells belonging to this atom 01785 // XXX this is maybe a misnomer because in split valence 01786 // basis sets like 6-31G we have more than one basis 01787 // function per (valence-)shell and we are actually 01788 // looping over the individual contracted GTOs 01789 for (shell=0; shell < maxshell; shell++) { 01790 vector float contracted_gto = vec_splats(0.0f); 01791 01792 // Loop over the Gaussian primitives of this contracted 01793 // basis function to build the atomic orbital 01794 // 01795 // XXX there's a significant opportunity here for further 01796 // speedup if we replace the entire set of primitives 01797 // with the single gaussian that they are attempting 01798 // to model. This could give us another 6x speedup in 01799 // some of the common/simple cases. 01800 int maxprim = num_prim_per_shell[shell_counter]; 01801 int shelltype = shell_types[shell_counter]; 01802 for (prim=0; prim<maxprim; prim++) { 01803 // XXX pre-negate exponent value 01804 float exponent = -basis_array[prim_counter ]; 01805 float contract_coeff = basis_array[prim_counter + 1]; 01806 01807 // contracted_gto += contract_coeff * exp(-exponent*dist2); 01808 vector float expval = vec_mul(vec_splats(exponent), dist2); 01809 01810 // VSX expf() required here 01811 vector float retval = aexpfnxvsx(expval); 01812 01813 vector float ctmp = vec_mul(vec_splats(contract_coeff), retval); 01814 contracted_gto = vec_add(contracted_gto, ctmp); 01815 prim_counter += 2; 01816 } 01817 01818 /* multiply with the appropriate wavefunction coefficient */ 01819 vector float tmpshell = vec_splats(0.0f); 01820 switch (shelltype) { 01821 case S_SHELL: 01822 value = vec_add(value, vec_mul(vec_splats(wave_f[ifunc++]), contracted_gto)); 01823 break; 01824 01825 case P_SHELL: 01826 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), xdist)); 01827 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), ydist)); 01828 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), zdist)); 01829 value = vec_add(value, vec_mul(tmpshell, contracted_gto)); 01830 break; 01831 01832 case D_SHELL: 01833 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), xdist2)); 01834 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(xdist, ydist))); 01835 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), ydist2)); 01836 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(xdist, zdist))); 01837 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(ydist, zdist))); 01838 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), zdist2)); 01839 value = vec_add(value, vec_mul(tmpshell, contracted_gto)); 01840 break; 01841 01842 case F_SHELL: 01843 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(xdist2, xdist))); 01844 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(xdist2, ydist))); 01845 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(ydist2, xdist))); 01846 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(ydist2, ydist))); 01847 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(xdist2, zdist))); 01848 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(vec_mul(xdist, ydist), zdist))); 01849 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(ydist2, zdist))); 01850 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(zdist2, xdist))); 01851 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(zdist2, ydist))); 01852 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(zdist2, zdist))); 01853 value = vec_add(value, vec_mul(tmpshell, contracted_gto)); 01854 break; 01855 01856 #if 0 01857 default: 01858 // avoid unnecessary branching and minimize use of pow() 01859 int i, j; 01860 float xdp, ydp, zdp; 01861 float xdiv = 1.0f / xdist; 01862 for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) { 01863 int imax = shelltype - j; 01864 for (i=0, ydp=1.0f, xdp=pow(xdist, imax); i<=imax; i++, ydp*=ydist, xdp*=xdiv) { 01865 tmpshell += wave_f[ifunc++] * xdp * ydp * zdp; 01866 } 01867 } 01868 value += tmpshell * contracted_gto; 01869 #endif 01870 } // end switch 01871 01872 shell_counter++; 01873 } // end shell 01874 } // end atom 01875 01876 // return either orbital density or orbital wavefunction amplitude 01877 if (density) { 01878 value = vec_cpsgn(value, vec_mul(value, value)); 01879 01880 float *ufptr = &orbitalgrid[gaddrzy + nx]; 01881 ufptr[0] = value[0]; 01882 ufptr[1] = value[1]; 01883 ufptr[2] = value[2]; 01884 ufptr[3] = value[3]; 01885 } else { 01886 float *ufptr = &orbitalgrid[gaddrzy + nx]; 01887 ufptr[0] = value[0]; 01888 ufptr[1] = value[1]; 01889 ufptr[2] = value[2]; 01890 ufptr[3] = value[3]; 01891 } 01892 } 01893 } 01894 } 01895 01896 return 0; 01897 } 01898 01899 #endif 01900 01901 01902 01903 // 01904 // Multithreaded molecular orbital computation engine 01905 // 01906 01907 typedef struct { 01908 wkf_cpu_caps_t *cpucaps; 01909 int numatoms; 01910 const float *wave_f; 01911 const float *basis_array; 01912 const float *atompos; 01913 const int *atom_basis; 01914 const int *num_shells_per_atom; 01915 const int *num_prim_per_shell; 01916 const int *shell_types; 01917 const int *numvoxels; 01918 float voxelsize; 01919 int density; 01920 const float *origin; 01921 float *orbitalgrid; 01922 } orbthrparms; 01923 01924 01925 extern "C" void * orbitalthread(void *voidparms) { 01926 int numvoxels[3]; 01927 float origin[3]; 01928 orbthrparms *parms = NULL; 01929 #if defined(VMDORBUSETHRPOOL) 01930 wkf_threadpool_worker_getdata(voidparms, (void **) &parms); 01931 #else 01932 wkf_threadlaunch_getdata(voidparms, (void **) &parms); 01933 #endif 01934 01935 #if defined(VMDCPUDISPATCH) 01936 wkf_cpu_caps_t *cpucaps = parms->cpucaps; 01937 01938 #if defined(VMDUSEAVX512) 01939 int dispatch_AVX512ER = 0; 01940 if ((cpucaps->flags & CPU_AVX512ER) && (getenv("VMDNOAVX512ER") == NULL)) { 01941 // printf("evaluate_grid_avx512er\n"); 01942 dispatch_AVX512ER = 1; 01943 } 01944 01945 int dispatch_AVX512F = 0; 01946 if ((cpucaps->flags & CPU_AVX512F) && (getenv("VMDNOAVX512F") == NULL)) { 01947 dispatch_AVX512F = 1; 01948 // printf("evaluate_grid_avx512f\n"); 01949 } 01950 #endif 01951 01952 #if defined(VMDUSEAVX2) 01953 int dispatch_AVX2 = 0; 01954 if ((cpucaps->flags & CPU_AVX2) && (getenv("VMDNOAVX2") == NULL)) { 01955 dispatch_AVX2 = 1; 01956 // printf("evaluate_grid_avx2\n"); 01957 } 01958 #endif 01959 01960 #if defined(VMDUSESVE) 01961 int dispatch_SVE = 0; 01962 if ((cpucaps->flags & CPU_ARM64_SVE) && (getenv("VMDNOSVE") == NULL)) { 01963 dispatch_SVE = 1; 01964 // printf("evaluate_grid_sve\n"); 01965 } 01966 #endif 01967 01968 #if defined(VMDUSENEON) 01969 int dispatch_NEON = 0; 01970 if ((cpucaps->flags & CPU_ARM64_ASIMD) && (getenv("VMDNONEON") == NULL)) { 01971 dispatch_NEON = 1; 01972 // printf("evaluate_grid_neon\n"); 01973 } 01974 #endif 01975 01976 #endif // VMDCPUDISPATCH 01977 01978 // 01979 // Hard-coded compile-time fall-through vectorization paths 01980 // 01981 #if defined(VMDORBUSESSE) && defined(__SSE2__) 01982 int dispatch_SSE2 = 0; 01983 if ((getenv("VMDNOSSE2") == NULL)) { 01984 // printf("evaluate_grid_sse\n"); 01985 dispatch_SSE2 = 1; 01986 } 01987 #endif 01988 01989 #if defined(VMDORBUSEVSX) && defined(__VSX__) 01990 int dispatch_VSX = 0; 01991 if (getenv("VMDNOVSX") == NULL) { 01992 // printf("evaluate_grid_vsx\n"); 01993 dispatch_VSX = 1; 01994 } 01995 #endif 01996 01997 numvoxels[0] = parms->numvoxels[0]; 01998 numvoxels[1] = parms->numvoxels[1]; 01999 numvoxels[2] = 1; // we compute only a single plane 02000 02001 origin[0] = parms->origin[0]; 02002 origin[1] = parms->origin[1]; 02003 02004 // loop over orbital planes 02005 int planesize = numvoxels[0] * numvoxels[1]; 02006 wkf_tasktile_t tile; 02007 #if defined(VMDORBUSETHRPOOL) 02008 while (wkf_threadpool_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) { 02009 #else 02010 while (wkf_threadlaunch_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) { 02011 #endif 02012 int k; 02013 for (k=tile.start; k<tile.end; k++) { 02014 origin[2] = parms->origin[2] + parms->voxelsize * k; 02015 02016 #if defined(VMDCPUDISPATCH) 02017 // 02018 // runtime CPU dispatch 02019 // check for optional vector instructions and execute custom kernels 02020 // for the fastest code path supported by the detected hardware 02021 // 02022 if (cpucaps != NULL) { 02023 #if defined(VMDUSEAVX512) 02024 if (dispatch_AVX512ER) { 02025 evaluate_grid_avx512er(parms->numatoms, parms->wave_f, 02026 parms->basis_array, parms->atompos, parms->atom_basis, 02027 parms->num_shells_per_atom, parms->num_prim_per_shell, 02028 parms->shell_types, numvoxels, parms->voxelsize, 02029 origin, parms->density, parms->orbitalgrid + planesize*k); 02030 continue; 02031 } 02032 02033 if (dispatch_AVX512F) { 02034 evaluate_grid_avx512f(parms->numatoms, parms->wave_f, 02035 parms->basis_array, parms->atompos, parms->atom_basis, 02036 parms->num_shells_per_atom, parms->num_prim_per_shell, 02037 parms->shell_types, numvoxels, parms->voxelsize, 02038 origin, parms->density, parms->orbitalgrid + planesize*k); 02039 continue; 02040 } 02041 #endif 02042 02043 #if defined(VMDUSEAVX2) 02044 if (dispatch_AVX2) { 02045 evaluate_grid_avx2(parms->numatoms, parms->wave_f, 02046 parms->basis_array, parms->atompos, parms->atom_basis, 02047 parms->num_shells_per_atom, parms->num_prim_per_shell, 02048 parms->shell_types, numvoxels, parms->voxelsize, 02049 origin, parms->density, parms->orbitalgrid + planesize*k); 02050 continue; 02051 } 02052 #endif 02053 02054 #if defined(VMDUSESVE) 02055 if (dispatch_SVE) { 02056 evaluate_grid_sve(parms->numatoms, parms->wave_f, 02057 parms->basis_array, parms->atompos, parms->atom_basis, 02058 parms->num_shells_per_atom, parms->num_prim_per_shell, 02059 parms->shell_types, numvoxels, parms->voxelsize, 02060 origin, parms->density, parms->orbitalgrid + planesize*k); 02061 continue; 02062 } 02063 #endif 02064 02065 #if defined(VMDUSENEON) 02066 if (dispatch_NEON) { 02067 evaluate_grid_neon(parms->numatoms, parms->wave_f, 02068 parms->basis_array, parms->atompos, parms->atom_basis, 02069 parms->num_shells_per_atom, parms->num_prim_per_shell, 02070 parms->shell_types, numvoxels, parms->voxelsize, 02071 origin, parms->density, parms->orbitalgrid + planesize*k); 02072 continue; 02073 } 02074 #endif 02075 02076 } // runtime cpucaps-based dispatch 02077 #endif 02078 02079 02080 // 02081 // hard-coded fall-through path if runtime CPU dispatch doesn't match up 02082 // 02083 #if defined(VMDORBUSESSE) && defined(__SSE2__) 02084 if (dispatch_SSE2) { 02085 evaluate_grid_sse(parms->numatoms, parms->wave_f, 02086 parms->basis_array, parms->atompos, parms->atom_basis, 02087 parms->num_shells_per_atom, parms->num_prim_per_shell, 02088 parms->shell_types, numvoxels, parms->voxelsize, 02089 origin, parms->density, parms->orbitalgrid + planesize*k); 02090 continue; 02091 } 02092 #endif 02093 02094 #if defined(VMDORBUSEVSX) && defined(__VSX__) 02095 if (dispatch_VSX) { 02096 evaluate_grid_vsx(parms->numatoms, parms->wave_f, 02097 parms->basis_array, parms->atompos, parms->atom_basis, 02098 parms->num_shells_per_atom, parms->num_prim_per_shell, 02099 parms->shell_types, numvoxels, parms->voxelsize, 02100 origin, parms->density, parms->orbitalgrid + planesize*k); 02101 continue; 02102 } 02103 #endif 02104 02105 // Standard C++-based implementation that uses wither the 02106 // standard math library expf(), a faster modified Cephes expf(), 02107 // our our own fast exponential approximation routine. 02108 evaluate_grid(parms->numatoms, parms->wave_f, 02109 parms->basis_array, parms->atompos, parms->atom_basis, 02110 parms->num_shells_per_atom, parms->num_prim_per_shell, 02111 parms->shell_types, numvoxels, parms->voxelsize, 02112 origin, parms->density, parms->orbitalgrid + planesize*k); 02113 } 02114 } 02115 02116 return NULL; 02117 } 02118 02119 02120 int evaluate_grid_fast(wkf_cpu_caps_t *cpucaps, 02121 #if defined(VMDORBUSETHRPOOL) 02122 wkf_threadpool_t *thrpool, 02123 #else 02124 int numcputhreads, 02125 #endif 02126 int numatoms, 02127 const float *wave_f, 02128 const float *basis_array, 02129 const float *atompos, 02130 const int *atom_basis, 02131 const int *num_shells_per_atom, 02132 const int *num_prim_per_shell, 02133 const int *shell_types, 02134 const int *numvoxels, 02135 float voxelsize, 02136 const float *origin, 02137 int density, 02138 float * orbitalgrid) { 02139 int rc=0; 02140 orbthrparms parms; 02141 02142 parms.cpucaps = cpucaps; 02143 parms.numatoms = numatoms; 02144 parms.wave_f = wave_f; 02145 parms.basis_array = basis_array; 02146 parms.atompos = atompos; 02147 parms.atom_basis = atom_basis; 02148 parms.num_shells_per_atom = num_shells_per_atom; 02149 parms.num_prim_per_shell = num_prim_per_shell; 02150 parms.shell_types = shell_types; 02151 parms.numvoxels = numvoxels; 02152 parms.voxelsize = voxelsize; 02153 parms.origin = origin; 02154 parms.density = density; 02155 parms.orbitalgrid = orbitalgrid; 02156 02157 /* spawn child threads to do the work */ 02158 wkf_tasktile_t tile; 02159 tile.start = 0; 02160 tile.end = numvoxels[2]; 02161 02162 #if defined(VMDORBUSETHRPOOL) 02163 wkf_threadpool_sched_dynamic(thrpool, &tile); 02164 rc = wkf_threadpool_launch(thrpool, orbitalthread, &parms, 1); 02165 #else 02166 rc = wkf_threadlaunch(numcputhreads, &parms, orbitalthread, &tile); 02167 #endif 02168 02169 return rc; 02170 } 02171 02172 02173 void Orbital::print_wavefunction() { 02174 // XXX Android, IRIX, and Windows don't provide log2f(), nor log2() ?!?!?!?! 02175 // for now we'll just avoid compiling this debugging code 02176 #if !(defined(_MSC_VER) || defined(ARCH_IRIX6) || defined(ARCH_IRIX6_64) || defined(ARCH_ANDROIDARMV7A)) 02177 char shellname[6] = {'S', 'P', 'D', 'F', 'G', 'H'}; 02178 int ifunc = 0; 02179 int at; 02180 int shell; 02181 for (at=0; at<numatoms; at++) { 02182 for (shell=0; shell < num_shells_per_atom[at]; shell++) { 02183 int shelltype = basis_set[at].shell[shell].type; 02184 02185 // avoid unnecessary branching and minimize use of pow() 02186 int i, j, iang=0; 02187 float xdist=2.0; 02188 float ydist=2.0; 02189 float zdist=2.0; 02190 float xdp, ydp, zdp; 02191 float xdiv = 1.0f / xdist; 02192 for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) { 02193 int imax = shelltype - j; 02194 for (i=0, ydp=1.0f, xdp=pow(xdist, imax); i<=imax; i++, ydp*=ydist, xdp*=xdiv) { 02195 printf("%3i %c", at, shellname[shelltype]); 02196 int k, m=0; 02197 char buf[20]; buf[0] = '0円'; 02198 for (k=0; k<(int)log2f(xdp); k++, m++) sprintf(buf+m, "x"); 02199 for (k=0; k<(int)log2f(ydp); k++, m++) sprintf(buf+m, "y"); 02200 for (k=0; k<(int)log2f(zdp); k++, m++) sprintf(buf+m, "z"); 02201 //char *ang = qmdata->get_angular_momentum(at, shell, iang); 02202 printf("%-5s (%1.0f%1.0f%1.0f) wave_f[%3i] = % 11.6f\n", buf, 02203 log2f(xdp), log2f(ydp), log2f(zdp), ifunc, wave_f[ifunc]); 02204 //delete [] ang; 02205 iang++; 02206 ifunc++; 02207 } 02208 } 02209 } 02210 } 02211 #endif 02212 02213 }