00001 /*************************************************************************** 00002 *cr 00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the 00004 *cr University of Illinois 00005 *cr All Rights Reserved 00006 *cr 00007 ***************************************************************************/ 00008 /*************************************************************************** 00009 * RCS INFORMATION: 00010 * 00011 * $RCSfile: Orbital_NEON.C,v $ 00012 * $Author: johns $ $Locker: $ $State: Exp $ 00013 * $Revision: 1.3 $ $Date: 2022年05月06日 03:43:05 $ 00014 * 00015 ***************************************************************************/ 00021 // 00022 // Notes: 00023 // ARM intrinsics documentation: 00024 // https://developer.arm.com/architectures/instruction-sets/intrinsics 00025 // 00026 00027 00028 // Due to differences in code generation between gcc/intelc/clang/msvc, we 00029 // don't have to check for a defined(__NEON__) 00030 #if defined(VMDCPUDISPATCH) && defined(VMDUSENEON) 00031 #include <arm_neon.h> 00032 00033 #include <math.h> 00034 #include <stdio.h> 00035 #include "Orbital.h" 00036 #include "DrawMolecule.h" 00037 #include "utilities.h" 00038 #include "Inform.h" 00039 #include "WKFThreads.h" 00040 #include "WKFUtils.h" 00041 #include "ProfileHooks.h" 00042 00043 #define ANGS_TO_BOHR 1.88972612478289694072f 00044 00045 // #if defined(__GNUC__) 00046 #define __align(X) __attribute__((aligned(X) )) 00047 // #endif 00048 00049 #define MLOG2EF -1.44269504088896f 00050 00051 #if 0 00052 static void print_float32x4_t(float32x4_t v) { 00053 __attribute__((aligned(16))) float tmp[4]; // 16-byte aligned for NEON 00054 vst1q_f32(&tmp[0], v); 00055 00056 printf("print_float32x4_t: "); 00057 int i; 00058 for (i=0; i<4; i++) 00059 printf("%g ", tmp[i]); 00060 printf("\n"); 00061 } 00062 00063 static void print_int32x4_t(int32x4_t v) { 00064 __attribute__((aligned(16))) int tmp[4]; // 16-byte aligned for NEON 00065 vst1q_s32(&tmp[0], v); 00066 00067 printf("print_int32x4_t: "); 00068 int i; 00069 for (i=0; i<4; i++) 00070 printf("%d ", tmp[i]); 00071 printf("\n"); 00072 } 00073 00074 static void print_hex32x4_t(int32x4_t v) { 00075 __attribute__((aligned(16))) int tmp[4]; // 16-byte aligned for NEON 00076 vst1q_s32(&tmp[0], v); 00077 00078 printf("print_hex32x4_t: "); 00079 int i; 00080 for (i=0; i<4; i++) 00081 printf("%08x ", tmp[i]); 00082 printf("\n"); 00083 } 00084 #endif 00085 00086 00087 // 00088 // John Stone, February 2021 00089 // 00090 // aexpfnxneon() - NEON version of aexpfnx(). 00091 // 00092 00093 /* 00094 * Interpolating coefficients for linear blending of the 00095 * 3rd degree Taylor expansion of 2^x about 0 and -1. 00096 */ 00097 #define SCEXP0 1.0000000000000000f 00098 #define SCEXP1 0.6987082824680118f 00099 #define SCEXP2 0.2633174272827404f 00100 #define SCEXP3 0.0923611991471395f 00101 #define SCEXP4 0.0277520543324108f 00102 00103 /* for single precision float */ 00104 #define EXPOBIAS 127 00105 #define EXPOSHIFT 23 00106 00107 /* cutoff is optional, but can help avoid unnecessary work */ 00108 #define ACUTOFF -10 00109 00110 typedef union NEONreg_t { 00111 float32x4_t f; // 4x float (NEON) 00112 int32x4_t i; // 4x 32-bit int (NEON) 00113 } NEONreg; 00114 00115 float32x4_t aexpfnxneon(float32x4_t x) { 00116 __align(16) NEONreg scal; 00117 00118 #if 1 00119 // NEON seems to lack a convenient way to test if any lane was true, so 00120 // we use a different approach, and perform a horizontal maximum, comparing 00121 // the result of that against the cutoff to see if all four values are 00122 // below the the cutoff, and to early exit returning zeros in that case. 00123 // If all x are outside of cutoff, return 0s. There may be a better 00124 // early exit scheme here if we dig and find some more useful NEON 00125 // instructions. This block of code is reverse-ordered from the x86 00126 // variants as a result of the different scheme used here. 00127 float32x2_t tmp; 00128 tmp = vpmax_f32(vget_low_f32(x), vget_high_f32(x)); 00129 tmp = vpmax_f32(tmp, tmp); 00130 float vmax = vget_lane_f32(tmp, 0); 00131 if (vmax < ACUTOFF) { 00132 return vdupq_n_f32(0.0f); 00133 } 00134 #endif 00135 // Otherwise, scal.f contains mask to be ANDed with the scale factor 00136 scal.f = vcvtq_f32_u32(vcgeq_f32(x, vdupq_n_f32(ACUTOFF))); // Is x within cutoff? 00137 00138 /* 00139 * Convert base: exp(x) = 2^(N-d) where N is integer and 0 <= d < 1. 00140 * 00141 * Below we calculate n=N and x=-d, with "y" for temp storage, 00142 * calculate floor of x*log2(e) and subtract to get -d. 00143 */ 00144 __align(16) NEONreg n; 00145 float32x4_t mb = vmulq_f32(x, vdupq_n_f32(MLOG2EF)); 00146 n.i = vcvtq_s32_f32(mb); 00147 float32x4_t mbflr = vcvtq_f32_s32(n.i); 00148 float32x4_t d = vsubq_f32(mbflr, mb); 00149 00150 // Approximate 2^{-d}, 0 <= d < 1, by interpolation. 00151 // Perform Horner's method to evaluate interpolating polynomial. 00152 float32x4_t y; 00153 #if __ARM_FEATURE_FMA 00154 y = vfmaq_f32(vdupq_n_f32(SCEXP3), vdupq_n_f32(SCEXP4), d); 00155 y = vfmaq_f32(vdupq_n_f32(SCEXP2), d, y); 00156 y = vfmaq_f32(vdupq_n_f32(SCEXP1), d, y); 00157 y = vfmaq_f32(vdupq_n_f32(SCEXP0), d, y); 00158 #else 00159 y = vmulq_f32(d, vdupq_n_f32(SCEXP4)); /* for x^4 term */ 00160 y = vaddq_f32(y, vdupq_n_f32(SCEXP3)); /* for x^3 term */ 00161 y = vmulq_f32(y, d); 00162 y = vaddq_f32(y, vdupq_n_f32(SCEXP2)); /* for x^2 term */ 00163 y = vmulq_f32(y, d); 00164 y = vaddq_f32(y, vdupq_n_f32(SCEXP1)); /* for x^1 term */ 00165 y = vmulq_f32(y, d); 00166 y = vaddq_f32(y, vdupq_n_f32(SCEXP0)); /* for x^0 term */ 00167 #endif 00168 00169 // Calculate 2^N exactly by directly manipulating floating point exponent, 00170 // then use it to scale y for the final result. 00171 n.i = vsubq_s32(vdupq_n_s32(EXPOBIAS), n.i); 00172 n.i = vshlq_s32(n.i, vdupq_n_s32(EXPOSHIFT)); 00173 scal.i = vandq_s32(scal.i, n.i); 00174 y = vmulq_f32(y, scal.f); 00175 00176 return y; 00177 } 00178 00179 00180 // 00181 // NEON implementation for Xeons that don't have special fctn units 00182 // 00183 int evaluate_grid_neon(int numatoms, 00184 const float *wave_f, const float *basis_array, 00185 const float *atompos, 00186 const int *atom_basis, 00187 const int *num_shells_per_atom, 00188 const int *num_prim_per_shell, 00189 const int *shell_types, 00190 const int *numvoxels, 00191 float voxelsize, 00192 const float *origin, 00193 int density, 00194 float * orbitalgrid) { 00195 if (!orbitalgrid) 00196 return -1; 00197 00198 int nx, ny, nz; 00199 __attribute__((aligned(16))) float sxdelta[4]; // 16-byte aligned for NEON 00200 for (nx=0; nx<4; nx++) 00201 sxdelta[nx] = ((float) nx) * voxelsize * ANGS_TO_BOHR; 00202 00203 // Calculate the value of the orbital at each gridpoint and store in 00204 // the current oribtalgrid array 00205 int numgridxy = numvoxels[0]*numvoxels[1]; 00206 for (nz=0; nz<numvoxels[2]; nz++) { 00207 float grid_x, grid_y, grid_z; 00208 grid_z = origin[2] + nz * voxelsize; 00209 for (ny=0; ny<numvoxels[1]; ny++) { 00210 grid_y = origin[1] + ny * voxelsize; 00211 int gaddrzy = ny*numvoxels[0] + nz*numgridxy; 00212 for (nx=0; nx<numvoxels[0]; nx+=4) { 00213 grid_x = origin[0] + nx * voxelsize; 00214 00215 // calculate the value of the wavefunction of the 00216 // selected orbital at the current grid point 00217 int at; 00218 int prim, shell; 00219 00220 // initialize value of orbital at gridpoint 00221 float32x4_t value = vdupq_n_f32(0.0f); 00222 00223 // initialize the wavefunction and shell counters 00224 int ifunc = 0; 00225 int shell_counter = 0; 00226 00227 // loop over all the QM atoms 00228 for (at=0; at<numatoms; at++) { 00229 int maxshell = num_shells_per_atom[at]; 00230 int prim_counter = atom_basis[at]; 00231 00232 // calculate distance between grid point and center of atom 00233 float sxdist = (grid_x - atompos[3*at ])*ANGS_TO_BOHR; 00234 float sydist = (grid_y - atompos[3*at+1])*ANGS_TO_BOHR; 00235 float szdist = (grid_z - atompos[3*at+2])*ANGS_TO_BOHR; 00236 00237 float sydist2 = sydist*sydist; 00238 float szdist2 = szdist*szdist; 00239 float yzdist2 = sydist2 + szdist2; 00240 00241 float32x4_t xdelta = vld1q_f32(&sxdelta[0]); // aligned load 00242 float32x4_t xdist = vdupq_n_f32(sxdist); 00243 xdist = vaddq_f32(xdist, xdelta); 00244 float32x4_t ydist = vdupq_n_f32(sydist); 00245 float32x4_t zdist = vdupq_n_f32(szdist); 00246 float32x4_t xdist2 = vmulq_f32(xdist, xdist); 00247 float32x4_t ydist2 = vmulq_f32(ydist, ydist); 00248 float32x4_t zdist2 = vmulq_f32(zdist, zdist); 00249 float32x4_t dist2 = vdupq_n_f32(yzdist2); 00250 dist2 = vaddq_f32(dist2, xdist2); 00251 00252 // loop over the shells belonging to this atom 00253 // XXX this is maybe a misnomer because in split valence 00254 // basis sets like 6-31G we have more than one basis 00255 // function per (valence-)shell and we are actually 00256 // looping over the individual contracted GTOs 00257 for (shell=0; shell < maxshell; shell++) { 00258 float32x4_t contracted_gto = vdupq_n_f32(0.0f); 00259 00260 // Loop over the Gaussian primitives of this contracted 00261 // basis function to build the atomic orbital 00262 // 00263 // XXX there's a significant opportunity here for further 00264 // speedup if we replace the entire set of primitives 00265 // with the single gaussian that they are attempting 00266 // to model. This could give us another 6x speedup in 00267 // some of the common/simple cases. 00268 int maxprim = num_prim_per_shell[shell_counter]; 00269 int shelltype = shell_types[shell_counter]; 00270 for (prim=0; prim<maxprim; prim++) { 00271 // XXX pre-negate exponent value 00272 float exponent = -basis_array[prim_counter ]; 00273 float contract_coeff = basis_array[prim_counter + 1]; 00274 00275 // contracted_gto += contract_coeff * exp(-exponent*dist2); 00276 float32x4_t expval = vmulq_f32(vdupq_n_f32(exponent), dist2); 00277 // exp2f() equivalent required, use base-2 approximation 00278 float32x4_t retval = aexpfnxneon(expval); 00279 contracted_gto = vfmaq_f32(contracted_gto, retval, vdupq_n_f32(contract_coeff)); 00280 00281 prim_counter += 2; 00282 } 00283 00284 /* multiply with the appropriate wavefunction coefficient */ 00285 float32x4_t tmpshell = vdupq_n_f32(0.0f); 00286 switch (shelltype) { 00287 // use FMADD instructions 00288 case S_SHELL: 00289 value = vfmaq_f32(value, contracted_gto, vdupq_n_f32(wave_f[ifunc++])); 00290 break; 00291 00292 case P_SHELL: 00293 tmpshell = vfmaq_f32(tmpshell, xdist, vdupq_n_f32(wave_f[ifunc++])); 00294 tmpshell = vfmaq_f32(tmpshell, ydist, vdupq_n_f32(wave_f[ifunc++])); 00295 tmpshell = vfmaq_f32(tmpshell, zdist, vdupq_n_f32(wave_f[ifunc++])); 00296 value = vfmaq_f32(value, contracted_gto, tmpshell); 00297 break; 00298 00299 case D_SHELL: 00300 tmpshell = vfmaq_f32(tmpshell, xdist2, vdupq_n_f32(wave_f[ifunc++])); 00301 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(xdist, ydist), vdupq_n_f32(wave_f[ifunc++])); 00302 tmpshell = vfmaq_f32(tmpshell, ydist2, vdupq_n_f32(wave_f[ifunc++])); 00303 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(xdist, zdist), vdupq_n_f32(wave_f[ifunc++])); 00304 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(ydist, zdist), vdupq_n_f32(wave_f[ifunc++])); 00305 tmpshell = vfmaq_f32(tmpshell, zdist2, vdupq_n_f32(wave_f[ifunc++])); 00306 value = vfmaq_f32(value, contracted_gto, tmpshell); 00307 break; 00308 00309 case F_SHELL: 00310 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(xdist2, xdist), vdupq_n_f32(wave_f[ifunc++])); 00311 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(xdist2, ydist), vdupq_n_f32(wave_f[ifunc++])); 00312 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(ydist2, xdist), vdupq_n_f32(wave_f[ifunc++])); 00313 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(ydist2, ydist), vdupq_n_f32(wave_f[ifunc++])); 00314 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(xdist2, zdist), vdupq_n_f32(wave_f[ifunc++])); 00315 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(vmulq_f32(xdist, ydist), zdist), vdupq_n_f32(wave_f[ifunc++])); 00316 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(ydist2, zdist), vdupq_n_f32(wave_f[ifunc++])); 00317 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(zdist2, xdist), vdupq_n_f32(wave_f[ifunc++])); 00318 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(zdist2, ydist), vdupq_n_f32(wave_f[ifunc++])); 00319 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(zdist2, zdist), vdupq_n_f32(wave_f[ifunc++])); 00320 value = vfmaq_f32(value, contracted_gto, tmpshell); 00321 break; 00322 00323 #if 0 00324 default: 00325 // avoid unnecessary branching and minimize use of pow() 00326 int i, j; 00327 float xdp, ydp, zdp; 00328 float xdiv = 1.0f / xdist; 00329 for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) { 00330 int imax = shelltype - j; 00331 for (i=0, ydp=1.0f, xdp=pow(xdist, imax); i<=imax; i++, ydp*=ydist, xdp*=xdiv) { 00332 tmpshell += wave_f[ifunc++] * xdp * ydp * zdp; 00333 } 00334 } 00335 value += tmpshell * contracted_gto; 00336 #endif 00337 } // end switch 00338 00339 shell_counter++; 00340 } // end shell 00341 } // end atom 00342 00343 // return either orbital density or orbital wavefunction amplitude 00344 if (density) { 00345 float32x4_t mask = vcvtq_f32_u32(vcltq_f32(value, vdupq_n_f32(0.0f))); 00346 float32x4_t sqdensity = vmulq_f32(value, value); 00347 float32x4_t orbdensity = sqdensity; 00348 float32x4_t nsqdensity = vmulq_f32(sqdensity, mask); 00349 orbdensity = vsubq_f32(orbdensity, nsqdensity); 00350 orbdensity = vsubq_f32(orbdensity, nsqdensity); 00351 vst1q_f32(&orbitalgrid[gaddrzy + nx], orbdensity); 00352 } else { 00353 vst1q_f32(&orbitalgrid[gaddrzy + nx], value); 00354 } 00355 } 00356 } 00357 } 00358 00359 return 0; 00360 } 00361 00362 #endif 00363 00364