00001 /*************************************************************************** 00002 *cr 00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the 00004 *cr University of Illinois 00005 *cr All Rights Reserved 00006 *cr 00007 ***************************************************************************/ 00008 00009 /*************************************************************************** 00010 * RCS INFORMATION: 00011 * 00012 * $RCSfile: MeasureQCP.C,v $ 00013 * $Author: johns $ $Locker: $ $State: Exp $ 00014 * $Revision: 1.35 $ $Date: 2020年10月15日 16:07:31 $ 00015 * 00016 *************************************************************************** 00017 * DESCRIPTION: 00018 * Code to compute RMSD values for unaligned structures without 00019 * actually performing the alginment, particularly useful for 00020 * computing large dissimilarity matrices required for 00021 * trajectory clustering analysis 00022 * 00023 ***************************************************************************/ 00024 00025 #include <stdio.h> 00026 #include <stdlib.h> 00027 00028 #define VMDQCPUSESSE 1 00029 // #define VMDQCPUSEAVX2 1 00030 #if defined(VMDUSEAVX512) 00031 #define VMDQCPUSEAVX512 1 00032 #endif 00033 00034 #define VMDQCPUSETHRPOOL 1 00035 00036 #if VMDQCPUSESSE && defined(__SSE2__) 00037 #include <emmintrin.h> 00038 #endif 00039 #if VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__) 00040 #include <immintrin.h> 00041 #endif 00042 #if VMDQCPUSEAVX512 && defined(__AVX512F__) 00043 #include <immintrin.h> 00044 #endif 00045 #if (defined(VMDQCPUSEVSX) && defined(__VSX__)) 00046 #if defined(__GNUC__) && defined(__VEC__) 00047 #include <altivec.h> 00048 #endif 00049 #endif 00050 00051 #include <math.h> 00052 #include "Measure.h" 00053 #include "AtomSel.h" 00054 #include "utilities.h" 00055 #include "ResizeArray.h" 00056 #include "MoleculeList.h" 00057 #include "Inform.h" 00058 #include "Timestep.h" 00059 #include "CUDAAccel.h" 00060 #include "CUDAMeasureQCP.h" 00061 #include "VMDApp.h" 00062 #include "WKFThreads.h" 00063 #include "WKFUtils.h" 00064 00065 #if VMDQCPUSEAVX512 && defined(__AVX512F__) 00066 00067 static double hadd8_m512d(__m512d sum8) { 00068 // __m512d tmp = sum8; 00069 // __m512d hsum4 = _mm512_add_pd(tmp, _mm512_permute2f128_pd(tmp, tmp, 0x1)); 00070 // __m512d hsum2 = _mm256_castpd256_pd128(hsum4); 00071 // __m512d sum2 = _mm_hadd_pd(hsum2, hsum2); 00072 // return _mm_cvtsd_f64(sum2); 00073 return 0.0; 00074 } 00075 00076 00077 // AVX2 + FMA + 32-byte-aligned SOA-format memory buffers 00078 static double InnerProductSOA_avx512(double *A, 00079 float *crdx1, float *crdy1, float *crdz1, 00080 float *crdx2, float *crdy2, float *crdz2, 00081 const int cnt, const float *weight) { 00082 __m512d va0 = _mm512_set1_pd(0.0); 00083 __m512d va1 = _mm512_set1_pd(0.0); 00084 __m512d va2 = _mm512_set1_pd(0.0); 00085 __m512d va3 = _mm512_set1_pd(0.0); 00086 __m512d va4 = _mm512_set1_pd(0.0); 00087 __m512d va5 = _mm512_set1_pd(0.0); 00088 __m512d va6 = _mm512_set1_pd(0.0); 00089 __m512d va7 = _mm512_set1_pd(0.0); 00090 __m512d va8 = _mm512_set1_pd(0.0); 00091 __m512d vG1 = _mm512_set1_pd(0.0); 00092 __m512d vG2 = _mm512_set1_pd(0.0); 00093 00094 if (weight != NULL) { 00095 for (int i=0; i<cnt; i+=8) { 00096 __m256 xa8f = _mm256_load_ps(crdx1 + i); // load 8-float vectors 00097 __m256 ya8f = _mm256_load_ps(crdy1 + i); 00098 __m256 za8f = _mm256_load_ps(crdz1 + i); 00099 00100 __m512d xa8 = _mm512_cvtps_pd(xa8f); // convert from float to doubles 00101 __m512d ya8 = _mm512_cvtps_pd(ya8f); 00102 __m512d za8 = _mm512_cvtps_pd(za8f); 00103 00104 __m512d gatmp = _mm512_mul_pd(xa8, xa8); 00105 gatmp = _mm512_fmadd_pd(ya8, ya8, gatmp); 00106 gatmp = _mm512_fmadd_pd(za8, za8, gatmp); 00107 00108 __m256 xb8f = _mm256_load_ps(crdx2 + i); // load 8-float vectors 00109 __m256 yb8f = _mm256_load_ps(crdy2 + i); 00110 __m256 zb8f = _mm256_load_ps(crdz2 + i); 00111 00112 __m512d xb8 = _mm512_cvtps_pd(xb8f); // convert from float to doubles 00113 __m512d yb8 = _mm512_cvtps_pd(yb8f); 00114 __m512d zb8 = _mm512_cvtps_pd(zb8f); 00115 00116 __m512d gbtmp = _mm512_mul_pd(xb8, xb8); 00117 gbtmp = _mm512_fmadd_pd(yb8, yb8, gbtmp); 00118 gbtmp = _mm512_fmadd_pd(zb8, zb8, gbtmp); 00119 00120 __m256 w8f = _mm256_load_ps(weight + i); // load 8-float vector 00121 __m512d w8 = _mm512_cvtps_pd(w8f); // convert from float to double 00122 00123 vG1 = _mm512_fmadd_pd(w8, gatmp, vG1); 00124 vG2 = _mm512_fmadd_pd(w8, gbtmp, vG2); 00125 00126 va0 = _mm512_fmadd_pd(xa8, xb8, va0); 00127 va1 = _mm512_fmadd_pd(xa8, yb8, va1); 00128 va2 = _mm512_fmadd_pd(xa8, zb8, va2); 00129 00130 va3 = _mm512_fmadd_pd(ya8, xb8, va3); 00131 va4 = _mm512_fmadd_pd(ya8, yb8, va4); 00132 va5 = _mm512_fmadd_pd(ya8, zb8, va5); 00133 00134 va6 = _mm512_fmadd_pd(za8, xb8, va6); 00135 va7 = _mm512_fmadd_pd(za8, yb8, va7); 00136 va8 = _mm512_fmadd_pd(za8, zb8, va8); 00137 } 00138 } else { 00139 for (int i=0; i<cnt; i+=8) { 00140 __m256 xa8f = _mm256_load_ps(crdx1 + i); // load 8-float vectors 00141 __m256 ya8f = _mm256_load_ps(crdy1 + i); 00142 __m256 za8f = _mm256_load_ps(crdz1 + i); 00143 00144 __m512d xa8 = _mm512_cvtps_pd(xa8f); // convert from float to doubles 00145 __m512d ya8 = _mm512_cvtps_pd(ya8f); 00146 __m512d za8 = _mm512_cvtps_pd(za8f); 00147 00148 vG1 = _mm512_fmadd_pd(xa8, xa8, vG1); 00149 vG1 = _mm512_fmadd_pd(ya8, ya8, vG1); 00150 vG1 = _mm512_fmadd_pd(za8, za8, vG1); 00151 00152 __m256 xb8f = _mm256_load_ps(crdx2 + i); // load 8-float vectors 00153 __m256 yb8f = _mm256_load_ps(crdy2 + i); 00154 __m256 zb8f = _mm256_load_ps(crdz2 + i); 00155 00156 __m512d xb8 = _mm512_cvtps_pd(xb8f); // convert from float to doubles 00157 __m512d yb8 = _mm512_cvtps_pd(yb8f); 00158 __m512d zb8 = _mm512_cvtps_pd(zb8f); 00159 00160 vG2 = _mm512_fmadd_pd(xb8, xb8, vG2); 00161 vG2 = _mm512_fmadd_pd(yb8, yb8, vG2); 00162 vG2 = _mm512_fmadd_pd(zb8, zb8, vG2); 00163 00164 va0 = _mm512_fmadd_pd(xa8, xb8, va0); 00165 va1 = _mm512_fmadd_pd(xa8, yb8, va1); 00166 va2 = _mm512_fmadd_pd(xa8, zb8, va2); 00167 00168 va3 = _mm512_fmadd_pd(ya8, xb8, va3); 00169 va4 = _mm512_fmadd_pd(ya8, yb8, va4); 00170 va5 = _mm512_fmadd_pd(ya8, zb8, va5); 00171 00172 va6 = _mm512_fmadd_pd(za8, xb8, va6); 00173 va7 = _mm512_fmadd_pd(za8, yb8, va7); 00174 va8 = _mm512_fmadd_pd(za8, zb8, va8); 00175 } 00176 } 00177 00178 A[0] = hadd8_m512d(va0); 00179 A[1] = hadd8_m512d(va1); 00180 A[2] = hadd8_m512d(va2); 00181 A[3] = hadd8_m512d(va3); 00182 A[4] = hadd8_m512d(va4); 00183 A[5] = hadd8_m512d(va5); 00184 A[6] = hadd8_m512d(va6); 00185 A[7] = hadd8_m512d(va7); 00186 A[8] = hadd8_m512d(va8); 00187 00188 double G1 = hadd8_m512d(vG1); 00189 double G2 = hadd8_m512d(vG2); 00190 00191 return (G1 + G2) * 0.5; 00192 } 00193 00194 #endif 00195 00196 #if VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__) 00197 00198 static double hadd4_m256d(__m256d sum4) { 00199 __m256d tmp = sum4; 00200 __m256d hsum4 = _mm256_add_pd(tmp, _mm256_permute2f128_pd(tmp, tmp, 0x1)); 00201 __m128d hsum2 = _mm256_castpd256_pd128(hsum4); 00202 __m128d sum2 = _mm_hadd_pd(hsum2, hsum2); 00203 return _mm_cvtsd_f64(sum2); 00204 } 00205 00206 00207 // AVX2 + FMA + 32-byte-aligned SOA-format memory buffers 00208 static double InnerProductSOA_avx2(double *A, 00209 float *crdx1, float *crdy1, float *crdz1, 00210 float *crdx2, float *crdy2, float *crdz2, 00211 const int cnt, const float *weight) { 00212 __m256d va0 = _mm256_set1_pd(0.0); 00213 __m256d va1 = _mm256_set1_pd(0.0); 00214 __m256d va2 = _mm256_set1_pd(0.0); 00215 __m256d va3 = _mm256_set1_pd(0.0); 00216 __m256d va4 = _mm256_set1_pd(0.0); 00217 __m256d va5 = _mm256_set1_pd(0.0); 00218 __m256d va6 = _mm256_set1_pd(0.0); 00219 __m256d va7 = _mm256_set1_pd(0.0); 00220 __m256d va8 = _mm256_set1_pd(0.0); 00221 __m256d vG1 = _mm256_set1_pd(0.0); 00222 __m256d vG2 = _mm256_set1_pd(0.0); 00223 00224 if (weight != NULL) { 00225 for (int i=0; i<cnt; i+=4) { 00226 __m128 xa4f = _mm_load_ps(crdx1 + i); // load 4-float vectors 00227 __m128 ya4f = _mm_load_ps(crdy1 + i); 00228 __m128 za4f = _mm_load_ps(crdz1 + i); 00229 00230 __m256d xa4 = _mm256_cvtps_pd(xa4f); // convert from float to doubles 00231 __m256d ya4 = _mm256_cvtps_pd(ya4f); 00232 __m256d za4 = _mm256_cvtps_pd(za4f); 00233 00234 __m256d gatmp = _mm256_mul_pd(xa4, xa4); 00235 gatmp = _mm256_fmadd_pd(ya4, ya4, gatmp); 00236 gatmp = _mm256_fmadd_pd(za4, za4, gatmp); 00237 00238 __m128 xb4f = _mm_load_ps(crdx2 + i); // load 4-float vectors 00239 __m128 yb4f = _mm_load_ps(crdy2 + i); 00240 __m128 zb4f = _mm_load_ps(crdz2 + i); 00241 00242 __m256d xb4 = _mm256_cvtps_pd(xb4f); // convert from float to doubles 00243 __m256d yb4 = _mm256_cvtps_pd(yb4f); 00244 __m256d zb4 = _mm256_cvtps_pd(zb4f); 00245 00246 __m256d gbtmp = _mm256_mul_pd(xb4, xb4); 00247 gbtmp = _mm256_fmadd_pd(yb4, yb4, gbtmp); 00248 gbtmp = _mm256_fmadd_pd(zb4, zb4, gbtmp); 00249 00250 __m128 w4f = _mm_load_ps(weight + i); // load 4-float vector 00251 __m256d w4 = _mm256_cvtps_pd(w4f); // convert from float to double 00252 00253 vG1 = _mm256_fmadd_pd(w4, gatmp, vG1); 00254 vG2 = _mm256_fmadd_pd(w4, gbtmp, vG2); 00255 00256 va0 = _mm256_fmadd_pd(xa4, xb4, va0); 00257 va1 = _mm256_fmadd_pd(xa4, yb4, va1); 00258 va2 = _mm256_fmadd_pd(xa4, zb4, va2); 00259 00260 va3 = _mm256_fmadd_pd(ya4, xb4, va3); 00261 va4 = _mm256_fmadd_pd(ya4, yb4, va4); 00262 va5 = _mm256_fmadd_pd(ya4, zb4, va5); 00263 00264 va6 = _mm256_fmadd_pd(za4, xb4, va6); 00265 va7 = _mm256_fmadd_pd(za4, yb4, va7); 00266 va8 = _mm256_fmadd_pd(za4, zb4, va8); 00267 } 00268 } else { 00269 for (int i=0; i<cnt; i+=4) { 00270 __m128 xa4f = _mm_load_ps(crdx1 + i); // load 4-float vectors 00271 __m128 ya4f = _mm_load_ps(crdy1 + i); 00272 __m128 za4f = _mm_load_ps(crdz1 + i); 00273 00274 __m256d xa4 = _mm256_cvtps_pd(xa4f); // convert from float to doubles 00275 __m256d ya4 = _mm256_cvtps_pd(ya4f); 00276 __m256d za4 = _mm256_cvtps_pd(za4f); 00277 00278 vG1 = _mm256_fmadd_pd(xa4, xa4, vG1); 00279 vG1 = _mm256_fmadd_pd(ya4, ya4, vG1); 00280 vG1 = _mm256_fmadd_pd(za4, za4, vG1); 00281 00282 __m128 xb4f = _mm_load_ps(crdx2 + i); // load 4-float vectors 00283 __m128 yb4f = _mm_load_ps(crdy2 + i); 00284 __m128 zb4f = _mm_load_ps(crdz2 + i); 00285 00286 __m256d xb4 = _mm256_cvtps_pd(xb4f); // convert from float to doubles 00287 __m256d yb4 = _mm256_cvtps_pd(yb4f); 00288 __m256d zb4 = _mm256_cvtps_pd(zb4f); 00289 00290 vG2 = _mm256_fmadd_pd(xb4, xb4, vG2); 00291 vG2 = _mm256_fmadd_pd(yb4, yb4, vG2); 00292 vG2 = _mm256_fmadd_pd(zb4, zb4, vG2); 00293 00294 va0 = _mm256_fmadd_pd(xa4, xb4, va0); 00295 va1 = _mm256_fmadd_pd(xa4, yb4, va1); 00296 va2 = _mm256_fmadd_pd(xa4, zb4, va2); 00297 00298 va3 = _mm256_fmadd_pd(ya4, xb4, va3); 00299 va4 = _mm256_fmadd_pd(ya4, yb4, va4); 00300 va5 = _mm256_fmadd_pd(ya4, zb4, va5); 00301 00302 va6 = _mm256_fmadd_pd(za4, xb4, va6); 00303 va7 = _mm256_fmadd_pd(za4, yb4, va7); 00304 va8 = _mm256_fmadd_pd(za4, zb4, va8); 00305 } 00306 } 00307 00308 A[0] = hadd4_m256d(va0); 00309 A[1] = hadd4_m256d(va1); 00310 A[2] = hadd4_m256d(va2); 00311 A[3] = hadd4_m256d(va3); 00312 A[4] = hadd4_m256d(va4); 00313 A[5] = hadd4_m256d(va5); 00314 A[6] = hadd4_m256d(va6); 00315 A[7] = hadd4_m256d(va7); 00316 A[8] = hadd4_m256d(va8); 00317 00318 double G1 = hadd4_m256d(vG1); 00319 double G2 = hadd4_m256d(vG2); 00320 00321 return (G1 + G2) * 0.5; 00322 } 00323 00324 #endif 00325 00326 00327 // plain C++ version of inner product for SOA coordinate storage 00328 static double InnerProductSOA(double *A, 00329 float *crdx1, float *crdy1, float *crdz1, 00330 float *crdx2, float *crdy2, float *crdz2, 00331 const int cnt, const float *weight) { 00332 double G1=0.0, G2 = 0.0; 00333 memset(A, 0, sizeof(double) * 9); 00334 00335 double x1, x2, y1, y2, z1, z2; 00336 double a0, a1, a2, a3, a4, a5, a6, a7, a8; 00337 a0=a1=a2=a3=a4=a5=a6=a7=a8=0.0; 00338 if (weight != NULL) { 00339 for (int i=0; i<cnt; i++) { 00340 double w = weight[i]; 00341 x1 = crdx1[i]; 00342 y1 = crdy1[i]; 00343 z1 = crdz1[i]; 00344 00345 G1 += w * (x1*x1 + y1*y1 + z1*z1); 00346 00347 x2 = crdx2[i]; 00348 y2 = crdy2[i]; 00349 z2 = crdz2[i]; 00350 00351 G2 += w * (x2*x2 + y2*y2 + z2*z2); 00352 00353 a0 += x1 * x2; 00354 a1 += x1 * y2; 00355 a2 += x1 * z2; 00356 00357 a3 += y1 * x2; 00358 a4 += y1 * y2; 00359 a5 += y1 * z2; 00360 00361 a6 += z1 * x2; 00362 a7 += z1 * y2; 00363 a8 += z1 * z2; 00364 } 00365 } else { 00366 for (int i=0; i<cnt; i++) { 00367 x1 = crdx1[i]; 00368 y1 = crdy1[i]; 00369 z1 = crdz1[i]; 00370 00371 G1 += x1*x1 + y1*y1 + z1*z1; 00372 00373 x2 = crdx2[i]; 00374 y2 = crdy2[i]; 00375 z2 = crdz2[i]; 00376 00377 G2 += x2*x2 + y2*y2 + z2*z2; 00378 00379 a0 += x1 * x2; 00380 a1 += x1 * y2; 00381 a2 += x1 * z2; 00382 00383 a3 += y1 * x2; 00384 a4 += y1 * y2; 00385 a5 += y1 * z2; 00386 00387 a6 += z1 * x2; 00388 a7 += z1 * y2; 00389 a8 += z1 * z2; 00390 } 00391 } 00392 00393 A[0] = a0; 00394 A[1] = a1; 00395 A[2] = a2; 00396 00397 A[3] = a3; 00398 A[4] = a4; 00399 A[5] = a5; 00400 00401 A[6] = a6; 00402 A[7] = a7; 00403 A[8] = a8; 00404 00405 return (G1 + G2) * 0.5; 00406 } 00407 00408 // 00409 // OpenACC version of inner product for SOA coordinate storage 00410 // 00411 // use pgc++ -m64 -Minfo=accel -ta=nvidia -O -acc 00412 #if defined(__PGIC__) && defined(_OPENACC) 00413 00414 #if 0 00415 static void vecadd_acc(void) { 00416 printf("****** OpenACC test vecadd_acc()...\n"); 00417 00418 // Size of vectors 00419 int n = 10000; 00420 00421 // Input vectors 00422 double *restrict a; 00423 double *restrict b; 00424 00425 // Output vector 00426 double *restrict c; 00427 00428 // Size, in bytes, of each vector 00429 size_t bytes = n*sizeof(double); 00430 00431 // Allocate memory for each vector 00432 a = (double*)malloc(bytes); 00433 b = (double*)malloc(bytes); 00434 c = (double*)malloc(bytes); 00435 00436 // Initialize content of input vectors, vector a[i] = sin(i)^2 vector b[i] = cos(i)^2 00437 int i; 00438 for (i=0; i<n; i++) { 00439 a[i] = sin(i)*sin(i); 00440 b[i] = cos(i)*cos(i); 00441 } 00442 00443 // sum component wise and save result into vector c 00444 #pragma acc kernels copyin(a[0:n],b[0:n]), copyout(c[0:n]) 00445 for (i=0; i<n; i++) { 00446 c[i] = a[i] + b[i]; 00447 } 00448 00449 // Sum up vector c and print result divided by n, this should equal 1 within error 00450 double sum = 0.0; 00451 for(i=0; i<n; i++) { 00452 sum += c[i]; 00453 } 00454 sum = sum/n; 00455 printf("****** final result: %f *******\n", sum); 00456 00457 // Release memory 00458 free(a); 00459 free(b); 00460 free(c); 00461 00462 printf("****** OpenACC test vecadd_acc() done.\n"); 00463 } 00464 #endif 00465 00466 // 00467 // Use 1-D loop rather than 2-D to please PGI OpenACC so it doesn't 00468 // complain about loop-carried dependencies, this appears to be the only 00469 // successful method to achieve a good quality parallelization. 00470 // 00471 #define LOOP1D 1 00472 00473 #if defined(LOOP1D) 00474 #if defined(__PGIC__) && defined(_OPENACC) 00475 #pragma acc routine seq 00476 #endif 00477 static void acc_idx2sub_tril(long N, long ind, long *J, long *I) { 00478 long i, j; 00479 i = long(floor((2*N+1 - sqrt((2*N+1)*(2*N+1) - 8*ind)) / 2)); 00480 j = ind - N*i + i*(i-1)/2 + i; 00481 00482 *I = i; 00483 *J = j; 00484 } 00485 #endif 00486 00487 static void rmsdmat_qcp_acc(int cnt, int padcnt, int framecrdsz, 00488 int framecount, 00489 #if defined(LOOP1D) 00490 const float * restrict crds, 00491 #else 00492 const float * crds, 00493 #endif 00494 // const float *weight, 00495 float * rmsdmat) { 00496 printf("OpenACC rmsdmat_qcp_acc()...\n"); 00497 printf("ACC cnt: %d padcnt: %d\n", cnt, padcnt); 00498 00499 printf("Copying input arrays to accelerators...\n"); 00500 long totalsz = 3L * framecrdsz * framecount; 00501 printf("ACC copysz: %ld (3 * %d * %d)\n", totalsz, framecrdsz, framecount); 00502 00503 long matcnt = framecount * framecount; 00504 printf("ACC matcnt: %ld\n", matcnt); 00505 00506 printf("Running OpenACC kernels...\n"); 00507 #if defined(LOOP1D) 00508 long i, j, k; 00509 #pragma acc kernels copyin(crds[0:totalsz]), copy(rmsdmat[0:matcnt]) 00510 for (k=0; k<(framecount*(framecount-1))/2; k++) { 00511 acc_idx2sub_tril(long(framecount-1), k, &i, &j); 00512 long x1addr = j * 3L * framecrdsz; 00513 { 00514 #else 00515 long i, j; 00516 #pragma acc kernels copyin(crds[0:totalsz]), copy(rmsdmat[0:matcnt]) 00517 for (j=0; j<framecount; j++) { 00518 long x1addr = j * 3L * framecrdsz; 00519 00520 for (i=0; i<j; i++) { 00521 #endif 00522 // calculate the (weighted) inner product of two structures 00523 long x2addr = i * 3L * framecrdsz; 00524 00525 double G1=0.0, G2=0.0; 00526 00527 double a0, a1, a2, a3, a4, a5, a6, a7, a8; 00528 a0=a1=a2=a3=a4=a5=a6=a7=a8=0.0; 00529 #if 0 00530 if (weight != NULL) { 00531 double x1, x2, y1, y2, z1, z2; 00532 #pragma acc loop 00533 for (long l=0; l<cnt; l++) { 00534 double w = weight[l]; 00535 x1 = crds[l + x1addr]; 00536 y1 = crds[l + x1addr + framecrdsz]; 00537 z1 = crds[l + x1addr + framecrdsz*2]; 00538 00539 G1 += w * (x1*x1 + y1*y1 + z1*z1); 00540 00541 x2 = crds[l + x2addr]; 00542 y2 = crds[l + x2addr + framecrdsz]; 00543 z2 = crds[l + x2addr + framecrdsz*2]; 00544 00545 G2 += w * (x2*x2 + y2*y2 + z2*z2); 00546 00547 a0 += x1 * x2; 00548 a1 += x1 * y2; 00549 a2 += x1 * z2; 00550 00551 a3 += y1 * x2; 00552 a4 += y1 * y2; 00553 a5 += y1 * z2; 00554 00555 a6 += z1 * x2; 00556 a7 += z1 * y2; 00557 a8 += z1 * z2; 00558 } 00559 } else { 00560 #endif 00561 double x1, x2, y1, y2, z1, z2; 00562 #pragma acc loop vector(256) 00563 //#pragma acc loop vector(256) reduction(+:a0),reduction(+:a1),reduction(+:a2),reduction(+:a3),reduction(+:a4),reduction(+:a5),reduction(+:a6),reduction(+:a7),reduction(+:a8),reduction(+:G1),reduction(+:G2) 00564 for (long l=0; l<cnt; l++) { 00565 x1 = crds[l + x1addr]; 00566 y1 = crds[l + x1addr + framecrdsz]; 00567 z1 = crds[l + x1addr + framecrdsz*2]; 00568 00569 G1 += x1*x1 + y1*y1 + z1*z1; 00570 00571 x2 = crds[l + x2addr]; 00572 y2 = crds[l + x2addr + framecrdsz]; 00573 z2 = crds[l + x2addr + framecrdsz*2]; 00574 00575 G2 += x2*x2 + y2*y2 + z2*z2; 00576 00577 a0 += x1 * x2; 00578 a1 += x1 * y2; 00579 a2 += x1 * z2; 00580 00581 a3 += y1 * x2; 00582 a4 += y1 * y2; 00583 a5 += y1 * z2; 00584 00585 a6 += z1 * x2; 00586 a7 += z1 * y2; 00587 a8 += z1 * z2; 00588 } 00589 #if 0 00590 } 00591 #endif 00592 00593 double A[9]; 00594 A[0] = a0; 00595 A[1] = a1; 00596 A[2] = a2; 00597 00598 A[3] = a3; 00599 A[4] = a4; 00600 A[5] = a5; 00601 00602 A[6] = a6; 00603 A[7] = a7; 00604 A[8] = a8; 00605 00606 double E0 = (G1 + G2) * 0.5; 00607 00608 // calculate the RMSD & rotational matrix 00609 float rmsd; 00610 FastCalcRMSDAndRotation(NULL, A, &rmsd, E0, cnt, -1); 00611 #if defined(LOOP1D) 00612 rmsdmat[k]=rmsd; // store linearized triangle 00613 #else 00614 rmsdmat[j*framecount + i]=rmsd; 00615 #endif 00616 } 00617 } 00618 00619 printf("ACC done.\n"); 00620 } 00621 00622 #endif 00623 00624 00625 #if 0 00626 static double InnerProductAOS(double *A, double *coords1, double *coords2, 00627 const int cnt, const double *weight) { 00628 double G1=0.0, G2=0.0; 00629 memset(A, 0, sizeof(double) * 9); 00630 00631 long i; 00632 double x1, x2, y1, y2, z1, z2; 00633 if (weight != NULL) { 00634 for (i=0; i<cnt; i++) { 00635 double w = weight[i]; 00636 long idx = i*3; 00637 x1 = coords1[idx ]; 00638 y1 = coords1[idx+1]; 00639 z1 = coords1[idx+2]; 00640 00641 G1 += w * (x1*x1 + y1*y1 + z1*z1); 00642 00643 x2 = coords2[idx ]; 00644 y2 = coords2[idx+1]; 00645 z2 = coords2[idx+2]; 00646 00647 G2 += w * (x2*x2 + y2*y2 + z2*z2); 00648 00649 A[0] += (x1 * x2); 00650 A[1] += (x1 * y2); 00651 A[2] += (x1 * z2); 00652 00653 A[3] += (y1 * x2); 00654 A[4] += (y1 * y2); 00655 A[5] += (y1 * z2); 00656 00657 A[6] += (z1 * x2); 00658 A[7] += (z1 * y2); 00659 A[8] += (z1 * z2); 00660 } 00661 } else { 00662 for (i=0; i<cnt; i++) { 00663 long idx = i*3; 00664 x1 = coords1[idx ]; 00665 y1 = coords1[idx+1]; 00666 z1 = coords1[idx+2]; 00667 00668 G1 += x1*x1 + y1*y1 + z1*z1; 00669 00670 x2 = coords2[idx ]; 00671 y2 = coords2[idx+1]; 00672 z2 = coords2[idx+2]; 00673 00674 G2 += x2*x2 + y2*y2 + z2*z2; 00675 00676 A[0] += (x1 * x2); 00677 A[1] += (x1 * y2); 00678 A[2] += (x1 * z2); 00679 00680 A[3] += (y1 * x2); 00681 A[4] += (y1 * y2); 00682 A[5] += (y1 * z2); 00683 00684 A[6] += (z1 * x2); 00685 A[7] += (z1 * y2); 00686 A[8] += (z1 * z2); 00687 } 00688 } 00689 00690 return (G1 + G2) * 0.5; 00691 } 00692 #endif 00693 00694 00695 void com_soa(int cnt, 00696 float *&soax, float *&soay, float *&soaz, 00697 double &comx, double &comy, double &comz, 00698 const float *weight) { 00699 comx=comy=comz=0.0; 00700 00701 if (weight != NULL) { 00702 double wsum = 0.0; 00703 00704 // compute weighted center of mass 00705 int i; 00706 for (i=0; i<cnt; i++) { 00707 double w = weight[i]; 00708 wsum += w; 00709 00710 comx += soax[i] * w; 00711 comy += soay[i] * w; 00712 comz += soaz[i] * w; 00713 } 00714 double wsumnorm = 1.0 / wsum; 00715 comx *= wsumnorm; 00716 comy *= wsumnorm; 00717 comz *= wsumnorm; 00718 } else { 00719 // compute unweighted center of mass 00720 int i; 00721 for (i=0; i<cnt; i++) { 00722 comx += soax[i]; 00723 comy += soay[i]; 00724 comz += soaz[i]; 00725 } 00726 double avenorm = 1.0 / ((double) cnt); 00727 comx *= avenorm; 00728 comy *= avenorm; 00729 comz *= avenorm; 00730 } 00731 } 00732 00733 00734 00735 int center_convert_soa(const AtomSel *sel, int num, const float *framepos, 00736 const float *weight, 00737 float *&soax, float *&soay, float *&soaz) { 00738 // allocate temporary working arrays, plus required SIMD padding 00739 int cnt = sel->selected; 00740 soax = (float *) calloc(1, (cnt + 16)*sizeof(float)); 00741 soay = (float *) calloc(1, (cnt + 16)*sizeof(float)); 00742 soaz = (float *) calloc(1, (cnt + 16)*sizeof(float)); 00743 00744 int selind = sel->firstsel; // start from the first selected atom 00745 double comx=0.0, comy=0.0, comz=0.0; 00746 00747 int i; 00748 for (i=0; i<cnt; i++) { 00749 // find next 'on' atom in selection 00750 // loop is safe since we already stop the on cnt > 0 above 00751 while (!sel->on[selind]) 00752 selind++; 00753 00754 // compact selection and convert AOS to SOA storage on-the-fly 00755 long addr = 3*selind; 00756 float tx = framepos[addr ]; 00757 float ty = framepos[addr + 1]; 00758 float tz = framepos[addr + 2]; 00759 00760 comx += tx; 00761 comy += ty; 00762 comz += tz; 00763 00764 soax[i] = tx; 00765 soay[i] = ty; 00766 soaz[i] = tz; 00767 00768 selind++; // advance to next atom 00769 } 00770 00771 double avenorm = 1.0 / ((double) cnt); 00772 comx *= avenorm; // compute unweighted center of mass 00773 comy *= avenorm; 00774 comz *= avenorm; 00775 00776 #if 0 00777 printf("center_convert_soa(): structure com: %g %g %g\n", comx, comy, comz); 00778 #endif 00779 00780 // translate center of mass to the origin 00781 for (i=0; i<cnt; i++) { 00782 soax[i] -= float(comx); 00783 soay[i] -= float(comy); 00784 soaz[i] -= float(comz); 00785 } 00786 00787 #if 0 00788 // check post-translation com 00789 com_soa(cnt, soax, soay, soaz, comx, comy, comz, weight); 00790 printf("center_convert_soa(): centered com: %lg %lg %lg\n", comx, comy, comz); 00791 #endif 00792 00793 return 0; 00794 } 00795 00796 00797 int center_convert_single_soa(const AtomSel *sel, int num, 00798 const float *framepos, 00799 const float *weight, 00800 float *soax, float *soay, float *soaz) { 00801 // allocate temporary working arrays, plus required SIMD padding 00802 int cnt = sel->selected; 00803 int selind = sel->firstsel; // start from the first selected atom 00804 double comx=0.0, comy=0.0, comz=0.0; 00805 00806 int i; 00807 for (i=0; i<cnt; i++) { 00808 // find next 'on' atom in selection 00809 // loop is safe since we already stop the on cnt > 0 above 00810 while (!sel->on[selind]) 00811 selind++; 00812 00813 // compact selection and convert AOS to SOA storage on-the-fly 00814 long addr = 3*selind; 00815 float tx = framepos[addr ]; 00816 float ty = framepos[addr + 1]; 00817 float tz = framepos[addr + 2]; 00818 00819 comx += tx; 00820 comy += ty; 00821 comz += tz; 00822 00823 soax[i] = tx; 00824 soay[i] = ty; 00825 soaz[i] = tz; 00826 00827 selind++; // advance to next atom 00828 } 00829 00830 double avenorm = 1.0 / ((double) cnt); 00831 comx *= avenorm; // compute unweighted center of mass 00832 comy *= avenorm; 00833 comz *= avenorm; 00834 00835 // translate center of mass to the origin 00836 for (i=0; i<cnt; i++) { 00837 soax[i] -= float(comx); 00838 soay[i] -= float(comy); 00839 soaz[i] -= float(comz); 00840 } 00841 00842 return 0; 00843 } 00844 00845 00846 int measure_rmsd_qcp(VMDApp *app, 00847 const AtomSel *sel1, const AtomSel *sel2, 00848 int num, const float *framepos1, const float *framepos2, 00849 float *weight, float *rmsd) { 00850 if (!sel1 || !sel2) return MEASURE_ERR_NOSEL; 00851 if (sel1->selected < 1 || sel2->selected < 1) return MEASURE_ERR_NOSEL; 00852 if (!weight || !rmsd) return MEASURE_ERR_NOWEIGHT; 00853 00854 // the number of selected atoms must be the same 00855 if (sel1->selected != sel2->selected) return MEASURE_ERR_MISMATCHEDCNT; 00856 00857 #if 0 00858 // need to know how to traverse the list of weights 00859 // there could be 1 weight per atom (sel_flg == 1) or 00860 // 1 weight per selected atom (sel_flg == 0) 00861 int sel_flg; 00862 if (num == sel1->num_atoms) { 00863 sel_flg = 1; // using all elements 00864 } else { 00865 sel_flg = 0; // using elements from selection 00866 } 00867 #endif 00868 00869 // 00870 // compute CoM for each selection while copying them into target bufs 00871 // 00872 float *sel1x, *sel1y, *sel1z, *sel2x, *sel2y, *sel2z; 00873 center_convert_soa(sel1, num, framepos1, weight, sel1x, sel1y, sel1z); 00874 center_convert_soa(sel2, num, framepos2, weight, sel2x, sel2y, sel2z); 00875 00876 // calculate the (weighted) inner product of two structures 00877 double E0 = 0; 00878 double A[9]; 00879 E0 = InnerProductSOA(A, 00880 sel1x, sel1y, sel1z, 00881 sel2x, sel2y, sel2z, 00882 sel1->selected, NULL /* weight */); 00883 00884 #if 0 00885 printf("QCP inner product results:\n"); 00886 printf(" E0: %g\n", E0); 00887 int i; 00888 for (i=0; i<9; i+=3) 00889 printf("A[%d-%d]: %g %g %g\n", i, i+2, A[i], A[i+1], A[i+2]); 00890 printf("\n"); 00891 #endif 00892 00893 // calculate the RMSD & rotational matrix 00894 FastCalcRMSDAndRotation(NULL, A, rmsd, E0, sel1->selected, -1); 00895 00896 free(sel1x); 00897 free(sel1y); 00898 free(sel1z); 00899 00900 free(sel2x); 00901 free(sel2y); 00902 free(sel2z); 00903 00904 return MEASURE_NOERR; // and say rmsd is OK 00905 } 00906 00907 00908 #if 0 00909 // compute linear array index from lower-triangular indices i,j 00910 static int sub2idx_tril(long N, long i, long j, long *ind) { 00911 // *ind = i + (j-1)*N - j*(j-1)/2; 00912 *ind = j + N*i - i*(i-1)/2; 00913 return 0; 00914 } 00915 #endif 00916 00917 // compute lower-triangular indices i,j from linear array index 00918 static int idx2sub_tril(long N, long ind, long *J, long *I) { 00919 long i, j; 00920 00921 if (ind > (N*(N+1)/2)) { 00922 return -1; // out of bounds 00923 } 00924 00925 // XXX deal with ambiguous types for sqrt() on Solaris 00926 double tmp2np1 = 2*N+1; 00927 i = long(floor((tmp2np1 - sqrt(tmp2np1*tmp2np1 - 8.0*ind)) / 2)); 00928 // i = long(floor((2*N+1 - sqrt((2*N+1)*(2*N+1) - 8*ind)) / 2)); 00929 j = ind - N*i + i*(i-1)/2 + i; 00930 00931 *I = i; 00932 *J = j+1; 00933 00934 return 0; 00935 } 00936 00937 00938 typedef struct { 00939 const AtomSel *sel; 00940 int first; 00941 int last; 00942 int step; 00943 float *rmsdmat; 00944 int padcnt; 00945 int framecrdsz; 00946 float *crds; 00947 #if (VMDQCPUSEAVX512 && defined(__AVX512F__)) 00948 int useavx512; 00949 #endif 00950 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__)) 00951 int useavx2; 00952 #endif 00953 #if (VMDQCPUSEVSX && defined(__VEC__)) 00954 int usevsx; 00955 #endif 00956 } qcprmsdthreadparms; 00957 00958 00959 static void * measure_rmsdmat_qcp_thread(void *voidparms) { 00960 int threadid; 00961 qcprmsdthreadparms *parms = NULL; 00962 #if defined(VMDQCPUSETHRPOOL) 00963 wkf_threadpool_worker_getdata(voidparms, (void **) &parms); 00964 wkf_threadpool_worker_getid(voidparms, &threadid, NULL); 00965 #else 00966 wkf_threadlaunch_getdata(voidparms, (void **) &parms); 00967 wkf_threadlaunch_getid(voidparms, &threadid, NULL); 00968 #endif 00969 00970 // 00971 // copy in per-thread parameters 00972 // 00973 const AtomSel *sel = parms->sel; 00974 float *rmsdmat = parms->rmsdmat; 00975 00976 // XXX array padding not universally honored yet... 00977 // int padcnt = parms->padcnt; 00978 00979 int framecrdsz = parms->framecrdsz; 00980 float *crds = parms->crds; 00981 int first = parms->first; 00982 int last = parms->last; 00983 int step = parms->step; 00984 #if VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__) 00985 int useavx2 = parms->useavx2; 00986 #endif 00987 #if (VMDQCPUSEAVX512 && defined(__AVX512F__)) 00988 int useavx512 = parms->useavx512; 00989 #endif 00990 #if (VMDQCPUSEVSX && defined(__VEC__)) 00991 int usevsx = parms->usevsx; 00992 #endif 00993 00994 #if 0 00995 printf("qcpthread[%d] running... %s\n", threadid, 00996 #if (VMDQCPUSEAVX512 && defined(__AVX512F__)) 00997 (useavx512) ? "(AVX512)" : "(C++)"); 00998 #elif (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__)) 00999 (useavx2) ? "(AVX2)" : "(C++)"); 01000 #elif (VMDQCPUSEVSX && defined(__VEC__)) 01001 (useavsx) ? "(VSX)" : "(C++)"); 01002 #else 01003 "(C++)"); 01004 #endif 01005 #endif 01006 01007 int framecount = (last - first + 1) / step; 01008 01009 wkf_tasktile_t tile; 01010 #if defined(VMDQCPUSETHRPOOL) 01011 while (wkf_threadpool_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) { 01012 #else 01013 while (wkf_threadlaunch_next_tile(voidparms, 8, &tile) != WKF_SCHED_DONE) { 01014 #endif 01015 long idx; 01016 01017 for (idx=tile.start; idx<tile.end; idx++) { 01018 long i, j; 01019 01020 // compute i,j from idx... 01021 // only compute off-diagonal elements, so we use (framecount-1) 01022 if (idx2sub_tril(framecount-1, idx, &i, &j)) { 01023 printf("qcpthread[%d]: work idx %ld out of triangle!\n", threadid, idx); 01024 break; 01025 } 01026 01027 // calculate the (weighted) inner product of two structures 01028 double A[9]; 01029 double E0 = 0; 01030 01031 float *xj = crds + (j * 3 * framecrdsz); 01032 float *yj = xj + framecrdsz; 01033 float *zj = xj + framecrdsz*2; 01034 01035 float *xi = crds + (i * 3 * framecrdsz); 01036 float *yi = xi + framecrdsz; 01037 float *zi = xi + framecrdsz*2; 01038 01039 #if VMDQCPUSEAVX512 && defined(__AVX512F__) 01040 if (useavx512) { 01041 E0 = InnerProductSOA_avx512(A, xj, yj, zj, xi, yi, zi, 01042 sel->selected, NULL /* weight */); 01043 } else 01044 #endif 01045 #if VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__) 01046 if (useavx2) { 01047 E0 = InnerProductSOA_avx2(A, xj, yj, zj, xi, yi, zi, 01048 sel->selected, NULL /* weight */); 01049 } else 01050 #endif 01051 E0 = InnerProductSOA(A, xj, yj, zj, xi, yi, zi, 01052 sel->selected, NULL /* weight */); 01053 01054 // calculate the RMSD & rotational matrix 01055 FastCalcRMSDAndRotation(NULL, A, &rmsdmat[j*framecount + i], 01056 E0, sel->selected, -1); 01057 01058 // reflect the outcome of the lower triangle into the upper triangle 01059 rmsdmat[i*framecount + j] = rmsdmat[j*framecount + i]; 01060 } 01061 } 01062 01063 return NULL; 01064 } 01065 01066 01067 01068 int measure_rmsdmat_qcp(VMDApp *app, 01069 const AtomSel *sel, MoleculeList *mlist, 01070 int num, float *weight, 01071 int first, int last, int step, 01072 float *rmsdmat) { 01073 if (!sel) return MEASURE_ERR_NOSEL; 01074 if (sel->selected < 1) return MEASURE_ERR_NOSEL; 01075 // if (!weight || !rmsd) return MEASURE_ERR_NOWEIGHT; 01076 01077 Molecule *mymol = mlist->mol_from_id(sel->molid()); 01078 int maxframes = mymol->numframes(); 01079 01080 // accept value of -1 meaning "all" frames 01081 if (last == -1) 01082 last = maxframes-1; 01083 01084 if (maxframes == 0 || first < 0 || first > last || 01085 last >= maxframes || step <= 0) 01086 return MEASURE_ERR_BADFRAMERANGE; 01087 01088 // XXX replace with calls to centralized control system 01089 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__)) 01090 // XXX there's no SSE-specific code path 01091 // int usesse=1; 01092 // if (getenv("VMDNOSSE")) { 01093 // usesse=0; 01094 // } 01095 #endif 01096 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__)) 01097 int useavx2=1; 01098 if (getenv("VMDNOAVX2")) { 01099 useavx2=0; 01100 } 01101 #endif 01102 #if (VMDQCPUSEAVX512 && defined(__AVX512F__)) 01103 int useavx512=1; 01104 if (getenv("VMDNOAVX512")) { 01105 useavx512=0; 01106 } 01107 #endif 01108 #if (VMDQCPUSEVSX && defined(__VEC__)) 01109 int usevsx=1; 01110 if (getenv("VMDNOVSX")) { 01111 usevsx=0; 01112 } 01113 #endif 01114 01115 #if 0 01116 // need to know how to traverse the list of weights 01117 // there could be 1 weight per atom (sel_flg == 1) or 01118 // 1 weight per selected atom (sel_flg == 0) 01119 int sel_flg; 01120 if (num == sel->num_atoms) { 01121 sel_flg = 1; // using all elements 01122 } else { 01123 sel_flg = 0; // using elements from selection 01124 } 01125 #endif 01126 01127 // start timers 01128 wkf_timerhandle timer; 01129 timer=wkf_timer_create(); 01130 wkf_timer_start(timer); 01131 01132 01133 // 01134 // compute CoM for frame/selection while copying them into SOA target bufs 01135 // 01136 int framecount = (last - first + 1) / step; 01137 01138 int padcnt = (num + 255) & ~255; 01139 int framecrdsz = padcnt + 256; 01140 float *crds = (float *) calloc(1, (framecount * 3L * framecrdsz + 256) * sizeof(float)); 01141 01142 int frame; 01143 for (frame=first; frame<=last; frame+=step) { 01144 const float *framepos = (mymol->get_frame(frame))->pos; 01145 float *xc = crds + (frame * 3L * framecrdsz); 01146 float *yc = xc + framecrdsz; 01147 float *zc = xc + framecrdsz*2; 01148 01149 center_convert_single_soa(sel, num, framepos, weight, xc, yc, zc); 01150 } 01151 01152 double converttime = wkf_timer_timenow(timer); 01153 01154 #if !(defined(__PGIC__) && defined(_OPENACC)) 01155 #if defined(VMDTHREADS) 01156 int numprocs = wkf_thread_numprocessors(); 01157 #else 01158 int numprocs = 1; 01159 #endif 01160 01161 // 01162 // copy in per-thread parameters 01163 // 01164 qcprmsdthreadparms parms; 01165 memset(&parms, 0, sizeof(parms)); 01166 parms.sel = sel; 01167 parms.rmsdmat = rmsdmat; 01168 parms.padcnt = padcnt; 01169 parms.framecrdsz = framecrdsz; 01170 parms.crds = crds; 01171 parms.first = first; 01172 parms.last = last; 01173 parms.step = step; 01174 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__)) 01175 parms.useavx2 = useavx2; 01176 #endif 01177 #if (VMDQCPUSEAVX512 && defined(__AVX512F__)) 01178 parms.useavx512 = useavx512; 01179 #endif 01180 #if (VMDQCPUSEVSX && defined(__VEC__)) 01181 parms.usevsx = usevsx; 01182 #endif 01183 01184 // spawn child threads to do the work 01185 wkf_tasktile_t tile; 01186 tile.start=0; 01187 tile.end=(framecount-1)*(framecount-1)/2; // only compute off-diag elements 01188 01189 #if defined(VMDORBUSETHRPOOL) 01190 wkf_threadpool_sched_dynamic(app->thrpool, &tile); 01191 rc = wkf_threadpool_launch(app->thrpool, measure_rmsdmat_qcp_thread, &parms, 1); 01192 #else 01193 wkf_threadlaunch(numprocs, &parms, measure_rmsdmat_qcp_thread, &tile); 01194 #endif 01195 #elif defined(__PGIC__) && defined(_OPENACC) 01196 // OpenACC variant 01197 rmsdmat_qcp_acc(sel->selected, padcnt, framecrdsz, framecount, crds, 01198 // NULL /* weight */, 01199 rmsdmat); 01200 #else 01201 int i, j; 01202 for (j=0; j<framecount; j++) { 01203 float *xj = crds + (j * 3 * framecrdsz); 01204 float *yj = xj + framecrdsz; 01205 float *zj = xj + framecrdsz*2; 01206 for (i=0; i<j; i++) { 01207 // calculate the (weighted) inner product of two structures 01208 double A[9]; 01209 01210 float *xi = crds + (i * 3 * framecrdsz); 01211 float *yi = xi + framecrdsz; 01212 float *zi = xi + framecrdsz*2; 01213 01214 double E0 = InnerProductSOA(A, xj, yj, zj, xi, yi, zi, 01215 sel->selected, NULL /* weight */); 01216 01217 // calculate the RMSD & rotational matrix 01218 FastCalcRMSDAndRotation(NULL, A, &rmsdmat[j*framecount + i], 01219 E0, sel->selected, -1); 01220 01221 // reflect the outcome of the lower triangle into the upper triangle 01222 rmsdmat[i*framecount + j] = rmsdmat[j*framecount + i]; 01223 } 01224 } 01225 #endif 01226 01227 // mark all self-RMSDs with a value of 0.0 01228 for (long l=0; l<framecount; l++) { 01229 rmsdmat[l*framecount + l] = 0.0; 01230 } 01231 01232 double rmsdtime = wkf_timer_timenow(timer) - converttime; 01233 01234 // free all temporary buffers 01235 free(crds); 01236 01237 #if 1 01238 double totaltime = wkf_timer_timenow(timer); 01239 printf("QCP RMSD Matrix calculation time: SOA selection: %.3f RMSD solve: %.3f total: %.3f\n", converttime, rmsdtime, totaltime); 01240 #endif 01241 01242 wkf_timer_destroy(timer); 01243 01244 return MEASURE_NOERR; // and say rmsd is OK 01245 } 01246 01247 01248 01249 int measure_rmsdmat_qcp_ooc(VMDApp *app, 01250 const AtomSel *sel, MoleculeList *mlist, 01251 int nfiles, const char **trjfileset, 01252 int num, float *weight, 01253 int first, int last, int step, 01254 int &framecount, float *&rmsdmat) { 01255 if (!sel) return MEASURE_ERR_NOSEL; 01256 if (sel->selected < 1) return MEASURE_ERR_NOSEL; 01257 // if (!weight || !rmsd) return MEASURE_ERR_NOWEIGHT; 01258 01259 01260 // 01261 // XXX really need to compute per-file frame counts etc 01262 // 01263 framecount = (last - first + 1) / step; 01264 01265 printf("** measure_rmsdmat_qcp_ooc(): \n"); 01266 printf("** first: %d last: %d step: %d nfiles: %d count: %d\n", 01267 first, last, step, nfiles, framecount); 01268 01269 rmsdmat = (float *) calloc(1, framecount * framecount * sizeof(float)); 01270 01271 // XXX this needs to be properly implemented with fallback functionality 01272 #if defined(VMDCUDA) 01273 qcp_soa_gpu_ooc(app->cuda->get_cuda_devpool(), 01274 nfiles, trjfileset, sel, first, last, step, rmsdmat); 01275 #endif 01276 01277 #if 0 01278 Molecule *mymol = mlist->mol_from_id(sel->molid()); 01279 01280 int maxframes = mymol->numframes(); 01281 01282 // accept value of -1 meaning "all" frames 01283 if (last == -1) 01284 last = maxframes-1; 01285 01286 if (maxframes == 0 || first < 0 || first > last || 01287 last >= maxframes || step <= 0) 01288 return MEASURE_ERR_BADFRAMERANGE; 01289 01290 // XXX hacked out lots of stuff we don't need yet... 01291 01292 01293 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__)) 01294 int useavx2=1; 01295 if (getenv("VMDNOAVX2")) { 01296 useavx2=0; 01297 } 01298 #endif 01299 01300 // start timers 01301 wkf_timerhandle timer; 01302 timer=wkf_timer_create(); 01303 wkf_timer_start(timer); 01304 01305 // 01306 // compute CoM for frame/selection while copying them into SOA target bufs 01307 // 01308 int framecount = (last - first + 1) / step; 01309 01310 int padcnt = (num + 255) & ~255; 01311 int framecrdsz = padcnt + 256; 01312 float *crds = (float *) calloc(1, (framecount * 3L * framecrdsz + 256) * sizeof(float)); 01313 01314 int frame; 01315 for (frame=first; frame<=last; frame+=step) { 01316 const float *framepos = (mymol->get_frame(frame))->pos; 01317 float *xc = crds + (frame * 3L * framecrdsz); 01318 float *yc = xc + framecrdsz; 01319 float *zc = xc + framecrdsz*2; 01320 01321 center_convert_single_soa(sel, num, framepos, weight, xc, yc, zc); 01322 } 01323 01324 double converttime = wkf_timer_timenow(timer); 01325 01326 #if defined(VMDTHREADS) 01327 int numprocs = wkf_thread_numprocessors(); 01328 #else 01329 int numprocs = 1; 01330 #endif 01331 01332 // 01333 // copy in per-thread parameters 01334 // 01335 qcprmsdthreadparms parms; 01336 memset(&parms, 0, sizeof(parms)); 01337 parms.sel = sel; 01338 parms.rmsdmat = rmsdmat; 01339 parms.padcnt = padcnt; 01340 parms.framecrdsz = framecrdsz; 01341 parms.crds = crds; 01342 parms.first = first; 01343 parms.last = last; 01344 parms.step = step; 01345 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__)) 01346 parms.useavx2 = useavx2; 01347 #endif 01348 01349 // spawn child threads to do the work 01350 wkf_tasktile_t tile; 01351 tile.start=0; 01352 tile.end=(framecount-1)*(framecount-1)/2; // only compute off-diag elements 01353 01354 #if defined(VMDORBUSETHRPOOL) 01355 wkf_threadpool_sched_dynamic(app->thrpool, &tile); 01356 rc = wkf_threadpool_launch(app->thrpool, measure_rmsdmat_qcp_thread, &parms, 1); 01357 #else 01358 wkf_threadlaunch(numprocs, &parms, measure_rmsdmat_qcp_thread, &tile); 01359 #endif 01360 01361 // mark all self-RMSDs with a value of 1.0 01362 for (long l=0; l<framecount; l++) { 01363 rmsdmat[l*framecount + l] = 1.0; 01364 } 01365 01366 double rmsdtime = wkf_timer_timenow(timer) - converttime; 01367 01368 // free all temporary buffers 01369 free(crds); 01370 01371 #if 1 01372 double totaltime = wkf_timer_timenow(timer); 01373 printf("QCP RMSD Matrix calculation time: SOA selection: %.3f RMSD solve: %.3f total: %.3f\n", converttime, rmsdtime, totaltime); 01374 #endif 01375 01376 wkf_timer_destroy(timer); 01377 #endif 01378 01379 return MEASURE_NOERR; // and say rmsd is OK 01380 } 01381 01382 01383 01384 01385 01386 01387 // 01388 // Copyright notice for original QCP FastCalcRMSDAndRotation() routine 01389 // 01390 // If you use this QCP rotation calculation method in a publication, please 01391 // reference: 01392 // Douglas L. Theobald (2005) 01393 // "Rapid calculation of RMSD using a quaternion-based characteristic 01394 // polynomial." 01395 // Acta Crystallographica A 61(4):478-480. 01396 // 01397 // Pu Liu, Dmitris K. Agrafiotis, and Douglas L. Theobald (2009) 01398 // "Fast determination of the optimal rotational matrix for macromolecular 01399 // superpositions." 01400 // Journal of Computational Chemistry 31(7):1561-1563. 01401 // 01402 // Copyright (c) 2009-2013 Pu Liu and Douglas L. Theobald 01403 // All rights reserved. 01404 // 01405 // Redistribution and use in source and binary forms, with or without modification, are permitted 01406 // provided that the following conditions are met: 01407 // 01408 // * Redistributions of source code must retain the above copyright notice, 01409 // this list of conditions and the following disclaimer. 01410 // * Redistributions in binary form must reproduce the above copyright notice, 01411 // this list of conditions and the following disclaimer in the documentation 01412 // and/or other materials provided with the distribution. 01413 // * Neither the name of the <ORGANIZATION> nor the names of its 01414 // contributors may be used to endorse or promote products derived from 01415 // this software without specific prior written permission. 01416 // 01417 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 01418 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 01419 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 01420 // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 01421 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 01422 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 01423 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 01424 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 01425 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 01426 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 01427 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 01428 // 01429 #if defined(__PGIC__) 01430 #pragma acc routine seq 01431 #endif 01432 int FastCalcRMSDAndRotation(double *rot, double *A, float *rmsd, 01433 double E0, int len, double minScore) { 01434 double Sxx, Sxy, Sxz, Syx, Syy, Syz, Szx, Szy, Szz; 01435 double Szz2, Syy2, Sxx2, Sxy2, Syz2, Sxz2, Syx2, Szy2, Szx2, 01436 SyzSzymSyySzz2, Sxx2Syy2Szz2Syz2Szy2, Sxy2Sxz2Syx2Szx2, 01437 SxzpSzx, SyzpSzy, SxypSyx, SyzmSzy, 01438 SxzmSzx, SxymSyx, SxxpSyy, SxxmSyy; 01439 double C[4]; 01440 int i; 01441 double mxEigenV; 01442 double oldg = 0.0; 01443 double b, a, delta, rms, qsqr; 01444 double q1, q2, q3, q4, normq; 01445 double a11, a12, a13, a14, a21, a22, a23, a24; 01446 double a31, a32, a33, a34, a41, a42, a43, a44; 01447 double a2, x2, y2, z2; 01448 double xy, az, zx, ay, yz, ax; 01449 double a3344_4334, a3244_4234, a3243_4233, a3143_4133,a3144_4134, a3142_4132; 01450 double evecprec = 1e-6; 01451 double evalprec = 1e-11; 01452 01453 Sxx = A[0]; Sxy = A[1]; Sxz = A[2]; 01454 Syx = A[3]; Syy = A[4]; Syz = A[5]; 01455 Szx = A[6]; Szy = A[7]; Szz = A[8]; 01456 01457 Sxx2 = Sxx * Sxx; 01458 Syy2 = Syy * Syy; 01459 Szz2 = Szz * Szz; 01460 01461 Sxy2 = Sxy * Sxy; 01462 Syz2 = Syz * Syz; 01463 Sxz2 = Sxz * Sxz; 01464 01465 Syx2 = Syx * Syx; 01466 Szy2 = Szy * Szy; 01467 Szx2 = Szx * Szx; 01468 01469 SyzSzymSyySzz2 = 2.0*(Syz*Szy - Syy*Szz); 01470 Sxx2Syy2Szz2Syz2Szy2 = Syy2 + Szz2 - Sxx2 + Syz2 + Szy2; 01471 01472 C[2] = -2.0 * (Sxx2 + Syy2 + Szz2 + Sxy2 + Syx2 + Sxz2 + Szx2 + Syz2 + Szy2); 01473 C[1] = 8.0 * (Sxx*Syz*Szy + Syy*Szx*Sxz + Szz*Sxy*Syx - Sxx*Syy*Szz - Syz*Szx*Sxy - Szy*Syx*Sxz); 01474 01475 SxzpSzx = Sxz + Szx; 01476 SyzpSzy = Syz + Szy; 01477 SxypSyx = Sxy + Syx; 01478 SyzmSzy = Syz - Szy; 01479 SxzmSzx = Sxz - Szx; 01480 SxymSyx = Sxy - Syx; 01481 SxxpSyy = Sxx + Syy; 01482 SxxmSyy = Sxx - Syy; 01483 Sxy2Sxz2Syx2Szx2 = Sxy2 + Sxz2 - Syx2 - Szx2; 01484 01485 C[0] = Sxy2Sxz2Syx2Szx2 * Sxy2Sxz2Syx2Szx2 01486 + (Sxx2Syy2Szz2Syz2Szy2 + SyzSzymSyySzz2) * (Sxx2Syy2Szz2Syz2Szy2 - SyzSzymSyySzz2) 01487 + (-(SxzpSzx)*(SyzmSzy)+(SxymSyx)*(SxxmSyy-Szz)) * (-(SxzmSzx)*(SyzpSzy)+(SxymSyx)*(SxxmSyy+Szz)) 01488 + (-(SxzpSzx)*(SyzpSzy)-(SxypSyx)*(SxxpSyy-Szz)) * (-(SxzmSzx)*(SyzmSzy)-(SxypSyx)*(SxxpSyy+Szz)) 01489 + (+(SxypSyx)*(SyzpSzy)+(SxzpSzx)*(SxxmSyy+Szz)) * (-(SxymSyx)*(SyzmSzy)+(SxzpSzx)*(SxxpSyy+Szz)) 01490 + (+(SxypSyx)*(SyzmSzy)+(SxzmSzx)*(SxxmSyy-Szz)) * (-(SxymSyx)*(SyzpSzy)+(SxzmSzx)*(SxxpSyy-Szz)); 01491 01492 /* Newton-Raphson */ 01493 mxEigenV = E0; 01494 for (i = 0; i < 50; ++i) { 01495 oldg = mxEigenV; 01496 x2 = mxEigenV*mxEigenV; 01497 b = (x2 + C[2])*mxEigenV; 01498 a = b + C[1]; 01499 delta = ((a*mxEigenV + C[0])/(2.0*x2*mxEigenV + b + a)); 01500 mxEigenV -= delta; 01501 #if 0 01502 printf("QCP diff[%3d]: %16g %16g %16g\n", i, mxEigenV - oldg, evalprec*mxEigenV, mxEigenV); 01503 #endif 01504 if (fabs(mxEigenV - oldg) < fabs(evalprec*mxEigenV)) 01505 break; 01506 } 01507 01508 #if !defined(__PGIC__) 01509 if (i == 50) 01510 printf("MeasureQCP: More than %d iterations needed!\n", i); 01511 #endif 01512 01513 // the fabs() is to guard against extremely small, 01514 // but *negative* numbers due to floating point error 01515 rms = sqrt(fabs(2.0 * (E0 - mxEigenV)/len)); 01516 (*rmsd) = float(rms); 01517 /* printf("\n\n %16g %16g %16g \n", rms, E0, 2.0 * (E0 - mxEigenV)/len); */ 01518 01519 if (minScore > 0) 01520 if (rms < minScore) 01521 return (-1); // Don't bother with rotation. 01522 01523 // only perform rotation related calculations if we have a non-NULL 01524 // pointer for the output rotation matrix 01525 if (rot != NULL) { 01526 a11 = SxxpSyy + Szz-mxEigenV; a12 = SyzmSzy; a13 = - SxzmSzx; a14 = SxymSyx; 01527 a21 = SyzmSzy; a22 = SxxmSyy - Szz-mxEigenV; a23 = SxypSyx; a24= SxzpSzx; 01528 a31 = a13; a32 = a23; a33 = Syy-Sxx-Szz - mxEigenV; a34 = SyzpSzy; 01529 a41 = a14; a42 = a24; a43 = a34; a44 = Szz - SxxpSyy - mxEigenV; 01530 a3344_4334 = a33 * a44 - a43 * a34; a3244_4234 = a32 * a44-a42*a34; 01531 a3243_4233 = a32 * a43 - a42 * a33; a3143_4133 = a31 * a43-a41*a33; 01532 a3144_4134 = a31 * a44 - a41 * a34; a3142_4132 = a31 * a42-a41*a32; 01533 q1 = a22*a3344_4334-a23*a3244_4234+a24*a3243_4233; 01534 q2 = -a21*a3344_4334+a23*a3144_4134-a24*a3143_4133; 01535 q3 = a21*a3244_4234-a22*a3144_4134+a24*a3142_4132; 01536 q4 = -a21*a3243_4233+a22*a3143_4133-a23*a3142_4132; 01537 01538 qsqr = q1 * q1 + q2 * q2 + q3 * q3 + q4 * q4; 01539 01540 // The following code tries to calculate another column in the 01541 // adjoint matrix when the norm of the current column is too small. 01542 // Usually this block will never be activated. 01543 // To be absolutely safe this should be 01544 // uncommented, but it is most likely unnecessary. 01545 if (qsqr < evecprec) { 01546 q1 = a12*a3344_4334 - a13*a3244_4234 + a14*a3243_4233; 01547 q2 = -a11*a3344_4334 + a13*a3144_4134 - a14*a3143_4133; 01548 q3 = a11*a3244_4234 - a12*a3144_4134 + a14*a3142_4132; 01549 q4 = -a11*a3243_4233 + a12*a3143_4133 - a13*a3142_4132; 01550 qsqr = q1*q1 + q2 *q2 + q3*q3+q4*q4; 01551 01552 if (qsqr < evecprec) { 01553 double a1324_1423 = a13*a24 - a14*a23, a1224_1422 = a12*a24 - a14*a22; 01554 double a1223_1322 = a12*a23 - a13*a22, a1124_1421 = a11*a24 - a14*a21; 01555 double a1123_1321 = a11*a23 - a13*a21, a1122_1221 = a11*a22 - a12*a21; 01556 01557 q1 = a42 * a1324_1423 - a43 * a1224_1422 + a44 * a1223_1322; 01558 q2 = -a41 * a1324_1423 + a43 * a1124_1421 - a44 * a1123_1321; 01559 q3 = a41 * a1224_1422 - a42 * a1124_1421 + a44 * a1122_1221; 01560 q4 = -a41 * a1223_1322 + a42 * a1123_1321 - a43 * a1122_1221; 01561 qsqr = q1*q1 + q2 *q2 + q3*q3+q4*q4; 01562 01563 if (qsqr < evecprec) { 01564 q1 = a32 * a1324_1423 - a33 * a1224_1422 + a34 * a1223_1322; 01565 q2 = -a31 * a1324_1423 + a33 * a1124_1421 - a34 * a1123_1321; 01566 q3 = a31 * a1224_1422 - a32 * a1124_1421 + a34 * a1122_1221; 01567 q4 = -a31 * a1223_1322 + a32 * a1123_1321 - a33 * a1122_1221; 01568 qsqr = q1*q1 + q2 *q2 + q3*q3 + q4*q4; 01569 01570 if (qsqr < evecprec) { 01571 // if qsqr is still too small, return the identity matrix. 01572 rot[0] = rot[4] = rot[8] = 1.0; 01573 rot[1] = rot[2] = rot[3] = rot[5] = rot[6] = rot[7] = 0.0; 01574 01575 return(0); 01576 } 01577 } 01578 } 01579 } 01580 01581 normq = sqrt(qsqr); 01582 q1 /= normq; 01583 q2 /= normq; 01584 q3 /= normq; 01585 q4 /= normq; 01586 01587 a2 = q1 * q1; 01588 x2 = q2 * q2; 01589 y2 = q3 * q3; 01590 z2 = q4 * q4; 01591 01592 xy = q2 * q3; 01593 az = q1 * q4; 01594 zx = q4 * q2; 01595 ay = q1 * q3; 01596 yz = q3 * q4; 01597 ax = q1 * q2; 01598 01599 rot[0] = a2 + x2 - y2 - z2; 01600 rot[1] = 2 * (xy + az); 01601 rot[2] = 2 * (zx - ay); 01602 rot[3] = 2 * (xy - az); 01603 rot[4] = a2 - x2 + y2 - z2; 01604 rot[5] = 2 * (yz + ax); 01605 rot[6] = 2 * (zx + ay); 01606 rot[7] = 2 * (yz - ax); 01607 rot[8] = a2 - x2 - y2 + z2; 01608 } 01609 01610 return 1; 01611 } 01612