Main Page Namespace List Class Hierarchy Alphabetical List Compound List File List Namespace Members Compound Members File Members Related Pages
MeasureQCP.C

Go to the documentation of this file.
00001 /***************************************************************************
00002 *cr
00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the
00004 *cr University of Illinois
00005 *cr All Rights Reserved
00006 *cr
00007 ***************************************************************************/
00008 
00009 /***************************************************************************
00010 * RCS INFORMATION:
00011 *
00012 * $RCSfile: MeasureQCP.C,v $
00013 * $Author: johns $ $Locker: $ $State: Exp $
00014 * $Revision: 1.35 $ $Date: 2020年10月15日 16:07:31 $
00015 *
00016 ***************************************************************************
00017 * DESCRIPTION:
00018 * Code to compute RMSD values for unaligned structures without 
00019 * actually performing the alginment, particularly useful for 
00020 * computing large dissimilarity matrices required for 
00021 * trajectory clustering analysis
00022 *
00023 ***************************************************************************/
00024 
00025 #include <stdio.h>
00026 #include <stdlib.h>
00027 
00028 #define VMDQCPUSESSE 1
00029 // #define VMDQCPUSEAVX2 1
00030 #if defined(VMDUSEAVX512)
00031 #define VMDQCPUSEAVX512 1
00032 #endif
00033 
00034 #define VMDQCPUSETHRPOOL 1
00035 
00036 #if VMDQCPUSESSE && defined(__SSE2__)
00037 #include <emmintrin.h>
00038 #endif
00039 #if VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00040 #include <immintrin.h>
00041 #endif
00042 #if VMDQCPUSEAVX512 && defined(__AVX512F__)
00043 #include <immintrin.h>
00044 #endif
00045 #if (defined(VMDQCPUSEVSX) && defined(__VSX__))
00046 #if defined(__GNUC__) && defined(__VEC__)
00047 #include <altivec.h>
00048 #endif
00049 #endif
00050 
00051 #include <math.h>
00052 #include "Measure.h"
00053 #include "AtomSel.h"
00054 #include "utilities.h"
00055 #include "ResizeArray.h"
00056 #include "MoleculeList.h"
00057 #include "Inform.h"
00058 #include "Timestep.h"
00059 #include "CUDAAccel.h"
00060 #include "CUDAMeasureQCP.h"
00061 #include "VMDApp.h"
00062 #include "WKFThreads.h"
00063 #include "WKFUtils.h"
00064 
00065 #if VMDQCPUSEAVX512 && defined(__AVX512F__)
00066 
00067 static double hadd8_m512d(__m512d sum8) {
00068 // __m512d tmp = sum8;
00069 // __m512d hsum4 = _mm512_add_pd(tmp, _mm512_permute2f128_pd(tmp, tmp, 0x1));
00070 // __m512d hsum2 = _mm256_castpd256_pd128(hsum4);
00071 // __m512d sum2 = _mm_hadd_pd(hsum2, hsum2);
00072 // return _mm_cvtsd_f64(sum2);
00073 return 0.0;
00074 }
00075 
00076 
00077 // AVX2 + FMA + 32-byte-aligned SOA-format memory buffers
00078 static double InnerProductSOA_avx512(double *A,
00079 float *crdx1, float *crdy1, float *crdz1,
00080 float *crdx2, float *crdy2, float *crdz2,
00081 const int cnt, const float *weight) {
00082 __m512d va0 = _mm512_set1_pd(0.0);
00083 __m512d va1 = _mm512_set1_pd(0.0);
00084 __m512d va2 = _mm512_set1_pd(0.0);
00085 __m512d va3 = _mm512_set1_pd(0.0);
00086 __m512d va4 = _mm512_set1_pd(0.0);
00087 __m512d va5 = _mm512_set1_pd(0.0);
00088 __m512d va6 = _mm512_set1_pd(0.0);
00089 __m512d va7 = _mm512_set1_pd(0.0);
00090 __m512d va8 = _mm512_set1_pd(0.0);
00091 __m512d vG1 = _mm512_set1_pd(0.0);
00092 __m512d vG2 = _mm512_set1_pd(0.0);
00093 
00094 if (weight != NULL) {
00095 for (int i=0; i<cnt; i+=8) {
00096 __m256 xa8f = _mm256_load_ps(crdx1 + i); // load 8-float vectors
00097 __m256 ya8f = _mm256_load_ps(crdy1 + i);
00098 __m256 za8f = _mm256_load_ps(crdz1 + i);
00099 
00100 __m512d xa8 = _mm512_cvtps_pd(xa8f); // convert from float to doubles
00101 __m512d ya8 = _mm512_cvtps_pd(ya8f);
00102 __m512d za8 = _mm512_cvtps_pd(za8f);
00103 
00104 __m512d gatmp = _mm512_mul_pd(xa8, xa8);
00105 gatmp = _mm512_fmadd_pd(ya8, ya8, gatmp);
00106 gatmp = _mm512_fmadd_pd(za8, za8, gatmp);
00107 
00108 __m256 xb8f = _mm256_load_ps(crdx2 + i); // load 8-float vectors
00109 __m256 yb8f = _mm256_load_ps(crdy2 + i);
00110 __m256 zb8f = _mm256_load_ps(crdz2 + i);
00111 
00112 __m512d xb8 = _mm512_cvtps_pd(xb8f); // convert from float to doubles
00113 __m512d yb8 = _mm512_cvtps_pd(yb8f);
00114 __m512d zb8 = _mm512_cvtps_pd(zb8f);
00115 
00116 __m512d gbtmp = _mm512_mul_pd(xb8, xb8);
00117 gbtmp = _mm512_fmadd_pd(yb8, yb8, gbtmp);
00118 gbtmp = _mm512_fmadd_pd(zb8, zb8, gbtmp);
00119 
00120 __m256 w8f = _mm256_load_ps(weight + i); // load 8-float vector
00121 __m512d w8 = _mm512_cvtps_pd(w8f); // convert from float to double
00122 
00123 vG1 = _mm512_fmadd_pd(w8, gatmp, vG1);
00124 vG2 = _mm512_fmadd_pd(w8, gbtmp, vG2);
00125 
00126 va0 = _mm512_fmadd_pd(xa8, xb8, va0);
00127 va1 = _mm512_fmadd_pd(xa8, yb8, va1);
00128 va2 = _mm512_fmadd_pd(xa8, zb8, va2);
00129 
00130 va3 = _mm512_fmadd_pd(ya8, xb8, va3);
00131 va4 = _mm512_fmadd_pd(ya8, yb8, va4);
00132 va5 = _mm512_fmadd_pd(ya8, zb8, va5);
00133 
00134 va6 = _mm512_fmadd_pd(za8, xb8, va6);
00135 va7 = _mm512_fmadd_pd(za8, yb8, va7);
00136 va8 = _mm512_fmadd_pd(za8, zb8, va8);
00137 }
00138 } else {
00139 for (int i=0; i<cnt; i+=8) {
00140 __m256 xa8f = _mm256_load_ps(crdx1 + i); // load 8-float vectors
00141 __m256 ya8f = _mm256_load_ps(crdy1 + i);
00142 __m256 za8f = _mm256_load_ps(crdz1 + i);
00143 
00144 __m512d xa8 = _mm512_cvtps_pd(xa8f); // convert from float to doubles
00145 __m512d ya8 = _mm512_cvtps_pd(ya8f);
00146 __m512d za8 = _mm512_cvtps_pd(za8f);
00147 
00148 vG1 = _mm512_fmadd_pd(xa8, xa8, vG1);
00149 vG1 = _mm512_fmadd_pd(ya8, ya8, vG1);
00150 vG1 = _mm512_fmadd_pd(za8, za8, vG1);
00151 
00152 __m256 xb8f = _mm256_load_ps(crdx2 + i); // load 8-float vectors
00153 __m256 yb8f = _mm256_load_ps(crdy2 + i);
00154 __m256 zb8f = _mm256_load_ps(crdz2 + i);
00155 
00156 __m512d xb8 = _mm512_cvtps_pd(xb8f); // convert from float to doubles
00157 __m512d yb8 = _mm512_cvtps_pd(yb8f);
00158 __m512d zb8 = _mm512_cvtps_pd(zb8f);
00159 
00160 vG2 = _mm512_fmadd_pd(xb8, xb8, vG2);
00161 vG2 = _mm512_fmadd_pd(yb8, yb8, vG2);
00162 vG2 = _mm512_fmadd_pd(zb8, zb8, vG2);
00163 
00164 va0 = _mm512_fmadd_pd(xa8, xb8, va0);
00165 va1 = _mm512_fmadd_pd(xa8, yb8, va1);
00166 va2 = _mm512_fmadd_pd(xa8, zb8, va2);
00167 
00168 va3 = _mm512_fmadd_pd(ya8, xb8, va3);
00169 va4 = _mm512_fmadd_pd(ya8, yb8, va4);
00170 va5 = _mm512_fmadd_pd(ya8, zb8, va5);
00171 
00172 va6 = _mm512_fmadd_pd(za8, xb8, va6);
00173 va7 = _mm512_fmadd_pd(za8, yb8, va7);
00174 va8 = _mm512_fmadd_pd(za8, zb8, va8);
00175 }
00176 }
00177 
00178 A[0] = hadd8_m512d(va0);
00179 A[1] = hadd8_m512d(va1);
00180 A[2] = hadd8_m512d(va2);
00181 A[3] = hadd8_m512d(va3);
00182 A[4] = hadd8_m512d(va4);
00183 A[5] = hadd8_m512d(va5);
00184 A[6] = hadd8_m512d(va6);
00185 A[7] = hadd8_m512d(va7);
00186 A[8] = hadd8_m512d(va8);
00187 
00188 double G1 = hadd8_m512d(vG1);
00189 double G2 = hadd8_m512d(vG2);
00190 
00191 return (G1 + G2) * 0.5;
00192 }
00193 
00194 #endif
00195 
00196 #if VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00197 
00198 static double hadd4_m256d(__m256d sum4) {
00199 __m256d tmp = sum4;
00200 __m256d hsum4 = _mm256_add_pd(tmp, _mm256_permute2f128_pd(tmp, tmp, 0x1));
00201 __m128d hsum2 = _mm256_castpd256_pd128(hsum4);
00202 __m128d sum2 = _mm_hadd_pd(hsum2, hsum2);
00203 return _mm_cvtsd_f64(sum2);
00204 }
00205 
00206 
00207 // AVX2 + FMA + 32-byte-aligned SOA-format memory buffers
00208 static double InnerProductSOA_avx2(double *A,
00209 float *crdx1, float *crdy1, float *crdz1,
00210 float *crdx2, float *crdy2, float *crdz2,
00211 const int cnt, const float *weight) {
00212 __m256d va0 = _mm256_set1_pd(0.0);
00213 __m256d va1 = _mm256_set1_pd(0.0);
00214 __m256d va2 = _mm256_set1_pd(0.0);
00215 __m256d va3 = _mm256_set1_pd(0.0);
00216 __m256d va4 = _mm256_set1_pd(0.0);
00217 __m256d va5 = _mm256_set1_pd(0.0);
00218 __m256d va6 = _mm256_set1_pd(0.0);
00219 __m256d va7 = _mm256_set1_pd(0.0);
00220 __m256d va8 = _mm256_set1_pd(0.0);
00221 __m256d vG1 = _mm256_set1_pd(0.0);
00222 __m256d vG2 = _mm256_set1_pd(0.0);
00223 
00224 if (weight != NULL) {
00225 for (int i=0; i<cnt; i+=4) {
00226 __m128 xa4f = _mm_load_ps(crdx1 + i); // load 4-float vectors
00227 __m128 ya4f = _mm_load_ps(crdy1 + i);
00228 __m128 za4f = _mm_load_ps(crdz1 + i);
00229 
00230 __m256d xa4 = _mm256_cvtps_pd(xa4f); // convert from float to doubles
00231 __m256d ya4 = _mm256_cvtps_pd(ya4f);
00232 __m256d za4 = _mm256_cvtps_pd(za4f);
00233 
00234 __m256d gatmp = _mm256_mul_pd(xa4, xa4);
00235 gatmp = _mm256_fmadd_pd(ya4, ya4, gatmp);
00236 gatmp = _mm256_fmadd_pd(za4, za4, gatmp);
00237 
00238 __m128 xb4f = _mm_load_ps(crdx2 + i); // load 4-float vectors
00239 __m128 yb4f = _mm_load_ps(crdy2 + i);
00240 __m128 zb4f = _mm_load_ps(crdz2 + i);
00241 
00242 __m256d xb4 = _mm256_cvtps_pd(xb4f); // convert from float to doubles
00243 __m256d yb4 = _mm256_cvtps_pd(yb4f);
00244 __m256d zb4 = _mm256_cvtps_pd(zb4f);
00245 
00246 __m256d gbtmp = _mm256_mul_pd(xb4, xb4);
00247 gbtmp = _mm256_fmadd_pd(yb4, yb4, gbtmp);
00248 gbtmp = _mm256_fmadd_pd(zb4, zb4, gbtmp);
00249 
00250 __m128 w4f = _mm_load_ps(weight + i); // load 4-float vector
00251 __m256d w4 = _mm256_cvtps_pd(w4f); // convert from float to double
00252 
00253 vG1 = _mm256_fmadd_pd(w4, gatmp, vG1);
00254 vG2 = _mm256_fmadd_pd(w4, gbtmp, vG2);
00255 
00256 va0 = _mm256_fmadd_pd(xa4, xb4, va0);
00257 va1 = _mm256_fmadd_pd(xa4, yb4, va1);
00258 va2 = _mm256_fmadd_pd(xa4, zb4, va2);
00259 
00260 va3 = _mm256_fmadd_pd(ya4, xb4, va3);
00261 va4 = _mm256_fmadd_pd(ya4, yb4, va4);
00262 va5 = _mm256_fmadd_pd(ya4, zb4, va5);
00263 
00264 va6 = _mm256_fmadd_pd(za4, xb4, va6);
00265 va7 = _mm256_fmadd_pd(za4, yb4, va7);
00266 va8 = _mm256_fmadd_pd(za4, zb4, va8);
00267 }
00268 } else {
00269 for (int i=0; i<cnt; i+=4) {
00270 __m128 xa4f = _mm_load_ps(crdx1 + i); // load 4-float vectors
00271 __m128 ya4f = _mm_load_ps(crdy1 + i);
00272 __m128 za4f = _mm_load_ps(crdz1 + i);
00273 
00274 __m256d xa4 = _mm256_cvtps_pd(xa4f); // convert from float to doubles
00275 __m256d ya4 = _mm256_cvtps_pd(ya4f);
00276 __m256d za4 = _mm256_cvtps_pd(za4f);
00277 
00278 vG1 = _mm256_fmadd_pd(xa4, xa4, vG1);
00279 vG1 = _mm256_fmadd_pd(ya4, ya4, vG1);
00280 vG1 = _mm256_fmadd_pd(za4, za4, vG1);
00281 
00282 __m128 xb4f = _mm_load_ps(crdx2 + i); // load 4-float vectors
00283 __m128 yb4f = _mm_load_ps(crdy2 + i);
00284 __m128 zb4f = _mm_load_ps(crdz2 + i);
00285 
00286 __m256d xb4 = _mm256_cvtps_pd(xb4f); // convert from float to doubles
00287 __m256d yb4 = _mm256_cvtps_pd(yb4f);
00288 __m256d zb4 = _mm256_cvtps_pd(zb4f);
00289 
00290 vG2 = _mm256_fmadd_pd(xb4, xb4, vG2);
00291 vG2 = _mm256_fmadd_pd(yb4, yb4, vG2);
00292 vG2 = _mm256_fmadd_pd(zb4, zb4, vG2);
00293 
00294 va0 = _mm256_fmadd_pd(xa4, xb4, va0);
00295 va1 = _mm256_fmadd_pd(xa4, yb4, va1);
00296 va2 = _mm256_fmadd_pd(xa4, zb4, va2);
00297 
00298 va3 = _mm256_fmadd_pd(ya4, xb4, va3);
00299 va4 = _mm256_fmadd_pd(ya4, yb4, va4);
00300 va5 = _mm256_fmadd_pd(ya4, zb4, va5);
00301 
00302 va6 = _mm256_fmadd_pd(za4, xb4, va6);
00303 va7 = _mm256_fmadd_pd(za4, yb4, va7);
00304 va8 = _mm256_fmadd_pd(za4, zb4, va8);
00305 }
00306 }
00307 
00308 A[0] = hadd4_m256d(va0);
00309 A[1] = hadd4_m256d(va1);
00310 A[2] = hadd4_m256d(va2);
00311 A[3] = hadd4_m256d(va3);
00312 A[4] = hadd4_m256d(va4);
00313 A[5] = hadd4_m256d(va5);
00314 A[6] = hadd4_m256d(va6);
00315 A[7] = hadd4_m256d(va7);
00316 A[8] = hadd4_m256d(va8);
00317 
00318 double G1 = hadd4_m256d(vG1);
00319 double G2 = hadd4_m256d(vG2);
00320 
00321 return (G1 + G2) * 0.5;
00322 }
00323 
00324 #endif
00325 
00326 
00327 // plain C++ version of inner product for SOA coordinate storage
00328 static double InnerProductSOA(double *A,
00329 float *crdx1, float *crdy1, float *crdz1,
00330 float *crdx2, float *crdy2, float *crdz2,
00331 const int cnt, const float *weight) {
00332 double G1=0.0, G2 = 0.0;
00333 memset(A, 0, sizeof(double) * 9);
00334 
00335 double x1, x2, y1, y2, z1, z2;
00336 double a0, a1, a2, a3, a4, a5, a6, a7, a8;
00337 a0=a1=a2=a3=a4=a5=a6=a7=a8=0.0;
00338 if (weight != NULL) {
00339 for (int i=0; i<cnt; i++) {
00340 double w = weight[i];
00341 x1 = crdx1[i];
00342 y1 = crdy1[i];
00343 z1 = crdz1[i];
00344 
00345 G1 += w * (x1*x1 + y1*y1 + z1*z1);
00346 
00347 x2 = crdx2[i];
00348 y2 = crdy2[i];
00349 z2 = crdz2[i];
00350 
00351 G2 += w * (x2*x2 + y2*y2 + z2*z2);
00352 
00353 a0 += x1 * x2;
00354 a1 += x1 * y2;
00355 a2 += x1 * z2;
00356 
00357 a3 += y1 * x2;
00358 a4 += y1 * y2;
00359 a5 += y1 * z2;
00360 
00361 a6 += z1 * x2;
00362 a7 += z1 * y2;
00363 a8 += z1 * z2;
00364 }
00365 } else {
00366 for (int i=0; i<cnt; i++) {
00367 x1 = crdx1[i];
00368 y1 = crdy1[i];
00369 z1 = crdz1[i];
00370 
00371 G1 += x1*x1 + y1*y1 + z1*z1;
00372 
00373 x2 = crdx2[i];
00374 y2 = crdy2[i];
00375 z2 = crdz2[i];
00376 
00377 G2 += x2*x2 + y2*y2 + z2*z2;
00378 
00379 a0 += x1 * x2;
00380 a1 += x1 * y2;
00381 a2 += x1 * z2;
00382 
00383 a3 += y1 * x2;
00384 a4 += y1 * y2;
00385 a5 += y1 * z2;
00386 
00387 a6 += z1 * x2;
00388 a7 += z1 * y2;
00389 a8 += z1 * z2;
00390 }
00391 }
00392 
00393 A[0] = a0;
00394 A[1] = a1;
00395 A[2] = a2;
00396 
00397 A[3] = a3;
00398 A[4] = a4;
00399 A[5] = a5;
00400 
00401 A[6] = a6;
00402 A[7] = a7;
00403 A[8] = a8;
00404 
00405 return (G1 + G2) * 0.5;
00406 }
00407 
00408 //
00409 // OpenACC version of inner product for SOA coordinate storage
00410 //
00411 // use pgc++ -m64 -Minfo=accel -ta=nvidia -O -acc
00412 #if defined(__PGIC__) && defined(_OPENACC)
00413 
00414 #if 0
00415 static void vecadd_acc(void) {
00416 printf("****** OpenACC test vecadd_acc()...\n");
00417 
00418 // Size of vectors
00419 int n = 10000;
00420 
00421 // Input vectors
00422 double *restrict a;
00423 double *restrict b;
00424 
00425 // Output vector
00426 double *restrict c;
00427 
00428 // Size, in bytes, of each vector
00429 size_t bytes = n*sizeof(double);
00430 
00431 // Allocate memory for each vector
00432 a = (double*)malloc(bytes);
00433 b = (double*)malloc(bytes);
00434 c = (double*)malloc(bytes);
00435 
00436 // Initialize content of input vectors, vector a[i] = sin(i)^2 vector b[i] = cos(i)^2
00437 int i;
00438 for (i=0; i<n; i++) {
00439 a[i] = sin(i)*sin(i);
00440 b[i] = cos(i)*cos(i);
00441 } 
00442 
00443 // sum component wise and save result into vector c
00444 #pragma acc kernels copyin(a[0:n],b[0:n]), copyout(c[0:n])
00445 for (i=0; i<n; i++) {
00446 c[i] = a[i] + b[i];
00447 }
00448 
00449 // Sum up vector c and print result divided by n, this should equal 1 within error
00450 double sum = 0.0;
00451 for(i=0; i<n; i++) {
00452 sum += c[i];
00453 }
00454 sum = sum/n;
00455 printf("****** final result: %f *******\n", sum);
00456 
00457 // Release memory
00458 free(a);
00459 free(b);
00460 free(c);
00461 
00462 printf("****** OpenACC test vecadd_acc() done.\n");
00463 }
00464 #endif
00465 
00466 //
00467 // Use 1-D loop rather than 2-D to please PGI OpenACC so it doesn't
00468 // complain about loop-carried dependencies, this appears to be the only
00469 // successful method to achieve a good quality parallelization.
00470 //
00471 #define LOOP1D 1
00472 
00473 #if defined(LOOP1D)
00474 #if defined(__PGIC__) && defined(_OPENACC)
00475 #pragma acc routine seq
00476 #endif
00477 static void acc_idx2sub_tril(long N, long ind, long *J, long *I) {
00478 long i, j;
00479 i = long(floor((2*N+1 - sqrt((2*N+1)*(2*N+1) - 8*ind)) / 2));
00480 j = ind - N*i + i*(i-1)/2 + i;
00481 
00482 *I = i;
00483 *J = j;
00484 }
00485 #endif
00486 
00487 static void rmsdmat_qcp_acc(int cnt, int padcnt, int framecrdsz, 
00488 int framecount, 
00489 #if defined(LOOP1D)
00490 const float * restrict crds, 
00491 #else
00492 const float * crds, 
00493 #endif
00494 // const float *weight,
00495 float * rmsdmat) {
00496 printf("OpenACC rmsdmat_qcp_acc()...\n");
00497 printf("ACC cnt: %d padcnt: %d\n", cnt, padcnt);
00498 
00499 printf("Copying input arrays to accelerators...\n");
00500 long totalsz = 3L * framecrdsz * framecount;
00501 printf("ACC copysz: %ld (3 * %d * %d)\n", totalsz, framecrdsz, framecount);
00502 
00503 long matcnt = framecount * framecount;
00504 printf("ACC matcnt: %ld\n", matcnt);
00505 
00506 printf("Running OpenACC kernels...\n");
00507 #if defined(LOOP1D)
00508 long i, j, k;
00509 #pragma acc kernels copyin(crds[0:totalsz]), copy(rmsdmat[0:matcnt])
00510 for (k=0; k<(framecount*(framecount-1))/2; k++) {
00511 acc_idx2sub_tril(long(framecount-1), k, &i, &j);
00512 long x1addr = j * 3L * framecrdsz;
00513 {
00514 #else
00515 long i, j;
00516 #pragma acc kernels copyin(crds[0:totalsz]), copy(rmsdmat[0:matcnt])
00517 for (j=0; j<framecount; j++) {
00518 long x1addr = j * 3L * framecrdsz;
00519 
00520 for (i=0; i<j; i++) {
00521 #endif
00522 // calculate the (weighted) inner product of two structures
00523 long x2addr = i * 3L * framecrdsz;
00524 
00525 double G1=0.0, G2=0.0;
00526 
00527 double a0, a1, a2, a3, a4, a5, a6, a7, a8;
00528 a0=a1=a2=a3=a4=a5=a6=a7=a8=0.0;
00529 #if 0
00530 if (weight != NULL) {
00531 double x1, x2, y1, y2, z1, z2;
00532 #pragma acc loop
00533 for (long l=0; l<cnt; l++) {
00534 double w = weight[l];
00535 x1 = crds[l + x1addr];
00536 y1 = crds[l + x1addr + framecrdsz];
00537 z1 = crds[l + x1addr + framecrdsz*2];
00538 
00539 G1 += w * (x1*x1 + y1*y1 + z1*z1);
00540 
00541 x2 = crds[l + x2addr];
00542 y2 = crds[l + x2addr + framecrdsz];
00543 z2 = crds[l + x2addr + framecrdsz*2];
00544 
00545 G2 += w * (x2*x2 + y2*y2 + z2*z2);
00546 
00547 a0 += x1 * x2;
00548 a1 += x1 * y2;
00549 a2 += x1 * z2;
00550 
00551 a3 += y1 * x2;
00552 a4 += y1 * y2;
00553 a5 += y1 * z2;
00554 
00555 a6 += z1 * x2;
00556 a7 += z1 * y2;
00557 a8 += z1 * z2;
00558 }
00559 } else {
00560 #endif
00561 double x1, x2, y1, y2, z1, z2;
00562 #pragma acc loop vector(256)
00563 //#pragma acc loop vector(256) reduction(+:a0),reduction(+:a1),reduction(+:a2),reduction(+:a3),reduction(+:a4),reduction(+:a5),reduction(+:a6),reduction(+:a7),reduction(+:a8),reduction(+:G1),reduction(+:G2)
00564 for (long l=0; l<cnt; l++) {
00565 x1 = crds[l + x1addr];
00566 y1 = crds[l + x1addr + framecrdsz];
00567 z1 = crds[l + x1addr + framecrdsz*2];
00568 
00569 G1 += x1*x1 + y1*y1 + z1*z1;
00570 
00571 x2 = crds[l + x2addr];
00572 y2 = crds[l + x2addr + framecrdsz];
00573 z2 = crds[l + x2addr + framecrdsz*2];
00574 
00575 G2 += x2*x2 + y2*y2 + z2*z2;
00576 
00577 a0 += x1 * x2;
00578 a1 += x1 * y2;
00579 a2 += x1 * z2;
00580 
00581 a3 += y1 * x2;
00582 a4 += y1 * y2;
00583 a5 += y1 * z2;
00584 
00585 a6 += z1 * x2;
00586 a7 += z1 * y2;
00587 a8 += z1 * z2;
00588 }
00589 #if 0
00590 }
00591 #endif
00592 
00593 double A[9];
00594 A[0] = a0;
00595 A[1] = a1;
00596 A[2] = a2;
00597 
00598 A[3] = a3;
00599 A[4] = a4;
00600 A[5] = a5;
00601 
00602 A[6] = a6;
00603 A[7] = a7;
00604 A[8] = a8;
00605 
00606 double E0 = (G1 + G2) * 0.5;
00607 
00608 // calculate the RMSD & rotational matrix
00609 float rmsd;
00610 FastCalcRMSDAndRotation(NULL, A, &rmsd, E0, cnt, -1);
00611 #if defined(LOOP1D)
00612 rmsdmat[k]=rmsd; // store linearized triangle
00613 #else
00614 rmsdmat[j*framecount + i]=rmsd;
00615 #endif
00616 }
00617 }
00618 
00619 printf("ACC done.\n");
00620 }
00621 
00622 #endif
00623 
00624 
00625 #if 0
00626 static double InnerProductAOS(double *A, double *coords1, double *coords2,
00627 const int cnt, const double *weight) {
00628 double G1=0.0, G2=0.0;
00629 memset(A, 0, sizeof(double) * 9);
00630 
00631 long i;
00632 double x1, x2, y1, y2, z1, z2;
00633 if (weight != NULL) {
00634 for (i=0; i<cnt; i++) {
00635 double w = weight[i];
00636 long idx = i*3;
00637 x1 = coords1[idx ];
00638 y1 = coords1[idx+1];
00639 z1 = coords1[idx+2];
00640 
00641 G1 += w * (x1*x1 + y1*y1 + z1*z1);
00642 
00643 x2 = coords2[idx ];
00644 y2 = coords2[idx+1];
00645 z2 = coords2[idx+2];
00646 
00647 G2 += w * (x2*x2 + y2*y2 + z2*z2);
00648 
00649 A[0] += (x1 * x2);
00650 A[1] += (x1 * y2);
00651 A[2] += (x1 * z2);
00652 
00653 A[3] += (y1 * x2);
00654 A[4] += (y1 * y2);
00655 A[5] += (y1 * z2);
00656 
00657 A[6] += (z1 * x2);
00658 A[7] += (z1 * y2);
00659 A[8] += (z1 * z2);
00660 }
00661 } else {
00662 for (i=0; i<cnt; i++) {
00663 long idx = i*3;
00664 x1 = coords1[idx ];
00665 y1 = coords1[idx+1];
00666 z1 = coords1[idx+2];
00667 
00668 G1 += x1*x1 + y1*y1 + z1*z1;
00669 
00670 x2 = coords2[idx ];
00671 y2 = coords2[idx+1];
00672 z2 = coords2[idx+2];
00673 
00674 G2 += x2*x2 + y2*y2 + z2*z2;
00675 
00676 A[0] += (x1 * x2);
00677 A[1] += (x1 * y2);
00678 A[2] += (x1 * z2);
00679 
00680 A[3] += (y1 * x2);
00681 A[4] += (y1 * y2);
00682 A[5] += (y1 * z2);
00683 
00684 A[6] += (z1 * x2);
00685 A[7] += (z1 * y2);
00686 A[8] += (z1 * z2);
00687 }
00688 }
00689 
00690 return (G1 + G2) * 0.5;
00691 }
00692 #endif
00693 
00694 
00695 void com_soa(int cnt, 
00696 float *&soax, float *&soay, float *&soaz,
00697 double &comx, double &comy, double &comz,
00698 const float *weight) {
00699 comx=comy=comz=0.0;
00700 
00701 if (weight != NULL) {
00702 double wsum = 0.0;
00703 
00704 // compute weighted center of mass
00705 int i;
00706 for (i=0; i<cnt; i++) {
00707 double w = weight[i];
00708 wsum += w;
00709 
00710 comx += soax[i] * w;
00711 comy += soay[i] * w;
00712 comz += soaz[i] * w;
00713 }
00714 double wsumnorm = 1.0 / wsum;
00715 comx *= wsumnorm; 
00716 comy *= wsumnorm;
00717 comz *= wsumnorm;
00718 } else {
00719 // compute unweighted center of mass
00720 int i;
00721 for (i=0; i<cnt; i++) {
00722 comx += soax[i];
00723 comy += soay[i];
00724 comz += soaz[i];
00725 }
00726 double avenorm = 1.0 / ((double) cnt);
00727 comx *= avenorm; 
00728 comy *= avenorm;
00729 comz *= avenorm;
00730 }
00731 }
00732 
00733 
00734 
00735 int center_convert_soa(const AtomSel *sel, int num, const float *framepos,
00736 const float *weight, 
00737 float *&soax, float *&soay, float *&soaz) {
00738 // allocate temporary working arrays, plus required SIMD padding
00739 int cnt = sel->selected;
00740 soax = (float *) calloc(1, (cnt + 16)*sizeof(float));
00741 soay = (float *) calloc(1, (cnt + 16)*sizeof(float));
00742 soaz = (float *) calloc(1, (cnt + 16)*sizeof(float));
00743 
00744 int selind = sel->firstsel; // start from the first selected atom
00745 double comx=0.0, comy=0.0, comz=0.0;
00746 
00747 int i;
00748 for (i=0; i<cnt; i++) {
00749 // find next 'on' atom in selection
00750 // loop is safe since we already stop the on cnt > 0 above
00751 while (!sel->on[selind])
00752 selind++;
00753 
00754 // compact selection and convert AOS to SOA storage on-the-fly
00755 long addr = 3*selind;
00756 float tx = framepos[addr ];
00757 float ty = framepos[addr + 1];
00758 float tz = framepos[addr + 2];
00759 
00760 comx += tx;
00761 comy += ty;
00762 comz += tz;
00763 
00764 soax[i] = tx;
00765 soay[i] = ty; 
00766 soaz[i] = tz;
00767 
00768 selind++; // advance to next atom
00769 }
00770 
00771 double avenorm = 1.0 / ((double) cnt);
00772 comx *= avenorm; // compute unweighted center of mass
00773 comy *= avenorm;
00774 comz *= avenorm;
00775 
00776 #if 0
00777 printf("center_convert_soa(): structure com: %g %g %g\n", comx, comy, comz);
00778 #endif
00779 
00780 // translate center of mass to the origin
00781 for (i=0; i<cnt; i++) {
00782 soax[i] -= float(comx);
00783 soay[i] -= float(comy);
00784 soaz[i] -= float(comz);
00785 }
00786 
00787 #if 0
00788 // check post-translation com 
00789 com_soa(cnt, soax, soay, soaz, comx, comy, comz, weight); 
00790 printf("center_convert_soa(): centered com: %lg %lg %lg\n", comx, comy, comz);
00791 #endif 
00792 
00793 return 0;
00794 }
00795 
00796 
00797 int center_convert_single_soa(const AtomSel *sel, int num, 
00798 const float *framepos,
00799 const float *weight, 
00800 float *soax, float *soay, float *soaz) {
00801 // allocate temporary working arrays, plus required SIMD padding
00802 int cnt = sel->selected;
00803 int selind = sel->firstsel; // start from the first selected atom
00804 double comx=0.0, comy=0.0, comz=0.0;
00805 
00806 int i;
00807 for (i=0; i<cnt; i++) {
00808 // find next 'on' atom in selection
00809 // loop is safe since we already stop the on cnt > 0 above
00810 while (!sel->on[selind])
00811 selind++;
00812 
00813 // compact selection and convert AOS to SOA storage on-the-fly
00814 long addr = 3*selind;
00815 float tx = framepos[addr ];
00816 float ty = framepos[addr + 1];
00817 float tz = framepos[addr + 2];
00818 
00819 comx += tx;
00820 comy += ty;
00821 comz += tz;
00822 
00823 soax[i] = tx;
00824 soay[i] = ty; 
00825 soaz[i] = tz;
00826 
00827 selind++; // advance to next atom
00828 }
00829 
00830 double avenorm = 1.0 / ((double) cnt);
00831 comx *= avenorm; // compute unweighted center of mass
00832 comy *= avenorm;
00833 comz *= avenorm;
00834 
00835 // translate center of mass to the origin
00836 for (i=0; i<cnt; i++) {
00837 soax[i] -= float(comx);
00838 soay[i] -= float(comy);
00839 soaz[i] -= float(comz);
00840 }
00841 
00842 return 0;
00843 }
00844 
00845 
00846 int measure_rmsd_qcp(VMDApp *app,
00847 const AtomSel *sel1, const AtomSel *sel2,
00848 int num, const float *framepos1, const float *framepos2,
00849 float *weight, float *rmsd) {
00850 if (!sel1 || !sel2) return MEASURE_ERR_NOSEL;
00851 if (sel1->selected < 1 || sel2->selected < 1) return MEASURE_ERR_NOSEL;
00852 if (!weight || !rmsd) return MEASURE_ERR_NOWEIGHT;
00853 
00854 // the number of selected atoms must be the same
00855 if (sel1->selected != sel2->selected) return MEASURE_ERR_MISMATCHEDCNT;
00856 
00857 #if 0
00858 // need to know how to traverse the list of weights
00859 // there could be 1 weight per atom (sel_flg == 1) or
00860 // 1 weight per selected atom (sel_flg == 0)
00861 int sel_flg;
00862 if (num == sel1->num_atoms) {
00863 sel_flg = 1; // using all elements
00864 } else {
00865 sel_flg = 0; // using elements from selection
00866 }
00867 #endif
00868 
00869 //
00870 // compute CoM for each selection while copying them into target bufs 
00871 //
00872 float *sel1x, *sel1y, *sel1z, *sel2x, *sel2y, *sel2z;
00873 center_convert_soa(sel1, num, framepos1, weight, sel1x, sel1y, sel1z);
00874 center_convert_soa(sel2, num, framepos2, weight, sel2x, sel2y, sel2z);
00875 
00876 // calculate the (weighted) inner product of two structures
00877 double E0 = 0;
00878 double A[9];
00879 E0 = InnerProductSOA(A, 
00880 sel1x, sel1y, sel1z,
00881 sel2x, sel2y, sel2z,
00882 sel1->selected, NULL /* weight */);
00883 
00884 #if 0
00885 printf("QCP inner product results:\n");
00886 printf(" E0: %g\n", E0);
00887 int i;
00888 for (i=0; i<9; i+=3) 
00889 printf("A[%d-%d]: %g %g %g\n", i, i+2, A[i], A[i+1], A[i+2]);
00890 printf("\n");
00891 #endif
00892 
00893 // calculate the RMSD & rotational matrix
00894 FastCalcRMSDAndRotation(NULL, A, rmsd, E0, sel1->selected, -1);
00895 
00896 free(sel1x);
00897 free(sel1y);
00898 free(sel1z);
00899 
00900 free(sel2x);
00901 free(sel2y);
00902 free(sel2z);
00903 
00904 return MEASURE_NOERR; // and say rmsd is OK
00905 }
00906 
00907 
00908 #if 0
00909 // compute linear array index from lower-triangular indices i,j 
00910 static int sub2idx_tril(long N, long i, long j, long *ind) {
00911 // *ind = i + (j-1)*N - j*(j-1)/2;
00912 *ind = j + N*i - i*(i-1)/2;
00913 return 0;
00914 }
00915 #endif
00916 
00917 // compute lower-triangular indices i,j from linear array index
00918 static int idx2sub_tril(long N, long ind, long *J, long *I) {
00919 long i, j;
00920 
00921 if (ind > (N*(N+1)/2)) {
00922 return -1; // out of bounds
00923 }
00924 
00925 // XXX deal with ambiguous types for sqrt() on Solaris
00926 double tmp2np1 = 2*N+1;
00927 i = long(floor((tmp2np1 - sqrt(tmp2np1*tmp2np1 - 8.0*ind)) / 2));
00928 // i = long(floor((2*N+1 - sqrt((2*N+1)*(2*N+1) - 8*ind)) / 2));
00929 j = ind - N*i + i*(i-1)/2 + i;
00930 
00931 *I = i;
00932 *J = j+1;
00933 
00934 return 0;
00935 }
00936 
00937 
00938 typedef struct {
00939 const AtomSel *sel;
00940 int first;
00941 int last;
00942 int step;
00943 float *rmsdmat;
00944 int padcnt;
00945 int framecrdsz;
00946 float *crds;
00947 #if (VMDQCPUSEAVX512 && defined(__AVX512F__))
00948 int useavx512;
00949 #endif
00950 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__))
00951 int useavx2;
00952 #endif
00953 #if (VMDQCPUSEVSX && defined(__VEC__))
00954 int usevsx;
00955 #endif
00956 } qcprmsdthreadparms;
00957 
00958 
00959 static void * measure_rmsdmat_qcp_thread(void *voidparms) {
00960 int threadid;
00961 qcprmsdthreadparms *parms = NULL;
00962 #if defined(VMDQCPUSETHRPOOL)
00963 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00964 wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00965 #else
00966 wkf_threadlaunch_getdata(voidparms, (void **) &parms);
00967 wkf_threadlaunch_getid(voidparms, &threadid, NULL);
00968 #endif
00969 
00970 //
00971 // copy in per-thread parameters
00972 //
00973 const AtomSel *sel = parms->sel;
00974 float *rmsdmat = parms->rmsdmat;
00975 
00976 // XXX array padding not universally honored yet...
00977 // int padcnt = parms->padcnt;
00978 
00979 int framecrdsz = parms->framecrdsz;
00980 float *crds = parms->crds;
00981 int first = parms->first;
00982 int last = parms->last;
00983 int step = parms->step;
00984 #if VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00985 int useavx2 = parms->useavx2;
00986 #endif
00987 #if (VMDQCPUSEAVX512 && defined(__AVX512F__))
00988 int useavx512 = parms->useavx512;
00989 #endif
00990 #if (VMDQCPUSEVSX && defined(__VEC__))
00991 int usevsx = parms->usevsx;
00992 #endif
00993 
00994 #if 0
00995 printf("qcpthread[%d] running... %s\n", threadid, 
00996 #if (VMDQCPUSEAVX512 && defined(__AVX512F__))
00997 (useavx512) ? "(AVX512)" : "(C++)");
00998 #elif (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__))
00999 (useavx2) ? "(AVX2)" : "(C++)");
01000 #elif (VMDQCPUSEVSX && defined(__VEC__))
01001 (useavsx) ? "(VSX)" : "(C++)");
01002 #else 
01003 "(C++)");
01004 #endif
01005 #endif
01006 
01007 int framecount = (last - first + 1) / step;
01008 
01009 wkf_tasktile_t tile;
01010 #if defined(VMDQCPUSETHRPOOL)
01011 while (wkf_threadpool_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) {
01012 #else
01013 while (wkf_threadlaunch_next_tile(voidparms, 8, &tile) != WKF_SCHED_DONE) {
01014 #endif
01015 long idx;
01016 
01017 for (idx=tile.start; idx<tile.end; idx++) {
01018 long i, j;
01019 
01020 // compute i,j from idx...
01021 // only compute off-diagonal elements, so we use (framecount-1)
01022 if (idx2sub_tril(framecount-1, idx, &i, &j)) {
01023 printf("qcpthread[%d]: work idx %ld out of triangle!\n", threadid, idx);
01024 break;
01025 }
01026 
01027 // calculate the (weighted) inner product of two structures
01028 double A[9];
01029 double E0 = 0;
01030 
01031 float *xj = crds + (j * 3 * framecrdsz);
01032 float *yj = xj + framecrdsz;
01033 float *zj = xj + framecrdsz*2;
01034 
01035 float *xi = crds + (i * 3 * framecrdsz);
01036 float *yi = xi + framecrdsz;
01037 float *zi = xi + framecrdsz*2;
01038 
01039 #if VMDQCPUSEAVX512 && defined(__AVX512F__)
01040 if (useavx512) {
01041 E0 = InnerProductSOA_avx512(A, xj, yj, zj, xi, yi, zi,
01042 sel->selected, NULL /* weight */);
01043 } else 
01044 #endif
01045 #if VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
01046 if (useavx2) {
01047 E0 = InnerProductSOA_avx2(A, xj, yj, zj, xi, yi, zi,
01048 sel->selected, NULL /* weight */);
01049 } else 
01050 #endif
01051 E0 = InnerProductSOA(A, xj, yj, zj, xi, yi, zi, 
01052 sel->selected, NULL /* weight */);
01053 
01054 // calculate the RMSD & rotational matrix
01055 FastCalcRMSDAndRotation(NULL, A, &rmsdmat[j*framecount + i], 
01056 E0, sel->selected, -1);
01057 
01058 // reflect the outcome of the lower triangle into the upper triangle
01059 rmsdmat[i*framecount + j] = rmsdmat[j*framecount + i];
01060 } 
01061 }
01062 
01063 return NULL;
01064 }
01065 
01066 
01067 
01068 int measure_rmsdmat_qcp(VMDApp *app,
01069 const AtomSel *sel, MoleculeList *mlist,
01070 int num, float *weight, 
01071 int first, int last, int step,
01072 float *rmsdmat) {
01073 if (!sel) return MEASURE_ERR_NOSEL;
01074 if (sel->selected < 1) return MEASURE_ERR_NOSEL;
01075 // if (!weight || !rmsd) return MEASURE_ERR_NOWEIGHT;
01076 
01077 Molecule *mymol = mlist->mol_from_id(sel->molid());
01078 int maxframes = mymol->numframes();
01079 
01080 // accept value of -1 meaning "all" frames
01081 if (last == -1)
01082 last = maxframes-1;
01083 
01084 if (maxframes == 0 || first < 0 || first > last ||
01085 last >= maxframes || step <= 0)
01086 return MEASURE_ERR_BADFRAMERANGE;
01087 
01088 // XXX replace with calls to centralized control system
01089 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__))
01090 // XXX there's no SSE-specific code path 
01091 // int usesse=1;
01092 // if (getenv("VMDNOSSE")) {
01093 // usesse=0;
01094 // }
01095 #endif
01096 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__))
01097 int useavx2=1;
01098 if (getenv("VMDNOAVX2")) {
01099 useavx2=0;
01100 }
01101 #endif
01102 #if (VMDQCPUSEAVX512 && defined(__AVX512F__))
01103 int useavx512=1;
01104 if (getenv("VMDNOAVX512")) {
01105 useavx512=0;
01106 }
01107 #endif
01108 #if (VMDQCPUSEVSX && defined(__VEC__))
01109 int usevsx=1;
01110 if (getenv("VMDNOVSX")) {
01111 usevsx=0;
01112 }
01113 #endif
01114 
01115 #if 0
01116 // need to know how to traverse the list of weights
01117 // there could be 1 weight per atom (sel_flg == 1) or
01118 // 1 weight per selected atom (sel_flg == 0)
01119 int sel_flg;
01120 if (num == sel->num_atoms) {
01121 sel_flg = 1; // using all elements
01122 } else {
01123 sel_flg = 0; // using elements from selection
01124 }
01125 #endif
01126 
01127 // start timers
01128 wkf_timerhandle timer;
01129 timer=wkf_timer_create();
01130 wkf_timer_start(timer);
01131 
01132 
01133 //
01134 // compute CoM for frame/selection while copying them into SOA target bufs 
01135 //
01136 int framecount = (last - first + 1) / step;
01137 
01138 int padcnt = (num + 255) & ~255;
01139 int framecrdsz = padcnt + 256;
01140 float *crds = (float *) calloc(1, (framecount * 3L * framecrdsz + 256) * sizeof(float));
01141 
01142 int frame;
01143 for (frame=first; frame<=last; frame+=step) {
01144 const float *framepos = (mymol->get_frame(frame))->pos;
01145 float *xc = crds + (frame * 3L * framecrdsz);
01146 float *yc = xc + framecrdsz;
01147 float *zc = xc + framecrdsz*2;
01148 
01149 center_convert_single_soa(sel, num, framepos, weight, xc, yc, zc);
01150 }
01151 
01152 double converttime = wkf_timer_timenow(timer);
01153 
01154 #if !(defined(__PGIC__) && defined(_OPENACC))
01155 #if defined(VMDTHREADS)
01156 int numprocs = wkf_thread_numprocessors();
01157 #else
01158 int numprocs = 1;
01159 #endif
01160 
01161 //
01162 // copy in per-thread parameters
01163 //
01164 qcprmsdthreadparms parms;
01165 memset(&parms, 0, sizeof(parms));
01166 parms.sel = sel;
01167 parms.rmsdmat = rmsdmat;
01168 parms.padcnt = padcnt;
01169 parms.framecrdsz = framecrdsz;
01170 parms.crds = crds;
01171 parms.first = first;
01172 parms.last = last;
01173 parms.step = step;
01174 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__))
01175 parms.useavx2 = useavx2;
01176 #endif
01177 #if (VMDQCPUSEAVX512 && defined(__AVX512F__))
01178 parms.useavx512 = useavx512;
01179 #endif
01180 #if (VMDQCPUSEVSX && defined(__VEC__))
01181 parms.usevsx = usevsx;
01182 #endif
01183 
01184 // spawn child threads to do the work
01185 wkf_tasktile_t tile;
01186 tile.start=0;
01187 tile.end=(framecount-1)*(framecount-1)/2; // only compute off-diag elements
01188 
01189 #if defined(VMDORBUSETHRPOOL)
01190 wkf_threadpool_sched_dynamic(app->thrpool, &tile);
01191 rc = wkf_threadpool_launch(app->thrpool, measure_rmsdmat_qcp_thread, &parms, 1);
01192 #else
01193 wkf_threadlaunch(numprocs, &parms, measure_rmsdmat_qcp_thread, &tile);
01194 #endif
01195 #elif defined(__PGIC__) && defined(_OPENACC)
01196 // OpenACC variant
01197 rmsdmat_qcp_acc(sel->selected, padcnt, framecrdsz, framecount, crds, 
01198 // NULL /* weight */, 
01199 rmsdmat);
01200 #else
01201 int i, j;
01202 for (j=0; j<framecount; j++) {
01203 float *xj = crds + (j * 3 * framecrdsz);
01204 float *yj = xj + framecrdsz;
01205 float *zj = xj + framecrdsz*2;
01206 for (i=0; i<j; i++) {
01207 // calculate the (weighted) inner product of two structures
01208 double A[9];
01209 
01210 float *xi = crds + (i * 3 * framecrdsz);
01211 float *yi = xi + framecrdsz;
01212 float *zi = xi + framecrdsz*2;
01213 
01214 double E0 = InnerProductSOA(A, xj, yj, zj, xi, yi, zi, 
01215 sel->selected, NULL /* weight */);
01216 
01217 // calculate the RMSD & rotational matrix
01218 FastCalcRMSDAndRotation(NULL, A, &rmsdmat[j*framecount + i], 
01219 E0, sel->selected, -1);
01220 
01221 // reflect the outcome of the lower triangle into the upper triangle
01222 rmsdmat[i*framecount + j] = rmsdmat[j*framecount + i];
01223 }
01224 }
01225 #endif
01226 
01227 // mark all self-RMSDs with a value of 0.0
01228 for (long l=0; l<framecount; l++) {
01229 rmsdmat[l*framecount + l] = 0.0;
01230 }
01231 
01232 double rmsdtime = wkf_timer_timenow(timer) - converttime;
01233 
01234 // free all temporary buffers
01235 free(crds);
01236 
01237 #if 1
01238 double totaltime = wkf_timer_timenow(timer);
01239 printf("QCP RMSD Matrix calculation time: SOA selection: %.3f RMSD solve: %.3f total: %.3f\n", converttime, rmsdtime, totaltime); 
01240 #endif
01241 
01242 wkf_timer_destroy(timer);
01243 
01244 return MEASURE_NOERR; // and say rmsd is OK
01245 }
01246 
01247 
01248 
01249 int measure_rmsdmat_qcp_ooc(VMDApp *app,
01250 const AtomSel *sel, MoleculeList *mlist,
01251 int nfiles, const char **trjfileset,
01252 int num, float *weight, 
01253 int first, int last, int step,
01254 int &framecount, float *&rmsdmat) {
01255 if (!sel) return MEASURE_ERR_NOSEL;
01256 if (sel->selected < 1) return MEASURE_ERR_NOSEL;
01257 // if (!weight || !rmsd) return MEASURE_ERR_NOWEIGHT;
01258 
01259 
01260 //
01261 // XXX really need to compute per-file frame counts etc
01262 //
01263 framecount = (last - first + 1) / step;
01264 
01265 printf("** measure_rmsdmat_qcp_ooc(): \n");
01266 printf("** first: %d last: %d step: %d nfiles: %d count: %d\n",
01267 first, last, step, nfiles, framecount);
01268 
01269 rmsdmat = (float *) calloc(1, framecount * framecount * sizeof(float));
01270 
01271 // XXX this needs to be properly implemented with fallback functionality
01272 #if defined(VMDCUDA)
01273 qcp_soa_gpu_ooc(app->cuda->get_cuda_devpool(), 
01274 nfiles, trjfileset, sel, first, last, step, rmsdmat);
01275 #endif
01276 
01277 #if 0
01278 Molecule *mymol = mlist->mol_from_id(sel->molid());
01279 
01280 int maxframes = mymol->numframes();
01281 
01282 // accept value of -1 meaning "all" frames
01283 if (last == -1)
01284 last = maxframes-1;
01285 
01286 if (maxframes == 0 || first < 0 || first > last ||
01287 last >= maxframes || step <= 0)
01288 return MEASURE_ERR_BADFRAMERANGE;
01289 
01290 // XXX hacked out lots of stuff we don't need yet...
01291 
01292 
01293 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__))
01294 int useavx2=1;
01295 if (getenv("VMDNOAVX2")) {
01296 useavx2=0;
01297 }
01298 #endif
01299 
01300 // start timers
01301 wkf_timerhandle timer;
01302 timer=wkf_timer_create();
01303 wkf_timer_start(timer);
01304 
01305 //
01306 // compute CoM for frame/selection while copying them into SOA target bufs 
01307 //
01308 int framecount = (last - first + 1) / step;
01309 
01310 int padcnt = (num + 255) & ~255;
01311 int framecrdsz = padcnt + 256;
01312 float *crds = (float *) calloc(1, (framecount * 3L * framecrdsz + 256) * sizeof(float));
01313 
01314 int frame;
01315 for (frame=first; frame<=last; frame+=step) {
01316 const float *framepos = (mymol->get_frame(frame))->pos;
01317 float *xc = crds + (frame * 3L * framecrdsz);
01318 float *yc = xc + framecrdsz;
01319 float *zc = xc + framecrdsz*2;
01320 
01321 center_convert_single_soa(sel, num, framepos, weight, xc, yc, zc);
01322 }
01323 
01324 double converttime = wkf_timer_timenow(timer);
01325 
01326 #if defined(VMDTHREADS)
01327 int numprocs = wkf_thread_numprocessors();
01328 #else
01329 int numprocs = 1;
01330 #endif
01331 
01332 //
01333 // copy in per-thread parameters
01334 //
01335 qcprmsdthreadparms parms;
01336 memset(&parms, 0, sizeof(parms));
01337 parms.sel = sel;
01338 parms.rmsdmat = rmsdmat;
01339 parms.padcnt = padcnt;
01340 parms.framecrdsz = framecrdsz;
01341 parms.crds = crds;
01342 parms.first = first;
01343 parms.last = last;
01344 parms.step = step;
01345 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__))
01346 parms.useavx2 = useavx2;
01347 #endif
01348 
01349 // spawn child threads to do the work
01350 wkf_tasktile_t tile;
01351 tile.start=0;
01352 tile.end=(framecount-1)*(framecount-1)/2; // only compute off-diag elements
01353 
01354 #if defined(VMDORBUSETHRPOOL)
01355 wkf_threadpool_sched_dynamic(app->thrpool, &tile);
01356 rc = wkf_threadpool_launch(app->thrpool, measure_rmsdmat_qcp_thread, &parms, 1);
01357 #else
01358 wkf_threadlaunch(numprocs, &parms, measure_rmsdmat_qcp_thread, &tile);
01359 #endif
01360 
01361 // mark all self-RMSDs with a value of 1.0
01362 for (long l=0; l<framecount; l++) {
01363 rmsdmat[l*framecount + l] = 1.0;
01364 }
01365 
01366 double rmsdtime = wkf_timer_timenow(timer) - converttime;
01367 
01368 // free all temporary buffers
01369 free(crds);
01370 
01371 #if 1
01372 double totaltime = wkf_timer_timenow(timer);
01373 printf("QCP RMSD Matrix calculation time: SOA selection: %.3f RMSD solve: %.3f total: %.3f\n", converttime, rmsdtime, totaltime); 
01374 #endif
01375 
01376 wkf_timer_destroy(timer);
01377 #endif
01378 
01379 return MEASURE_NOERR; // and say rmsd is OK
01380 }
01381 
01382 
01383 
01384 
01385 
01386 
01387 //
01388 // Copyright notice for original QCP FastCalcRMSDAndRotation() routine
01389 //
01390 // If you use this QCP rotation calculation method in a publication, please
01391 // reference:
01392 // Douglas L. Theobald (2005)
01393 // "Rapid calculation of RMSD using a quaternion-based characteristic
01394 // polynomial."
01395 // Acta Crystallographica A 61(4):478-480.
01396 //
01397 // Pu Liu, Dmitris K. Agrafiotis, and Douglas L. Theobald (2009)
01398 // "Fast determination of the optimal rotational matrix for macromolecular
01399 // superpositions."
01400 // Journal of Computational Chemistry 31(7):1561-1563.
01401 //
01402 // Copyright (c) 2009-2013 Pu Liu and Douglas L. Theobald
01403 // All rights reserved.
01404 //
01405 // Redistribution and use in source and binary forms, with or without modification, are permitted
01406 // provided that the following conditions are met:
01407 //
01408 // * Redistributions of source code must retain the above copyright notice, 
01409 // this list of conditions and the following disclaimer.
01410 // * Redistributions in binary form must reproduce the above copyright notice,
01411 // this list of conditions and the following disclaimer in the documentation
01412 // and/or other materials provided with the distribution.
01413 // * Neither the name of the <ORGANIZATION> nor the names of its 
01414 // contributors may be used to endorse or promote products derived from 
01415 // this software without specific prior written permission.
01416 //
01417 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
01418 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
01419 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
01420 // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
01421 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
01422 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
01423 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
01424 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
01425 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
01426 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
01427 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
01428 //
01429 #if defined(__PGIC__)
01430 #pragma acc routine seq
01431 #endif
01432 int FastCalcRMSDAndRotation(double *rot, double *A, float *rmsd, 
01433 double E0, int len, double minScore) {
01434 double Sxx, Sxy, Sxz, Syx, Syy, Syz, Szx, Szy, Szz;
01435 double Szz2, Syy2, Sxx2, Sxy2, Syz2, Sxz2, Syx2, Szy2, Szx2,
01436 SyzSzymSyySzz2, Sxx2Syy2Szz2Syz2Szy2, Sxy2Sxz2Syx2Szx2,
01437 SxzpSzx, SyzpSzy, SxypSyx, SyzmSzy,
01438 SxzmSzx, SxymSyx, SxxpSyy, SxxmSyy;
01439 double C[4];
01440 int i;
01441 double mxEigenV; 
01442 double oldg = 0.0;
01443 double b, a, delta, rms, qsqr;
01444 double q1, q2, q3, q4, normq;
01445 double a11, a12, a13, a14, a21, a22, a23, a24;
01446 double a31, a32, a33, a34, a41, a42, a43, a44;
01447 double a2, x2, y2, z2; 
01448 double xy, az, zx, ay, yz, ax; 
01449 double a3344_4334, a3244_4234, a3243_4233, a3143_4133,a3144_4134, a3142_4132; 
01450 double evecprec = 1e-6;
01451 double evalprec = 1e-11;
01452 
01453 Sxx = A[0]; Sxy = A[1]; Sxz = A[2];
01454 Syx = A[3]; Syy = A[4]; Syz = A[5];
01455 Szx = A[6]; Szy = A[7]; Szz = A[8];
01456 
01457 Sxx2 = Sxx * Sxx;
01458 Syy2 = Syy * Syy;
01459 Szz2 = Szz * Szz;
01460 
01461 Sxy2 = Sxy * Sxy;
01462 Syz2 = Syz * Syz;
01463 Sxz2 = Sxz * Sxz;
01464 
01465 Syx2 = Syx * Syx;
01466 Szy2 = Szy * Szy;
01467 Szx2 = Szx * Szx;
01468 
01469 SyzSzymSyySzz2 = 2.0*(Syz*Szy - Syy*Szz);
01470 Sxx2Syy2Szz2Syz2Szy2 = Syy2 + Szz2 - Sxx2 + Syz2 + Szy2;
01471 
01472 C[2] = -2.0 * (Sxx2 + Syy2 + Szz2 + Sxy2 + Syx2 + Sxz2 + Szx2 + Syz2 + Szy2);
01473 C[1] = 8.0 * (Sxx*Syz*Szy + Syy*Szx*Sxz + Szz*Sxy*Syx - Sxx*Syy*Szz - Syz*Szx*Sxy - Szy*Syx*Sxz);
01474 
01475 SxzpSzx = Sxz + Szx;
01476 SyzpSzy = Syz + Szy;
01477 SxypSyx = Sxy + Syx;
01478 SyzmSzy = Syz - Szy;
01479 SxzmSzx = Sxz - Szx;
01480 SxymSyx = Sxy - Syx;
01481 SxxpSyy = Sxx + Syy;
01482 SxxmSyy = Sxx - Syy;
01483 Sxy2Sxz2Syx2Szx2 = Sxy2 + Sxz2 - Syx2 - Szx2;
01484 
01485 C[0] = Sxy2Sxz2Syx2Szx2 * Sxy2Sxz2Syx2Szx2
01486 + (Sxx2Syy2Szz2Syz2Szy2 + SyzSzymSyySzz2) * (Sxx2Syy2Szz2Syz2Szy2 - SyzSzymSyySzz2)
01487 + (-(SxzpSzx)*(SyzmSzy)+(SxymSyx)*(SxxmSyy-Szz)) * (-(SxzmSzx)*(SyzpSzy)+(SxymSyx)*(SxxmSyy+Szz))
01488 + (-(SxzpSzx)*(SyzpSzy)-(SxypSyx)*(SxxpSyy-Szz)) * (-(SxzmSzx)*(SyzmSzy)-(SxypSyx)*(SxxpSyy+Szz))
01489 + (+(SxypSyx)*(SyzpSzy)+(SxzpSzx)*(SxxmSyy+Szz)) * (-(SxymSyx)*(SyzmSzy)+(SxzpSzx)*(SxxpSyy+Szz))
01490 + (+(SxypSyx)*(SyzmSzy)+(SxzmSzx)*(SxxmSyy-Szz)) * (-(SxymSyx)*(SyzpSzy)+(SxzmSzx)*(SxxpSyy-Szz));
01491 
01492 /* Newton-Raphson */
01493 mxEigenV = E0;
01494 for (i = 0; i < 50; ++i) {
01495 oldg = mxEigenV;
01496 x2 = mxEigenV*mxEigenV;
01497 b = (x2 + C[2])*mxEigenV;
01498 a = b + C[1];
01499 delta = ((a*mxEigenV + C[0])/(2.0*x2*mxEigenV + b + a));
01500 mxEigenV -= delta;
01501 #if 0
01502 printf("QCP diff[%3d]: %16g %16g %16g\n", i, mxEigenV - oldg, evalprec*mxEigenV, mxEigenV);
01503 #endif
01504 if (fabs(mxEigenV - oldg) < fabs(evalprec*mxEigenV))
01505 break;
01506 }
01507 
01508 #if !defined(__PGIC__)
01509 if (i == 50) 
01510 printf("MeasureQCP: More than %d iterations needed!\n", i);
01511 #endif
01512 
01513 // the fabs() is to guard against extremely small, 
01514 // but *negative* numbers due to floating point error 
01515 rms = sqrt(fabs(2.0 * (E0 - mxEigenV)/len));
01516 (*rmsd) = float(rms);
01517 /* printf("\n\n %16g %16g %16g \n", rms, E0, 2.0 * (E0 - mxEigenV)/len); */
01518 
01519 if (minScore > 0) 
01520 if (rms < minScore)
01521 return (-1); // Don't bother with rotation. 
01522 
01523 // only perform rotation related calculations if we have a non-NULL
01524 // pointer for the output rotation matrix
01525 if (rot != NULL) {
01526 a11 = SxxpSyy + Szz-mxEigenV; a12 = SyzmSzy; a13 = - SxzmSzx; a14 = SxymSyx;
01527 a21 = SyzmSzy; a22 = SxxmSyy - Szz-mxEigenV; a23 = SxypSyx; a24= SxzpSzx;
01528 a31 = a13; a32 = a23; a33 = Syy-Sxx-Szz - mxEigenV; a34 = SyzpSzy;
01529 a41 = a14; a42 = a24; a43 = a34; a44 = Szz - SxxpSyy - mxEigenV;
01530 a3344_4334 = a33 * a44 - a43 * a34; a3244_4234 = a32 * a44-a42*a34;
01531 a3243_4233 = a32 * a43 - a42 * a33; a3143_4133 = a31 * a43-a41*a33;
01532 a3144_4134 = a31 * a44 - a41 * a34; a3142_4132 = a31 * a42-a41*a32;
01533 q1 = a22*a3344_4334-a23*a3244_4234+a24*a3243_4233;
01534 q2 = -a21*a3344_4334+a23*a3144_4134-a24*a3143_4133;
01535 q3 = a21*a3244_4234-a22*a3144_4134+a24*a3142_4132;
01536 q4 = -a21*a3243_4233+a22*a3143_4133-a23*a3142_4132;
01537 
01538 qsqr = q1 * q1 + q2 * q2 + q3 * q3 + q4 * q4;
01539 
01540 // The following code tries to calculate another column in the 
01541 // adjoint matrix when the norm of the current column is too small.
01542 // Usually this block will never be activated. 
01543 // To be absolutely safe this should be
01544 // uncommented, but it is most likely unnecessary.
01545 if (qsqr < evecprec) {
01546 q1 = a12*a3344_4334 - a13*a3244_4234 + a14*a3243_4233;
01547 q2 = -a11*a3344_4334 + a13*a3144_4134 - a14*a3143_4133;
01548 q3 = a11*a3244_4234 - a12*a3144_4134 + a14*a3142_4132;
01549 q4 = -a11*a3243_4233 + a12*a3143_4133 - a13*a3142_4132;
01550 qsqr = q1*q1 + q2 *q2 + q3*q3+q4*q4;
01551 
01552 if (qsqr < evecprec) {
01553 double a1324_1423 = a13*a24 - a14*a23, a1224_1422 = a12*a24 - a14*a22;
01554 double a1223_1322 = a12*a23 - a13*a22, a1124_1421 = a11*a24 - a14*a21;
01555 double a1123_1321 = a11*a23 - a13*a21, a1122_1221 = a11*a22 - a12*a21;
01556 
01557 q1 = a42 * a1324_1423 - a43 * a1224_1422 + a44 * a1223_1322;
01558 q2 = -a41 * a1324_1423 + a43 * a1124_1421 - a44 * a1123_1321;
01559 q3 = a41 * a1224_1422 - a42 * a1124_1421 + a44 * a1122_1221;
01560 q4 = -a41 * a1223_1322 + a42 * a1123_1321 - a43 * a1122_1221;
01561 qsqr = q1*q1 + q2 *q2 + q3*q3+q4*q4;
01562 
01563 if (qsqr < evecprec) {
01564 q1 = a32 * a1324_1423 - a33 * a1224_1422 + a34 * a1223_1322;
01565 q2 = -a31 * a1324_1423 + a33 * a1124_1421 - a34 * a1123_1321;
01566 q3 = a31 * a1224_1422 - a32 * a1124_1421 + a34 * a1122_1221;
01567 q4 = -a31 * a1223_1322 + a32 * a1123_1321 - a33 * a1122_1221;
01568 qsqr = q1*q1 + q2 *q2 + q3*q3 + q4*q4;
01569 
01570 if (qsqr < evecprec) {
01571 // if qsqr is still too small, return the identity matrix.
01572 rot[0] = rot[4] = rot[8] = 1.0;
01573 rot[1] = rot[2] = rot[3] = rot[5] = rot[6] = rot[7] = 0.0;
01574 
01575 return(0);
01576 }
01577 }
01578 }
01579 }
01580 
01581 normq = sqrt(qsqr);
01582 q1 /= normq;
01583 q2 /= normq;
01584 q3 /= normq;
01585 q4 /= normq;
01586 
01587 a2 = q1 * q1;
01588 x2 = q2 * q2;
01589 y2 = q3 * q3;
01590 z2 = q4 * q4;
01591 
01592 xy = q2 * q3;
01593 az = q1 * q4;
01594 zx = q4 * q2;
01595 ay = q1 * q3;
01596 yz = q3 * q4;
01597 ax = q1 * q2;
01598 
01599 rot[0] = a2 + x2 - y2 - z2;
01600 rot[1] = 2 * (xy + az);
01601 rot[2] = 2 * (zx - ay);
01602 rot[3] = 2 * (xy - az);
01603 rot[4] = a2 - x2 + y2 - z2;
01604 rot[5] = 2 * (yz + ax);
01605 rot[6] = 2 * (zx + ay);
01606 rot[7] = 2 * (yz - ax);
01607 rot[8] = a2 - x2 - y2 + z2;
01608 }
01609 
01610 return 1;
01611 }
01612