00001 /*************************************************************************** 00002 *cr 00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the 00004 *cr University of Illinois 00005 *cr All Rights Reserved 00006 *cr 00007 ***************************************************************************/ 00008 00009 /*************************************************************************** 00010 * RCS INFORMATION: 00011 * 00012 * $RCSfile: util_simd_AVX2.C,v $ 00013 * $Author: johns $ $Locker: $ $State: Exp $ 00014 * $Revision: 1.1 $ $Date: 2020年11月01日 05:00:11 $ 00015 * 00016 *************************************************************************** 00017 * DESCRIPTION: 00018 * 00019 * Hand-coded SIMD loops using compiler provided intrinsics, or inline 00020 * assembly code to generate highly optimized machine code for time-critical 00021 * loops that crop up commonly used features of VMD. 00022 * 00023 ***************************************************************************/ 00024 00025 #include "WKFThreads.h" // CPU capability flags 00026 00027 #if defined(__SSE2__) || (defined(_MSC_VER) && (_MSC_VER >= 1916)) 00028 #define VMDUSESSE 1 // enable SSE in combination with target macros 00029 #endif 00030 00031 #if defined(__AVX__) || (defined(_MSC_VER) && (_MSC_VER >= 1916)) 00032 #define VMDUSEAVX 1 // enable AVX with runtime dispatch 00033 #endif 00034 00035 #if defined(__AVX2__) || (defined(_MSC_VER) && (_MSC_VER >= 1916)) 00036 #define VMDUSEAVX2 1 // enable AVX with runtime dispatch 00037 #endif 00038 00039 #if defined(VMDUSESSE) && defined(VMDUSEAVX) 00040 00041 #if defined(VMDUSESSE) 00042 #include <emmintrin.h> 00043 #endif 00044 #if defined(VMDUSEAVX) 00045 #include <immintrin.h> 00046 #endif 00047 00048 // #include <string.h> 00049 // #include <ctype.h> 00050 #include <math.h> 00051 #include <stdio.h> 00052 #include <stdlib.h> 00053 00054 #if defined(_MSC_VER) 00055 #include <windows.h> 00056 #include <conio.h> 00057 #else 00058 #include <unistd.h> 00059 #endif // _MSC_VER 00060 00061 00062 // 00063 // Helper routine for use when coping with unaligned 00064 // buffers returned by malloc() on many GNU systems: 00065 // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=24261 00066 // http://www.sourceware.org/bugzilla/show_bug.cgi?id=206 00067 // 00068 // XXX until all compilers support uintptr_t, we have to do 00069 // dangerous and ugly things with pointer casting here... 00070 // 00071 #if defined(_WIN64) /* sizeof(size_t) == sizeof(void*) */ 00072 #define myintptrtype size_t 00073 #elif 1 /* sizeof(unsigned long) == sizeof(void*) */ 00074 #define myintptrtype unsigned long 00075 #else /* C99 */ 00076 #define myintptrtype uintptr_t 00077 #endif 00078 00079 00080 // 00081 // Aligment test routines for vector instructions 00082 // 00083 static int test_alignment_Nbyte_powertwo(const void *ptr, unsigned int alignsz) { 00084 unsigned int alignmask = alignsz - 1; 00085 return (((myintptrtype) ptr) == (((myintptrtype) ptr) & (~alignmask))); 00086 } 00087 00088 00089 // 00090 // Small inlinable AVX helper routines to make code easier to read 00091 // 00092 static int hadd_m256i(__m256i sum8) { 00093 int tmp[sizeof(__m256i)/sizeof(int)]; 00094 _mm256_store_si256((__m256i *)tmp, sum8); 00095 int sum = tmp[0]+tmp[1]+tmp[2]+tmp[3]+tmp[4]+tmp[5]+tmp[6]+tmp[7]; 00096 return sum; 00097 } 00098 00099 00100 #if defined(VMDUSEAVX) 00101 // Find the first selected atom, the last selected atom, 00102 // and the total number of selected atoms. 00103 int analyze_selection_aligned_avx2(int n, const int *on, 00104 int *firstsel, int *lastsel, int *selected) { 00105 int sel = 0; // set count to zero in case of early-exit 00106 int first = 0; // if we early-exit, firstsel is 0 00107 int last = -1; // and lastsel is -1 00108 int i; 00109 00110 // assign potential early-exit outcomes 00111 if (selected != NULL) 00112 *selected = sel; 00113 00114 if (firstsel != NULL) 00115 *firstsel = first; 00116 00117 if (lastsel != NULL) 00118 *lastsel = last; 00119 00120 // find the first selected atom, if any 00121 if ((firstsel != NULL) || (selected != NULL)) { 00122 // find the first selected atom, if any 00123 // roll up to the first 32-byte-aligned array index 00124 for (i=0; ((i<n) && !test_alignment_Nbyte_powertwo(&on[i], 32)); i++) { 00125 if (on[i]) { 00126 first = i; // found first selected atom 00127 goto foundfirstsel; 00128 } 00129 } 00130 00131 // AVX vectorized search loop 00132 for (; i<(n-7); i+=8) { 00133 // aligned load of 8 selection flags 00134 __m256i on8 = _mm256_load_si256((__m256i*) &on[i]); 00135 if (!_mm256_testz_si256(on8, on8)) 00136 break; // found a block containing the first selected atom 00137 } 00138 00139 for (; i<n; i++) { 00140 if (on[i]) { 00141 first = i; // found first selected atom 00142 goto foundfirstsel; 00143 } 00144 } 00145 00146 // prevent x86 AVX-SSE transition performance loss due to CPU state 00147 // transition penalties or false dependence on upper register state 00148 _mm256_zeroupper(); 00149 return -1; // indicate that no selection was found 00150 } 00151 foundfirstsel: 00152 00153 // find the last selected atom, if any 00154 if ((lastsel != NULL) || (selected != NULL)) { 00155 // AVX vectorized search loop 00156 // Roll down to next 32-byte boundary 00157 for (i=n-1; i>=0; i--) { 00158 if (on[i]) { 00159 last = i; // found last selected atom 00160 goto foundlastsel; 00161 } 00162 00163 // drop out of the alignment loop once we hit a 32-byte boundary 00164 if (test_alignment_Nbyte_powertwo(&on[i], 32)) 00165 break; 00166 } 00167 00168 for (i-=8; i>=0; i-=8) { 00169 // aligned load of 8 selection flags 00170 __m256i on8 = _mm256_load_si256((__m256i*) &on[i]); 00171 if (!_mm256_testz_si256(on8, on8)) 00172 break; // found a block containing the last selected atom 00173 } 00174 00175 int last8=i; 00176 for (i=last8+7; i>=last8; i--) { 00177 if (on[i]) { 00178 last = i; // found last selected atom 00179 goto foundlastsel; 00180 } 00181 } 00182 00183 // prevent x86 AVX-SSE transition performance loss due to CPU state 00184 // transition penalties or false dependence on upper register state 00185 _mm256_zeroupper(); 00186 return -1; // indicate that no selection was found 00187 } 00188 foundlastsel: 00189 00190 // count the number of selected atoms (there are only 0s and 1s) 00191 // and determine the index of the last selected atom 00192 if (selected != NULL) { 00193 // AVX2 vectorized search loop 00194 // Roll up to next 32-byte boundary 00195 for (i=first; ((i<=last) && (!test_alignment_Nbyte_powertwo(&on[i], 32))); i++) { 00196 sel += on[i]; 00197 } 00198 00199 // Process groups of 8 flags at a time 00200 __m256i sum8 = _mm256_setzero_si256(); 00201 for (; i<=(last-7); i+=8) { 00202 // aligned load of four selection flags 00203 __m256i on8 = _mm256_load_si256((__m256i*) &on[i]); 00204 00205 // sum selected atoms vertically 00206 sum8 = _mm256_add_epi32(sum8, on8); 00207 } 00208 sel += hadd_m256i(sum8); // sum horizontally to finalize count 00209 00210 // check the very end of the array (non-divisible by four) 00211 for (; i<=last; i++) { 00212 sel += on[i]; 00213 } 00214 } 00215 00216 if (selected != NULL) 00217 *selected = sel; 00218 00219 if (firstsel != NULL) 00220 *firstsel = first; 00221 00222 if (lastsel != NULL) 00223 *lastsel = last; 00224 00225 // prevent x86 AVX-SSE transition performance loss due to CPU state 00226 // transition penalties or false dependence on upper register state 00227 _mm256_zeroupper(); 00228 return 0; 00229 } 00230 #endif 00231 00232 #endif // SSE+AVX 00233