Main Page Namespace List Class Hierarchy Alphabetical List Compound List File List Namespace Members Compound Members File Members Related Pages
util_simd_AVX2.C

Go to the documentation of this file.
00001 /***************************************************************************
00002 *cr 
00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the 
00004 *cr University of Illinois 
00005 *cr All Rights Reserved 
00006 *cr 
00007 ***************************************************************************/
00008 
00009 /***************************************************************************
00010 * RCS INFORMATION:
00011 *
00012 * $RCSfile: util_simd_AVX2.C,v $
00013 * $Author: johns $ $Locker: $ $State: Exp $
00014 * $Revision: 1.1 $ $Date: 2020年11月01日 05:00:11 $
00015 *
00016 ***************************************************************************
00017 * DESCRIPTION:
00018 *
00019 * Hand-coded SIMD loops using compiler provided intrinsics, or inline
00020 * assembly code to generate highly optimized machine code for time-critical
00021 * loops that crop up commonly used features of VMD. 
00022 *
00023 ***************************************************************************/
00024 
00025 #include "WKFThreads.h" // CPU capability flags
00026 
00027 #if defined(__SSE2__) || (defined(_MSC_VER) && (_MSC_VER >= 1916))
00028 #define VMDUSESSE 1 // enable SSE in combination with target macros
00029 #endif
00030 
00031 #if defined(__AVX__) || (defined(_MSC_VER) && (_MSC_VER >= 1916))
00032 #define VMDUSEAVX 1 // enable AVX with runtime dispatch 
00033 #endif
00034 
00035 #if defined(__AVX2__) || (defined(_MSC_VER) && (_MSC_VER >= 1916))
00036 #define VMDUSEAVX2 1 // enable AVX with runtime dispatch 
00037 #endif
00038 
00039 #if defined(VMDUSESSE) && defined(VMDUSEAVX) 
00040 
00041 #if defined(VMDUSESSE)
00042 #include <emmintrin.h>
00043 #endif
00044 #if defined(VMDUSEAVX)
00045 #include <immintrin.h>
00046 #endif
00047 
00048 // #include <string.h>
00049 // #include <ctype.h>
00050 #include <math.h>
00051 #include <stdio.h>
00052 #include <stdlib.h>
00053 
00054 #if defined(_MSC_VER)
00055 #include <windows.h>
00056 #include <conio.h>
00057 #else
00058 #include <unistd.h>
00059 #endif // _MSC_VER
00060 
00061 
00062 //
00063 // Helper routine for use when coping with unaligned
00064 // buffers returned by malloc() on many GNU systems:
00065 // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=24261
00066 // http://www.sourceware.org/bugzilla/show_bug.cgi?id=206
00067 //
00068 // XXX until all compilers support uintptr_t, we have to do 
00069 // dangerous and ugly things with pointer casting here...
00070 //
00071 #if defined(_WIN64) /* sizeof(size_t) == sizeof(void*) */
00072 #define myintptrtype size_t
00073 #elif 1 /* sizeof(unsigned long) == sizeof(void*) */
00074 #define myintptrtype unsigned long
00075 #else /* C99 */
00076 #define myintptrtype uintptr_t
00077 #endif
00078 
00079 
00080 //
00081 // Aligment test routines for vector instructions
00082 //
00083 static int test_alignment_Nbyte_powertwo(const void *ptr, unsigned int alignsz) {
00084 unsigned int alignmask = alignsz - 1;
00085 return (((myintptrtype) ptr) == (((myintptrtype) ptr) & (~alignmask)));
00086 }
00087 
00088 
00089 //
00090 // Small inlinable AVX helper routines to make code easier to read
00091 //
00092 static int hadd_m256i(__m256i sum8) {
00093 int tmp[sizeof(__m256i)/sizeof(int)];
00094 _mm256_store_si256((__m256i *)tmp, sum8);
00095 int sum = tmp[0]+tmp[1]+tmp[2]+tmp[3]+tmp[4]+tmp[5]+tmp[6]+tmp[7];
00096 return sum;
00097 }
00098 
00099 
00100 #if defined(VMDUSEAVX)
00101 // Find the first selected atom, the last selected atom,
00102 // and the total number of selected atoms.
00103 int analyze_selection_aligned_avx2(int n, const int *on,
00104 int *firstsel, int *lastsel, int *selected) {
00105 int sel = 0; // set count to zero in case of early-exit
00106 int first = 0; // if we early-exit, firstsel is 0
00107 int last = -1; // and lastsel is -1
00108 int i;
00109 
00110 // assign potential early-exit outcomes
00111 if (selected != NULL)
00112 *selected = sel;
00113 
00114 if (firstsel != NULL)
00115 *firstsel = first;
00116 
00117 if (lastsel != NULL)
00118 *lastsel = last;
00119 
00120 // find the first selected atom, if any
00121 if ((firstsel != NULL) || (selected != NULL)) {
00122 // find the first selected atom, if any
00123 // roll up to the first 32-byte-aligned array index
00124 for (i=0; ((i<n) && !test_alignment_Nbyte_powertwo(&on[i], 32)); i++) {
00125 if (on[i]) {
00126 first = i; // found first selected atom
00127 goto foundfirstsel;
00128 }
00129 }
00130 
00131 // AVX vectorized search loop
00132 for (; i<(n-7); i+=8) {
00133 // aligned load of 8 selection flags
00134 __m256i on8 = _mm256_load_si256((__m256i*) &on[i]);
00135 if (!_mm256_testz_si256(on8, on8))
00136 break; // found a block containing the first selected atom
00137 }
00138 
00139 for (; i<n; i++) {
00140 if (on[i]) {
00141 first = i; // found first selected atom
00142 goto foundfirstsel;
00143 }
00144 }
00145 
00146 // prevent x86 AVX-SSE transition performance loss due to CPU state 
00147 // transition penalties or false dependence on upper register state
00148 _mm256_zeroupper();
00149 return -1; // indicate that no selection was found
00150 }
00151 foundfirstsel:
00152 
00153 // find the last selected atom, if any
00154 if ((lastsel != NULL) || (selected != NULL)) {
00155 // AVX vectorized search loop
00156 // Roll down to next 32-byte boundary
00157 for (i=n-1; i>=0; i--) {
00158 if (on[i]) {
00159 last = i; // found last selected atom
00160 goto foundlastsel;
00161 }
00162 
00163 // drop out of the alignment loop once we hit a 32-byte boundary
00164 if (test_alignment_Nbyte_powertwo(&on[i], 32))
00165 break;
00166 }
00167 
00168 for (i-=8; i>=0; i-=8) {
00169 // aligned load of 8 selection flags
00170 __m256i on8 = _mm256_load_si256((__m256i*) &on[i]);
00171 if (!_mm256_testz_si256(on8, on8))
00172 break; // found a block containing the last selected atom
00173 }
00174 
00175 int last8=i;
00176 for (i=last8+7; i>=last8; i--) {
00177 if (on[i]) {
00178 last = i; // found last selected atom
00179 goto foundlastsel;
00180 }
00181 }
00182 
00183 // prevent x86 AVX-SSE transition performance loss due to CPU state 
00184 // transition penalties or false dependence on upper register state
00185 _mm256_zeroupper();
00186 return -1; // indicate that no selection was found
00187 }
00188 foundlastsel:
00189 
00190 // count the number of selected atoms (there are only 0s and 1s)
00191 // and determine the index of the last selected atom
00192 if (selected != NULL) {
00193 // AVX2 vectorized search loop
00194 // Roll up to next 32-byte boundary
00195 for (i=first; ((i<=last) && (!test_alignment_Nbyte_powertwo(&on[i], 32))); i++) {
00196 sel += on[i];
00197 }
00198 
00199 // Process groups of 8 flags at a time
00200 __m256i sum8 = _mm256_setzero_si256();
00201 for (; i<=(last-7); i+=8) {
00202 // aligned load of four selection flags
00203 __m256i on8 = _mm256_load_si256((__m256i*) &on[i]);
00204 
00205 // sum selected atoms vertically
00206 sum8 = _mm256_add_epi32(sum8, on8);
00207 }
00208 sel += hadd_m256i(sum8); // sum horizontally to finalize count
00209 
00210 // check the very end of the array (non-divisible by four)
00211 for (; i<=last; i++) {
00212 sel += on[i];
00213 }
00214 }
00215 
00216 if (selected != NULL)
00217 *selected = sel;
00218 
00219 if (firstsel != NULL)
00220 *firstsel = first;
00221 
00222 if (lastsel != NULL)
00223 *lastsel = last;
00224 
00225 // prevent x86 AVX-SSE transition performance loss due to CPU state 
00226 // transition penalties or false dependence on upper register state
00227 _mm256_zeroupper();
00228 return 0;
00229 }
00230 #endif
00231 
00232 #endif // SSE+AVX
00233