154{
155#ifndef USE_NO_SIMD
157
158 /*
159 * For better instruction-level parallelism, each loop iteration operates
160 * on a block of four registers.
161 */
162 const Vector32 keys = vector32_broadcast(
key);
/* load copies of key */
163 const uint32 nelem_per_vector =
sizeof(Vector32) /
sizeof(
uint32);
164 const uint32 nelem_per_iteration = 4 * nelem_per_vector;
165
166 /* round down to multiple of elements per iteration */
167 const uint32 tail_idx = nelem & ~(nelem_per_iteration - 1);
168
169#if defined(USE_ASSERT_CHECKING)
171#endif
172
173 /*
174 * If there aren't enough elements for the SIMD code, use the standard
175 * one-by-one linear search code.
176 */
177 if (nelem < nelem_per_iteration)
179
180 /*
181 * Process as many elements as possible with a block of 4 registers.
182 */
183 do
184 {
185 if (pg_lfind32_simd_helper(keys, &base[
i]))
186 {
187 Assert(assert_result ==
true);
188 return true;
189 }
190
191 i += nelem_per_iteration;
192
193 }
while (
i < tail_idx);
194
195 /*
196 * Process the last 'nelem_per_iteration' elements in the array with a
197 * 4-register block. This will cause us to check a subset of the elements
198 * more than once, but that won't affect correctness, and testing has
199 * demonstrated that this helps more cases than it harms.
200 */
201 Assert(assert_result == pg_lfind32_simd_helper(keys, &base[nelem - nelem_per_iteration]));
202 return pg_lfind32_simd_helper(keys, &base[nelem - nelem_per_iteration]);
203#else
204 /* Process the elements one at a time. */
206#endif
207}
Assert(PointerIsAligned(start, uint64))
static bool pg_lfind32_one_by_one_helper(uint32 key, const uint32 *base, uint32 nelem)