I am trying to write fast/optimal code to vectorize the product of an array of complex numbers. In simple C this would be:
#include <complex.h>
complex float f(complex float x[], int n ) {
complex float p = 1.0;
for (int i = 0; i < n; i++)
p *= x[i];
return p;
}
However, gcc can't vectorize this and my target CPU is the AMD FX-8350 which supports AVX. To speed it up I have tried:
typedef float v4sf __attribute__ ((vector_size (16)));
typedef union {
v4sf v;
float e[4];
} float4;
typedef struct {
float4 x;
float4 y;
} complex4;
static complex4 complex4_mul(complex4 a, complex4 b) {
return (complex4){a.x.v*b.x.v -a.y.v*b.y.v, a.y.v*b.x.v + a.x.v*b.y.v};
}
complex4 f4(complex4 x[], int n) {
v4sf one = {1,1,1,1};
complex4 p = {one,one};
for (int i = 0; i < n; i++) p = complex4_mul(p, x[i]);
return p;
}
Can this code be improved for AVX and is it as fast as it can get?
1 Answer 1
I was wondering if the compiler would more easily be able to vectorize it without the union
-within-struct
and the potential type punning of the e
member (which doesn't seem to be used).
How about this?
typedef float v4sf __attribute__ ((vector_size (16)));
typedef struct {
v4sf x;
v4sf y;
} complex4;
static inline v4sf complex4_mul_r(v4sf a_r, v4sf a_i, v4sf b_r, v4sf b_i) {
return a_r*b_r -a_i*b_i;
}
static inline v4sf complex4_mul_i(v4sf a_r, v4sf a_i, v4sf b_r, v4sf b_i) {
return a_r*b_i + a_i*b_r;
}
complex4 f4(v4sf x_r[], v4sf x_i[], int n) {
v4sf one = {1,1,1,1};
v4sf p_r = one;
v4sf p_i = one;
v4sf p_r_temp;
for (int i = 0; i < n; i++)
{
p_r_temp = complex4_mul_r(p_r, p_i, x_r[i], x_i[i]);
p_i = complex4_mul_i(p_r, p_i, x_r[i], x_i[i]);
p_r = p_r_temp;
}
return (complex4){p_r, p_i};
}
Looking at the assembly on https://godbolt.org/ it seems it gets fully vectorized. I can't get the godbolt sharing links to work.
I'm thinking about whether it might be possible to stick both the real and imaginary parts in the same vector, and use __builtin_shuffle()
to permute them as necessary. Can't quite work it out.
-
1\$\begingroup\$ Do you know what the assembly looks like from this? \$\endgroup\$Simd– Simd2017年12月22日 09:36:45 +00:00Commented Dec 22, 2017 at 9:36
-
\$\begingroup\$ Thanks for reminding me, I had forgotten to check. Looks good. \$\endgroup\$Snowbody– Snowbody2017年12月22日 15:18:55 +00:00Commented Dec 22, 2017 at 15:18
Explore related questions
See similar questions with these tags.
p
s (aka manual partial unroll and splitting of accumulators) and doing a post multiply of thep
s? and seeing if gcc can vectorize that? \$\endgroup\$