10
\$\begingroup\$

I am trying to write fast/optimal code to vectorize the product of an array of complex numbers. In simple C this would be:

#include <complex.h>
complex float f(complex float x[], int n ) {
 complex float p = 1.0;
 for (int i = 0; i < n; i++)
 p *= x[i];
 return p;
}

However, gcc can't vectorize this and my target CPU is the AMD FX-8350 which supports AVX. To speed it up I have tried:

typedef float v4sf __attribute__ ((vector_size (16)));
typedef union {
 v4sf v;
 float e[4];
} float4;
typedef struct {
 float4 x;
 float4 y;
} complex4;
static complex4 complex4_mul(complex4 a, complex4 b) {
 return (complex4){a.x.v*b.x.v -a.y.v*b.y.v, a.y.v*b.x.v + a.x.v*b.y.v};
}
complex4 f4(complex4 x[], int n) {
 v4sf one = {1,1,1,1};
 complex4 p = {one,one};
 for (int i = 0; i < n; i++) p = complex4_mul(p, x[i]);
 return p;
}

Can this code be improved for AVX and is it as fast as it can get?

200_success
145k22 gold badges190 silver badges478 bronze badges
asked Jan 16, 2017 at 11:39
\$\endgroup\$
3
  • 2
    \$\begingroup\$ have you tried using 4 ps (aka manual partial unroll and splitting of accumulators) and doing a post multiply of the ps? and seeing if gcc can vectorize that? \$\endgroup\$ Commented Jan 16, 2017 at 11:56
  • 1
    \$\begingroup\$ @ratchetfreak Like this godbolt.org/g/ndqZyN ? \$\endgroup\$ Commented Jan 16, 2017 at 11:57
  • 1
    \$\begingroup\$ As a guess, it seems like part of the issue is the complexity of complex multiplication. I don't know where you are getting your complex vectors from, but if you can store them in polar form without losing too much performance on that end, the multiplication will be much simpler here. \$\endgroup\$ Commented Jan 16, 2017 at 19:52

1 Answer 1

2
\$\begingroup\$

I was wondering if the compiler would more easily be able to vectorize it without the union-within-struct and the potential type punning of the e member (which doesn't seem to be used).

How about this?

typedef float v4sf __attribute__ ((vector_size (16)));
typedef struct {
 v4sf x;
 v4sf y;
} complex4;
static inline v4sf complex4_mul_r(v4sf a_r, v4sf a_i, v4sf b_r, v4sf b_i) {
 return a_r*b_r -a_i*b_i;
}
static inline v4sf complex4_mul_i(v4sf a_r, v4sf a_i, v4sf b_r, v4sf b_i) {
 return a_r*b_i + a_i*b_r;
}
complex4 f4(v4sf x_r[], v4sf x_i[], int n) {
 v4sf one = {1,1,1,1};
 v4sf p_r = one;
 v4sf p_i = one;
 v4sf p_r_temp;
 for (int i = 0; i < n; i++)
 {
 p_r_temp = complex4_mul_r(p_r, p_i, x_r[i], x_i[i]);
 p_i = complex4_mul_i(p_r, p_i, x_r[i], x_i[i]);
 p_r = p_r_temp;
 }
 return (complex4){p_r, p_i};
}

Looking at the assembly on https://godbolt.org/ it seems it gets fully vectorized. I can't get the godbolt sharing links to work.

I'm thinking about whether it might be possible to stick both the real and imaginary parts in the same vector, and use __builtin_shuffle() to permute them as necessary. Can't quite work it out.

answered Dec 21, 2017 at 23:19
\$\endgroup\$
2
  • 1
    \$\begingroup\$ Do you know what the assembly looks like from this? \$\endgroup\$ Commented Dec 22, 2017 at 9:36
  • \$\begingroup\$ Thanks for reminding me, I had forgotten to check. Looks good. \$\endgroup\$ Commented Dec 22, 2017 at 15:18

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.