4
\$\begingroup\$

I was inspired by this and this to make a C function that would take an array of 16 __m128i, treat it as a matrix of 16x16 uint8_t, and transpose it using SSE2 intrinsics.

File simd-transpose.h

#include <immintrin.h>
/// Transpose \a x (treating it as a 16x16 array of \c uint8_t) using SSE2 intrinsics
static void
transpose_m128ix16(__m128i x[static restrict 16])
{
 const __m128i A07_B07 = _mm_unpacklo_epi8(x[0x0], x[0x1]);
 const __m128i A8f_B8f = _mm_unpackhi_epi8(x[0x0], x[0x1]);
 const __m128i C07_D07 = _mm_unpacklo_epi8(x[0x2], x[0x3]);
 const __m128i C8f_D8f = _mm_unpackhi_epi8(x[0x2], x[0x3]);
 const __m128i E07_F07 = _mm_unpacklo_epi8(x[0x4], x[0x5]);
 const __m128i E8f_F8f = _mm_unpackhi_epi8(x[0x4], x[0x5]);
 const __m128i G07_H07 = _mm_unpacklo_epi8(x[0x6], x[0x7]);
 const __m128i G8f_H8f = _mm_unpackhi_epi8(x[0x6], x[0x7]);
 const __m128i I07_J07 = _mm_unpacklo_epi8(x[0x8], x[0x9]);
 const __m128i I8f_J8f = _mm_unpackhi_epi8(x[0x8], x[0x9]);
 const __m128i K07_L07 = _mm_unpacklo_epi8(x[0xa], x[0xb]);
 const __m128i K8f_L8f = _mm_unpackhi_epi8(x[0xa], x[0xb]);
 const __m128i M07_N07 = _mm_unpacklo_epi8(x[0xc], x[0xd]);
 const __m128i M8f_N8f = _mm_unpackhi_epi8(x[0xc], x[0xd]);
 const __m128i O07_P07 = _mm_unpacklo_epi8(x[0xe], x[0xf]);
 const __m128i O8f_P8f = _mm_unpackhi_epi8(x[0xe], x[0xf]);
 const __m128i A03_B03_C03_D03 = _mm_unpacklo_epi16(A07_B07, C07_D07);
 const __m128i A47_B47_C47_D47 = _mm_unpackhi_epi16(A07_B07, C07_D07);
 const __m128i A8b_B8b_C8b_D8b = _mm_unpacklo_epi16(A8f_B8f, C8f_D8f);
 const __m128i Acf_Bcf_Ccf_Dcf = _mm_unpackhi_epi16(A8f_B8f, C8f_D8f);
 const __m128i E03_F03_G03_H03 = _mm_unpacklo_epi16(E07_F07, G07_H07);
 const __m128i E47_F47_G47_H47 = _mm_unpackhi_epi16(E07_F07, G07_H07);
 const __m128i E8b_F8b_G8b_H8b = _mm_unpacklo_epi16(E8f_F8f, G8f_H8f);
 const __m128i Ecf_Fcf_Gcf_Hcf = _mm_unpackhi_epi16(E8f_F8f, G8f_H8f);
 const __m128i I03_J03_K03_L03 = _mm_unpacklo_epi16(I07_J07, K07_L07);
 const __m128i I47_J47_K47_L47 = _mm_unpackhi_epi16(I07_J07, K07_L07);
 const __m128i I8b_J8b_K8b_L8b = _mm_unpacklo_epi16(I8f_J8f, K8f_L8f);
 const __m128i Icf_Jcf_Kcf_Lcf = _mm_unpackhi_epi16(I8f_J8f, K8f_L8f);
 const __m128i M03_N03_O03_P03 = _mm_unpacklo_epi16(M07_N07, O07_P07);
 const __m128i M47_N47_O47_P47 = _mm_unpackhi_epi16(M07_N07, O07_P07);
 const __m128i M8b_N8b_O8b_P8b = _mm_unpacklo_epi16(M8f_N8f, O8f_P8f);
 const __m128i Mcf_Ncf_Ocf_Pcf = _mm_unpackhi_epi16(M8f_N8f, O8f_P8f);
 const __m128i A01_B01_C01_D01_E01_F01_G01_H01 = _mm_unpacklo_epi32(A03_B03_C03_D03, E03_F03_G03_H03);
 const __m128i A23_B23_C23_D23_E23_F23_G23_H23 = _mm_unpackhi_epi32(A03_B03_C03_D03, E03_F03_G03_H03);
 const __m128i I01_J01_K01_L01_M01_N01_O01_P01 = _mm_unpacklo_epi32(I03_J03_K03_L03, M03_N03_O03_P03);
 const __m128i I23_J23_K23_L23_M23_N23_O23_P23 = _mm_unpackhi_epi32(I03_J03_K03_L03, M03_N03_O03_P03);
 const __m128i A45_B45_C45_D45_E45_F45_G45_H45 = _mm_unpacklo_epi32(A47_B47_C47_D47, E47_F47_G47_H47);
 const __m128i A67_B67_C67_D67_E67_F67_G67_H67 = _mm_unpackhi_epi32(A47_B47_C47_D47, E47_F47_G47_H47);
 const __m128i I45_J45_K45_L45_M45_N45_O45_P45 = _mm_unpacklo_epi32(I47_J47_K47_L47, M47_N47_O47_P47);
 const __m128i I67_J67_K67_L67_M67_N67_O67_P67 = _mm_unpackhi_epi32(I47_J47_K47_L47, M47_N47_O47_P47);
 const __m128i A89_B89_C89_D89_E89_F89_G89_H89 = _mm_unpacklo_epi32(A8b_B8b_C8b_D8b, E8b_F8b_G8b_H8b);
 const __m128i Aab_Bab_Cab_Dab_Eab_Fab_Gab_Hab = _mm_unpackhi_epi32(A8b_B8b_C8b_D8b, E8b_F8b_G8b_H8b);
 const __m128i I89_J89_K89_L89_M89_N89_O89_P89 = _mm_unpacklo_epi32(I8b_J8b_K8b_L8b, M8b_N8b_O8b_P8b);
 const __m128i Iab_Jab_Kab_Lab_Mab_Nab_Oab_Pab = _mm_unpackhi_epi32(I8b_J8b_K8b_L8b, M8b_N8b_O8b_P8b);
 const __m128i Acd_Bcd_Ccd_Dcd_Ecd_Fcd_Gcd_Hcd = _mm_unpacklo_epi32(Acf_Bcf_Ccf_Dcf, Ecf_Fcf_Gcf_Hcf);
 const __m128i Aef_Bef_Cef_Def_Eef_Fef_Gef_Hef = _mm_unpackhi_epi32(Acf_Bcf_Ccf_Dcf, Ecf_Fcf_Gcf_Hcf);
 const __m128i Icd_Jcd_Kcd_Lcd_Mcd_Ncd_Ocd_Pcd = _mm_unpacklo_epi32(Icf_Jcf_Kcf_Lcf, Mcf_Ncf_Ocf_Pcf);
 const __m128i Ief_Jef_Kef_Lef_Mef_Nef_Oef_Pef = _mm_unpackhi_epi32(Icf_Jcf_Kcf_Lcf, Mcf_Ncf_Ocf_Pcf);
 x[0x0] = _mm_unpacklo_epi64(A01_B01_C01_D01_E01_F01_G01_H01, I01_J01_K01_L01_M01_N01_O01_P01); // A0_B0_C0_D0_E0_F0_G0_H0_I0_J0_K0_L0_M0_N0_O0_P0
 x[0x1] = _mm_unpackhi_epi64(A01_B01_C01_D01_E01_F01_G01_H01, I01_J01_K01_L01_M01_N01_O01_P01); // A1_B1_C1_D1_E1_F1_G1_H1_I1_J1_K1_L1_M1_N1_O1_P1
 x[0x2] = _mm_unpacklo_epi64(A23_B23_C23_D23_E23_F23_G23_H23, I23_J23_K23_L23_M23_N23_O23_P23); // A2_B2_C2_D2_E2_F2_G2_H2_I2_J2_K2_L2_M2_N2_O2_P2
 x[0x3] = _mm_unpackhi_epi64(A23_B23_C23_D23_E23_F23_G23_H23, I23_J23_K23_L23_M23_N23_O23_P23); // A3_B3_C3_D3_E3_F3_G3_H3_I3_J3_K3_L3_M3_N3_O3_P3
 x[0x4] = _mm_unpacklo_epi64(A45_B45_C45_D45_E45_F45_G45_H45, I45_J45_K45_L45_M45_N45_O45_P45); // A4_B4_C4_D4_E4_F4_G4_H4_I4_J4_K4_L4_M4_N4_O4_P4
 x[0x5] = _mm_unpackhi_epi64(A45_B45_C45_D45_E45_F45_G45_H45, I45_J45_K45_L45_M45_N45_O45_P45); // A5_B5_C5_D5_E5_F5_G5_H5_I5_J5_K5_L5_M5_N5_O5_P5
 x[0x6] = _mm_unpacklo_epi64(A67_B67_C67_D67_E67_F67_G67_H67, I67_J67_K67_L67_M67_N67_O67_P67); // A6_B6_C6_D6_E6_F6_G6_H6_I6_J6_K6_L6_M6_N6_O6_P6
 x[0x7] = _mm_unpackhi_epi64(A67_B67_C67_D67_E67_F67_G67_H67, I67_J67_K67_L67_M67_N67_O67_P67); // A7_B7_C7_D7_E7_F7_G7_H7_I7_J7_K7_L7_M7_N7_O7_P7
 x[0x8] = _mm_unpacklo_epi64(A89_B89_C89_D89_E89_F89_G89_H89, I89_J89_K89_L89_M89_N89_O89_P89); // A8_B8_C8_D8_E8_F8_G8_H8_I8_J8_K8_L8_M8_N8_O8_P8
 x[0x9] = _mm_unpackhi_epi64(A89_B89_C89_D89_E89_F89_G89_H89, I89_J89_K89_L89_M89_N89_O89_P89); // A9_B9_C9_D9_E9_F9_G9_H9_I9_J9_K9_L9_M9_N9_O9_P9
 x[0xa] = _mm_unpacklo_epi64(Aab_Bab_Cab_Dab_Eab_Fab_Gab_Hab, Iab_Jab_Kab_Lab_Mab_Nab_Oab_Pab); // Aa_Ba_Ca_Da_Ea_Fa_Ga_Ha_Ia_Ja_Ka_La_Ma_Na_Oa_Pa
 x[0xb] = _mm_unpackhi_epi64(Aab_Bab_Cab_Dab_Eab_Fab_Gab_Hab, Iab_Jab_Kab_Lab_Mab_Nab_Oab_Pab); // Ab_Bb_Cb_Db_Eb_Fb_Gb_Hb_Ib_Jb_Kb_Lb_Mb_Nb_Ob_Pb
 x[0xc] = _mm_unpacklo_epi64(Acd_Bcd_Ccd_Dcd_Ecd_Fcd_Gcd_Hcd, Icd_Jcd_Kcd_Lcd_Mcd_Ncd_Ocd_Pcd); // Ac_Bc_Cc_Dc_Ec_Fc_Gc_Hc_Ic_Jc_Kc_Lc_Mc_Nc_Oc_Pc
 x[0xd] = _mm_unpackhi_epi64(Acd_Bcd_Ccd_Dcd_Ecd_Fcd_Gcd_Hcd, Icd_Jcd_Kcd_Lcd_Mcd_Ncd_Ocd_Pcd); // Ad_Bd_Cd_Dd_Ed_Fd_Gd_Hd_Id_Jd_Kd_Ld_Md_Nd_Od_Pd
 x[0xe] = _mm_unpacklo_epi64(Aef_Bef_Cef_Def_Eef_Fef_Gef_Hef, Ief_Jef_Kef_Lef_Mef_Nef_Oef_Pef); // Ae_Be_Ce_De_Ee_Fe_Ge_He_Ie_Je_Ke_Le_Me_Ne_Oe_Pe
 x[0xf] = _mm_unpackhi_epi64(Aef_Bef_Cef_Def_Eef_Fef_Gef_Hef, Ief_Jef_Kef_Lef_Mef_Nef_Oef_Pef); // Af_Bf_Cf_Df_Ef_Ff_Gf_Hf_If_Jf_Kf_Lf_Mf_Nf_Of_Pf
}

I know some lines are long, but I wanted the variable names to be elucidative.

File test-simd-transpose.c

This test code demonstrates the correctness of the transpose function.

#include "simd-transpose.h"
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
int main()
{
 typedef union
 {
 __m128i xmm[16];
 uint8_t u8[16][16];
 } data_16_t;
 const data_16_t original_data_16 = {.u8 = {
 {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f},
 {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f},
 {0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f},
 {0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f},
 {0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f},
 {0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f},
 {0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f},
 {0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f},
 {0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f},
 {0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f},
 {0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf},
 {0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf},
 {0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf},
 {0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf},
 {0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef},
 {0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff},
 }};
 data_16_t transposed_data_16 = original_data_16;
 transpose_m128ix16(transposed_data_16.xmm);
 printf("original_data_16 = {\n");
 for (int i = 0; i < 16; ++i)
 {
 for (int j = 0; j < 16; ++j)
 {
 printf(" %02x,", original_data_16.u8[i][j]);
 }
 printf("\n");
 }
 printf("}\n\n");
 printf("transposed_data_16 = {\n");
 for (int i = 0; i < 16; ++i)
 {
 for (int j = 0; j < 16; ++j)
 {
 printf(" %02x,", transposed_data_16.u8[i][j]);
 }
 printf("\n");
 }
 printf("}\n\n");
 for (int i = 0; i < 16; ++i)
 {
 for (int j = 0; j < 16; ++j)
 {
 assert(original_data_16.u8[i][j] == transposed_data_16.u8[j][i]);
 }
 }
 return 0;
}

Environment

I used GCC on Linux.

gcc -dumpversion

14.2.1

uname -srm

Linux 6.14.3-arch1-1 x86_64

Output

gcc -msse2 -Wall test-simd-transpose.c && ./a.out

original_data_16 = {
 00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f,
 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f,
 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 2a, 2b, 2c, 2d, 2e, 2f,
 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 3a, 3b, 3c, 3d, 3e, 3f,
 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 4a, 4b, 4c, 4d, 4e, 4f,
 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 5a, 5b, 5c, 5d, 5e, 5f,
 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 6a, 6b, 6c, 6d, 6e, 6f,
 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 7a, 7b, 7c, 7d, 7e, 7f,
 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 8a, 8b, 8c, 8d, 8e, 8f,
 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 9a, 9b, 9c, 9d, 9e, 9f,
 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, ac, ad, ae, af,
 b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf,
 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, ca, cb, cc, cd, ce, cf,
 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, da, db, dc, dd, de, df,
 e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, ea, eb, ec, ed, ee, ef,
 f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, fa, fb, fc, fd, fe, ff,
}
transposed_data_16 = {
 00, 10, 20, 30, 40, 50, 60, 70, 80, 90, a0, b0, c0, d0, e0, f0,
 01, 11, 21, 31, 41, 51, 61, 71, 81, 91, a1, b1, c1, d1, e1, f1,
 02, 12, 22, 32, 42, 52, 62, 72, 82, 92, a2, b2, c2, d2, e2, f2,
 03, 13, 23, 33, 43, 53, 63, 73, 83, 93, a3, b3, c3, d3, e3, f3,
 04, 14, 24, 34, 44, 54, 64, 74, 84, 94, a4, b4, c4, d4, e4, f4,
 05, 15, 25, 35, 45, 55, 65, 75, 85, 95, a5, b5, c5, d5, e5, f5,
 06, 16, 26, 36, 46, 56, 66, 76, 86, 96, a6, b6, c6, d6, e6, f6,
 07, 17, 27, 37, 47, 57, 67, 77, 87, 97, a7, b7, c7, d7, e7, f7,
 08, 18, 28, 38, 48, 58, 68, 78, 88, 98, a8, b8, c8, d8, e8, f8,
 09, 19, 29, 39, 49, 59, 69, 79, 89, 99, a9, b9, c9, d9, e9, f9,
 0a, 1a, 2a, 3a, 4a, 5a, 6a, 7a, 8a, 9a, aa, ba, ca, da, ea, fa,
 0b, 1b, 2b, 3b, 4b, 5b, 6b, 7b, 8b, 9b, ab, bb, cb, db, eb, fb,
 0c, 1c, 2c, 3c, 4c, 5c, 6c, 7c, 8c, 9c, ac, bc, cc, dc, ec, fc,
 0d, 1d, 2d, 3d, 4d, 5d, 6d, 7d, 8d, 9d, ad, bd, cd, dd, ed, fd,
 0e, 1e, 2e, 3e, 4e, 5e, 6e, 7e, 8e, 9e, ae, be, ce, de, ee, fe,
 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, af, bf, cf, df, ef, ff,
}

Questions

Is there a better or faster way to do this transpose? Perhaps using AVX or AVX2 intrinsics? I don't have access to AVX-512 hardware.

asked Apr 21 at 21:13
\$\endgroup\$
1
  • 1
    \$\begingroup\$ Useful to provide link about uncommon word: " ... wanted the variable names to be elucidative". \$\endgroup\$ Commented Apr 23 at 8:40

0

Know someone who can answer? Share a link to this question via email, Twitter, or Facebook.

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.