-rw-r--r-- | lib/jit_fallback.c | 194 |
diff --git a/lib/jit_fallback.c b/lib/jit_fallback.c index cb593fa..21e2f42 100644 --- a/lib/jit_fallback.c +++ b/lib/jit_fallback.c @@ -1,5 +1,7 @@ #if PROTO #define USE_BIT_TABLES 1 +#define USE_BITSWAP_UNROLLED 0 +#define USE_BITSWAP_LOOP 0 #define fallback_save(r0) _fallback_save(_jit, r0) static void _fallback_save(jit_state_t*, jit_int32_t); #define fallback_load(r0) _fallback_load(_jit, r0) @@ -21,6 +23,8 @@ static void _fallback_clz(jit_state_t*, jit_int32_t, jit_int32_t); static void _fallback_cto(jit_state_t*, jit_int32_t, jit_int32_t); #define fallback_ctz(r0,r1) _fallback_ctz(_jit,r0,r1) static void _fallback_ctz(jit_state_t*, jit_int32_t, jit_int32_t); +#define fallback_bitswap(r0,r1) _fallback_bitswap(_jit, r0, r1) +static void _fallback_bitswap(jit_state_t*, jit_int32_t, jit_int32_t); # if defined(__ia64__) # define fallback_flush() sync() # elif defined(__mips__) @@ -487,4 +491,194 @@ _fallback_ctz(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) jit_unget_reg(r1_reg); # endif } + +static void +_fallback_bitswap(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) +{ +# if USE_BIT_TABLES + /* t0 = r1; + * t1 = t0 & 0xff; + * t2 = swap_tab; + * r0 = t2[t1]; + * t3 = 8; + * loop: + * t1 = t0 >> t3; + * t1 &= 0xff; + * r0 <<= 8; + * r0 |= t2[t1]; + * t3 += 8; + * if (t3 < __WORDSIZE) + * goto loop; + */ + jit_word_t loop; + jit_int32_t t0, r1_reg, t1, t2, t3; + static const unsigned char swap_tab[256] = { + 0, 128, 64, 192, 32, 160, 96, 224, + 16, 144, 80, 208, 48, 176, 112, 240, + 8, 136, 72, 200, 40, 168, 104, 232, + 24, 152, 88, 216 ,56, 184, 120, 248, + 4, 132, 68, 196, 36, 164, 100, 228, + 20, 148, 84, 212, 52, 180, 116, 244, + 12, 140, 76, 204, 44, 172, 108, 236, + 28, 156, 92, 220, 60, 188, 124, 252, + 2, 130, 66, 194, 34, 162, 98, 226, + 18, 146, 82, 210, 50, 178, 114, 242, + 10, 138, 74, 202, 42, 170, 106, 234, + 26, 154, 90, 218, 58, 186, 122, 250, + 6, 134, 70, 198, 38, 166, 102, 230, + 22, 150, 86, 214, 54, 182, 118, 246, + 14, 142, 78, 206, 46, 174, 110, 238, + 30, 158, 94, 222, 62, 190, 126, 254, + 1, 129, 65, 193, 33, 161, 97, 225, + 17, 145, 81, 209, 49, 177, 113, 241, + 9, 137, 73, 201, 41, 169, 105, 233, + 25, 153, 89, 217, 57, 185, 121, 249, + 5, 133, 69, 197, 37, 165, 101, 229, + 21, 149, 85, 213, 53, 181, 117, 245, + 13, 141, 77, 205, 45, 173, 109, 237, + 29, 157, 93, 221, 61, 189, 125, 253, + 3, 131, 67, 195, 35, 163, 99, 227, + 19, 147, 83, 211, 51, 179, 115, 243, + 11, 139, 75, 203, 43, 171, 107, 235, + 27, 155, 91, 219, 59, 187, 123, 251, + 7, 135, 71, 199, 39, 167, 103, 231, + 23, 151, 87, 215, 55, 183, 119, 247, + 15, 143, 79, 207, 47, 175, 111, 239, + 31, 159, 95, 223, 63, 191, 127, 255 + }; + if (r0 == r1) { + t0 = jit_get_reg(jit_class_gpr); + r1_reg = rn(t0); + } + else { + t0 = JIT_NOREG; + r1_reg = r1; + } + t1 = jit_get_reg(jit_class_gpr); + t2 = jit_get_reg(jit_class_gpr); + t3 = jit_get_reg(jit_class_gpr); + if (r0 == r1) + movr(rn(t0), r1); + extr_uc(rn(t1), r1_reg); + movi(rn(t2), (jit_word_t)swap_tab); + ldxr_uc(r0, rn(t2), rn(t1)); + movi(rn(t3), 8); + fallback_flush(); + loop = _jit->pc.w; + rshr(rn(t1), r1_reg, rn(t3)); + extr_uc(rn(t1), rn(t1)); + lshi(r0, r0, 8); + ldxr_uc(rn(t1), rn(t2), rn(t1)); + orr(r0, r0, rn(t1)); + addi(rn(t3), rn(t3), 8); + blti(loop, rn(t3), __WORDSIZE); + jit_unget_reg(t3); + jit_unget_reg(t2); + jit_unget_reg(t1); + if (t0 != JIT_NOREG) + jit_unget_reg(t0); +# elif USE_BITSWAP_UNROLLED +/* http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel */ +/* +unsigned int v; // 32-bit word to reverse bit order + +// swap odd and even bits +v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); +// swap consecutive pairs +v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); +// swap nibbles ... +v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); +// swap bytes +v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); +// swap 2-byte long pairs +v = ( v >> 16 ) | ( v << 16); + */ + jit_int32_t t0, t1, t2, t3, t4; + movr(r0, r1); + t0 = jit_get_reg(jit_class_gpr); + t1 = jit_get_reg(jit_class_gpr); + t2 = jit_get_reg(jit_class_gpr); + movi(rn(t0), __WORDSIZE == 32 ? 0x55555555L : 0x5555555555555555L); + rshi_u(rn(t1), r0, 1); /* t1 = v >> 1 */ + andr(rn(t1), rn(t1), rn(t0)); /* t1 &= t0 */ + andr(rn(t2), r0, rn(t0)); /* t2 = v & t0*/ + lshi(rn(t2), rn(t2), 1); /* t2 <<= 1 */ + orr(r0, rn(t1), rn(t2)); /* v = t1 | t2 */ + movi(rn(t0), __WORDSIZE == 32 ? 0x33333333L : 0x3333333333333333L); + rshi_u(rn(t1), r0, 2); /* t1 = v >> 2 */ + andr(rn(t1), rn(t1), rn(t0)); /* t1 &= t0 */ + andr(rn(t2), r0, rn(t0)); /* t2 = v & t0*/ + lshi(rn(t2), rn(t2), 2); /* t2 <<= 2 */ + orr(r0, rn(t1), rn(t2)); /* v = t1 | t2 */ + movi(rn(t0), __WORDSIZE == 32 ? 0x0f0f0f0fL : 0x0f0f0f0f0f0f0f0fL); + rshi_u(rn(t1), r0, 4); /* t1 = v >> 4 */ + andr(rn(t1), rn(t1), rn(t0)); /* t1 &= t0 */ + andr(rn(t2), r0, rn(t0)); /* t2 = v & t0*/ + lshi(rn(t2), rn(t2), 4); /* t2 <<= 4 */ + orr(r0, rn(t1), rn(t2)); /* v = t1 | t2 */ + movi(rn(t0), __WORDSIZE == 32 ? 0x00ff00ffL : 0x00ff00ff00ff00ffL); + rshi_u(rn(t1), r0, 8); /* t1 = v >> 8 */ + andr(rn(t1), rn(t1), rn(t0)); /* t1 &= t0 */ + andr(rn(t2), r0, rn(t0)); /* t2 = v & t0*/ + lshi(rn(t2), rn(t2), 8); /* t2 <<= 8 */ + orr(r0, rn(t1), rn(t2)); /* v = t1 | t2 */ +# if __WORDSIZE == 32 + rshi_u(rn(t1), r0, 16); /* t1 = v >> 16 */ + lshi(rn(t2), r0, 16); /* t2 = v << 16 */ + orr(r0, rn(t1), rn(t2)); /* v = t1 | t2 */ +# else + movi(rn(t0), 0x0000ffff0000ffffL); + rshi_u(rn(t1), r0, 16); /* t1 = v >> 16 */ + andr(rn(t1), rn(t1), rn(t0)); /* t1 &= t0 */ + andr(rn(t2), r0, rn(t0)); /* t2 = v & t0*/ + lshi(rn(t2), rn(t2), 16); /* t2 <<= 16 */ + orr(r0, rn(t1), rn(t2)); /* v = t1 | t2 */ + rshi_u(rn(t1), r0, 32); /* t1 = v >> 32 */ + lshi(rn(t2), r0, 32); /* t2 = v << 32 */ + orr(r0, rn(t1), rn(t2)); /* v = t1 | t2 */ +# endif + jit_unget_reg(t2); + jit_unget_reg(t1); + jit_unget_reg(t0); +# elif USE_BITSWAP_LOOP +/* http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel */ +/* +unsigned int s = sizeof(v) * CHAR_BIT; // bit size; must be power of 2 +unsigned int mask = ~0; +while ((s >>= 1) > 0) +{ + mask ^= (mask << s); + v = ((v >> s) & mask) | ((v << s) & ~mask); +} +*/ + jit_int32_t s, mask; + jit_word_t loop, done, t0, t1; + movr(v, r1); + s = jit_get_reg(jit_class_gpr); + movi(rn(s), __WORDSIZE); /* s = sizeof(v) * CHAR_BIT; */ + mask = jit_get_reg(jit_class_gpr); + movi(rn(mask), ~0L); /* mask = ~0; */ + flush(); + loop = _jit->pc.w; /* while ((s >>= 1) > 0) */ + rshi(rn(s), rn(s), 1); /* (s >>= 1) */ + done = blei(_jit->pc.w, rn(s), 0); /* no loop if s <= 0 */ + t0 = jit_get_reg(jit_class_gpr); + lshr(rn(t0), rn(mask), rn(s)); /* t0 = (mask << s) */ + xorr(rn(mask), rn(mask), rn(t0)); /* mask ^= t0 */ + rshr(rn(t0), v, rn(s)); /* t0 = v >> s */ + andr(rn(t0), rn(t0), rn(mask)); /* t0 = t0 & mask */ + t1 = jit_get_reg(jit_class_gpr); + lshr(rn(t1), v, rn(s)); /* t1 = v << s */ + comr(v, rn(mask)); /* v = ~mask */ + andr(rn(t1), v, rn(t1)); /* t1 = t1 & v */ + orr(v, rn(t0), rn(t1)); /* v = t0 | t1 */ + jmpi(loop, 0); + flush(); + patch_at(done, _jit->pc.w); + jit_unget_reg(t1); + jit_unget_reg(t0); + jit_unget_reg(mask); + jit_unget_reg(s); +# endif +} #endif |