author | pcpa <paulo.cesar.pereira.de.andrade@gmail.com> | 2023年02月28日 17:47:20 -0300 |
---|---|---|
committer | pcpa <paulo.cesar.pereira.de.andrade@gmail.com> | 2023年02月28日 17:47:20 -0300 |
commit | b71ab4f1b8fe533d943de7ea5ce4aa724cf10e7b (patch) | |
tree | cb9909113687e780528d8dc7036e66850fe65dd8 | |
parent | 9ddb79b50e5cd401ee35c994a47986148e5dfb15 (diff) | |
download | lightning-b71ab4f1b8fe533d943de7ea5ce4aa724cf10e7b.tar.gz |
-rw-r--r-- | check/Makefile.am | 3 | ||||
-rw-r--r-- | check/lightning.c | 3 | ||||
-rw-r--r-- | check/rbit.tst | 184 | ||||
-rw-r--r-- | doc/body.texi | 9 | ||||
-rw-r--r-- | include/lightning.h.in | 3 | ||||
-rw-r--r-- | lib/jit_aarch64-cpu.c | 1 | ||||
-rw-r--r-- | lib/jit_aarch64.c | 1 | ||||
-rw-r--r-- | lib/jit_alpha.c | 4 | ||||
-rw-r--r-- | lib/jit_arm-cpu.c | 15 | ||||
-rw-r--r-- | lib/jit_fallback.c | 2 | ||||
-rw-r--r-- | lib/jit_hppa.c | 2 | ||||
-rw-r--r-- | lib/jit_ia64.c | 2 | ||||
-rw-r--r-- | lib/jit_loongarch-cpu.c | 1 | ||||
-rw-r--r-- | lib/jit_mips-cpu.c | 18 | ||||
-rw-r--r-- | lib/jit_mips.c | 1 | ||||
-rw-r--r-- | lib/jit_names.c | 1 | ||||
-rw-r--r-- | lib/jit_ppc.c | 2 | ||||
-rw-r--r-- | lib/jit_riscv.c | 2 | ||||
-rw-r--r-- | lib/jit_s390.c | 2 | ||||
-rw-r--r-- | lib/jit_sparc.c | 2 | ||||
-rw-r--r-- | lib/jit_x86.c | 4 | ||||
-rw-r--r-- | lib/lightning.c | 2 |
diff --git a/check/Makefile.am b/check/Makefile.am index c77f5cd..2a96f8a 100644 --- a/check/Makefile.am +++ b/check/Makefile.am @@ -124,7 +124,8 @@ EXTRA_DIST = \ check.nodata.sh \ check.x87.nodata.sh \ run-test all.tst \ - collatz.tst factorial.tst + collatz.tst factorial.tst \ + rbit.tst base_TESTS = \ 3to2 add align allocai \ diff --git a/check/lightning.c b/check/lightning.c index 80ea081..52a313b 100644 --- a/check/lightning.c +++ b/check/lightning.c @@ -323,6 +323,7 @@ static void rshr_u(void); static void rshi_u(void); static void negr(void); static void comr(void); static void clor(void); static void clzr(void); static void ctor(void); static void ctzr(void); +static void rbitr(void); static void ltr(void); static void lti(void); static void ltr_u(void); static void lti_u(void); static void ler(void); static void lei(void); @@ -683,6 +684,7 @@ static instr_t instr_vector[] = { entry(negr), entry(comr), entry(clor), entry(clzr), entry(ctor), entry(ctzr), + entry(rbitr), entry(ltr), entry(lti), entry(ltr_u), entry(lti_u), entry(ler), entry(lei), @@ -1536,6 +1538,7 @@ entry_ir_ir_ir(rshr_u) entry_ir_ir_im(rshi_u) entry_ir_ir(negr) entry_ir_ir(comr) entry_ir_ir(clor) entry_ir_ir(clzr) entry_ir_ir(ctor) entry_ir_ir(ctzr) +entry_ir_ir(rbitr) entry_ir_ir_ir(ltr) entry_ir_ir_im(lti) entry_ir_ir_ir(ltr_u) entry_ir_ir_im(lti_u) entry_ir_ir_ir(ler) entry_ir_ir_im(lei) diff --git a/check/rbit.tst b/check/rbit.tst new file mode 100644 index 0000000..bc8bce3 --- /dev/null +++ b/check/rbit.tst @@ -0,0 +1,184 @@ +.data 4096 +swap_tab: +.cfmt: +#if __WORDSIZE == 32 +.c "0x%08lx = 0x%08lx\n" +#else +.c "0x%016lx = 0x%016lx\n" +#endif + +.code + jmpi main + name rbit_table +rbit_table: + prolog + arg $in + getarg %r1 $in + extr_uc %r2 %r1 + movi %v0 swap_tab + ldxr_uc %r0 %v0 %r2 + movi %v1 8 +rbit_table_loop: + rshr %r2 %r1 %v1 + extr_uc %r2 %r2 + lshi %r0 %r0 8 + ldxr_uc %r2 %v0 %r2 + orr %r0 %r0 %r2 + addi %v1 %v1 8 + blti rbit_table_loop %v1 __WORDSIZE + retr %r0 + epilog + + name rbit_unrolled +rbit_unrolled: + prolog + arg $in + getarg %r0 $in +#if __WORDSIZE == 32 + movi %r1 0x55555555 +#else + movi %r1 0x5555555555555555 +#endif + rshi_u %r2 %r0 1 // r2 = r0 >> 1 + andr %r2 %r2 %r1 // r2 &= r1 + andr %v0 %r0 %r1 // v0 = r0 & r1 + lshi %v0 %v0 1 // v0 <<= 1 + orr %r0 %r2 %v0 // r0 = r2 | v0 +#if __WORDSIZE == 32 + movi %r1 0x33333333 +#else + movi %r1 0x3333333333333333 +#endif + rshi_u %r2 %r0 2 // r2 = r0 >> 2 + andr %r2 %r2 %r1 // r2 &= r1 + andr %v0 %r0 %r1 // v0 = r0 & r1 + lshi %v0 %v0 2 // v0 <<= 2 + orr %r0 %r2 %v0 // r0 = r2 | v0 +#if __WORDSIZE == 32 + movi %r1 0x0f0f0f0f +#else + movi %r1 0x0f0f0f0f0f0f0f0f +#endif + rshi_u %r2 %r0 4 // r2 = r0 >> 4 + andr %r2 %r2 %r1 // r2 &= r1 + andr %v0 %r0 %r1 // v0 = r0 & r1 + lshi %v0 %v0 4 // v0 <<= 4 + orr %r0 %r2 %v0 // r0 = r2 | v0 +#if __WORDSIZE == 32 + movi %r1 0x00ff00ff +#else + movi %r1 0x00ff00ff00ff00ff +#endif + rshi_u %r2 %r0 8 // r2 = r0 >> 8 + andr %r2 %r2 %r1 // r2 &= r1 + andr %v0 %r0 %r1 // v0 = r0 & r1 + lshi %v0 %v0 8 // v0 <<= 8 + orr %r0 %r2 %v0 // r0 = r2 | v0 +#if __WORDSIZE == 32 + rshi_u %r2 %r0 16 // r2 = r0 >> 16 + lshi %v0 %r0 16 // v0 = r0 << 16 + orr %r0 %r2 %v0 // r0 = r2 | v0 +#else + movi %r1 0x0000ffff0000ffff + rshi_u %r2 %r0 16 // r2 = r0 >> 16 + andr %r2 %r2 %r1 // r2 &= r1 + andr %v0 %r0 %r1 // v0 = r0 & r1 + lshi %v0 %v0 16 // v0 <<= 16 + orr %r0 %r2 %v0 // r0 = r2 | v0 + rshi_u %r2 %r0 32 // r2 = r0 >> 32 + lshi %v0 %r0 32 // v0 = r0 << 32 + orr %r0 %r2 %v0 // r0 = r2 | v0 +#endif + retr %r0 + epilog + + name rbit_loop +rbit_loop: + prolog + arg $in + getarg %r0 $in + movi %r1 __WORDSIZE + movi %r2 $(~0) +rbit_loop_loop: // while (%r1 >>= 1) > 0 + rshi %r1 %r1 1 // %r1 >>= 1 + blei rbit_loop_done %r1 0 // no loop if %r1 <= 0 + lshr %v0 %r2 %r1 // %v0 = %r2 << %r1 + xorr %r2 %r2 %v0 // %r2 ^= %v0 + rshr %v0 %r0 %r1 // %v0 = %r0 >> %r1 + andr %v0 %v0 %r2 // %r2 = %v0 & %r2 + lshr %v1 %r0 %r1 // %v1 = %r0 << %r1 + comr %r0 %r2 // %r0 = ~%r2 + andr %v1 %r0 %v1 // %v1 &= %r0 + orr %r0 %v0 %v1 // %r0 = %v0 | %v1 + jmpi rbit_loop_loop +rbit_loop_done: + retr %r0 + epilog + + name main +main: + prolog + arg $argc + arg $argv + getarg %r0 $argc + bnei default %r0 2 + getarg %v0 $argv + ldxi %r0 %v0 $(__WORDSIZE >> 3) + prepare + pushargr %r0 + pushargi 0 + pushargi 0 + finishi @strtoul + retval %v0 + jmpi main_do +default: +#if __WORDSIZE == 32 + movi %v0 0x8a13c851 +#else + movi %v0 0x984a137ffec85219 +#endif +main_do: + prepare + pushargr %v0 + finishi rbit_table + retval %r0 + prepare + pushargi fmt + ellipsis + pushargr %v0 + pushargr %r0 + finishi @printf + + prepare + pushargr %v0 + finishi rbit_unrolled + retval %r0 + prepare + pushargi fmt + ellipsis + pushargr %v0 + pushargr %r0 + finishi @printf + + prepare + pushargr %v0 + finishi rbit_loop + retval %r0 + prepare + pushargi fmt + ellipsis + pushargr %v0 + pushargr %r0 + finishi @printf + + rbitr %r0 %v0 + prepare + pushargi fmt + ellipsis + pushargr %v0 + pushargr %r0 + finishi @printf + + ret + epilog diff --git a/doc/body.texi b/doc/body.texi index 1bd3f67..e7b090d 100644 --- a/doc/body.texi +++ b/doc/body.texi @@ -291,10 +291,11 @@ These accept two operands, both of which must be registers. @example negr _f _d O1 = -O2 comr O1 = ~O2 -clor O1 = number of leading one bits -clzr O1 = number of leading zero bits -ctor O1 = number of trailing one bits -ctzr O1 = number of trailing zero bits +clor O1 = number of leading one bits in O2 +clzr O1 = number of leading zero bits in O2 +ctor O1 = number of trailing one bits in O2 +ctzr O1 = number of trailing zero bits in O2 +rbitr O1 = bits of O2 reversed @end example Note that @code{ctzr} is basically equivalent of a @code{C} call diff --git a/include/lightning.h.in b/include/lightning.h.in index 7aa654c..66e3068 100644 --- a/include/lightning.h.in +++ b/include/lightning.h.in @@ -1056,6 +1056,9 @@ typedef enum { #define jit_ctzr(u,v) jit_new_node_ww(jit_code_ctzr,u,v) jit_code_ctor, jit_code_ctzr, +#define jit_rbitr(u,v) jit_new_node_ww(jit_code_rbitr,u,v) + jit_code_rbitr, + jit_code_last_code } jit_code_t; diff --git a/lib/jit_aarch64-cpu.c b/lib/jit_aarch64-cpu.c index d5e64ad..99d8756 100644 --- a/lib/jit_aarch64-cpu.c +++ b/lib/jit_aarch64-cpu.c @@ -598,6 +598,7 @@ static void _clzr(jit_state_t*, jit_int32_t, jit_int32_t); static void _ctor(jit_state_t*, jit_int32_t, jit_int32_t); # define ctzr(r0, r1) _ctzr(_jit, r0, r1) static void _ctzr(jit_state_t*, jit_int32_t, jit_int32_t); +# define rbitr(r0, r1) RBIT(r0, r1) # define andr(r0,r1,r2) AND(r0,r1,r2) # define andi(r0,r1,i0) _andi(_jit,r0,r1,i0) static void _andi(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t); diff --git a/lib/jit_aarch64.c b/lib/jit_aarch64.c index 243e677..8106a31 100644 --- a/lib/jit_aarch64.c +++ b/lib/jit_aarch64.c @@ -1448,6 +1448,7 @@ _emit_code(jit_state_t *_jit) case_rr(clz,); case_rr(cto,); case_rr(ctz,); + case_rr(rbit,); case_rrr(and,); case_rrw(and,); case_rrr(or,); diff --git a/lib/jit_alpha.c b/lib/jit_alpha.c index 25566f4..ef41c37 100644 --- a/lib/jit_alpha.c +++ b/lib/jit_alpha.c @@ -64,6 +64,7 @@ static void _patch(jit_state_t*,jit_word_t,jit_node_t*); #define PROTO 1 # include "jit_alpha-cpu.c" # include "jit_alpha-fpu.c" +# include "jit_fallback.c" #undef PROTO /* @@ -1134,6 +1135,8 @@ _emit_code(jit_state_t *_jit) case_rr(clz,); case_rr(cto,); case_rr(ctz,); +#define rbitr(r0, r1) fallback_rbit(r0, r1) + case_rr(rbit,); case_rrr(lt,); case_rrw(lt,); case_rrr(lt, _u); @@ -1555,6 +1558,7 @@ _emit_code(jit_state_t *_jit) #define CODE 1 # include "jit_alpha-cpu.c" # include "jit_alpha-fpu.c" +# include "jit_fallback.c" #undef CODE void diff --git a/lib/jit_arm-cpu.c b/lib/jit_arm-cpu.c index a0852a2..78c5814 100644 --- a/lib/jit_arm-cpu.c +++ b/lib/jit_arm-cpu.c @@ -914,6 +914,8 @@ static void _clzr(jit_state_t*, jit_int32_t, jit_int32_t); static void _ctor(jit_state_t*, jit_int32_t, jit_int32_t); # define ctzr(r0, r1) _ctzr(_jit, r0, r1) static void _ctzr(jit_state_t*, jit_int32_t, jit_int32_t); +# define rbitr(r0, r1) _rbitr(_jit, r0, r1) +static void _rbitr(jit_state_t*, jit_int32_t, jit_int32_t); # define addr(r0,r1,r2) _addr(_jit,r0,r1,r2) static void _addr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t); # define addi(r0,r1,i0) _addi(_jit,r0,r1,i0) @@ -1814,6 +1816,19 @@ _ctzr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) } static void +_rbitr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) +{ + if (jit_armv7_p()) { /* armv6t2 actually */ + if (jit_thumb_p()) + T2_RBIT(r0, r1); + else + RBIT(r0, r1); + } + else + fallback_bitswap(r0, r1); +} + +static void _addr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) { if (jit_thumb_p()) { diff --git a/lib/jit_fallback.c b/lib/jit_fallback.c index 21e2f42..935f3e4 100644 --- a/lib/jit_fallback.c +++ b/lib/jit_fallback.c @@ -672,7 +672,7 @@ while ((s >>= 1) > 0) comr(v, rn(mask)); /* v = ~mask */ andr(rn(t1), v, rn(t1)); /* t1 = t1 & v */ orr(v, rn(t0), rn(t1)); /* v = t0 | t1 */ - jmpi(loop, 0); + jmpi(loop); flush(); patch_at(done, _jit->pc.w); jit_unget_reg(t1); diff --git a/lib/jit_hppa.c b/lib/jit_hppa.c index d3c5ef7..d683dc1 100644 --- a/lib/jit_hppa.c +++ b/lib/jit_hppa.c @@ -1072,10 +1072,12 @@ _emit_code(jit_state_t *_jit) #define clzr(r0, r1) fallback_clz(r0, r1) #define ctor(r0, r1) fallback_cto(r0, r1) #define ctzr(r0, r1) fallback_ctz(r0, r1) +#define rbitr(r0, r1) fallback_bitswap(r0, r1) case_rr(clo,); case_rr(clz,); case_rr(cto,); case_rr(ctz,); + case_rr(rbit,); case_rr(ext, _c); case_rr(ext, _uc); case_rr(ext, _s); diff --git a/lib/jit_ia64.c b/lib/jit_ia64.c index 2968278..c45f784 100644 --- a/lib/jit_ia64.c +++ b/lib/jit_ia64.c @@ -1197,6 +1197,8 @@ _emit_code(jit_state_t *_jit) case_rr(clz,); case_rr(cto,); case_rr(ctz,); +#define rbitr(r0, r1) fallback_bitswap(r0, r1) + case_rr(rbit,); case jit_code_casr: casr(rn(node->u.w), rn(node->v.w), rn(node->w.q.l), rn(node->w.q.h)); diff --git a/lib/jit_loongarch-cpu.c b/lib/jit_loongarch-cpu.c index ab05852..2de50c4 100644 --- a/lib/jit_loongarch-cpu.c +++ b/lib/jit_loongarch-cpu.c @@ -338,6 +338,7 @@ static void _oj26(jit_state_t*, jit_int32_t,jit_int32_t); # define clzr(r0, r1) CLZ_D(r0, r1) # define ctor(r0, r1) CTO_D(r0, r1) # define ctzr(r0, r1) CTZ_D(r0, r1) +# define rbitr(r0, r1) BITREV_D(r0, r1) static void _nop(jit_state_t*,jit_int32_t); # define movr(r0, r1) _movr(_jit, r0, r1) static void _movr(jit_state_t*, jit_int32_t, jit_int32_t); diff --git a/lib/jit_mips-cpu.c b/lib/jit_mips-cpu.c index 5d4137c..19d34a2 100644 --- a/lib/jit_mips-cpu.c +++ b/lib/jit_mips-cpu.c @@ -501,6 +501,8 @@ static void _clzr(jit_state_t*, jit_int32_t, jit_int32_t); static void _ctor(jit_state_t*, jit_int32_t, jit_int32_t); # define ctzr(r0, r1) _ctzr(_jit, r0, r1) static void _ctzr(jit_state_t*, jit_int32_t, jit_int32_t); +# define rbitr(r0, r1) _rbitr(_jit, r0, r1) +static void _rbitr(jit_state_t*, jit_int32_t, jit_int32_t); # if __WORDSIZE == 32 # define addr(rd,rs,rt) ADDU(rd,rs,rt) # define addiu(r0,r1,i0) ADDIU(r0,r1,i0) @@ -1710,6 +1712,22 @@ _ctzr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) } static void +_rbitr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) +{ + if (jit_mips6_p()) { +#if __WORDSIZE == 32 + BITSWAP(r0, r1); + bswapr_ui(r0, r0); +#else + DBITSWAP(r0, r1); + bswapr_ul(r0, r0); +#endif + } + else + fallback_bitswap(r0, r1); +} + +static void _addi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0) { jit_int32_t reg; diff --git a/lib/jit_mips.c b/lib/jit_mips.c index b3a8078..07d4867 100644 --- a/lib/jit_mips.c +++ b/lib/jit_mips.c @@ -1676,6 +1676,7 @@ _emit_code(jit_state_t *_jit) case_rr(clz,); case_rr(cto,); case_rr(ctz,); + case_rr(rbit,); case_rrr(lt,); case_rrw(lt,); case_rrr(lt, _u); diff --git a/lib/jit_names.c b/lib/jit_names.c index e5985a3..29f5906 100644 --- a/lib/jit_names.c +++ b/lib/jit_names.c @@ -255,4 +255,5 @@ static char *code_name[] = { "movr_d_w", "movi_d_w", "clo", "clz", "cto", "ctz", + "rbit", }; diff --git a/lib/jit_ppc.c b/lib/jit_ppc.c index 0ad4ae8..2410deb 100644 --- a/lib/jit_ppc.c +++ b/lib/jit_ppc.c @@ -1379,6 +1379,8 @@ _emit_code(jit_state_t *_jit) case_rr(clz,); case_rr(cto,); case_rr(ctz,); +#define rbitr(r0, r1) fallback_bitswap(r0, r1) + case_rr(rbit,); case jit_code_casr: casr(rn(node->u.w), rn(node->v.w), rn(node->w.q.l), rn(node->w.q.h)); diff --git a/lib/jit_riscv.c b/lib/jit_riscv.c index 63a5cd9..8afe32e 100644 --- a/lib/jit_riscv.c +++ b/lib/jit_riscv.c @@ -1168,10 +1168,12 @@ _emit_code(jit_state_t *_jit) #define clzr(r0, r1) fallback_clz(r0, r1) #define ctor(r0, r1) fallback_cto(r0, r1) #define ctzr(r0, r1) fallback_ctz(r0, r1) +#define rbitr(r0, r1) fallback_bitswap(r0, r1) case_rr(clo,); case_rr(clz,); case_rr(cto,); case_rr(ctz,); + case_rr(rbit,); case_rrr(and,); case_rrw(and,); case_rrr(or,); diff --git a/lib/jit_s390.c b/lib/jit_s390.c index 25c6421..24869a7 100644 --- a/lib/jit_s390.c +++ b/lib/jit_s390.c @@ -1137,6 +1137,8 @@ _emit_code(jit_state_t *_jit) case_rr(clz,); case_rr(cto,); case_rr(ctz,); +#define rbitr(r0, r1) fallback_bitswap(r0, r1) + case_rr(rbit,); case_rrr(and,); case_rrw(and,); case_rrr(or,); diff --git a/lib/jit_sparc.c b/lib/jit_sparc.c index 9e837d8..34cfd68 100644 --- a/lib/jit_sparc.c +++ b/lib/jit_sparc.c @@ -1561,6 +1561,8 @@ _emit_code(jit_state_t *_jit) case_rr(clz,); case_rr(cto,); case_rr(ctz,); +#define rbitr(r0, r1) fallback_bitswap(r0, r1) + case_rr(rbit,); case_brr(blt,); case_brw(blt,); case_brr(blt, _u); diff --git a/lib/jit_x86.c b/lib/jit_x86.c index b409457..10a6ec9 100644 --- a/lib/jit_x86.c +++ b/lib/jit_x86.c @@ -140,6 +140,7 @@ static void _x87_from_sse_d(jit_state_t*,jit_int32_t,jit_int32_t); # include "jit_x86-cpu.c" # include "jit_x86-sse.c" # include "jit_x86-x87.c" +# include "jit_fallback.c" #undef PROTO /* @@ -1809,6 +1810,8 @@ _emit_code(jit_state_t *_jit) case_rr(clz,); case_rr(cto,); case_rr(ctz,); +#define rbitr(r0, r1) fallback_bitswap(r0, r1) + case_rr(rbit,); case_rrr(lt,); case_rrw(lt,); case_rrr(lt, _u); @@ -2426,6 +2429,7 @@ _emit_code(jit_state_t *_jit) # include "jit_x86-cpu.c" # include "jit_x86-sse.c" # include "jit_x86-x87.c" +# include "jit_fallback.c" #undef CODE void diff --git a/lib/lightning.c b/lib/lightning.c index b0b0ef7..1376773 100644 --- a/lib/lightning.c +++ b/lib/lightning.c @@ -1469,7 +1469,7 @@ _jit_classify(jit_state_t *_jit, jit_code_t code) case jit_code_extr_f: case jit_code_extr_d_f: case jit_code_ldr_f: case jit_code_negr_d: case jit_code_absr_d: case jit_code_sqrtr_d: case jit_code_movr_d: case jit_code_extr_d: case jit_code_extr_f_d: - case jit_code_ldr_d: + case jit_code_ldr_d: case jit_code_rbitr: case jit_code_clor: case jit_code_clzr: case jit_code_ctor: case jit_code_ctzr: case jit_code_movr_w_f: case jit_code_movr_f_w: |