/*
- * Copyright (C) 2013-2019 Free Software Foundation, Inc.
+ * Copyright (C) 2013-2023 Free Software Foundation, Inc.
*
* This file is part of GNU lightning.
*
jit_int32_t w;
# undef ui
} instr_t;
-# define stack_framesize 160
+# define s26_p(d) ((d) >= -33554432 && (d) <= 33554431)
# define ii(i) *_jit->pc.ui++ = i
# define ldr(r0,r1) ldr_l(r0,r1)
# define ldxr(r0,r1,r2) ldxr_l(r0,r1,r2)
# define A64_CBNZ 0x35000000
# define A64_B_C 0x54000000
# define A64_CSINC 0x1a800400
+# define A64_CSSEL 0x1a800000
# define A64_REV 0xdac00c00
# define A64_UDIV 0x1ac00800
# define A64_SDIV 0x1ac00c00
# define A64_LDRSB 0x38e06800
# define A64_STR 0xf8206800
# define A64_LDR 0xf8606800
+# define A64_LDAXR 0xc85ffc00
+# define A64_STLXR 0xc800fc00
# define A64_STRH 0x78206800
# define A64_LDRH 0x78606800
# define A64_LDRSH 0x78a06800
# define A64_ORR 0x2a000000
# define A64_MOV 0x2a0003e0 /* AKA orr Rd,xzr,Rm */
# define A64_MVN 0x2a2003e0
+# define A64_CLS 0x5ac01400
+# define A64_CLZ 0x5ac01000
+# define A64_RBIT 0x5ac00000
# define A64_UXTW 0x2a0003e0 /* AKA MOV */
# define A64_EOR 0x4a000000
# define A64_ANDS 0x6a000000
# define MOV(Rd,Rm) ox_x(A64_MOV|XS,Rd,Rm)
# define MVN(Rd,Rm) ox_x(A64_MVN|XS,Rd,Rm)
# define NEG(Rd,Rm) ox_x(A64_NEG|XS,Rd,Rm)
+# define CLS(Rd,Rm) o_xx(A64_CLS|XS,Rd,Rm)
+# define CLZ(Rd,Rm) o_xx(A64_CLZ|XS,Rd,Rm)
+# define RBIT(Rd,Rm) o_xx(A64_RBIT|XS,Rd,Rm)
# define MOVN(Rd,Imm16) ox_h(A64_MOVN|XS,Rd,Imm16)
# define MOVN_16(Rd,Imm16) ox_h(A64_MOVN|XS|MOVI_LSL_16,Rd,Imm16)
# define MOVN_32(Rd,Imm16) ox_h(A64_MOVN|XS|MOVI_LSL_32,Rd,Imm16)
# define LDR(Rt,Rn,Rm) oxxx(A64_LDR,Rt,Rn,Rm)
# define LDRI(Rt,Rn,Imm12) oxxi(A64_LDRI,Rt,Rn,Imm12)
# define LDUR(Rt,Rn,Imm9) oxx9(A64_LDUR,Rt,Rn,Imm9)
+# define LDAXR(Rt,Rn) o_xx(A64_LDAXR,Rt,Rn)
+# define STLXR(Rs,Rt,Rn) oxxx(A64_STLXR,Rs,Rn,Rt)
# define STRB(Rt,Rn,Rm) oxxx(A64_STRB,Rt,Rn,Rm)
# define STRBI(Rt,Rn,Imm12) oxxi(A64_STRBI,Rt,Rn,Imm12)
# define STURB(Rt,Rn,Imm9) oxx9(A64_STURB,Rt,Rn,Imm9)
# define LDPI_PRE(Rt,Rt2,Rn,Simm7) oxxx7(A64_LDP_PRE|XS,Rt,Rt2,Rn,Simm7)
# define STPI_POS(Rt,Rt2,Rn,Simm7) oxxx7(A64_STP_POS|XS,Rt,Rt2,Rn,Simm7)
# define CSET(Rd,Cc) CSINC(Rd,XZR_REGNO,XZR_REGNO,Cc)
+# define CSEL(Rd,Rn,Rm,Cc) oxxxc(A64_CSSEL|XS,Rd,Rn,Rm,Cc)
# define B(Simm26) o26(A64_B,Simm26)
# define BL(Simm26) o26(A64_BL,Simm26)
# define BR(Rn) o_x_(A64_BR,Rn)
# define rshr_u(r0,r1,r2) LSR(r0,r1,r2)
# define rshi_u(r0,r1,i0) _rshi_u(_jit,r0,r1,i0)
static void _rshi_u(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
+# define movnr(r0,r1,r2) _movnr(_jit,r0,r1,r2)
+static void _movnr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
+# define movzr(r0,r1,r2) _movzr(_jit,r0,r1,r2)
+static void _movzr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
# define negr(r0,r1) NEG(r0,r1)
# define comr(r0,r1) MVN(r0,r1)
+# define clor(r0, r1) _clor(_jit, r0, r1)
+static void _clor(jit_state_t*, jit_int32_t, jit_int32_t);
+# define clzr(r0, r1) CLZ(r0,r1)
+static void _clzr(jit_state_t*, jit_int32_t, jit_int32_t);
+# define ctor(r0, r1) _ctor(_jit, r0, r1)
+static void _ctor(jit_state_t*, jit_int32_t, jit_int32_t);
+# define ctzr(r0, r1) _ctzr(_jit, r0, r1)
+static void _ctzr(jit_state_t*, jit_int32_t, jit_int32_t);
# define andr(r0,r1,r2) AND(r0,r1,r2)
# define andi(r0,r1,i0) _andi(_jit,r0,r1,i0)
static void _andi(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
# define stxr_l(r0,r1,r2) STR(r2,r1,r0)
# define stxi_l(i0,r0,r1) _stxi_l(_jit,i0,r0,r1)
static void _stxi_l(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t);
-# if __BYTE_ORDER == __LITTLE_ENDIAN
-# define htonr_us(r0,r1) _htonr_us(_jit,r0,r1)
-static void _htonr_us(jit_state_t*,jit_int32_t,jit_int32_t);
-# define htonr_ui(r0,r1) _htonr_ui(_jit,r0,r1)
-static void _htonr_ui(jit_state_t*,jit_int32_t,jit_int32_t);
-# define htonr_ul(r0,r1) REV(r0,r1)
-# else
-# define htonr_us(r0,r1) extr_us(r0,r1)
-# define htonr_ui(r0,r1) extr_ui(r0,r1)
-# define htonr_ul(r0,r1) movr(r0,r1)
-# endif
+# define bswapr_us(r0,r1) _bswapr_us(_jit,r0,r1)
+static void _bswapr_us(jit_state_t*,jit_int32_t,jit_int32_t);
+# define bswapr_ui(r0,r1) _bswapr_ui(_jit,r0,r1)
+static void _bswapr_ui(jit_state_t*,jit_int32_t,jit_int32_t);
+# define bswapr_ul(r0,r1) REV(r0,r1)
# define extr_c(r0,r1) SXTB(r0,r1)
# define extr_uc(r0,r1) UXTB(r0,r1)
# define extr_s(r0,r1) SXTH(r0,r1)
# define extr_us(r0,r1) UXTH(r0,r1)
# define extr_i(r0,r1) SXTW(r0,r1)
# define extr_ui(r0,r1) UXTW(r0,r1)
+# define casx(r0, r1, r2, r3, i0) _casx(_jit, r0, r1, r2, r3, i0)
+static void _casx(jit_state_t *_jit,jit_int32_t,jit_int32_t,
+ jit_int32_t,jit_int32_t,jit_word_t);
+#define casr(r0, r1, r2, r3) casx(r0, r1, r2, r3, 0)
+#define casi(r0, i0, r1, r2) casx(r0, _NOREG, r1, r2, i0)
# define movr(r0,r1) _movr(_jit,r0,r1)
static void _movr(jit_state_t*,jit_int32_t,jit_int32_t);
# define movi(r0,i0) _movi(_jit,r0,i0)
# define bmci(i0,r0,i1) bmxi(BCC_EQ,i0,r0,i1)
# define jmpr(r0) BR(r0)
# define jmpi(i0) _jmpi(_jit,i0)
-static void _jmpi(jit_state_t*,jit_word_t);
+static jit_word_t _jmpi(jit_state_t*,jit_word_t);
# define jmpi_p(i0) _jmpi_p(_jit,i0)
static jit_word_t _jmpi_p(jit_state_t*,jit_word_t);
# define callr(r0) BLR(r0)
# define calli(i0) _calli(_jit,i0)
-static void _calli(jit_state_t*,jit_word_t);
+static jit_word_t _calli(jit_state_t*,jit_word_t);
# define calli_p(i0) _calli_p(_jit,i0)
static jit_word_t _calli_p(jit_state_t*,jit_word_t);
# define prolog(i0) _prolog(_jit,i0)
#endif
#if CODE
+/* https://dougallj.wordpress.com/2021/10/30/bit-twiddling-optimising-aarch64-logical-immediate-encoding-and-decoding/ */
+#include "aarch64-logical-immediates.c"
static jit_int32_t
logical_immediate(jit_word_t imm)
{
- /* There are 5334 possible immediate values, but to avoid the
- * need of either too complex code or large lookup tables,
- * only check for (simply) encodable common/small values */
- switch (imm) {
- case -16: return (0xf3b);
- case -15: return (0xf3c);
- case -13: return (0xf3d);
- case -9: return (0xf3e);
- case -8: return (0xf7c);
- case -7: return (0xf7d);
- case -5: return (0xf7e);
- case -4: return (0xfbd);
- case -3: return (0xfbe);
- case -2: return (0xffe);
- case 1: return (0x000);
- case 2: return (0xfc0);
- case 3: return (0x001);
- case 4: return (0xf80);
- case 6: return (0xfc1);
- case 7: return (0x002);
- case 8: return (0xf40);
- case 12: return (0xf81);
- case 14: return (0xfc2);
- case 15: return (0x003);
- case 16: return (0xf00);
- default: return (-1);
+ jit_int32_t result = encodeLogicalImmediate64(imm);
+ if (result != ENCODE_FAILED) {
+ assert(isValidLogicalImmediate64(result));
+ return (result & 0xfff);
}
+ return (-1);
}
static void
_o26(jit_state_t *_jit, jit_int32_t Op, jit_int32_t Simm26)
{
instr_t i;
- assert(Simm26 >= -33554432 && Simm26 <= 33554431);
+ assert(s26_p(Simm26));
assert(!(Op & ~0xfc000000));
i.w = Op;
i.imm26.b = Simm26;
}
}
+static void
+_movnr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
+{
+ CMPI(r2, 0);
+ CSEL(r0, r0, r1, CC_NE);
+}
+
+static void
+_movzr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
+{
+ CMPI(r2, 0);
+ CSEL(r0, r0, r1, CC_EQ);
+}
+
+static void
+_clor(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
+{
+ comr(r0, r1);
+ clzr(r0, r0);
+}
+
+static void
+_ctor(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
+{
+ RBIT(r0, r1);
+ clor(r0, r0);
+}
+
+static void
+_ctzr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
+{
+ RBIT(r0, r1);
+ clzr(r0, r0);
+}
+
static void
_andi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
{
}
}
-#if __BYTE_ORDER == __LITTLE_ENDIAN
static void
-_htonr_us(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
+_bswapr_us(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
{
- htonr_ul(r0, r1);
+ bswapr_ul(r0, r1);
rshi_u(r0, r0, 48);
}
static void
-_htonr_ui(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
+_bswapr_ui(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
{
- htonr_ul(r0, r1);
+ bswapr_ul(r0, r1);
rshi_u(r0, r0, 32);
}
-#endif
static void
_ldi_c(jit_state_t *_jit, jit_int32_t r0, jit_word_t i0)
}
}
+static void
+_casx(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
+ jit_int32_t r2, jit_int32_t r3, jit_word_t i0)
+{
+ jit_int32_t r1_reg, iscasi;
+ jit_word_t retry, done, jump0, jump1;
+ if ((iscasi = (r1 == _NOREG))) {
+ r1_reg = jit_get_reg(jit_class_gpr);
+ r1 = rn(r1_reg);
+ movi(r1, i0);
+ }
+ /* retry: */
+ retry = _jit->pc.w;
+ LDAXR(r0, r1);
+ eqr(r0, r0, r2);
+ jump0 = beqi(_jit->pc.w, r0, 0); /* beqi done r0 0 */
+ STLXR(r3, r0, r1);
+ jump1 = bnei(_jit->pc.w, r0, 0); /* bnei retry r0 0 */
+ /* done: */
+ CSET(r0, CC_EQ);
+ done = _jit->pc.w;
+ patch_at(jump0, done);
+ patch_at(jump1, retry);
+ if (iscasi)
+ jit_unget_reg(r1_reg);
+}
+
static void
_movr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
{
return (w);
}
-static void
+static jit_word_t
_jmpi(jit_state_t *_jit, jit_word_t i0)
{
- jit_word_t w;
jit_int32_t reg;
- w = (i0 - _jit->pc.w) >> 2;
- if (w >= -33554432 && w <= 33554431)
- B(w);
+ jit_word_t d, w;
+ w = _jit->pc.w;
+ d = (i0 - w) >> 2;
+ if (s26_p(d))
+ B(d);
else {
reg = jit_get_reg(jit_class_gpr|jit_class_nospill);
movi(rn(reg), i0);
jmpr(rn(reg));
jit_unget_reg(reg);
}
+ return (w);
}
static jit_word_t
return (w);
}
-static void
+static jit_word_t
_calli(jit_state_t *_jit, jit_word_t i0)
{
- jit_word_t w;
jit_int32_t reg;
- w = (i0 - _jit->pc.w) >> 2;
- if (w >= -33554432 && w <= 33554431)
- BL(w);
+ jit_word_t d, w;
+ w = _jit->pc.w;
+ d = (i0 - w) >> 2;
+ if (s26_p(d))
+ BL(d);
else {
reg = jit_get_reg(jit_class_gpr);
movi(rn(reg), i0);
callr(rn(reg));
jit_unget_reg(reg);
}
+ return (w);
}
static jit_word_t
return (w);
}
-/*
- * prolog and epilog not as "optimized" as one would like, but the
- * problem of overallocating stack space to save callee save registers
- * exists on all ports, and is still a todo to use a variable
- * stack_framesize
- * value, what would cause needing to patch some calls, most likely
- * the offset of jit_arg* of stack arguments.
- */
static void
_prolog(jit_state_t *_jit, jit_node_t *node)
{
- jit_int32_t reg;
+ jit_int32_t reg, rreg, offs;
if (_jitc->function->define_frame || _jitc->function->assume_frame) {
jit_int32_t frame = -_jitc->function->frame;
+ jit_check_frame();
assert(_jitc->function->self.aoff >= frame);
if (_jitc->function->assume_frame)
return;
_jitc->function->stack = ((_jitc->function->self.alen -
/* align stack at 16 bytes */
_jitc->function->self.aoff) + 15) & -16;
- STPI_POS(FP_REGNO, LR_REGNO, SP_REGNO, -(stack_framesize >> 3));
- MOV_XSP(FP_REGNO, SP_REGNO);
-#define SPILL(L, R, O) \
- do { \
- if (jit_regset_tstbit(&_jitc->function->regset, _R##L)) { \
- if (jit_regset_tstbit(&_jitc->function->regset, _R##R)) \
- STPI(L, R, SP_REGNO, O); \
- else \
- STRI(L, SP_REGNO, O); \
- } \
- else if (jit_regset_tstbit(&_jitc->function->regset, _R##R)) \
- STRI(R, SP_REGNO, O + 1); \
- } while (0)
- SPILL(19, 20, 2);
- SPILL(21, 22, 4);
- SPILL(23, 24, 6);
- SPILL(25, 26, 8);
- SPILL(27, 28, 10);
-#undef SPILL
-#define SPILL(R, O) \
- do { \
- if (jit_regset_tstbit(&_jitc->function->regset, _V##R)) \
- stxi_d(O, SP_REGNO, R); \
- } while (0)
- SPILL( 8, 96);
- SPILL( 9, 104);
- SPILL(10, 112);
- SPILL(11, 120);
- SPILL(12, 128);
- SPILL(13, 136);
- SPILL(14, 144);
- SPILL(15, 152);
-#undef SPILL
- if (_jitc->function->stack)
+
+ if (!_jitc->function->need_frame) {
+ /* check if any callee save register needs to be saved */
+ for (reg = 0; reg < _jitc->reglen; ++reg)
+ if (jit_regset_tstbit(&_jitc->function->regset, reg) &&
+ (_rvs[reg].spec & jit_class_sav)) {
+ jit_check_frame();
+ break;
+ }
+ }
+
+ if (_jitc->function->need_frame) {
+ STPI_POS(FP_REGNO, LR_REGNO, SP_REGNO, -(jit_framesize() >> 3));
+ MOV_XSP(FP_REGNO, SP_REGNO);
+ }
+ /* callee save registers */
+ for (reg = 0, offs = 2; reg < jit_size(iregs);) {
+ if (jit_regset_tstbit(&_jitc->function->regset, iregs[reg])) {
+ for (rreg = reg + 1; rreg < jit_size(iregs); rreg++) {
+ if (jit_regset_tstbit(&_jitc->function->regset, iregs[rreg]))
+ break;
+ }
+ if (rreg < jit_size(iregs)) {
+ STPI(rn(iregs[reg]), rn(iregs[rreg]), SP_REGNO, offs);
+ offs += 2;
+ reg = rreg + 1;
+ }
+ else {
+ STRI(rn(iregs[reg]), SP_REGNO, offs);
+ ++offs;
+ /* No pair found */
+ break;
+ }
+ }
+ else
+ ++reg;
+ }
+ for (reg = 0, offs <<= 3; reg < jit_size(fregs); reg++) {
+ if (jit_regset_tstbit(&_jitc->function->regset, fregs[reg])) {
+ stxi_d(offs, SP_REGNO, rn(fregs[reg]));
+ offs += sizeof(jit_float64_t);
+ }
+ }
+
+ if (_jitc->function->stack)
subi(SP_REGNO, SP_REGNO, _jitc->function->stack);
if (_jitc->function->allocar) {
reg = jit_get_reg(jit_class_gpr);
jit_unget_reg(reg);
}
+#if !__APPLE__
if (_jitc->function->self.call & jit_call_varargs) {
/* Save gp registers in the save area, if any is a vararg */
for (reg = 8 - _jitc->function->vagp / -8;
stxi_d(_jitc->function->vaoff + offsetof(jit_va_list_t, q0) +
reg * 16 + offsetof(jit_qreg_t, l), FP_REGNO, rn(_V0 - reg));
}
+#endif
}
static void
_epilog(jit_state_t *_jit, jit_node_t *node)
{
+ jit_int32_t reg, rreg, offs;
if (_jitc->function->assume_frame)
return;
if (_jitc->function->stack)
MOV_XSP(SP_REGNO, FP_REGNO);
-#define LOAD(L, R, O) \
- do { \
- if (jit_regset_tstbit(&_jitc->function->regset, _R##L)) { \
- if (jit_regset_tstbit(&_jitc->function->regset, _R##R)) \
- LDPI(L, R, SP_REGNO, O); \
- else \
- LDRI(L, SP_REGNO, O); \
- } \
- else if (jit_regset_tstbit(&_jitc->function->regset, _R##R)) \
- LDRI(R, SP_REGNO, O + 1); \
- } while (0)
- LOAD(19, 20, 2);
- LOAD(21, 22, 4);
- LOAD(23, 24, 6);
- LOAD(25, 26, 8);
- LOAD(27, 28, 10);
-#undef LOAD
-#define LOAD(R, O) \
- do { \
- if (jit_regset_tstbit(&_jitc->function->regset, _V##R)) \
- ldxi_d(R, SP_REGNO, O); \
- } while (0)
- LOAD( 8, 96);
- LOAD( 9, 104);
- LOAD(10, 112);
- LOAD(11, 120);
- LOAD(12, 128);
- LOAD(13, 136);
- LOAD(14, 144);
- LOAD(15, 152);
-#undef LOAD
- LDPI_PRE(FP_REGNO, LR_REGNO, SP_REGNO, stack_framesize >> 3);
+ /* callee save registers */
+ for (reg = 0, offs = 2; reg < jit_size(iregs);) {
+ if (jit_regset_tstbit(&_jitc->function->regset, iregs[reg])) {
+ for (rreg = reg + 1; rreg < jit_size(iregs); rreg++) {
+ if (jit_regset_tstbit(&_jitc->function->regset, iregs[rreg]))
+ break;
+ }
+ if (rreg < jit_size(iregs)) {
+ LDPI(rn(iregs[reg]), rn(iregs[rreg]), SP_REGNO, offs);
+ offs += 2;
+ reg = rreg + 1;
+ }
+ else {
+ LDRI(rn(iregs[reg]), SP_REGNO, offs);
+ ++offs;
+ /* No pair found */
+ break;
+ }
+ }
+ else
+ ++reg;
+ }
+ for (reg = 0, offs <<= 3; reg < jit_size(fregs); reg++) {
+ if (jit_regset_tstbit(&_jitc->function->regset, fregs[reg])) {
+ ldxi_d(rn(fregs[reg]), SP_REGNO, offs);
+ offs += sizeof(jit_float64_t);
+ }
+ }
+
+ if (_jitc->function->need_frame)
+ LDPI_PRE(FP_REGNO, LR_REGNO, SP_REGNO, jit_framesize() >> 3);
RET();
}
static void
_vastart(jit_state_t *_jit, jit_int32_t r0)
{
+#if !__APPLE__
jit_int32_t reg;
assert(_jitc->function->self.call & jit_call_varargs);
reg = jit_get_reg(jit_class_gpr);
/* Initialize stack pointer to the first stack argument. */
- addi(rn(reg), FP_REGNO, _jitc->function->self.size);
+ addi(rn(reg), FP_REGNO, jit_selfsize());
stxi(offsetof(jit_va_list_t, stack), r0, rn(reg));
/* Initialize gp top pointer to the first stack argument. */
stxi_i(offsetof(jit_va_list_t, fpoff), r0, rn(reg));
jit_unget_reg(reg);
+#else
+ assert(_jitc->function->self.call & jit_call_varargs);
+ addi(r0, FP_REGNO, jit_selfsize());
+#endif
}
static void
_vaarg(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
{
+#if !__APPLE__
jit_word_t ge_code;
jit_word_t lt_code;
jit_int32_t rg0, rg1;
jit_unget_reg(rg1);
/* Jump over overflow code. */
- lt_code = jmpi_p(_jit->pc.w);
+ lt_code = jmpi(_jit->pc.w);
/* Where to land if argument is in overflow area. */
patch_at(ge_code, _jit->pc.w);
patch_at(lt_code, _jit->pc.w);
jit_unget_reg(rg0);
+#else
+ assert(_jitc->function->self.call & jit_call_varargs);
+ ldr(r0, r1);
+ addi(r1, r1, sizeof(jit_word_t));
+#endif
}
static void
ffc = i.w & 0xffc00000;
if (fc == A64_B || fc == A64_BL) {
d = (label - instr) >> 2;
- assert(d >= -33554432 && d <= 33554431);
+ assert(s26_p(d));
i.imm26.b = d;
u.i[0] = i.w;
}