X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;ds=sidebyside;f=deps%2Flightning%2Flib%2Fjit_aarch64-cpu.c;h=b0bc26fcba80045e84319fc5940dc0026c6e17d3;hb=d481fb64f2aac7a36532142cda11fa43f5ca792f;hp=7d2a99d699f7e047dbc9c131535d3c03446bb29a;hpb=437b1e617808119c3a24a72c77cd2fa86a5d3220;p=pcsx_rearmed.git diff --git a/deps/lightning/lib/jit_aarch64-cpu.c b/deps/lightning/lib/jit_aarch64-cpu.c index 7d2a99d6..b0bc26fc 100644 --- a/deps/lightning/lib/jit_aarch64-cpu.c +++ b/deps/lightning/lib/jit_aarch64-cpu.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013-2019 Free Software Foundation, Inc. + * Copyright (C) 2013-2023 Free Software Foundation, Inc. * * This file is part of GNU lightning. * @@ -210,11 +210,15 @@ typedef union { jit_int32_t w; # undef ui } instr_t; -# define stack_framesize 160 +# define s26_p(d) ((d) >= -33554432 && (d) <= 33554431) # define ii(i) *_jit->pc.ui++ = i # define ldr(r0,r1) ldr_l(r0,r1) +# define ldi(r0,i0) ldi_l(r0,i0) # define ldxr(r0,r1,r2) ldxr_l(r0,r1,r2) # define ldxi(r0,r1,i0) ldxi_l(r0,r1,i0) +# define str(r0,r1) str_l(r0,r1) +# define sti(i0,r0) sti_l(i0,r0) +# define stxr(r0,r1,r2) stxr_l(r0,r1,r2) # define stxi(i0,r0,r1) stxi_l(i0,r0,r1) # define FP_REGNO 0x1d # define LR_REGNO 0x1e @@ -278,9 +282,11 @@ typedef union { # define A64_NEG 0x4b0003e0 # define A64_SUBS 0x6b000000 # define A64_CMP 0x6b00001f +# define A64_BFM 0x33400000 # define A64_SBFM 0x93400000 +# define A64_SBFX 0x13400000 # define A64_UBFM 0x53400000 -# define A64_UBFX 0x53000000 +# define A64_UBFX 0x53400000 # define A64_B 0x14000000 # define A64_BL 0x94000000 # define A64_BR 0xd61f0000 @@ -297,6 +303,8 @@ typedef union { # define A64_LSL 0x1ac02000 # define A64_LSR 0x1ac02400 # define A64_ASR 0x1ac02800 +# define A64_RORV 0x1ac02c00 +# define A64_EXTR 0x13800000 # define A64_MUL 0x1b007c00 # define A64_SMULL 0x9b207c00 # define A64_SMULH 0x9b407c00 @@ -318,6 +326,8 @@ typedef union { # define A64_LDRSB 0x38e06800 # define A64_STR 0xf8206800 # define A64_LDR 0xf8606800 +# define A64_LDAXR 0xc85ffc00 +# define A64_STLXR 0xc800fc00 # define A64_STRH 0x78206800 # define A64_LDRH 0x78606800 # define A64_LDRSH 0x78a06800 @@ -347,15 +357,20 @@ typedef union { # define A64_ORR 0x2a000000 # define A64_MOV 0x2a0003e0 /* AKA orr Rd,xzr,Rm */ # define A64_MVN 0x2a2003e0 +# define A64_CLS 0x5ac01400 +# define A64_CLZ 0x5ac01000 +# define A64_RBIT 0x5ac00000 # define A64_UXTW 0x2a0003e0 /* AKA MOV */ # define A64_EOR 0x4a000000 # define A64_ANDS 0x6a000000 # define A64_MOVN 0x12800000 # define A64_MOVZ 0x52800000 # define A64_MOVK 0x72800000 +# define BFM(Rd,Rn,ImmR,ImmS) oxxrs(A64_BFM|XS,Rd,Rn,ImmR,ImmS) # define SBFM(Rd,Rn,ImmR,ImmS) oxxrs(A64_SBFM|XS,Rd,Rn,ImmR,ImmS) # define UBFM(Rd,Rn,ImmR,ImmS) oxxrs(A64_UBFM|XS,Rd,Rn,ImmR,ImmS) -# define UBFX(Rd,Rn,ImmR,ImmS) oxxrs(A64_UBFX,Rd,Rn,ImmR,ImmS) +# define SBFX(Rd,Rn,ImmR,ImmS) oxxrs(A64_SBFX|XS,Rd,Rn,ImmR,ImmS) +# define UBFX(Rd,Rn,ImmR,ImmS) oxxrs(A64_UBFX|XS,Rd,Rn,ImmR,ImmS) # define CMP(Rn,Rm) oxx_(A64_CMP|XS,Rn,Rm) # define CMPI(Rn,Imm12) oxxi(A64_SUBSI|XS,XZR_REGNO,Rn,Imm12) # define CMPI_12(Rn,Imm12) oxxi(A64_SUBSI|XS|LSL_12,XZR_REGNO,Rn,Imm12) @@ -368,6 +383,9 @@ typedef union { # define MOV(Rd,Rm) ox_x(A64_MOV|XS,Rd,Rm) # define MVN(Rd,Rm) ox_x(A64_MVN|XS,Rd,Rm) # define NEG(Rd,Rm) ox_x(A64_NEG|XS,Rd,Rm) +# define CLS(Rd,Rm) o_xx(A64_CLS|XS,Rd,Rm) +# define CLZ(Rd,Rm) o_xx(A64_CLZ|XS,Rd,Rm) +# define RBIT(Rd,Rm) o_xx(A64_RBIT|XS,Rd,Rm) # define MOVN(Rd,Imm16) ox_h(A64_MOVN|XS,Rd,Imm16) # define MOVN_16(Rd,Imm16) ox_h(A64_MOVN|XS|MOVI_LSL_16,Rd,Imm16) # define MOVN_32(Rd,Imm16) ox_h(A64_MOVN|XS|MOVI_LSL_32,Rd,Imm16) @@ -408,6 +426,9 @@ typedef union { # define ASRI(r0,r1,i0) SBFM(r0,r1,i0,63) # define LSR(Rd,Rn,Rm) oxxx(A64_LSR|XS,Rd,Rn,Rm) # define LSRI(r0,r1,i0) UBFM(r0,r1,i0,63) +# define RORV(Rd,Rn,Rm) oxxx(A64_RORV|XS,Rd,Rn,Rm) +# define EXTR(Rd,Rn,Rm,Im) oxxx6(A64_EXTR|XS|DS,Rm,Im,Rn,Rd) +# define ROR(Rd,Rn,Rm,Im) EXTR(Rd,Rn,Rm,Im) # define AND(Rd,Rn,Rm) oxxx(A64_AND|XS,Rd,Rn,Rm) /* actually should use oxxrs but logical_immediate returns proper encoding */ # define ANDI(Rd,Rn,Imm12) oxxi(A64_ANDI|XS,Rd,Rn,Imm12) @@ -420,8 +441,8 @@ typedef union { # define SXTB(Rd,Rn) SBFM(Rd,Rn,0,7) # define SXTH(Rd,Rn) SBFM(Rd,Rn,0,15) # define SXTW(Rd,Rn) SBFM(Rd,Rn,0,31) -# define UXTB(Rd,Rn) UBFX(Rd,Rn,0,7) -# define UXTH(Rd,Rn) UBFX(Rd,Rn,0,15) +# define UXTB(Rd,Rn) oxxrs(A64_UBFX & ~DS,Rd,Rn,0,7) +# define UXTH(Rd,Rn) oxxrs(A64_UBFX & ~DS,Rd,Rn,0,15) # define UXTW(Rd,Rm) ox_x(A64_UXTW,Rd,Rm) # define REV(Rd,Rn) o_xx(A64_REV,Rd,Rn) # define LDRSB(Rt,Rn,Rm) oxxx(A64_LDRSB,Rt,Rn,Rm) @@ -445,6 +466,8 @@ typedef union { # define LDR(Rt,Rn,Rm) oxxx(A64_LDR,Rt,Rn,Rm) # define LDRI(Rt,Rn,Imm12) oxxi(A64_LDRI,Rt,Rn,Imm12) # define LDUR(Rt,Rn,Imm9) oxx9(A64_LDUR,Rt,Rn,Imm9) +# define LDAXR(Rt,Rn) o_xx(A64_LDAXR,Rt,Rn) +# define STLXR(Rs,Rt,Rn) oxxx(A64_STLXR,Rs,Rn,Rt) # define STRB(Rt,Rn,Rm) oxxx(A64_STRB,Rt,Rn,Rm) # define STRBI(Rt,Rn,Imm12) oxxi(A64_STRBI,Rt,Rn,Imm12) # define STURB(Rt,Rn,Imm9) oxx9(A64_STURB,Rt,Rn,Imm9) @@ -504,6 +527,9 @@ static void _oxxxc(jit_state_t*,jit_int32_t,jit_int32_t, # define oxxx7(Op,Rt,Rt2,Rn,Simm7) _oxxx7(_jit,Op,Rt,Rt2,Rn,Simm7) static void _oxxx7(jit_state_t*,jit_int32_t, jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t); +# define oxxx6(Op,Rm,Imm6,Rn,Rd) _oxxx6(_jit,Op,Rm,Imm6,Rn,Rd) +static void _oxxx6(jit_state_t*,jit_int32_t, + jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t); # define nop(i0) _nop(_jit,i0) static void _nop(jit_state_t*,jit_int32_t); # define addr(r0,r1,r2) ADD(r0,r1,r2) @@ -529,6 +555,12 @@ static void _rsbi(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t); # define mulr(r0,r1,r2) MUL(r0,r1,r2) # define muli(r0,r1,i0) _muli(_jit,r0,r1,i0) static void _muli(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t); +# define hmulr(r0,r1,r2) SMULH(r0,r1,r2) +# define hmuli(r0,r1,i0) _hmuli(_jit,r0,r1,i0) +static void _hmuli(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t); +# define hmulr_u(r0,r1,r2) UMULH(r0,r1,r2) +# define hmuli_u(r0,r1,i0) _hmuli_u(_jit,r0,r1,i0) +static void _hmuli_u(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t); # define qmulr(r0,r1,r2,r3) _qmulr(_jit,r0,r1,r2,r3) static void _qmulr(jit_state_t*,jit_int32_t, jit_int32_t,jit_int32_t,jit_int32_t); @@ -574,12 +606,46 @@ static void _rshi(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t); # define rshr_u(r0,r1,r2) LSR(r0,r1,r2) # define rshi_u(r0,r1,i0) _rshi_u(_jit,r0,r1,i0) static void _rshi_u(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t); +# define qlshr(r0,r1,r2,r3) xlshr(1,r0,r1,r2,r3) +# define qlshr_u(r0, r1, r2, r3) xlshr(0, r0, r1, r2, r3) +# define xlshr(s,r0,r1,r2,r3) _xlshr(_jit,s,r0,r1,r2,r3) +static void +_xlshr(jit_state_t*,jit_bool_t,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t); +# define qlshi(r0, r1, r2, i0) xlshi(1, r0, r1, r2, i0) +# define qlshi_u(r0, r1, r2, i0) xlshi(0, r0, r1, r2, i0) +# define xlshi(s, r0, r1, r2, i0) _xlshi(_jit, s, r0, r1, r2, i0) +static void +_xlshi(jit_state_t*,jit_bool_t,jit_int32_t,jit_int32_t,jit_int32_t,jit_word_t); +# define qrshr(r0, r1, r2, r3) xrshr(1, r0, r1, r2, r3) +# define qrshr_u(r0, r1, r2, r3) xrshr(0, r0, r1, r2, r3) +# define xrshr(s, r0, r1, r2, r3) _xrshr(_jit, s, r0, r1, r2, r3) +static void +_xrshr(jit_state_t*,jit_bool_t,jit_int32_t,jit_int32_t,jit_int32_t,jit_int32_t); +# define qrshi(r0, r1, r2, i0) xrshi(1, r0, r1, r2, i0) +# define qrshi_u(r0, r1, r2, i0) xrshi(0, r0, r1, r2, i0) +# define xrshi(s, r0, r1, r2, i0) _xrshi(_jit, s, r0, r1, r2, i0) +static void +_xrshi(jit_state_t*,jit_bool_t,jit_int32_t,jit_int32_t,jit_int32_t,jit_word_t); +# define lrotr(r0,r1,r2) _lrotr(_jit,r0,r1,r2) +static void _lrotr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t); +# define lroti(r0,r1,i0) rroti(r0,r1,64-i0) +# define rrotr(r0,r1,r2) RORV(r0,r1,r2) +# define rroti(r0,r1,i0) ROR(r0,r1,r1,i0) # define movnr(r0,r1,r2) _movnr(_jit,r0,r1,r2) static void _movnr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t); # define movzr(r0,r1,r2) _movzr(_jit,r0,r1,r2) static void _movzr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t); # define negr(r0,r1) NEG(r0,r1) # define comr(r0,r1) MVN(r0,r1) +# define clor(r0, r1) _clor(_jit, r0, r1) +static void _clor(jit_state_t*, jit_int32_t, jit_int32_t); +# define clzr(r0, r1) CLZ(r0,r1) +static void _clzr(jit_state_t*, jit_int32_t, jit_int32_t); +# define ctor(r0, r1) _ctor(_jit, r0, r1) +static void _ctor(jit_state_t*, jit_int32_t, jit_int32_t); +# define ctzr(r0, r1) _ctzr(_jit, r0, r1) +static void _ctzr(jit_state_t*, jit_int32_t, jit_int32_t); +# define rbitr(r0, r1) RBIT(r0, r1) # define andr(r0,r1,r2) AND(r0,r1,r2) # define andi(r0,r1,i0) _andi(_jit,r0,r1,i0) static void _andi(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t); @@ -639,6 +705,10 @@ static void _ldxi_ui(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t); # define ldxr_l(r0,r1,r2) LDR(r0,r1,r2) # define ldxi_l(r0,r1,i0) _ldxi_l(_jit,r0,r1,i0) static void _ldxi_l(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t); +# define unldr(r0, r1, i0) generic_unldr(r0, r1, i0) +# define unldi(r0, i0, i1) generic_unldi(r0, i0, i1) +# define unldr_u(r0, r1, i0) generic_unldr_u(r0, r1, i0) +# define unldi_u(r0, i0, i1) generic_unldi_u(r0, i0, i1) # define str_c(r0,r1) STRBI(r1,r0,0) # define sti_c(i0,r0) _sti_c(_jit,i0,r0) static void _sti_c(jit_state_t*,jit_word_t,jit_int32_t); @@ -663,17 +733,30 @@ static void _stxi_i(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t); # define stxr_l(r0,r1,r2) STR(r2,r1,r0) # define stxi_l(i0,r0,r1) _stxi_l(_jit,i0,r0,r1) static void _stxi_l(jit_state_t*,jit_word_t,jit_int32_t,jit_int32_t); +# define unstr(r0, r1, i0) generic_unstr(r0, r1, i0) +# define unsti(i0, r0, i1) generic_unsti(i0, r0, i1) # define bswapr_us(r0,r1) _bswapr_us(_jit,r0,r1) static void _bswapr_us(jit_state_t*,jit_int32_t,jit_int32_t); # define bswapr_ui(r0,r1) _bswapr_ui(_jit,r0,r1) static void _bswapr_ui(jit_state_t*,jit_int32_t,jit_int32_t); # define bswapr_ul(r0,r1) REV(r0,r1) +#define extr(r0,r1,i0,i1) _extr(_jit,r0,r1,i0,i1) +static void _extr(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t,jit_word_t); +#define extr_u(r0,r1,i0,i1) _extr_u(_jit,r0,r1,i0,i1) +static void _extr_u(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t,jit_word_t); +#define depr(r0,r1,i0,i1) _depr(_jit,r0,r1,i0,i1) +static void _depr(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t,jit_word_t); # define extr_c(r0,r1) SXTB(r0,r1) # define extr_uc(r0,r1) UXTB(r0,r1) # define extr_s(r0,r1) SXTH(r0,r1) # define extr_us(r0,r1) UXTH(r0,r1) # define extr_i(r0,r1) SXTW(r0,r1) # define extr_ui(r0,r1) UXTW(r0,r1) +# define casx(r0, r1, r2, r3, i0) _casx(_jit, r0, r1, r2, r3, i0) +static void _casx(jit_state_t *_jit,jit_int32_t,jit_int32_t, + jit_int32_t,jit_int32_t,jit_word_t); +#define casr(r0, r1, r2, r3) casx(r0, r1, r2, r3, 0) +#define casi(r0, i0, r1, r2) casx(r0, _NOREG, r1, r2, i0) # define movr(r0,r1) _movr(_jit,r0,r1) static void _movr(jit_state_t*,jit_int32_t,jit_int32_t); # define movi(r0,i0) _movi(_jit,r0,i0) @@ -772,12 +855,12 @@ _bmxi(jit_state_t*,jit_int32_t,jit_word_t,jit_int32_t,jit_word_t); # define bmci(i0,r0,i1) bmxi(BCC_EQ,i0,r0,i1) # define jmpr(r0) BR(r0) # define jmpi(i0) _jmpi(_jit,i0) -static void _jmpi(jit_state_t*,jit_word_t); +static jit_word_t _jmpi(jit_state_t*,jit_word_t); # define jmpi_p(i0) _jmpi_p(_jit,i0) static jit_word_t _jmpi_p(jit_state_t*,jit_word_t); # define callr(r0) BLR(r0) # define calli(i0) _calli(_jit,i0) -static void _calli(jit_state_t*,jit_word_t); +static jit_word_t _calli(jit_state_t*,jit_word_t); # define calli_p(i0) _calli_p(_jit,i0) static jit_word_t _calli_p(jit_state_t*,jit_word_t); # define prolog(i0) _prolog(_jit,i0) @@ -793,36 +876,17 @@ static void _patch_at(jit_state_t*,jit_word_t,jit_word_t); #endif #if CODE +/* https://dougallj.wordpress.com/2021/10/30/bit-twiddling-optimising-aarch64-logical-immediate-encoding-and-decoding/ */ +#include "aarch64-logical-immediates.c" static jit_int32_t logical_immediate(jit_word_t imm) { - /* There are 5334 possible immediate values, but to avoid the - * need of either too complex code or large lookup tables, - * only check for (simply) encodable common/small values */ - switch (imm) { - case -16: return (0xf3b); - case -15: return (0xf3c); - case -13: return (0xf3d); - case -9: return (0xf3e); - case -8: return (0xf7c); - case -7: return (0xf7d); - case -5: return (0xf7e); - case -4: return (0xfbd); - case -3: return (0xfbe); - case -2: return (0xffe); - case 1: return (0x000); - case 2: return (0xfc0); - case 3: return (0x001); - case 4: return (0xf80); - case 6: return (0xfc1); - case 7: return (0x002); - case 8: return (0xf40); - case 12: return (0xf81); - case 14: return (0xfc2); - case 15: return (0x003); - case 16: return (0xf00); - default: return (-1); + jit_int32_t result = encodeLogicalImmediate64(imm); + if (result != ENCODE_FAILED) { + assert(isValidLogicalImmediate64(result)); + return (result & 0xfff); } + return (-1); } static void @@ -903,7 +967,7 @@ static void _o26(jit_state_t *_jit, jit_int32_t Op, jit_int32_t Simm26) { instr_t i; - assert(Simm26 >= -33554432 && Simm26 <= 33554431); + assert(s26_p(Simm26)); assert(!(Op & ~0xfc000000)); i.w = Op; i.imm26.b = Simm26; @@ -1027,6 +1091,24 @@ _oxxx7(jit_state_t *_jit, jit_int32_t Op, ii(i.w); } +static void +_oxxx6(jit_state_t *_jit, jit_int32_t Op, + jit_int32_t Rm, jit_int32_t Imm6, jit_int32_t Rn, jit_int32_t Rd) +{ + instr_t i; + assert(!(Rm & ~0x1f)); + assert(!(Rn & ~0x1f)); + assert(!(Rd & ~0x1f)); + assert(Imm6 >= 0 && Imm6 <= 63); + assert(!(Op & ~0xffe0fc00)); + i.w = Op; + i.Rm.b = Rm; + i.imm6.b = Imm6; + i.Rn.b = Rn; + i.Rd.b = Rd; + ii(i.w); +} + static void _nop(jit_state_t *_jit, jit_int32_t i0) { @@ -1152,6 +1234,26 @@ _muli(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0) jit_unget_reg(reg); } +static void +_hmuli(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0) +{ + jit_int32_t reg; + reg = jit_get_reg(jit_class_gpr); + movi(rn(reg), i0); + hmulr(r0, r1, rn(reg)); + jit_unget_reg(reg); +} + +static void +_hmuli_u(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0) +{ + jit_int32_t reg; + reg = jit_get_reg(jit_class_gpr); + movi(rn(reg), i0); + hmulr_u(r0, r1, rn(reg)); + jit_unget_reg(reg); +} + static void _qmulr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3) @@ -1375,6 +1477,194 @@ _rshi_u(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0) } } +static void +_xlshr(jit_state_t *_jit, jit_bool_t sign, + jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3) +{ + jit_bool_t branch; + jit_word_t over, zero, done, done_over; + jit_int32_t t0, s0, t1, s1, t2, s2, t3, s3; + s0 = jit_get_reg(jit_class_gpr); + t0 = rn(s0); + if (r0 == r2 || r1 == r2) { + s2 = jit_get_reg(jit_class_gpr); + t2 = rn(s2); + movr(t2, r2); + } + else + t2 = r2; + if (r0 == r3 || r1 == r3) { + s3 = jit_get_reg(jit_class_gpr); + t3 = rn(s3); + movr(t3, r3); + } + else + t3 = r3; + if ((s1 = jit_get_reg(jit_class_gpr|jit_class_nospill|jit_class_chk))) { + t1 = rn(s1); + branch = 0; + } + else + branch = 1; + rsbi(t0, t3, __WORDSIZE); + lshr(r0, t2, t3); + if (sign) + rshr(r1, t2, t0); + else + rshr_u(r1, t2, t0); + if (branch) { + zero = beqi(_jit->pc.w, t3, 0); + over = beqi(_jit->pc.w, t3, __WORDSIZE); + done = jmpi(_jit->pc.w); + patch_at(over, _jit->pc.w); + /* overflow */ + movi(r0, 0); + done_over = jmpi(_jit->pc.w); + /* zero */ + patch_at(zero, _jit->pc.w); + if (sign) + rshi(r1, t2, __WORDSIZE - 1); + else + movi(r1, 0); + patch_at(done, _jit->pc.w); + patch_at(done_over, _jit->pc.w); + } + else { + if (sign) + rshi(t0, t2, __WORDSIZE - 1); + else + movi(t0, 0); + /* zero? */ + movzr(r1, t0, t3); + /* Branchless but 4 bytes longer than branching fallback */ + if (sign) + movi(t0, 0); + /* overflow? */ + eqi(t1, t3, __WORDSIZE); + movnr(r0, t0, t1); + jit_unget_reg(s1); + } + jit_unget_reg(s0); + if (t2 != r2) + jit_unget_reg(s2); + if (t3 != r3) + jit_unget_reg(s3); +} + +static void +_xlshi(jit_state_t *_jit, jit_bool_t sign, + jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_word_t i0) +{ + if (i0 == 0) { + movr(r0, r2); + if (sign) + rshi(r1, r2, __WORDSIZE - 1); + else + movi(r1, 0); + } + else if (i0 == __WORDSIZE) { + movr(r1, r2); + movi(r0, 0); + } + else { + assert((jit_uword_t)i0 <= __WORDSIZE); + if (sign) + rshi(r1, r2, __WORDSIZE - i0); + else + rshi_u(r1, r2, __WORDSIZE - i0); + lshi(r0, r2, i0); + } +} + +static void +_xrshr(jit_state_t *_jit, jit_bool_t sign, + jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_int32_t r3) +{ + jit_int32_t t0, s0, t2, s2, t3, s3; + s0 = jit_get_reg(jit_class_gpr); + t0 = rn(s0); + if (r0 == r2 || r1 == r2) { + s2 = jit_get_reg(jit_class_gpr); + t2 = rn(s2); + movr(t2, r2); + } + else + t2 = r2; + if (r0 == r3 || r1 == r3) { + s3 = jit_get_reg(jit_class_gpr); + t3 = rn(s3); + movr(t3, r3); + } + else + t3 = r3; + + if (sign) { + /* underflow? */ + eqi(t0, t3, __WORDSIZE); + subr(t0, t3, t0); + rshr(r0, t2, t0); + } + else { + /* underflow? */ + nei(t0, t3, __WORDSIZE); + rshr_u(r0, t2, t3); + movzr(r0, t0, t0); + } + + rsbi(t0, t3, __WORDSIZE); + lshr(r1, t2, t0); + + /* zero? */ + movzr(r1, t3, t3); + + jit_unget_reg(s0); + if (t2 != r2) + jit_unget_reg(s2); + if (t3 != r3) + jit_unget_reg(s3); +} + +static void +_xrshi(jit_state_t *_jit, jit_bool_t sign, + jit_int32_t r0, jit_int32_t r1, jit_int32_t r2, jit_word_t i0) +{ + if (i0 == 0) { + movr(r0, r2); + movi(r1, 0); + } + else if (i0 == __WORDSIZE) { + movr(r1, r2); + if (sign) + rshi(r0, r2, __WORDSIZE - 1); + else + movi(r0, 0); + } + else { + assert((jit_uword_t)i0 <= __WORDSIZE); + lshi(r1, r2, __WORDSIZE - i0); + if (sign) + rshi(r0, r2, i0); + else + rshi_u(r0, r2, i0); + } +} + +static void +_lrotr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) +{ + jit_int32_t reg; + if (r0 != r1 && r0 != r2) { + rsbi(r0, r2, 64); + rrotr(r0, r1, r0); + } + else { + reg = jit_get_reg(jit_class_gpr); + rsbi(rn(reg), r2, 64); + rrotr(r0, r1, rn(reg)); + jit_unget_reg(reg); + } +} + static void _movnr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) { @@ -1389,6 +1679,74 @@ _movzr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2) CSEL(r0, r0, r1, CC_EQ); } +static void +_extr(jit_state_t *_jit, + jit_int32_t r0, jit_int32_t r1, jit_word_t i0, jit_word_t i1) +{ + assert(i0 >= 0 && i1 >= 1 && i0 + i1 <= __WORDSIZE); + if ( i1 == __WORDSIZE) + movr(r0, r1); + else { +# if __BYTE_ORDER == __BIG_ENDIAN + i0 = __WORDSIZE - (i0 + i1); +# endif + SBFX(r0, r1, i0, (i0 + i1) - 1); + } +} + +static void +_extr_u(jit_state_t *_jit, + jit_int32_t r0, jit_int32_t r1, jit_word_t i0, jit_word_t i1) +{ + assert(i0 >= 0 && i1 >= 1 && i0 + i1 <= __WORDSIZE); + if (i1 == __WORDSIZE) + movr(r0, r1); + else { +# if __BYTE_ORDER == __BIG_ENDIAN + i0 = __WORDSIZE - (i0 + i1); +# endif + UBFX(r0, r1, i0, (i0 + i1) - 1); + } +} + +static void +_depr(jit_state_t *_jit, + jit_int32_t r0, jit_int32_t r1, jit_word_t i0, jit_word_t i1) +{ + jit_int32_t t0; + jit_word_t mask; + assert(i0 >= 0 && i1 >= 1 && i0 + i1 <= __WORDSIZE); + if (i1 == __WORDSIZE) + movr(r0, r1); + else { +# if __BYTE_ORDER == __BIG_ENDIAN + i0 = __WORDSIZE - (i0 + i1); +# endif + BFM(r0, r1, -i0 & 63, i1 - 1); + } +} + +static void +_clor(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) +{ + comr(r0, r1); + clzr(r0, r0); +} + +static void +_ctor(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) +{ + RBIT(r0, r1); + clor(r0, r0); +} + +static void +_ctzr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) +{ + RBIT(r0, r1); + clzr(r0, r0); +} + static void _andi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0) { @@ -1826,6 +2184,33 @@ _stxi_l(jit_state_t *_jit, jit_word_t i0, jit_int32_t r0, jit_int32_t r1) } } +static void +_casx(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, + jit_int32_t r2, jit_int32_t r3, jit_word_t i0) +{ + jit_int32_t r1_reg, iscasi; + jit_word_t retry, done, jump0, jump1; + if ((iscasi = (r1 == _NOREG))) { + r1_reg = jit_get_reg(jit_class_gpr); + r1 = rn(r1_reg); + movi(r1, i0); + } + /* retry: */ + retry = _jit->pc.w; + LDAXR(r0, r1); + eqr(r0, r0, r2); + jump0 = beqi(_jit->pc.w, r0, 0); /* beqi done r0 0 */ + STLXR(r3, r0, r1); + jump1 = bnei(_jit->pc.w, r0, 0); /* bnei retry r0 0 */ + /* done: */ + CSET(r0, CC_EQ); + done = _jit->pc.w; + patch_at(jump0, done); + patch_at(jump1, retry); + if (iscasi) + jit_unget_reg(r1_reg); +} + static void _movr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) { @@ -2130,20 +2515,22 @@ _bmxi(jit_state_t *_jit, jit_int32_t cc, return (w); } -static void +static jit_word_t _jmpi(jit_state_t *_jit, jit_word_t i0) { - jit_word_t w; jit_int32_t reg; - w = (i0 - _jit->pc.w) >> 2; - if (w >= -33554432 && w <= 33554431) - B(w); + jit_word_t d, w; + w = _jit->pc.w; + d = (i0 - w) >> 2; + if (s26_p(d)) + B(d); else { reg = jit_get_reg(jit_class_gpr|jit_class_nospill); movi(rn(reg), i0); jmpr(rn(reg)); jit_unget_reg(reg); } + return (w); } static jit_word_t @@ -2158,20 +2545,22 @@ _jmpi_p(jit_state_t *_jit, jit_word_t i0) return (w); } -static void +static jit_word_t _calli(jit_state_t *_jit, jit_word_t i0) { - jit_word_t w; jit_int32_t reg; - w = (i0 - _jit->pc.w) >> 2; - if (w >= -33554432 && w <= 33554431) - BL(w); + jit_word_t d, w; + w = _jit->pc.w; + d = (i0 - w) >> 2; + if (s26_p(d)) + BL(d); else { reg = jit_get_reg(jit_class_gpr); movi(rn(reg), i0); callr(rn(reg)); jit_unget_reg(reg); } + return (w); } static jit_word_t @@ -2186,20 +2575,13 @@ _calli_p(jit_state_t *_jit, jit_word_t i0) return (w); } -/* - * prolog and epilog not as "optimized" as one would like, but the - * problem of overallocating stack space to save callee save registers - * exists on all ports, and is still a todo to use a variable - * stack_framesize - * value, what would cause needing to patch some calls, most likely - * the offset of jit_arg* of stack arguments. - */ static void _prolog(jit_state_t *_jit, jit_node_t *node) { - jit_int32_t reg; + jit_int32_t reg, rreg, offs; if (_jitc->function->define_frame || _jitc->function->assume_frame) { jit_int32_t frame = -_jitc->function->frame; + jit_check_frame(); assert(_jitc->function->self.aoff >= frame); if (_jitc->function->assume_frame) return; @@ -2210,40 +2592,51 @@ _prolog(jit_state_t *_jit, jit_node_t *node) _jitc->function->stack = ((_jitc->function->self.alen - /* align stack at 16 bytes */ _jitc->function->self.aoff) + 15) & -16; - STPI_POS(FP_REGNO, LR_REGNO, SP_REGNO, -(stack_framesize >> 3)); - MOV_XSP(FP_REGNO, SP_REGNO); -#define SPILL(L, R, O) \ - do { \ - if (jit_regset_tstbit(&_jitc->function->regset, _R##L)) { \ - if (jit_regset_tstbit(&_jitc->function->regset, _R##R)) \ - STPI(L, R, SP_REGNO, O); \ - else \ - STRI(L, SP_REGNO, O); \ - } \ - else if (jit_regset_tstbit(&_jitc->function->regset, _R##R)) \ - STRI(R, SP_REGNO, O + 1); \ - } while (0) - SPILL(19, 20, 2); - SPILL(21, 22, 4); - SPILL(23, 24, 6); - SPILL(25, 26, 8); - SPILL(27, 28, 10); -#undef SPILL -#define SPILL(R, O) \ - do { \ - if (jit_regset_tstbit(&_jitc->function->regset, _V##R)) \ - stxi_d(O, SP_REGNO, R); \ - } while (0) - SPILL( 8, 96); - SPILL( 9, 104); - SPILL(10, 112); - SPILL(11, 120); - SPILL(12, 128); - SPILL(13, 136); - SPILL(14, 144); - SPILL(15, 152); -#undef SPILL - if (_jitc->function->stack) + + if (!_jitc->function->need_frame) { + /* check if any callee save register needs to be saved */ + for (reg = 0; reg < _jitc->reglen; ++reg) + if (jit_regset_tstbit(&_jitc->function->regset, reg) && + (_rvs[reg].spec & jit_class_sav)) { + jit_check_frame(); + break; + } + } + + if (_jitc->function->need_frame) { + STPI_POS(FP_REGNO, LR_REGNO, SP_REGNO, -(jit_framesize() >> 3)); + MOV_XSP(FP_REGNO, SP_REGNO); + } + /* callee save registers */ + for (reg = 0, offs = 2; reg < jit_size(iregs);) { + if (jit_regset_tstbit(&_jitc->function->regset, iregs[reg])) { + for (rreg = reg + 1; rreg < jit_size(iregs); rreg++) { + if (jit_regset_tstbit(&_jitc->function->regset, iregs[rreg])) + break; + } + if (rreg < jit_size(iregs)) { + STPI(rn(iregs[reg]), rn(iregs[rreg]), SP_REGNO, offs); + offs += 2; + reg = rreg + 1; + } + else { + STRI(rn(iregs[reg]), SP_REGNO, offs); + ++offs; + /* No pair found */ + break; + } + } + else + ++reg; + } + for (reg = 0, offs <<= 3; reg < jit_size(fregs); reg++) { + if (jit_regset_tstbit(&_jitc->function->regset, fregs[reg])) { + stxi_d(offs, SP_REGNO, rn(fregs[reg])); + offs += sizeof(jit_float64_t); + } + } + + if (_jitc->function->stack) subi(SP_REGNO, SP_REGNO, _jitc->function->stack); if (_jitc->function->allocar) { reg = jit_get_reg(jit_class_gpr); @@ -2252,6 +2645,7 @@ _prolog(jit_state_t *_jit, jit_node_t *node) jit_unget_reg(reg); } +#if !__APPLE__ if (_jitc->function->self.call & jit_call_varargs) { /* Save gp registers in the save area, if any is a vararg */ for (reg = 8 - _jitc->function->vagp / -8; @@ -2269,53 +2663,55 @@ _prolog(jit_state_t *_jit, jit_node_t *node) stxi_d(_jitc->function->vaoff + offsetof(jit_va_list_t, q0) + reg * 16 + offsetof(jit_qreg_t, l), FP_REGNO, rn(_V0 - reg)); } +#endif } static void _epilog(jit_state_t *_jit, jit_node_t *node) { + jit_int32_t reg, rreg, offs; if (_jitc->function->assume_frame) return; if (_jitc->function->stack) MOV_XSP(SP_REGNO, FP_REGNO); -#define LOAD(L, R, O) \ - do { \ - if (jit_regset_tstbit(&_jitc->function->regset, _R##L)) { \ - if (jit_regset_tstbit(&_jitc->function->regset, _R##R)) \ - LDPI(L, R, SP_REGNO, O); \ - else \ - LDRI(L, SP_REGNO, O); \ - } \ - else if (jit_regset_tstbit(&_jitc->function->regset, _R##R)) \ - LDRI(R, SP_REGNO, O + 1); \ - } while (0) - LOAD(19, 20, 2); - LOAD(21, 22, 4); - LOAD(23, 24, 6); - LOAD(25, 26, 8); - LOAD(27, 28, 10); -#undef LOAD -#define LOAD(R, O) \ - do { \ - if (jit_regset_tstbit(&_jitc->function->regset, _V##R)) \ - ldxi_d(R, SP_REGNO, O); \ - } while (0) - LOAD( 8, 96); - LOAD( 9, 104); - LOAD(10, 112); - LOAD(11, 120); - LOAD(12, 128); - LOAD(13, 136); - LOAD(14, 144); - LOAD(15, 152); -#undef LOAD - LDPI_PRE(FP_REGNO, LR_REGNO, SP_REGNO, stack_framesize >> 3); + /* callee save registers */ + for (reg = 0, offs = 2; reg < jit_size(iregs);) { + if (jit_regset_tstbit(&_jitc->function->regset, iregs[reg])) { + for (rreg = reg + 1; rreg < jit_size(iregs); rreg++) { + if (jit_regset_tstbit(&_jitc->function->regset, iregs[rreg])) + break; + } + if (rreg < jit_size(iregs)) { + LDPI(rn(iregs[reg]), rn(iregs[rreg]), SP_REGNO, offs); + offs += 2; + reg = rreg + 1; + } + else { + LDRI(rn(iregs[reg]), SP_REGNO, offs); + ++offs; + /* No pair found */ + break; + } + } + else + ++reg; + } + for (reg = 0, offs <<= 3; reg < jit_size(fregs); reg++) { + if (jit_regset_tstbit(&_jitc->function->regset, fregs[reg])) { + ldxi_d(rn(fregs[reg]), SP_REGNO, offs); + offs += sizeof(jit_float64_t); + } + } + + if (_jitc->function->need_frame) + LDPI_PRE(FP_REGNO, LR_REGNO, SP_REGNO, jit_framesize() >> 3); RET(); } static void _vastart(jit_state_t *_jit, jit_int32_t r0) { +#if !__APPLE__ jit_int32_t reg; assert(_jitc->function->self.call & jit_call_varargs); @@ -2326,7 +2722,7 @@ _vastart(jit_state_t *_jit, jit_int32_t r0) reg = jit_get_reg(jit_class_gpr); /* Initialize stack pointer to the first stack argument. */ - addi(rn(reg), FP_REGNO, _jitc->function->self.size); + addi(rn(reg), FP_REGNO, jit_selfsize()); stxi(offsetof(jit_va_list_t, stack), r0, rn(reg)); /* Initialize gp top pointer to the first stack argument. */ @@ -2346,11 +2742,16 @@ _vastart(jit_state_t *_jit, jit_int32_t r0) stxi_i(offsetof(jit_va_list_t, fpoff), r0, rn(reg)); jit_unget_reg(reg); +#else + assert(_jitc->function->self.call & jit_call_varargs); + addi(r0, FP_REGNO, jit_selfsize()); +#endif } static void _vaarg(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) { +#if !__APPLE__ jit_word_t ge_code; jit_word_t lt_code; jit_int32_t rg0, rg1; @@ -2380,7 +2781,7 @@ _vaarg(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) jit_unget_reg(rg1); /* Jump over overflow code. */ - lt_code = jmpi_p(_jit->pc.w); + lt_code = jmpi(_jit->pc.w); /* Where to land if argument is in overflow area. */ patch_at(ge_code, _jit->pc.w); @@ -2399,6 +2800,11 @@ _vaarg(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1) patch_at(lt_code, _jit->pc.w); jit_unget_reg(rg0); +#else + assert(_jitc->function->self.call & jit_call_varargs); + ldr(r0, r1); + addi(r1, r1, sizeof(jit_word_t)); +#endif } static void @@ -2418,7 +2824,7 @@ _patch_at(jit_state_t *_jit, jit_word_t instr, jit_word_t label) ffc = i.w & 0xffc00000; if (fc == A64_B || fc == A64_BL) { d = (label - instr) >> 2; - assert(d >= -33554432 && d <= 33554431); + assert(s26_p(d)); i.imm26.b = d; u.i[0] = i.w; }