From aaea8e3ecde060c3f042ef36bb68f10d186f6904 Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 9 Nov 2019 10:30:57 +0100 Subject: [PATCH] sh2 drc: optimizations for MIPS code emitting --- cpu/drc/emit_arm.c | 48 ++++- cpu/drc/emit_arm64.c | 52 +++++- cpu/drc/emit_mips.c | 408 +++++++++++++++++++++++++++++++----------- cpu/drc/emit_x86.c | 25 +++ cpu/sh2/compiler.c | 409 +++++++++++++++++++++++++++---------------- pico/32x/32x.c | 14 +- 6 files changed, 693 insertions(+), 263 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index e35d3471..25a2c72f 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -671,6 +671,8 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) literal_insn[pool_index] += move_offs; } +#define EMITH_HINT_COND(cond) /**/ + #define JMP_POS(ptr) { \ ptr = tcache_ptr; \ EMIT(0,M1(PC),0); \ @@ -721,9 +723,11 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_add_r_r_r_lsl_ptr(d, s1, s2, lslimm) \ emith_add_r_r_r_lsl(d, s1, s2, lslimm) +#define emith_adc_r_r_r_lsl(d, s1, s2, lslimm) \ + EOP_ADC_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) + #define emith_addf_r_r_r_lsl(d, s1, s2, lslimm) \ EOP_ADD_REG(A_COND_AL,1,d,s1,s2,A_AM1_LSL,lslimm) - #define emith_addf_r_r_r_lsr(d, s1, s2, lslimm) \ EOP_ADD_REG(A_COND_AL,1,d,s1,s2,A_AM1_LSR,lslimm) @@ -733,6 +737,9 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_sub_r_r_r_lsl(d, s1, s2, lslimm) \ EOP_SUB_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) +#define emith_sbc_r_r_r_lsl(d, s1, s2, lslimm) \ + EOP_SBC_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) + #define emith_subf_r_r_r_lsl(d, s1, s2, lslimm) \ EOP_SUB_REG(A_COND_AL,1,d,s1,s2,A_AM1_LSL,lslimm) @@ -741,10 +748,11 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_or_r_r_r_lsl(d, s1, s2, lslimm) \ EOP_ORR_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) +#define emith_or_r_r_r_lsr(d, s1, s2, lsrimm) \ + EOP_ORR_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSR,lsrimm) #define emith_eor_r_r_r_lsl(d, s1, s2, lslimm) \ EOP_EOR_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) - #define emith_eor_r_r_r_lsr(d, s1, s2, lsrimm) \ EOP_EOR_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSR,lsrimm) @@ -753,13 +761,20 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_or_r_r_lsl(d, s, lslimm) \ emith_or_r_r_r_lsl(d, d, s, lslimm) +#define emith_or_r_r_lsr(d, s, lsrimm) \ + emith_or_r_r_r_lsr(d, d, s, lsrimm) +#define emith_eor_r_r_lsl(d, s, lslimm) \ + emith_eor_r_r_r_lsl(d, d, s, lslimm) #define emith_eor_r_r_lsr(d, s, lsrimm) \ emith_eor_r_r_r_lsr(d, d, s, lsrimm) #define emith_add_r_r_r(d, s1, s2) \ emith_add_r_r_r_lsl(d, s1, s2, 0) +#define emith_adc_r_r_r(d, s1, s2) \ + emith_adc_r_r_r_lsl(d, s1, s2, 0) + #define emith_addf_r_r_r(d, s1, s2) \ emith_addf_r_r_r_lsl(d, s1, s2, 0) @@ -769,6 +784,9 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_sub_r_r_r(d, s1, s2) \ emith_sub_r_r_r_lsl(d, s1, s2, 0) +#define emith_sbc_r_r_r(d, s1, s2) \ + emith_sbc_r_r_r_lsl(d, s1, s2, 0) + #define emith_subf_r_r_r(d, s1, s2) \ emith_subf_r_r_r_lsl(d, s1, s2, 0) @@ -790,11 +808,17 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_add_r_r_ptr(d, s) \ emith_add_r_r_r(d, d, s) +#define emith_adc_r_r(d, s) \ + emith_adc_r_r_r(d, d, s) + #define emith_sub_r_r(d, s) \ emith_sub_r_r_r(d, d, s) -#define emith_adc_r_r(d, s) \ - EOP_ADC_REG(A_COND_AL,0,d,d,s,A_AM1_LSL,0) +#define emith_sbc_r_r(d, s) \ + emith_sbc_r_r_r(d, d, s) + +#define emith_negc_r_r(d, s) \ + EOP_C_DOP_IMM(A_COND_AL,A_OP_RSC,0,s,d,0,0) #define emith_and_r_r_c(cond, d, s) \ EOP_AND_REG(cond,0,d,d,s,A_AM1_LSL,0) @@ -987,9 +1011,13 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_rolcf(d) \ emith_adcf_r_r(d, d) +#define emith_rolc(d) \ + emith_adc_r_r(d, d) #define emith_rorcf(d) \ EOP_MOV_REG(A_COND_AL,1,d,d,A_AM1_ROR,0) /* ROR #0 -> RRX */ +#define emith_rorc(d) \ + EOP_MOV_REG(A_COND_AL,0,d,d,A_AM1_ROR,0) /* ROR #0 -> RRX */ #define emith_negcf_r_r(d, s) \ EOP_C_DOP_IMM(A_COND_AL,A_OP_RSC,1,s,d,0,0) @@ -1329,6 +1357,18 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) } \ } while (0) +#define emith_t_to_carry(srr, is_sub) do { \ + if (is_sub) { \ + int t_ = rcache_get_tmp(); \ + emith_eor_r_r_imm(t_, srr, 1); \ + emith_rorf(t_, t_, 1); \ + rcache_free_tmp(t_); \ + } else { \ + emith_rorf(srr, srr, 1); \ + emith_rol(srr, srr, 1); \ + } \ +} while (0) + #define emith_tpop_carry(sr, is_sub) do { \ if (is_sub) \ emith_eor_r_imm(sr, 1); \ diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index 0c36b2bc..dc0cf559 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -370,6 +370,8 @@ enum { AM_IDX, AM_IDXPOST, AM_IDXREG, AM_IDXPRE }; JMP_EMIT_NC(else_ptr); \ } +#define EMITH_HINT_COND(cond) /**/ + // "simple" jump (no more then a few insns) // ARM32 will use conditional instructions here #define EMITH_SJMP_START EMITH_JMP_START @@ -414,6 +416,24 @@ enum { AM_IDX, AM_IDXPOST, AM_IDXREG, AM_IDXPRE }; #define emith_addf_r_r_r_lsr(d, s1, s2, simm) \ EMIT(A64_ADDS_REG(d, s1, s2, ST_LSR, simm)) +#define emith_adc_r_r_r_lsl(d, s1, s2, simm) \ + if (simm) { int _t = rcache_get_tmp(); \ + emith_lsl(_t, s2, simm); \ + emith_adc_r_r_r(d, s1, _t); \ + rcache_free_tmp(_t); \ + } else \ + emith_adc_r_r_r(d, s1, s2); \ +} while (0) + +#define emith_sbc_r_r_r_lsl(d, s1, s2, simm) \ + if (simm) { int _t = rcache_get_tmp(); \ + emith_lsl(_t, s2, simm); \ + emith_sbc_r_r_r(d, s1, _t); \ + rcache_free_tmp(_t); \ + } else \ + emith_sbc_r_r_r(d, s1, s2); \ +} while (0) + #define emith_sub_r_r_r_lsl(d, s1, s2, simm) \ EMIT(A64_SUB_REG(d, s1, s2, ST_LSL, simm)) @@ -422,10 +442,11 @@ enum { AM_IDX, AM_IDXPOST, AM_IDXREG, AM_IDXPRE }; #define emith_or_r_r_r_lsl(d, s1, s2, simm) \ EMIT(A64_OR_REG(d, s1, s2, ST_LSL, simm)) +#define emith_or_r_r_r_lsr(d, s1, s2, simm) \ + EMIT(A64_OR_REG(d, s1, s2, ST_LSR, simm)) #define emith_eor_r_r_r_lsl(d, s1, s2, simm) \ EMIT(A64_EOR_REG(d, s1, s2, ST_LSL, simm)) - #define emith_eor_r_r_r_lsr(d, s1, s2, simm) \ EMIT(A64_EOR_REG(d, s1, s2, ST_LSR, simm)) @@ -434,7 +455,11 @@ enum { AM_IDX, AM_IDXPOST, AM_IDXREG, AM_IDXPRE }; #define emith_or_r_r_lsl(d, s, lslimm) \ emith_or_r_r_r_lsl(d, d, s, lslimm) +#define emith_or_r_r_lsr(d, s, lsrimm) \ + emith_or_r_r_r_lsr(d, d, s, lsrimm) +#define emith_eor_r_r_lsl(d, s, lslimm) \ + emith_eor_r_r_r_lsl(d, d, s, lslimm) #define emith_eor_r_r_lsr(d, s, lsrimm) \ emith_eor_r_r_r_lsr(d, d, s, lsrimm) @@ -472,6 +497,9 @@ enum { AM_IDX, AM_IDXPOST, AM_IDXREG, AM_IDXPRE }; #define emith_neg_r_r(d, s) \ EMIT(A64_NEG_REG(d, s, ST_LSL, 0)) +#define emith_negc_r_r(d, s) \ + EMIT(A64_NEGC_REG(d, s)) + #define emith_adc_r_r_r(d, s1, s2) \ EMIT(A64_ADC_REG(d, s1, s2)) @@ -481,6 +509,9 @@ enum { AM_IDX, AM_IDXPOST, AM_IDXREG, AM_IDXPRE }; #define emith_adcf_r_r_r(d, s1, s2) \ EMIT(A64_ADCS_REG(d, s1, s2)) +#define emith_sbc_r_r_r(d, s1, s2) \ + EMIT(A64_SBC_REG(d, s1, s2)) + #define emith_sbcf_r_r_r(d, s1, s2) \ EMIT(A64_SBCS_REG(d, s1, s2)) @@ -806,12 +837,19 @@ static void emith_log_imm(int op, int wx, int rd, int rn, u32 imm) #define emith_rolcf(d) \ emith_adcf_r_r(d, d) +#define emith_rolc(d) \ + emith_adc_r_r(d, d) #define emith_rorcf(d) do { \ EMIT(A64_RBIT_REG(d, d)); \ emith_adcf_r_r(d, d); \ EMIT(A64_RBIT_REG(d, d)); \ } while (0) +#define emith_rorc(d) do { \ + EMIT(A64_RBIT_REG(d, d)); \ + emith_adc_r_r(d, d); \ + EMIT(A64_RBIT_REG(d, d)); \ +} while (0) // signed/unsigned extend #define emith_clear_msb(d, s, count) /* bits to clear */ \ @@ -1286,6 +1324,18 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) emith_eor_r_imm(sr, 1); \ } while (0) +#define emith_t_to_carry(srr, is_sub) do { \ + if (is_sub) { \ + int t_ = rcache_get_tmp(); \ + emith_eor_r_r_imm(t_, srr, 1); \ + emith_rorf(t_, t_, 1); \ + rcache_free_tmp(t_); \ + } else { \ + emith_rorf(srr, srr, 1); \ + emith_rol(srr, srr, 1); \ + } \ +} while (0) + #define emith_tpop_carry(sr, is_sub) do { \ if (is_sub) \ emith_eor_r_imm(sr, 1); \ diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index 832364e9..82527474 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -173,15 +173,17 @@ enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; MIPS_OP_REG(FN_JALR,rd,rs,_) // conditional branches; no condition code, these compare rs against rt or Z0 -#define MIPS_BEQ (OP_BEQ << 5) -#define MIPS_BNE (OP_BNE << 5) -#define MIPS_BLE (OP_BLEZ << 5) -#define MIPS_BGT (OP_BGTZ << 5) -#define MIPS_BLT ((OP__RT << 5)|RT_BLTZ) -#define MIPS_BGE ((OP__RT << 5)|RT_BGEZ) -#define MIPS_BGTL ((OP__RT << 5)|RT_BLTZAL) -#define MIPS_BGEL ((OP__RT << 5)|RT_BGEZAL) - +#define MIPS_BEQ (OP_BEQ << 5) // rs == rt (rt in lower 5 bits) +#define MIPS_BNE (OP_BNE << 5) // rs != rt (ditto) +#define MIPS_BLE (OP_BLEZ << 5) // rs <= 0 +#define MIPS_BGT (OP_BGTZ << 5) // rs > 0 +#define MIPS_BLT ((OP__RT << 5)|RT_BLTZ) // rs < 0 +#define MIPS_BGE ((OP__RT << 5)|RT_BGEZ) // rs >= 0 +#define MIPS_BGTL ((OP__RT << 5)|RT_BLTZAL) // rs > 0, link $ra if jumping +#define MIPS_BGEL ((OP__RT << 5)|RT_BGEZAL) // rs >= 0, link $ra if jumping + +#define MIPS_BCOND(cond, rs, rt, offs16) \ + MIPS_OP_IMM((cond >> 5), rt, rs, (offs16) >> 2) #define MIPS_BCONDZ(cond, rs, offs16) \ MIPS_OP_IMM((cond >> 5), (cond & 0x1f), rs, (offs16) >> 2) #define MIPS_B(offs16) \ @@ -216,25 +218,26 @@ enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; ptr = (void *)((u8 *)(ptr) + sizeof(u32)); \ } while (0) -// FIFO for 2 instructions, for delay slot handling -static u32 emith_last_insns[2] = { -1,-1 }; -static int emith_last_idx, emith_last_cnt; +// FIFO for some instructions, for delay slot handling +#define FSZ 4 +static u32 emith_last_insns[FSZ]; +static unsigned emith_last_idx, emith_last_cnt; #define EMIT_PUSHOP() \ do { \ - emith_last_idx ^= 1; \ - if (emith_last_insns[emith_last_idx] != -1) { \ + if (emith_last_cnt > 0) { \ u32 *p = (u32 *)tcache_ptr - emith_last_cnt; \ - EMIT_PTR(p, emith_last_insns[emith_last_idx]);\ + int idx = (emith_last_idx - emith_last_cnt+1) %FSZ; \ + EMIT_PTR(p, emith_last_insns[idx]);\ emith_last_cnt --; \ } \ - emith_last_insns[emith_last_idx] = -1; \ } while (0) #define EMIT(op) \ do { \ - EMIT_PUSHOP(); \ + if (emith_last_cnt >= FSZ) EMIT_PUSHOP(); \ tcache_ptr = (void *)((u32 *)tcache_ptr + 1); \ + emith_last_idx = (emith_last_idx+1) %FSZ; \ emith_last_insns[emith_last_idx] = op; \ emith_last_cnt ++; \ COUNT_OP; \ @@ -242,7 +245,8 @@ static int emith_last_idx, emith_last_cnt; #define emith_flush() \ do { \ - int i; for (i = 0; i < 2; i++) EMIT_PUSHOP(); \ + while (emith_last_cnt) EMIT_PUSHOP(); \ + emith_flg_hint = _FHV|_FHC; \ } while (0) #define emith_insn_ptr() (u8 *)((u32 *)tcache_ptr - emith_last_cnt) @@ -279,11 +283,12 @@ static int emith_rt(u32 op) return emith_has_(rt,2,op,26,0x3f) ? (op>>16)&0x1f : 0; } static int emith_rd(u32 op) - { if ((op>>26) == OP__FN) - return emith_has_(rd,0,op, 0,0x3f) ? (op>>11)&0x1f :-1; + { int ret = emith_has_(rd,2,op,26,0x3f) ? (op>>16)&0x1f :-1; + if ((op>>26) == OP__FN) + ret = emith_has_(rd,0,op, 0,0x3f) ? (op>>11)&0x1f :-1; if ((op>>26) == OP__RT) - return -1; - return emith_has_(rd,2,op,26,0x3f) ? (op>>16)&0x1f :-1; + ret = -1; + return (ret ?: -1); // Z0 doesn't have dependencies } static int emith_b_isswap(u32 bop, u32 lop) @@ -292,48 +297,56 @@ static int emith_b_isswap(u32 bop, u32 lop) return bop; else if (emith_is_jr(bop) && emith_rd(lop) != emith_rs(bop)) return bop; - else if (emith_is_b(bop) && emith_rd(lop) != emith_rs(bop)) + else if (emith_is_b(bop) && emith_rd(lop) != emith_rs(bop) && + emith_rd(lop) != emith_rt(bop)) if ((bop & 0xffff) != 0x7fff) // displacement overflow? return (bop & 0xffff0000) | ((bop+1) & 0x0000ffff); return 0; } +static int emith_insn_swappable(u32 op1, u32 op2) +{ + if (emith_rd(op1) != emith_rd(op2) && + emith_rs(op1) != emith_rd(op2) && emith_rt(op1) != emith_rd(op2) && + emith_rs(op2) != emith_rd(op1) && emith_rt(op2) != emith_rd(op1)) + return 1; + return 0; +} + // emit branch, trying to fill the delay slot with one of the last insns static void *emith_branch(u32 op) { - int idx = emith_last_idx; - u32 op1 = emith_last_insns[idx], op2 = emith_last_insns[idx^1]; - u32 bop = 0; + unsigned idx = emith_last_idx, ds = idx; + u32 bop = 0, sop; void *bp; - - // check last insn (op1) - if (op1 != -1 && op1) - bop = emith_b_isswap(op, op1); - // if not, check older insn (op2); mustn't interact with op1 to overtake - if (!bop && op2 != -1 && op2 && emith_rd(op1) != emith_rd(op2) && - emith_rs(op1) != emith_rd(op2) && emith_rt(op1) != emith_rd(op2) && - emith_rs(op2) != emith_rd(op1) && emith_rt(op2) != emith_rd(op1)) { - idx ^= 1; - bop = emith_b_isswap(op, op2); + int i, j, s; + + // check for ds insn; older mustn't interact with newer ones to overtake + for (i = 0; i < emith_last_cnt && !bop; i++) { + ds = (idx-i)%FSZ; + sop = emith_last_insns[ds]; + for (j = i, s = 1; j > 0 && s; j--) + s = emith_insn_swappable(emith_last_insns[(ds+j)%FSZ], sop); + if (s) + bop = emith_b_isswap(op, sop); } - // flush FIFO and branch + // flush FIFO, but omit delay slot insn tcache_ptr = (void *)((u32 *)tcache_ptr - emith_last_cnt); - if (emith_last_insns[idx^1] != -1) - EMIT_PTR(tcache_ptr, emith_last_insns[idx^1]); + idx = (idx-emith_last_cnt+1)%FSZ; + for (i = emith_last_cnt; i > 0; i--, idx = (idx+1)%FSZ) + if (!bop || idx != ds) + EMIT_PTR(tcache_ptr, emith_last_insns[idx]); + emith_last_cnt = 0; + // emit branch and delay slot + bp = tcache_ptr; if (bop) { // can swap - bp = tcache_ptr; EMIT_PTR(tcache_ptr, bop); COUNT_OP; - EMIT_PTR(tcache_ptr, emith_last_insns[idx]); + EMIT_PTR(tcache_ptr, emith_last_insns[ds]); } else { // can't swap - if (emith_last_insns[idx] != -1) - EMIT_PTR(tcache_ptr, emith_last_insns[idx]); - bp = tcache_ptr; EMIT_PTR(tcache_ptr, op); COUNT_OP; EMIT_PTR(tcache_ptr, MIPS_NOP); COUNT_OP; } - emith_last_insns[0] = emith_last_insns[1] = -1; - emith_last_cnt = 0; return bp; } @@ -403,34 +416,56 @@ static void *emith_branch(u32 op) // flag emulation creates 2 (ie cmp #0/beq) up to 9 (ie adcf/ble) extra insns. // flag handling shortcuts may reduce this by 1-4 insns, see emith_cond_check() -static int emith_flg_rs, emith_flg_rt; // registers used in FNZ=rs-rt (cmp_r_r) +static int emith_cmp_rs, emith_cmp_rt; // registers used in cmp_r_r/cmp_r_imm +static s32 emith_cmp_imm; // immediate value used in cmp_r_imm +enum { _FHC=1, _FHV=2 } emith_flg_hint; // C/V flag usage hinted by compiler static int emith_flg_noV; // V flag known not to be set +#define EMITH_HINT_COND(cond) do { \ + /* only need to check cond>>1 since the lowest bit inverts the cond */ \ + unsigned _mv = BITMASK3(DCOND_VS>>1,DCOND_GE>>1,DCOND_GT>>1); \ + unsigned _mc = _mv | BITMASK2(DCOND_HS>>1,DCOND_HI>>1); \ + emith_flg_hint = (_mv & BITMASK1(cond >> 1) ? _FHV : 0); \ + emith_flg_hint |= (_mc & BITMASK1(cond >> 1) ? _FHC : 0); \ +} while (0) + // store minimal cc information: rd, rt^rs, carry // NB: the result *must* first go to FNZ, in case rd == rs or rd == rt. // NB: for adcf and sbcf, carry-in must be dealt with separately (see there) -static void emith_set_arith_flags(int rd, int rt, int rs, s32 imm, int sub) +static void emith_set_arith_flags(int rd, int rs, int rt, s32 imm, int sub) { - if (sub && rd == FNZ && rt > AT && rs > AT) // is this cmp_r_r? - emith_flg_rs = rs, emith_flg_rt = rt; - else emith_flg_rs = emith_flg_rt = 0; - - if (sub) // C = sub:rt 0) // Nt^Ns - EMIT(MIPS_XOR_REG(FV, rt, rs)); - else if (imm < 0) - EMIT(MIPS_NOR_REG(FV, rt, Z0)); - else if (imm > 0) - EMIT(MIPS_OR_REG(FV, rt, Z0)); // Nt^Ns in FV, bit 31 - else emith_flg_noV = 1; // imm #0, never overflows + if (emith_flg_hint & _FHC) { + if (sub) // C = sub:rt= 0) // Nt^Ns in FV, bit 31 + EMIT(MIPS_XOR_REG(FV, rs, rt)); + else if (imm == 0) + emith_flg_noV = 1; // imm #0 can't overflow + else if ((imm < 0) == !sub) + EMIT(MIPS_NOR_REG(FV, rs, Z0)); + else if ((imm > 0) == !sub) + EMIT(MIPS_OR_REG(FV, rs, Z0)); + } // full V = Nd^Nt^Ns^C calculation is deferred until really needed - if (rd != FNZ) + if (rd && rd != FNZ) EMIT(MIPS_MOVE_REG(rd, FNZ)); // N,Z via result value in FNZ + emith_cmp_rs = emith_cmp_rt = -1; +} + +// since MIPS has less-than and compare-branch insns, handle cmp separately by +// storing the involved regs for later use in one of those MIPS insns. +// This works for all conditions but VC/VS, but this is fortunately never used. +static void emith_set_compare_flags(int rs, int rt, s32 imm) +{ + emith_cmp_rt = rt; + emith_cmp_rs = rs; + emith_cmp_imm = imm; } // data processing, register @@ -510,6 +545,13 @@ static void emith_set_arith_flags(int rd, int rt, int rs, s32 imm, int sub) } else EMIT(MIPS_OR_REG(d, s1, s2)); \ } while (0) +#define emith_or_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSR_IMM(AT, s2, simm)); \ + EMIT(MIPS_OR_REG(d, s1, AT)); \ + } else EMIT(MIPS_OR_REG(d, s1, s2)); \ +} while (0) + #define emith_eor_r_r_r_lsl(d, s1, s2, simm) do { \ if (simm) { \ EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ @@ -533,7 +575,11 @@ static void emith_set_arith_flags(int rd, int rt, int rs, s32 imm, int sub) #define emith_or_r_r_lsl(d, s, lslimm) \ emith_or_r_r_r_lsl(d, d, s, lslimm) +#define emith_or_r_r_lsr(d, s, lsrimm) \ + emith_or_r_r_r_lsr(d, d, s, lsrimm) +#define emith_eor_r_r_lsl(d, s, lslimm) \ + emith_eor_r_r_r_lsl(d, d, s, lslimm) #define emith_eor_r_r_lsr(d, s, lsrimm) \ emith_eor_r_r_r_lsr(d, d, s, lsrimm) @@ -570,13 +616,21 @@ static void emith_set_arith_flags(int rd, int rt, int rs, s32 imm, int sub) EMIT(MIPS_NEG_REG(d, s)) #define emith_adc_r_r_r(d, s1, s2) do { \ - emith_add_r_r_r(AT, s1, FC); \ - emith_add_r_r_r(d, AT, s2); \ + emith_add_r_r_r(AT, s2, FC); \ + emith_add_r_r_r(d, s1, AT); \ +} while (0) + +#define emith_sbc_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(AT, s2, FC); \ + emith_sub_r_r_r(d, s1, AT); \ } while (0) #define emith_adc_r_r(d, s) \ emith_adc_r_r_r(d, d, s) +#define emith_negc_r_r(d, s) \ + emith_sbc_r_r_r(d, Z0, s) + // NB: the incoming carry Cin can cause Cout if s2+Cin=0 (or s1+Cin=0 FWIW) // moreover, if s2+Cin=0 caused Cout, s1+s2+Cin=s1+0 can't cause another Cout #define emith_adcf_r_r_r(d, s1, s2) do { \ @@ -606,16 +660,23 @@ static void emith_set_arith_flags(int rd, int rt, int rs, s32 imm, int sub) #define emith_eor_r_r(d, s) \ emith_eor_r_r_r(d, d, s) -#define emith_tst_r_r_ptr(d, s) \ - emith_and_r_r_r(FNZ, d, s) +#define emith_tst_r_r_ptr(d, s) do { \ + if (d != s) { \ + emith_and_r_r_r(FNZ, d, s); \ + emith_cmp_rs = emith_cmp_rt = -1; \ + } else emith_cmp_rs = s, emith_cmp_rt = Z0; \ +} while (0) #define emith_tst_r_r(d, s) \ emith_tst_r_r_ptr(d, s) -#define emith_teq_r_r(d, s) \ - emith_eor_r_r_r(FNZ, d, s) +#define emith_teq_r_r(d, s) do { \ + emith_eor_r_r_r(FNZ, d, s); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) #define emith_cmp_r_r(d, s) \ - emith_subf_r_r_r(FNZ, d, s) + emith_set_compare_flags(d, s, 0) +// emith_subf_r_r_r(FNZ, d, s) #define emith_addf_r_r(d, s) \ emith_addf_r_r_r(d, d, s) @@ -705,8 +766,8 @@ static void emith_arith_imm(int op, int rd, int rs, u32 imm) emith_adcf_r_r_imm(r, r, imm) #define emith_cmp_r_imm(r, imm) \ - emith_subf_r_r_imm(FNZ, r, (s16)imm) - + emith_set_compare_flags(r, -1, imm) +// emith_subf_r_r_imm(FNZ, r, (s16)imm) #define emith_add_r_r_ptr_imm(d, s, imm) \ emith_arith_imm(OP_ADDIU, d, s, imm) @@ -716,7 +777,7 @@ static void emith_arith_imm(int op, int rd, int rs, u32 imm) #define emith_addf_r_r_imm(d, s, imm) do { \ emith_add_r_r_imm(FNZ, s, imm); \ - emith_set_arith_flags(d, s, 0, imm, 0); \ + emith_set_arith_flags(d, s, -1, imm, 0); \ } while (0) #define emith_adc_r_r_imm(d, s, imm) do { \ @@ -725,11 +786,16 @@ static void emith_arith_imm(int op, int rd, int rs, u32 imm) } while (0) #define emith_adcf_r_r_imm(d, s, imm) do { \ - emith_add_r_r_r(FNZ, s, FC); \ - EMIT(MIPS_SLTU_REG(AT, FNZ, FC)); \ - emith_add_r_r_imm(FNZ, FNZ, imm); \ - emith_set_arith_flags(d, s, 0, imm, 0); \ - emith_or_r_r(FC, AT); \ + if (imm == 0) { \ + emith_add_r_r_r(FNZ, s, FC); \ + emith_set_arith_flags(d, s, -1, 1, 0); \ + } else { \ + emith_add_r_r_r(FNZ, s, FC); \ + EMIT(MIPS_SLTU_REG(AT, FNZ, FC)); \ + emith_add_r_r_imm(FNZ, FNZ, imm); \ + emith_set_arith_flags(d, s, -1, imm, 0); \ + emith_or_r_r(FC, AT); \ + } \ } while (0) // NB: no SUBI in MIPS II, since ADDI takes a signed imm @@ -740,7 +806,7 @@ static void emith_arith_imm(int op, int rd, int rs, u32 imm) #define emith_subf_r_r_imm(d, s, imm) do { \ emith_sub_r_r_imm(FNZ, s, imm); \ - emith_set_arith_flags(d, s, 0, imm, 1); \ + emith_set_arith_flags(d, s, -1, imm, 1); \ } while (0) // logical, immediate @@ -777,8 +843,10 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) #define emith_bic_r_imm_c(cond, r, imm) \ emith_bic_r_imm(r, imm) -#define emith_tst_r_imm(r, imm) \ - emith_log_imm(OP_ANDI, FNZ, r, imm) +#define emith_tst_r_imm(r, imm) do { \ + emith_log_imm(OP_ANDI, FNZ, r, imm); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) #define emith_tst_r_imm_c(cond, r, imm) \ emith_tst_r_imm(r, imm) @@ -816,6 +884,17 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) EMIT(MIPS_OR_REG(d, d, AT)); \ } while (0) +#define emith_rorc(d) do { \ + emith_lsr(d, d, 1); \ + emith_lsl(AT, FC, 31); \ + emith_or_r_r(d, AT); \ +} while (0) + +#define emith_rolc(d) do { \ + emith_lsl(d, d, 1); \ + emith_or_r_r(d, FC); \ +} while (0) + // NB: all flag setting shifts make V undefined // NB: mips32r2 has EXT (useful for extracting C) #define emith_lslf(d, s, cnt) do { \ @@ -829,6 +908,7 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) emith_lsl(d, _s, 1); \ } \ emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ } while (0) #define emith_lsrf(d, s, cnt) do { \ @@ -842,6 +922,7 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) emith_lsr(d, _s, 1); \ } \ emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ } while (0) #define emith_asrf(d, s, cnt) do { \ @@ -855,18 +936,21 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) emith_asr(d, _s, 1); \ } \ emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ } while (0) #define emith_rolf(d, s, cnt) do { \ emith_rol(d, s, cnt); \ emith_and_r_r_imm(FC, d, 1); \ emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ } while (0) #define emith_rorf(d, s, cnt) do { \ emith_ror(d, s, cnt); \ emith_lsr(FC, d, 31); \ emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ } while (0) #define emith_rolcf(d) do { \ @@ -875,6 +959,7 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) emith_or_r_r(d, FC); \ emith_move_r_r(FC, AT); \ emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ } while (0) #define emith_rorcf(d) do { \ @@ -884,6 +969,7 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) emith_or_r_r(d, FC); \ emith_move_r_r(FC, AT); \ emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ } while (0) // signed/unsigned extend @@ -1108,26 +1194,84 @@ static void emith_lohi_nops(void) (((cond) >> 5) == OP__RT ? (cond) ^ 0x01 : (cond) ^ 0x20) // evaluate the emulated condition, returns a register/branch type pair -static int emith_cond_check(int cond, int *r) +static int emith_cmpr_check(int rs, int rt, int cond, int *r) { int b = 0; - // shortcut for comparing 2 registers - if (emith_flg_rs || emith_flg_rt) switch (cond) { - case DCOND_LS: EMIT(MIPS_SLTU_REG(AT, emith_flg_rs, emith_flg_rt)); + // condition check for comparing 2 registers + switch (cond) { + case DCOND_EQ: *r = rs; b = MIPS_BEQ|rt; break; + case DCOND_NE: *r = rs; b = MIPS_BNE|rt; break; + case DCOND_LO: EMIT(MIPS_SLTU_REG(AT, rs, rt)); + *r = AT, b = MIPS_BNE; break; // s < t unsigned + case DCOND_HS: EMIT(MIPS_SLTU_REG(AT, rs, rt)); + *r = AT, b = MIPS_BEQ; break; // s >= t unsigned + case DCOND_LS: EMIT(MIPS_SLTU_REG(AT, rt, rs)); *r = AT, b = MIPS_BEQ; break; // s <= t unsigned - case DCOND_HI: EMIT(MIPS_SLTU_REG(AT, emith_flg_rs, emith_flg_rt)); + case DCOND_HI: EMIT(MIPS_SLTU_REG(AT, rt, rs)); *r = AT, b = MIPS_BNE; break; // s > t unsigned - case DCOND_LT: EMIT(MIPS_SLT_REG(AT, emith_flg_rt, emith_flg_rs)); + case DCOND_LT: if (rt == 0) { *r = rs, b = MIPS_BLT; break; } // s < 0 + EMIT(MIPS_SLT_REG(AT, rs, rt)); *r = AT, b = MIPS_BNE; break; // s < t - case DCOND_GE: EMIT(MIPS_SLT_REG(AT, emith_flg_rt, emith_flg_rs)); + case DCOND_GE: if (rt == 0) { *r = rs, b = MIPS_BGE; break; } // s >= 0 + EMIT(MIPS_SLT_REG(AT, rs, rt)); *r = AT, b = MIPS_BEQ; break; // s >= t - case DCOND_LE: EMIT(MIPS_SLT_REG(AT, emith_flg_rs, emith_flg_rt)); + case DCOND_LE: if (rt == 0) { *r = rs, b = MIPS_BLE; break; } // s <= 0 + EMIT(MIPS_SLT_REG(AT, rt, rs)); *r = AT, b = MIPS_BEQ; break; // s <= t - case DCOND_GT: EMIT(MIPS_SLT_REG(AT, emith_flg_rs, emith_flg_rt)); + case DCOND_GT: if (rt == 0) { *r = rs, b = MIPS_BGT; break; } // s > 0 + EMIT(MIPS_SLT_REG(AT, rt, rs)); *r = AT, b = MIPS_BNE; break; // s > t } + return b; +} + +static int emith_cmpi_check(int rs, s32 imm, int cond, int *r) +{ + int b = 0; + + // condition check for comparing register with immediate + if (imm == 0) return emith_cmpr_check(rs, Z0, cond, r); + switch (cond) { + case DCOND_EQ: emith_move_r_imm(AT, imm); + *r = rs; b = MIPS_BEQ|AT; break; + case DCOND_NE: emith_move_r_imm(AT, imm); + *r = rs; b = MIPS_BNE|AT; break; + case DCOND_LO: EMIT(MIPS_SLTU_IMM(AT, rs, imm)); + *r = AT, b = MIPS_BNE; break; // s < imm unsigned + case DCOND_HS: EMIT(MIPS_SLTU_IMM(AT, rs, imm)); + *r = AT, b = MIPS_BEQ; break; // s >= imm unsigned + case DCOND_LS: emith_move_r_imm(AT, imm); + EMIT(MIPS_SLTU_REG(AT, AT, rs)); + *r = AT, b = MIPS_BEQ; break; // s <= imm unsigned + case DCOND_HI: emith_move_r_imm(AT, imm); + EMIT(MIPS_SLTU_REG(AT, AT, rs)); + *r = AT, b = MIPS_BNE; break; // s > imm unsigned + case DCOND_LT: EMIT(MIPS_SLT_IMM(AT, rs, imm)); + *r = AT, b = MIPS_BNE; break; // s < imm + case DCOND_GE: EMIT(MIPS_SLT_IMM(AT, rs, imm)); + *r = AT, b = MIPS_BEQ; break; // s >= imm + case DCOND_LE: emith_move_r_imm(AT, imm); + EMIT(MIPS_SLT_REG(AT, AT, rs)); + *r = AT, b = MIPS_BEQ; break; // s <= imm + case DCOND_GT: emith_move_r_imm(AT, imm); + EMIT(MIPS_SLT_REG(AT, AT, rs)); + *r = AT, b = MIPS_BNE; break; // s > imm + } + return b; +} + +static int emith_cond_check(int cond, int *r) +{ + int b = 0; + + if (emith_cmp_rs >= 0) { + if (emith_cmp_rt != -1) + b = emith_cmpr_check(emith_cmp_rs,emith_cmp_rt, cond,r); + else b = emith_cmpi_check(emith_cmp_rs,emith_cmp_imm,cond,r); + } + // shortcut for V known to be 0 if (!b && emith_flg_noV) switch (cond) { case DCOND_VS: *r = Z0; b = MIPS_BNE; break; // never @@ -1373,8 +1517,10 @@ static int emith_cond_check(int cond, int *r) #define emith_sh2_div1_step(rn, rm, sr) do { \ emith_tst_r_imm(sr, Q); /* if (Q ^ M) */ \ EMITH_JMP3_START(DCOND_EQ); \ + EMITH_HINT_COND(DCOND_CS); \ emith_addf_r_r(rn, rm); \ EMITH_JMP3_MID(DCOND_EQ); \ + EMITH_HINT_COND(DCOND_CS); \ emith_subf_r_r(rn, rm); \ EMITH_JMP3_END(); \ emith_eor_r_r(sr, FC); \ @@ -1433,23 +1579,27 @@ static int emith_cond_check(int cond, int *r) } while (0) #define emith_write_sr(sr, srcr) do { \ - emith_lsr(sr, sr, 10); \ - emith_or_r_r_r_lsl(sr, sr, srcr, 22); \ - emith_ror(sr, sr, 22); \ + emith_lsr(sr, sr , 10); emith_lsl(sr, sr, 10); \ + emith_lsl(AT, srcr, 22); emith_lsr(AT, AT, 22); \ + emith_or_r_r(sr, AT); \ +} while (0) + +#define emith_carry_to_t(sr, is_sub) do { \ + emith_and_r_imm(sr, 0xfffffffe); \ + emith_or_r_r(sr, FC); \ } while (0) -#define emith_carry_to_t(srr, is_sub) do { \ - emith_lsr(sr, sr, 1); \ - emith_adc_r_r(sr, sr); \ +#define emith_t_to_carry(sr, is_sub) do { \ + emith_and_r_r_imm(FC, sr, 1); \ } while (0) #define emith_tpop_carry(sr, is_sub) do { \ emith_and_r_r_imm(FC, sr, 1); \ - emith_lsr(sr, sr, 1); \ + emith_eor_r_r(sr, FC); \ } while (0) #define emith_tpush_carry(sr, is_sub) \ - emith_adc_r_r(sr, sr) + emith_or_r_r(sr, FC) #ifdef T // T bit handling @@ -1463,9 +1613,61 @@ static void emith_clr_t_cond(int sr) static void emith_set_t_cond(int sr, int cond) { - EMITH_SJMP_START(emith_invert_cond(cond)); - emith_or_r_imm_c(cond, sr, T); - EMITH_SJMP_END(emith_invert_cond(cond)); + int b, r; + u8 *ptr; + u32 val = 0, inv = 0; + + // try to avoid jumping around if possible + if (emith_cmp_rs >= 0) { + if (emith_cmp_rt >= 0) + b = emith_cmpr_check(emith_cmp_rs, emith_cmp_rt, cond, &r); + else + b = emith_cmpi_check(emith_cmp_rs, emith_cmp_imm, cond, &r); + + // XXX this relies on the inner workings of cmp_check... + if (r == AT) + // result of slt check which returns either 0 or 1 in AT + val++, inv = (b == MIPS_BEQ); + } else { + b = emith_cond_check(cond, &r); + if (r == Z0) { + if (b == MIPS_BEQ || b == MIPS_BLE || b == MIPS_BGE) + emith_or_r_imm(sr, T); + return; + } else if (r == FC) + val++, inv = (b == MIPS_BEQ); + } + + if (!val) switch (b) { // cases: b..z r, aka cmp r,Z0 or cmp r,#0 + case MIPS_BEQ: EMIT(MIPS_SLTU_IMM(AT, r, 1)); r=AT; val++; break; + case MIPS_BNE: EMIT(MIPS_SLTU_REG(AT,Z0, r)); r=AT; val++; break; + case MIPS_BLT: EMIT(MIPS_SLT_REG(AT, r, Z0)); r=AT; val++; break; + case MIPS_BGE: EMIT(MIPS_SLT_REG(AT, r, Z0)); r=AT; val++; inv++; break; + case MIPS_BLE: EMIT(MIPS_SLT_REG(AT, Z0, r)); r=AT; val++; inv++; break; + case MIPS_BGT: EMIT(MIPS_SLT_REG(AT, Z0, r)); r=AT; val++; break; + default: // cases: beq/bne r,s, aka cmp r,s + if ((b>>5) == OP_BEQ) { + EMIT(MIPS_XOR_REG(AT, r, b&0x1f)); + EMIT(MIPS_SLTU_IMM(AT,AT, 1)); r=AT; val++; break; + } else if ((b>>5) == OP_BNE) { + EMIT(MIPS_XOR_REG(AT, r, b&0x1f)); + EMIT(MIPS_SLTU_IMM(AT,Z0,AT)); r=AT; val++; break; + } + } + if (val) { + emith_or_r_r(sr, r); + if (inv) + emith_eor_r_imm(sr, T); + return; + } + + // can't obtain result directly, use presumably slower jump !cond + or sr,T + b = emith_invert_branch(b); + ptr = emith_branch(MIPS_BCONDZ(b, r, 0)); + emith_or_r_imm(sr, T); + emith_flush(); // prohibit delay slot switching across jump targets + val = (u8 *)tcache_ptr - (u8 *)(ptr) - 4; + EMIT_PTR(ptr, MIPS_BCONDZ(b, r, val & 0x0003ffff)); } #define emith_get_t_cond() -1 diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 39f3a1d7..e7284499 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -340,11 +340,29 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common rcache_free_tmp(tmp_); \ } else emith_or_r_r_r(d, s1, s2); \ } while (0) +#define emith_or_r_r_r_lsr(d, s1, s2, lsrimm) do { \ + if (lsrimm) { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsr(tmp_, s2, lsrimm); \ + emith_or_r_r_r(d, s1, tmp_); \ + rcache_free_tmp(tmp_); \ + } else emith_or_r_r_r(d, s1, s2); \ +} while (0) // _r_r_shift #define emith_or_r_r_lsl(d, s, lslimm) \ emith_or_r_r_r_lsl(d, d, s, lslimm) +#define emith_or_r_r_lsr(d, s, lsrimm) \ + emith_or_r_r_r_lsr(d, d, s, lsrimm) +#define emith_eor_r_r_lsl(d, s, lslimm) do { \ + if (lslimm) { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsl(tmp_, s, lslimm); \ + emith_eor_r_r(d, tmp_); \ + rcache_free_tmp(tmp_); \ + } else emith_eor_r_r(d, s); \ +} while (0) #define emith_eor_r_r_lsr(d, s, lsrimm) do { \ if (lsrimm) { \ int tmp_ = rcache_get_tmp(); \ @@ -972,6 +990,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #define EMITH_SJMP2_END(cond) \ EMITH_SJMP3_END() +#define EMITH_HINT_COND(cond) /**/ + #define emith_pass_arg_r(arg, reg) do { \ int rd = 7; \ host_arg2reg(rd, arg); \ @@ -1255,6 +1275,11 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common emith_rol(sr, sr, 1); \ } while (0) +#define emith_t_to_carry(sr, is_sub) do { \ + emith_ror(sr, sr, 1); \ + emith_rol(sr, sr, 1); \ +} while (0) + #define emith_tpop_carry(sr, is_sub) \ emith_lsr(sr, sr, 1) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 3cf7a0d9..2320c501 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -45,6 +45,7 @@ #define REMAP_REGISTER 1 #define LOOP_DETECTION 1 #define LOOP_OPTIMIZER 1 +#define T_OPTIMIZER 1 // limits (per block) #define MAX_BLOCK_SIZE (BLOCK_INSN_LIMIT * 6 * 6) @@ -108,7 +109,7 @@ static int insns_compiled, hash_collisions, host_insn_count; #define GET_Rn() \ ((op >> 8) & 0x0f) -#define SHR_T SHR_SR // might make them separate someday +#define SHR_T 30 // separate T for not-used detection #define SHR_MEM 31 #define SHR_TMP -1 @@ -122,6 +123,7 @@ static int insns_compiled, hash_collisions, host_insn_count; #define I_SHIFT 4 #define Q_SHIFT 8 #define M_SHIFT 9 +#define T_SHIFT 11 static struct op_data { u8 op; @@ -263,7 +265,6 @@ static void REGPARM(3) *sh2_drc_log_entry(void *block, SH2 *sh2, u32 sr) return block; } #endif -// } debug #define TCACHE_BUFFERS 3 @@ -1527,7 +1528,7 @@ static void rcache_unmap_vreg(int x) FOR_ALL_BITS_SET_DO(cache_regs[x].gregs, i, if (guest_regs[i].flags & GRF_DIRTY) { // if a dirty reg is unmapped save its value to context - if (~rcache_regs_discard & (1 << i)) + if ((~rcache_regs_discard | rcache_regs_now) & (1 << i)) emith_ctx_write(cache_regs[x].hreg, i * 4); guest_regs[i].flags &= ~GRF_DIRTY; } @@ -1565,7 +1566,7 @@ static void rcache_clean_vreg(int x) if (guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) { if (guest_regs[r].vreg != guest_regs[r].sreg && !cache_regs[guest_regs[r].sreg].locked && - (~rcache_regs_discard & (1 << r)) && + ((~rcache_regs_discard | rcache_regs_now) & (1 << r)) && !(rns & cache_regs[guest_regs[r].sreg].gregs)) { // statically mapped reg not in its sreg. move back to sreg rcache_evict_vreg(guest_regs[r].sreg); @@ -1578,7 +1579,7 @@ static void rcache_clean_vreg(int x) // cannot remap. keep dirty for writeback in unmap cache_regs[x].flags |= HRF_DIRTY; } else { - if (~rcache_regs_discard & (1 << r)) + if ((~rcache_regs_discard | rcache_regs_now) & (1 << r)) emith_ctx_write(cache_regs[x].hreg, r * 4); guest_regs[r].flags &= ~GRF_DIRTY; } @@ -1875,9 +1876,22 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr if ((guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) && guest_regs[r].sreg == dst && !tr->locked) { // split aliases if r is STATIC in sreg and dst isn't already locked - rcache_lock_vreg(dst); // lock to avoid evicting dst - x = rcache_allocate_vreg(rsp_d & ali); - rcache_unlock_vreg(dst); + int t; + FOR_ALL_BITS_SET_DO(ali, t, + if ((guest_regs[t].flags & (GRF_STATIC|GRF_PINNED)) && + !(ali & ~(1 << t)) && + !cache_regs[guest_regs[t].sreg].locked && + !(rsp_d & cache_regs[guest_regs[t].sreg].gregs)) { + // alias is a single STATIC and its sreg is available + x = guest_regs[t].sreg; + rcache_evict_vreg(x); + } else { + rcache_lock_vreg(dst); // lock to avoid evicting dst + x = rcache_allocate_vreg(rsp_d & ali); + rcache_unlock_vreg(dst); + } + break; + ) if (x >= 0) { src = x; rcache_move_vreg(src, dst); @@ -2855,11 +2869,11 @@ static void emit_do_static_regs(int is_write, int tmpr) } #define DELAY_SAVE_T(sr) { \ + int t_ = rcache_get_tmp(); \ emith_bic_r_imm(sr, T_save); \ - emith_tst_r_imm(sr, T); \ - EMITH_SJMP_START(DCOND_EQ); \ - emith_or_r_imm_c(DCOND_NE, sr, T_save); \ - EMITH_SJMP_END(DCOND_EQ); \ + emith_and_r_r_imm(t_, sr, 1); \ + emith_or_r_r_lsl(sr, t_, T_SHIFT); \ + rcache_free_tmp(t_); \ } #define FLUSH_CYCLES(sr) \ @@ -2961,6 +2975,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) ADD_TO_ARRAY(branch_target_pc, branch_target_count, pc, ); if (ops[i].op == OP_LDC && (ops[i].dest & BITMASK1(SHR_SR)) && pc+2 < end_pc) op_flags[i+1] |= OF_BTARGET; // RTE entrypoint in case of SR.IMASK change + // unify T and SR since rcache doesn't know about "virtual" guest regs + if (ops[i].source & BITMASK1(SHR_T)) ops[i].source |= BITMASK1(SHR_SR); + if (ops[i].dest & BITMASK1(SHR_T)) ops[i].dest |= BITMASK1(SHR_SR); #if LOOP_DETECTION // loop types detected: // 1. target: ... BRA target -> idle loop @@ -3014,15 +3031,15 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) drcf.pending_branch_indirect = 1; // conditions g,h - cond.branch // poll/idle loops terminate with their backwards branch to the loop start if (drcf.pending_branch_direct && !(op_flags[i+1] & OF_DELAY_OP)) { - m2 &= ~(m1 | BITMASK2(SHR_PC, SHR_SR)); // conditions d,e + g,h + m2 &= ~(m1 | BITMASK3(SHR_PC, SHR_SR, SHR_T)); // conditions d,e + g,h if (m2 || ((op == OF_IDLE_LOOP) == (drcf.pending_branch_indirect))) op = 0; // conditions not met op_flags[v] = (op_flags[v] & ~OF_LOOP) | op; // set loop type drcf.loop_type = 0; #if LOOP_OPTIMIZER if (op_flags[v] & OF_BASIC_LOOP) { - m3 &= ~rcache_regs_static & ~BITMASK4(SHR_PC, SHR_PR, SHR_SR, SHR_MEM); - if (m3 && count_bits(m3) < count_bits(rcache_hregs_reg) && + m3 &= ~rcache_regs_static & ~BITMASK5(SHR_PC, SHR_PR, SHR_SR, SHR_T, SHR_MEM); + if (m3 && count_bits(m3) < count_bits(rcache_vregs_reg) && pinned_loop_count < ARRAY_SIZE(pinned_loop_pc)-1) { pinned_loop_mask[pinned_loop_count] = m3; pinned_loop_pc[pinned_loop_count++] = base_pc + 2*v; @@ -3154,48 +3171,63 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_free_tmp(tmp3); #endif + // check cycles + sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); + #if LOOP_OPTIMIZER if (op_flags[i] & OF_BASIC_LOOP) { if (pinned_loop_pc[pinned_loop_count] == pc) { // pin needed regs on loop entry FOR_ALL_BITS_SET_DO(pinned_loop_mask[pinned_loop_count], v, rcache_pin_reg(v)); emith_flush(); + // store current PC as loop target pinned_loop_ptr[pinned_loop_count] = tcache_ptr; } else op_flags[i] &= ~OF_BASIC_LOOP; } -#endif - // check cycles - sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); - emith_cmp_r_imm(sr, 0); - -#if LOOP_OPTIMIZER - void *jp = NULL; if (op_flags[i] & OF_BASIC_LOOP) { // if exiting a pinned loop pinned regs must be written back to ctx // since they are reloaded in the loop entry code - jp = tcache_ptr; - emith_jump_cond_patchable(DCOND_GT, jp); // XXX need API for JMP_POS + emith_cmp_r_imm(sr, 0); + EMITH_JMP_START(DCOND_GT); rcache_save_pinned(); - } + + if (blx_target_count < ARRAY_SIZE(blx_target_pc)) { + // exit via stub in blx table (saves some 1-3 insns in the main flow) + blx_target_ptr[blx_target_count] = tcache_ptr; + blx_target_pc[blx_target_count] = pc|1; + blx_target_bl[blx_target_count++] = NULL; + emith_jump_patchable(tcache_ptr); + } else { + // blx table full, must inline exit code + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm(tmp, pc); + emith_jump(sh2_drc_exit); + rcache_free_tmp(tmp); + } + EMITH_JMP_END(DCOND_GT); + } else #endif - if (blx_target_count < ARRAY_SIZE(blx_target_pc)) { - // exit via stub in blx table (saves some 1-3 insns in the main flow) - blx_target_pc[blx_target_count] = pc|1; - blx_target_bl[blx_target_count] = NULL; - blx_target_ptr[blx_target_count++] = tcache_ptr; - } else { - // blx table full, must inline exit code - tmp = rcache_get_tmp_arg(0); - emith_move_r_imm_c(DCOND_LE, tmp, pc); - rcache_free_tmp(tmp); + { + if (blx_target_count < ARRAY_SIZE(blx_target_pc)) { + // exit via stub in blx table (saves some 1-3 insns in the main flow) + blx_target_pc[blx_target_count] = pc|1; + blx_target_bl[blx_target_count] = NULL; + emith_cmp_r_imm(sr, 0); + blx_target_ptr[blx_target_count++] = tcache_ptr; + emith_jump_cond_patchable(DCOND_LE, tcache_ptr); + } else { + // blx table full, must inline exit code + tmp = rcache_get_tmp_arg(0); + emith_cmp_r_imm(sr, 0); + EMITH_SJMP_START(DCOND_GT); + emith_move_r_imm_c(DCOND_LE, tmp, pc); + emith_jump_cond(DCOND_LE, sh2_drc_exit); + EMITH_SJMP_END(DCOND_GT); + rcache_free_tmp(tmp); + } } - emith_jump_cond_patchable(DCOND_LE, tcache_ptr); -#if LOOP_OPTIMIZER - if (op_flags[i] & OF_BASIC_LOOP) - emith_jump_patch(jp, tcache_ptr, NULL); -#endif #if (DRC_DEBUG & 32) // block hit counter @@ -3328,7 +3360,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_set_usage_now(opd[0].source); // current insn rcache_set_usage_soon(soon); // insns 1-4 rcache_set_usage_late(late & ~soon); // insns 5-9 - rcache_set_usage_discard(write & ~(late|soon|opd[0].source)); + rcache_set_usage_discard(write & ~(late|soon)); if (v <= 9) // upcoming rcache_flush, start writing back unused dirty stuff rcache_clean_masked(rcache_dirty_mask() & ~(write|opd[0].dest)); @@ -3512,11 +3544,17 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { case 0: // CLRT 0000000000001000 sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_set_t(sr, 0); +#if T_OPTIMIZER + if (~rcache_regs_discard & BITMASK1(SHR_T)) +#endif + emith_set_t(sr, 0); break; case 1: // SETT 0000000000011000 sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_set_t(sr, 1); +#if T_OPTIMIZER + if (~rcache_regs_discard & BITMASK1(SHR_T)) +#endif + emith_set_t(sr, 1); break; case 2: // CLRMAC 0000000000101000 emit_move_r_imm32(SHR_MACL, 0); @@ -3602,20 +3640,16 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp = rcache_get_tmp(); emith_invalidate_t(); emith_bic_r_imm(sr, M|Q|T); - emith_tst_r_imm(tmp2, (1<<31)); - EMITH_SJMP_START(DCOND_EQ); - emith_or_r_imm_c(DCOND_NE, sr, Q); - EMITH_SJMP_END(DCOND_EQ); - emith_tst_r_imm(tmp3, (1<<31)); - EMITH_SJMP_START(DCOND_EQ); - emith_or_r_imm_c(DCOND_NE, sr, M); - EMITH_SJMP_END(DCOND_EQ); - emith_teq_r_r(tmp2, tmp3); - EMITH_SJMP_START(DCOND_PL); - emith_or_r_imm_c(DCOND_MI, sr, T); - EMITH_SJMP_END(DCOND_PL); + emith_lsr(tmp, tmp2, 31); // Q = Nn + emith_or_r_r_lsl(sr, tmp, Q_SHIFT); + emith_lsr(tmp, tmp3, 31); // M = Nm + emith_or_r_r_lsl(sr, tmp, M_SHIFT); + emith_eor_r_r_lsr(tmp, tmp2, 31); + emith_or_r_r(sr, tmp); // T = Q^M + rcache_free(tmp); goto end_op; case 0x08: // TST Rm,Rn 0010nnnnmmmm1000 sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); @@ -3708,26 +3742,27 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); - emith_clr_t_cond(sr); - emith_cmp_r_r(tmp2, tmp3); switch (op & 0x07) { case 0x00: // CMP/EQ - emith_set_t_cond(sr, DCOND_EQ); + tmp = DCOND_EQ; break; case 0x02: // CMP/HS - emith_set_t_cond(sr, DCOND_HS); + tmp = DCOND_HS; break; case 0x03: // CMP/GE - emith_set_t_cond(sr, DCOND_GE); + tmp = DCOND_GE; break; case 0x06: // CMP/HI - emith_set_t_cond(sr, DCOND_HI); + tmp = DCOND_HI; break; case 0x07: // CMP/GT - emith_set_t_cond(sr, DCOND_GT); + tmp = DCOND_GT; break; } + emith_clr_t_cond(sr); + emith_cmp_r_r(tmp2, tmp3); + emith_set_t_cond(sr, tmp); goto end_op; case 0x04: // DIV1 Rm,Rn 0011nnnnmmmm0100 // Q1 = carry(Rn = (Rn << 1) | T) @@ -3738,29 +3773,27 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // Q = M ^ Q1 ^ Q2 // T = (Q == M) = !(Q ^ M) = !(Q1 ^ Q2) tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp4); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_sync_t(sr); + EMITH_HINT_COND(DCOND_CS); emith_tpop_carry(sr, 0); - emith_adcf_r_r_r(tmp2, tmp, tmp); + emith_adcf_r_r_r(tmp2, tmp4, tmp4); emith_tpush_carry(sr, 0); // keep Q1 in T for now - rcache_free(tmp); - tmp4 = rcache_get_tmp(); - emith_and_r_r_imm(tmp4, sr, M); - emith_eor_r_r_lsr(sr, tmp4, M_SHIFT - Q_SHIFT); // Q ^= M - rcache_free_tmp(tmp4); + rcache_free(tmp4); + tmp = rcache_get_tmp(); + emith_and_r_r_imm(tmp, sr, M); + emith_eor_r_r_lsr(sr, tmp, M_SHIFT - Q_SHIFT); // Q ^= M + rcache_free_tmp(tmp); // add or sub, invert T if carry to get Q1 ^ Q2 // in: (Q ^ M) passed in Q, Q1 in T emith_sh2_div1_step(tmp2, tmp3, sr); - emith_bic_r_imm(sr, Q); - emith_tst_r_imm(sr, M); - EMITH_SJMP_START(DCOND_EQ); - emith_or_r_imm_c(DCOND_NE, sr, Q); // Q = M - EMITH_SJMP_END(DCOND_EQ); - emith_tst_r_imm(sr, T); - EMITH_SJMP_START(DCOND_EQ); - emith_eor_r_imm_c(DCOND_NE, sr, Q); // Q = M ^ Q1 ^ Q2 - EMITH_SJMP_END(DCOND_EQ); + tmp = rcache_get_tmp(); + emith_bic_r_imm(sr, Q); // Q = M + emith_and_r_r_imm(tmp, sr, M); + emith_or_r_r_lsr(sr, tmp, M_SHIFT - Q_SHIFT); + emith_and_r_r_imm(tmp, sr, T); // Q = M ^ Q1 ^ Q2 + emith_eor_r_r_lsl(sr, tmp, Q_SHIFT); emith_eor_r_imm(sr, T); // T = !(Q1 ^ Q2) goto end_op; case 0x05: // DMULU.L Rm,Rn 0011nnnnmmmm0101 @@ -3791,14 +3824,28 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_sync_t(sr); - if (op & 4) { // adc - emith_tpop_carry(sr, 0); - emith_adcf_r_r_r(tmp, tmp3, tmp2); - emith_tpush_carry(sr, 0); - } else { - emith_tpop_carry(sr, 1); - emith_sbcf_r_r_r(tmp, tmp3, tmp2); - emith_tpush_carry(sr, 1); +#if T_OPTIMIZER + if (rcache_regs_discard & BITMASK1(SHR_T)) { + if (op & 4) { + emith_t_to_carry(sr, 0); + emith_adc_r_r_r(tmp, tmp3, tmp2); + } else { + emith_t_to_carry(sr, 1); + emith_sbc_r_r_r(tmp, tmp3, tmp2); + } + } else +#endif + { + EMITH_HINT_COND(DCOND_CS); + if (op & 4) { // adc + emith_tpop_carry(sr, 0); + emith_adcf_r_r_r(tmp, tmp3, tmp2); + emith_tpush_carry(sr, 0); + } else { + emith_tpop_carry(sr, 1); + emith_sbcf_r_r_r(tmp, tmp3, tmp2); + emith_tpush_carry(sr, 1); + } } goto end_op; case 0x0b: // SUBV Rm,Rn 0011nnnnmmmm1011 @@ -3806,12 +3853,23 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_clr_t_cond(sr); - if (op & 4) { - emith_addf_r_r_r(tmp, tmp3, tmp2); +#if T_OPTIMIZER + if (rcache_regs_discard & BITMASK1(SHR_T)) { + if (op & 4) + emith_add_r_r_r(tmp,tmp3,tmp2); + else + emith_sub_r_r_r(tmp,tmp3,tmp2); } else - emith_subf_r_r_r(tmp, tmp3, tmp2); - emith_set_t_cond(sr, DCOND_VS); +#endif + { + emith_clr_t_cond(sr); + EMITH_HINT_COND(DCOND_VS); + if (op & 4) + emith_addf_r_r_r(tmp, tmp3, tmp2); + else + emith_subf_r_r_r(tmp, tmp3, tmp2); + emith_set_t_cond(sr, DCOND_VS); + } goto end_op; case 0x0d: // DMULS.L Rm,Rn 0011nnnnmmmm1101 tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); @@ -3834,9 +3892,16 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 2: // SHAL Rn 0100nnnn00100000 tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_invalidate_t(); - emith_lslf(tmp, tmp2, 1); - emith_carry_to_t(sr, 0); +#if T_OPTIMIZER + if (rcache_regs_discard & BITMASK1(SHR_T)) + emith_lsl(tmp, tmp2, 1); + else +#endif + { + emith_invalidate_t(); + emith_lslf(tmp, tmp2, 1); + emith_carry_to_t(sr, 0); + } goto end_op; case 1: // DT Rn 0100nnnn00010000 sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); @@ -3850,6 +3915,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #endif tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); emith_clr_t_cond(sr); + EMITH_HINT_COND(DCOND_EQ); emith_subf_r_r_imm(tmp, tmp2, 1); emith_set_t_cond(sr, DCOND_EQ); goto end_op; @@ -3862,12 +3928,22 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 2: // SHAR Rn 0100nnnn00100001 tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_invalidate_t(); - if (op & 0x20) { - emith_asrf(tmp, tmp2, 1); +#if T_OPTIMIZER + if (rcache_regs_discard & BITMASK1(SHR_T)) { + if (op & 0x20) + emith_asr(tmp,tmp2,1); + else + emith_lsr(tmp,tmp2,1); } else - emith_lsrf(tmp, tmp2, 1); - emith_carry_to_t(sr, 0); +#endif + { + emith_invalidate_t(); + if (op & 0x20) { + emith_asrf(tmp, tmp2, 1); + } else + emith_lsrf(tmp, tmp2, 1); + emith_carry_to_t(sr, 0); + } goto end_op; case 1: // CMP/PZ Rn 0100nnnn00010001 tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); @@ -3919,24 +3995,45 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x05: // ROTR Rn 0100nnnn00000101 tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_invalidate_t(); - if (op & 1) { - emith_rorf(tmp, tmp2, 1); +#if T_OPTIMIZER + if (rcache_regs_discard & BITMASK1(SHR_T)) { + if (op & 1) + emith_ror(tmp, tmp2, 1); + else + emith_rol(tmp, tmp2, 1); } else - emith_rolf(tmp, tmp2, 1); - emith_carry_to_t(sr, 0); +#endif + { + emith_invalidate_t(); + if (op & 1) + emith_rorf(tmp, tmp2, 1); + else + emith_rolf(tmp, tmp2, 1); + emith_carry_to_t(sr, 0); + } goto end_op; case 0x24: // ROTCL Rn 0100nnnn00100100 case 0x25: // ROTCR Rn 0100nnnn00100101 tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, NULL); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_sync_t(sr); - emith_tpop_carry(sr, 0); - if (op & 1) { - emith_rorcf(tmp); +#if T_OPTIMIZER + if (rcache_regs_discard & BITMASK1(SHR_T)) { + emith_t_to_carry(sr, 0); + if (op & 1) + emith_rorc(tmp); + else + emith_rolc(tmp); } else - emith_rolcf(tmp); - emith_tpush_carry(sr, 0); +#endif + { + emith_tpop_carry(sr, 0); + if (op & 1) + emith_rorcf(tmp); + else + emith_rolcf(tmp); + emith_tpush_carry(sr, 0); + } goto end_op; case 0x15: // CMP/PL Rn 0100nnnn00010101 tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); @@ -4131,9 +4228,18 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x0a: // NEGC Rm,Rn 0110nnnnmmmm1010 sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_sync_t(sr); - emith_tpop_carry(sr, 1); - emith_negcf_r_r(tmp2, tmp); - emith_tpush_carry(sr, 1); +#if T_OPTIMIZER + if (rcache_regs_discard & BITMASK1(SHR_T)) { + emith_t_to_carry(sr, 1); + emith_negc_r_r(tmp2, tmp); + } else +#endif + { + EMITH_HINT_COND(DCOND_CS); + emith_tpop_carry(sr, 1); + emith_negcf_r_r(tmp2, tmp); + emith_tpush_carry(sr, 1); + } break; case 0x0b: // NEG Rm,Rn 0110nnnnmmmm1011 emith_neg_r_r(tmp2, tmp); @@ -4639,9 +4745,6 @@ static void sh2_generate_utils(void) host_arg2reg(arg2, 2); host_arg2reg(arg3, 3); emith_move_r_r(arg0, arg0); // nop - emith_move_r_r(arg1, arg1); // nop - emith_move_r_r(arg2, arg2); // nop - emith_move_r_r(arg3, arg3); // nop emith_flush(); // sh2_drc_write8(u32 a, u32 d) @@ -4665,6 +4768,7 @@ static void sh2_generate_utils(void) // d = sh2_drc_read8(u32 a) sh2_drc_read8 = (void *)tcache_ptr; emith_ctx_read_ptr(arg1, offsetof(SH2, read8_map)); + EMITH_HINT_COND(DCOND_CS); emith_sh2_rcall(arg0, arg1, arg2, arg3); EMITH_SJMP_START(DCOND_CS); emith_and_r_r_c(DCOND_CC, arg0, arg3); @@ -4679,6 +4783,7 @@ static void sh2_generate_utils(void) // d = sh2_drc_read16(u32 a) sh2_drc_read16 = (void *)tcache_ptr; emith_ctx_read_ptr(arg1, offsetof(SH2, read16_map)); + EMITH_HINT_COND(DCOND_CS); emith_sh2_rcall(arg0, arg1, arg2, arg3); EMITH_SJMP_START(DCOND_CS); emith_and_r_r_c(DCOND_CC, arg0, arg3); @@ -4692,6 +4797,7 @@ static void sh2_generate_utils(void) // d = sh2_drc_read32(u32 a) sh2_drc_read32 = (void *)tcache_ptr; emith_ctx_read_ptr(arg1, offsetof(SH2, read32_map)); + EMITH_HINT_COND(DCOND_CS); emith_sh2_rcall(arg0, arg1, arg2, arg3); EMITH_SJMP_START(DCOND_CS); emith_and_r_r_c(DCOND_CC, arg0, arg3); @@ -4706,6 +4812,7 @@ static void sh2_generate_utils(void) // d = sh2_drc_read8_poll(u32 a) sh2_drc_read8_poll = (void *)tcache_ptr; emith_ctx_read_ptr(arg1, offsetof(SH2, read8_map)); + EMITH_HINT_COND(DCOND_CS); emith_sh2_rcall(arg0, arg1, arg2, arg3); EMITH_SJMP_START(DCOND_CC); emith_move_r_r_ptr_c(DCOND_CS, arg1, CONTEXT_REG); @@ -4723,6 +4830,7 @@ static void sh2_generate_utils(void) // d = sh2_drc_read16_poll(u32 a) sh2_drc_read16_poll = (void *)tcache_ptr; emith_ctx_read_ptr(arg1, offsetof(SH2, read16_map)); + EMITH_HINT_COND(DCOND_CS); emith_sh2_rcall(arg0, arg1, arg2, arg3); EMITH_SJMP_START(DCOND_CC); emith_move_r_r_ptr_c(DCOND_CS, arg1, CONTEXT_REG); @@ -4739,6 +4847,7 @@ static void sh2_generate_utils(void) // d = sh2_drc_read32_poll(u32 a) sh2_drc_read32_poll = (void *)tcache_ptr; emith_ctx_read_ptr(arg1, offsetof(SH2, read32_map)); + EMITH_HINT_COND(DCOND_CS); emith_sh2_rcall(arg0, arg1, arg2, arg3); EMITH_SJMP_START(DCOND_CC); emith_move_r_r_ptr_c(DCOND_CS, arg1, CONTEXT_REG); @@ -4834,16 +4943,19 @@ static void sh2_generate_utils(void) emith_ctx_read(arg2, offsetof(SH2, rts_cache_idx)); emith_add_r_r_r_lsl_ptr(arg1, CONTEXT_REG, arg2, 0); emith_read_r_r_offs(arg3, arg1, offsetof(SH2, rts_cache)); - emith_cmp_r_r(arg0, arg3); #if (DRC_DEBUG & 128) + emith_cmp_r_r(arg0, arg3); EMITH_SJMP_START(DCOND_EQ); emith_move_r_ptr_imm(arg3, (uptr)&rcmiss); emith_read_r_r_offs_c(DCOND_NE, arg1, arg3, 0); emith_add_r_imm_c(DCOND_NE, arg1, 1); emith_write_r_r_offs_c(DCOND_NE, arg1, arg3, 0); + emith_jump_cond(DCOND_NE, sh2_drc_dispatcher); EMITH_SJMP_END(DCOND_EQ); -#endif +#else + emith_cmp_r_r(arg0, arg3); emith_jump_cond(DCOND_NE, sh2_drc_dispatcher); +#endif emith_read_r_r_offs_ptr(arg0, arg1, offsetof(SH2, rts_cache) + sizeof(void *)); emith_sub_r_imm(arg2, 2*sizeof(void *)); emith_and_r_imm(arg2, (ARRAY_SIZE(sh2s->rts_cache)-1) * 2*sizeof(void *)); @@ -4874,7 +4986,7 @@ static void sh2_generate_utils(void) emith_sub_r_imm(tmp, 4*2); rcache_clean(); // push SR - tmp = rcache_get_reg_arg(0, SHR_SP,&tmp2); + tmp = rcache_get_reg_arg(0, SHR_SP, &tmp2); emith_add_r_r_imm(tmp, tmp2, 4); tmp = rcache_get_reg_arg(1, SHR_SR, NULL); emith_clear_msb(tmp, tmp, 22); @@ -5478,6 +5590,8 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, else if ((lowest_mova && lowest_mova <= pc) || (lowest_literal && lowest_literal <= pc)) break; // text area collides with data area + else if ((op_flags[i] & OF_BTARGET) && dr_get_entry(pc, is_slave, &i_end)) + break; // branch target already compiled op = FETCH_OP(pc); switch ((op & 0xf000) >> 12) @@ -5490,19 +5604,19 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, switch (GET_Fx()) { case 0: // STC SR,Rn 0000nnnn00000010 - tmp = SHR_SR; + tmp = BITMASK2(SHR_SR, SHR_T); break; case 1: // STC GBR,Rn 0000nnnn00010010 - tmp = SHR_GBR; + tmp = BITMASK1(SHR_GBR); break; case 2: // STC VBR,Rn 0000nnnn00100010 - tmp = SHR_VBR; + tmp = BITMASK1(SHR_VBR); break; default: goto undefined; } opd->op = OP_MOVE; - opd->source = BITMASK1(tmp); + opd->source = tmp; opd->dest = BITMASK1(GET_Rn()); break; case 0x03: @@ -5549,7 +5663,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, opd->imm = 1; break; case 2: // CLRMAC 0000000000101000 - opd->dest = BITMASK3(SHR_T, SHR_MACL, SHR_MACH); + opd->dest = BITMASK2(SHR_MACL, SHR_MACH); break; default: goto undefined; @@ -5612,7 +5726,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 2: // RTE 0000000000101011 opd->op = OP_RTE; opd->source = BITMASK1(SHR_SP); - opd->dest = BITMASK3(SHR_SP, SHR_SR, SHR_PC); + opd->dest = BITMASK4(SHR_SP, SHR_SR, SHR_T, SHR_PC); opd->cycles = 4; next_is_delay = 1; end_block = !(op_flags[i+1+next_is_delay] & OF_BTARGET); @@ -5664,7 +5778,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, break; case 0x07: // DIV0S Rm,Rn 0010nnnnmmmm0111 opd->source = BITMASK2(GET_Rm(), GET_Rn()); - opd->dest = BITMASK1(SHR_SR); + opd->dest = BITMASK2(SHR_SR, SHR_T); break; case 0x08: // TST Rm,Rn 0010nnnnmmmm1000 opd->source = BITMASK2(GET_Rm(), GET_Rn()); @@ -5707,8 +5821,8 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, opd->dest = BITMASK1(SHR_T); break; case 0x04: // DIV1 Rm,Rn 0011nnnnmmmm0100 - opd->source = BITMASK3(GET_Rm(), GET_Rn(), SHR_SR); - opd->dest = BITMASK2(GET_Rn(), SHR_SR); + opd->source = BITMASK4(GET_Rm(), GET_Rn(), SHR_SR, SHR_T); + opd->dest = BITMASK3(GET_Rn(), SHR_SR, SHR_T); break; case 0x05: // DMULU.L Rm,Rn 0011nnnnmmmm0101 case 0x0d: // DMULS.L Rm,Rn 0011nnnnmmmm1101 @@ -5778,30 +5892,30 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, switch (op & 0x3f) { case 0x02: // STS.L MACH,@-Rn 0100nnnn00000010 - tmp = SHR_MACH; + tmp = BITMASK1(SHR_MACH); break; case 0x12: // STS.L MACL,@-Rn 0100nnnn00010010 - tmp = SHR_MACL; + tmp = BITMASK1(SHR_MACL); break; case 0x22: // STS.L PR,@-Rn 0100nnnn00100010 - tmp = SHR_PR; + tmp = BITMASK1(SHR_PR); break; case 0x03: // STC.L SR,@-Rn 0100nnnn00000011 - tmp = SHR_SR; + tmp = BITMASK2(SHR_SR, SHR_T); opd->cycles = 2; break; case 0x13: // STC.L GBR,@-Rn 0100nnnn00010011 - tmp = SHR_GBR; + tmp = BITMASK1(SHR_GBR); opd->cycles = 2; break; case 0x23: // STC.L VBR,@-Rn 0100nnnn00100011 - tmp = SHR_VBR; + tmp = BITMASK1(SHR_VBR); opd->cycles = 2; break; default: goto undefined; } - opd->source = BITMASK2(GET_Rn(), tmp); + opd->source = BITMASK1(GET_Rn()) | tmp; opd->dest = BITMASK2(GET_Rn(), SHR_MEM); break; case 0x04: @@ -5831,26 +5945,26 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, switch (op & 0x3f) { case 0x06: // LDS.L @Rm+,MACH 0100mmmm00000110 - tmp = SHR_MACH; + tmp = BITMASK1(SHR_MACH); break; case 0x16: // LDS.L @Rm+,MACL 0100mmmm00010110 - tmp = SHR_MACL; + tmp = BITMASK1(SHR_MACL); break; case 0x26: // LDS.L @Rm+,PR 0100mmmm00100110 - tmp = SHR_PR; + tmp = BITMASK1(SHR_PR); break; case 0x07: // LDC.L @Rm+,SR 0100mmmm00000111 - tmp = SHR_SR; + tmp = BITMASK2(SHR_SR, SHR_T); opd->op = OP_LDC; opd->cycles = 3; break; case 0x17: // LDC.L @Rm+,GBR 0100mmmm00010111 - tmp = SHR_GBR; + tmp = BITMASK1(SHR_GBR); opd->op = OP_LDC; opd->cycles = 3; break; case 0x27: // LDC.L @Rm+,VBR 0100mmmm00100111 - tmp = SHR_VBR; + tmp = BITMASK1(SHR_VBR); opd->op = OP_LDC; opd->cycles = 3; break; @@ -5858,7 +5972,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, goto undefined; } opd->source = BITMASK2(GET_Rn(), SHR_MEM); - opd->dest = BITMASK2(GET_Rn(), tmp); + opd->dest = BITMASK1(GET_Rn()) | tmp; break; case 0x08: case 0x09: @@ -5931,20 +6045,20 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, switch (GET_Fx()) { case 0: // LDC Rm,SR 0100mmmm00001110 - tmp = SHR_SR; + tmp = BITMASK2(SHR_SR, SHR_T); break; case 1: // LDC Rm,GBR 0100mmmm00011110 - tmp = SHR_GBR; + tmp = BITMASK1(SHR_GBR); break; case 2: // LDC Rm,VBR 0100mmmm00101110 - tmp = SHR_VBR; + tmp = BITMASK1(SHR_VBR); break; default: goto undefined; } opd->op = OP_LDC; opd->source = BITMASK1(GET_Rn()); - opd->dest = BITMASK1(tmp); + opd->dest = tmp; break; case 0x0f: // MAC.W @Rm+,@Rn+ 0100nnnnmmmm1111 @@ -6130,7 +6244,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, break; case 0x0300: // TRAPA #imm 11000011iiiiiiii opd->op = OP_TRAPA; - opd->source = BITMASK3(SHR_SP, SHR_PC, SHR_SR); + opd->source = BITMASK4(SHR_SP, SHR_PC, SHR_SR, SHR_T); opd->dest = BITMASK2(SHR_SP, SHR_PC); opd->imm = (op & 0xff); opd->cycles = 8; @@ -6256,9 +6370,6 @@ end: last_btarget = 0; op = 0; // delay/poll insns counter for (i = 0, pc = base_pc; i < i_end; i++, pc += 2) { - int null; - if ((op_flags[i] & OF_BTARGET) && dr_get_entry(pc, is_slave, &null)) - break; // branch target already compiled opd = &ops[i]; crc += FETCH_OP(pc); diff --git a/pico/32x/32x.c b/pico/32x/32x.c index 9993bfa8..7e2e039e 100644 --- a/pico/32x/32x.c +++ b/pico/32x/32x.c @@ -38,17 +38,19 @@ void p32x_update_irls(SH2 *active_sh2, unsigned int m68k_cycles) if (active_sh2 != NULL) m68k_cycles = sh2_cycles_done_m68k(active_sh2); + // find top bit = highest irq number (0 <= irl <= 14/2) by binary search + // msh2 irqs = Pico32x.sh2irqs | Pico32x.sh2irqi[0]; - while ((irqs >>= 1)) - mlvl++; - mlvl *= 2; + if (irqs >= 0x10) mlvl += 8, irqs >>= 4; + if (irqs >= 0x04) mlvl += 4, irqs >>= 2; + if (irqs >= 0x02) mlvl += 2, irqs >>= 1; // ssh2 irqs = Pico32x.sh2irqs | Pico32x.sh2irqi[1]; - while ((irqs >>= 1)) - slvl++; - slvl *= 2; + if (irqs >= 0x10) slvl += 8, irqs >>= 4; + if (irqs >= 0x04) slvl += 4, irqs >>= 2; + if (irqs >= 0x02) slvl += 2, irqs >>= 1; mrun = sh2_irl_irq(&msh2, mlvl, msh2.state & SH2_STATE_RUN); if (mrun) { -- 2.39.5