From 7869213d35f3ec020083a1c2b3f35c107e0c52a7 Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 10 Oct 2019 23:52:39 +0200 Subject: [PATCH] sh2 drc: speed optimization and bugfixing --- Makefile | 2 +- cpu/drc/emit_arm64.c | 18 ++++++++------ cpu/drc/emit_mips.c | 59 +++++++++++++++++++++++++------------------- cpu/drc/emit_x86.c | 5 ++++ cpu/sh2/compiler.c | 32 +++++++++--------------- 5 files changed, 61 insertions(+), 55 deletions(-) diff --git a/Makefile b/Makefile index 63e9c833..15549dca 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ ifeq "$(DEBUG)" "0" CFLAGS += -O3 -DNDEBUG endif -# This is actually needed, bevieve me. +# This is actually needed, believe me. # If you really have to disable this, set NO_ALIGN_FUNCTIONS elsewhere. ifndef NO_ALIGN_FUNCTIONS CFLAGS += -falign-functions=2 diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index 688649b5..3ef402b4 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -160,7 +160,7 @@ enum { XT_UXTW=0x4, XT_UXTX=0x6, XT_LSL=0x7, XT_SXTW=0xc, XT_SXTX=0xe }; #define A64_ROR_REG(rd, rn, rm) \ A64_INSN(0xd,0x0,0x3,_,rm,_,0xb,rn,rd) -// rd = REVERSE(n) rn +// rd = REVERSE(rn) #define A64_RBIT_REG(rd, rn) \ A64_INSN(0xd,0x2,0x3,_,_,_,_,rn,rd) @@ -327,9 +327,10 @@ enum { AM_IDX, AM_IDXPOST, AM_IDXREG, AM_IDXPRE }; // if-then-else conditional execution helpers -#define JMP_POS(ptr) \ +#define JMP_POS(ptr) { \ ptr = tcache_ptr; \ - EMIT(A64_B(0)); + EMIT(A64_B(0)); \ +} #define JMP_EMIT(cond, ptr) { \ u32 val_ = (u8 *)tcache_ptr - (u8 *)(ptr); \ @@ -1225,9 +1226,9 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) emith_tst_r_imm(sr, S); \ EMITH_SJMP_START(DCOND_EQ); \ /* overflow if top 17 bits of MACH aren't all 1 or 0 */ \ - /* to check: add MACH[15] to MACH[31:16]. this is 0 if no overflow */ \ - emith_asrf(rn, mh, 16); /* sum = (MACH>>16) + ((MACH>>15)&1) */ \ - emith_adcf_r_imm(rn, 0); /* (MACH>>15) is in carry after shift */ \ + /* to check: add MACH >> 31 to MACH >> 15. this is 0 if no overflow */ \ + emith_asr(rn, mh, 15); \ + emith_addf_r_r_r_lsr(rn, rn, mh, 31); \ EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> ov */ \ emith_move_r_imm_c(DCOND_NE, ml, 0x0000); /* -overflow */ \ emith_move_r_imm_c(DCOND_NE, mh, 0x8000); \ @@ -1280,11 +1281,12 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) #define emith_tpop_carry(sr, is_sub) do { \ if (is_sub) \ emith_eor_r_imm(sr, 1); \ - emith_lsrf(sr, sr, 1); \ + emith_ror(sr, sr, 1); \ + emith_addf_r_r(sr, sr); \ } while (0) #define emith_tpush_carry(sr, is_sub) do { \ - emith_adc_r_r(sr, sr); \ + emith_adc_r_r(sr, Z0); \ if (is_sub) \ emith_eor_r_imm(sr, 1); \ } while (0) diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index fadf5744..4a452a68 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -21,7 +21,7 @@ #define AT 1 // used to hold intermediate results #define FNZ 15 // emulated processor flags: N (bit 31) ,Z (all bits) #define FC 24 // emulated processor flags: C (bit 0), others 0 -#define FV 25 // emulated processor flags: Nt^Ns (bit 31). others ? +#define FV 25 // emulated processor flags: Nt^Ns (bit 31). others x // unified conditions; virtual, not corresponding to anything real on MIPS @@ -208,8 +208,8 @@ enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; } while (0) // FIFO for 2 instructions, for delay slot handling -u32 emith_last_insns[2] = { -1,-1 }; -int emith_last_idx, emith_last_cnt; +static u32 emith_last_insns[2] = { -1,-1 }; +static int emith_last_idx, emith_last_cnt; #define EMIT_PUSHOP() \ do { \ @@ -248,7 +248,7 @@ static int emith_is_b(u32 op) // B ((op>>26) == OP__RT && ((op>>16) & 036) == RT_BLTZ); } // register usage for dependency evaluation XXX better do this as in emit_arm? static uint64_t emith_has_rs[3] = // OP__FN, OP__RT, others - { 0x00fffffffffa0ff0ULL, 0x000fff0fUL, 0xffffffff0f007f30ULL }; + { 0x00fffffffffa0ff0ULL, 0x000fff0fUL, 0xffffffff0f007ff0ULL }; static uint64_t emith_has_rt[3] = // OP__FN, OP__RT, others { 0xff00fffffff00cffULL, 0x00000000UL, 0x8000ff0000000030ULL }; static uint64_t emith_has_rd[3] = // OP__FN, OP__RT, others (rt instead of rd) @@ -308,21 +308,23 @@ static void *emith_branch(u32 op) bop = emith_b_isswap(op, op2); } + // flush FIFO and branch + tcache_ptr = (void *)((u32 *)tcache_ptr - emith_last_cnt); + if (emith_last_insns[idx^1] != -1) + EMIT_PTR(tcache_ptr, emith_last_insns[idx^1]); if (bop) { // can swap - tcache_ptr = (void *)((u32 *)tcache_ptr - emith_last_cnt); - if (emith_last_insns[idx^1] != -1) - EMIT_PTR(tcache_ptr, emith_last_insns[idx^1]); bp = tcache_ptr; EMIT_PTR(tcache_ptr, bop); COUNT_OP; EMIT_PTR(tcache_ptr, emith_last_insns[idx]); - emith_last_insns[0] = emith_last_insns[1] = -1; - emith_last_cnt = 0; } else { // can't swap - emith_flush(); + if (emith_last_insns[idx] != -1) + EMIT_PTR(tcache_ptr, emith_last_insns[idx]); bp = tcache_ptr; EMIT_PTR(tcache_ptr, op); COUNT_OP; EMIT_PTR(tcache_ptr, MIPS_NOP); COUNT_OP; } + emith_last_insns[0] = emith_last_insns[1] = -1; + emith_last_cnt = 0; return bp; } @@ -392,8 +394,8 @@ static void *emith_branch(u32 op) // flag emulation creates 2 (ie cmp #0/beq) up to 9 (ie adcf/ble) extra insns. // flag handling shortcuts may reduce this by 1-4 insns, see emith_cond_check() -int emith_flg_rs, emith_flg_rt; // registers used in FNZ=rs-rt (aka cmp_r_r) -int emith_flg_noV; // V flag known not to be set +static int emith_flg_rs, emith_flg_rt; // registers used in FNZ=rs-rt (cmp_r_r) +static int emith_flg_noV; // V flag known not to be set // store minimal cc information: rd, rt^rs, carry // NB: the result *must* first go to FNZ, in case rd == rs or rd == rt. @@ -625,7 +627,11 @@ static void emith_set_arith_flags(int rd, int rt, int rs, s32 imm, int sub) // move immediate static void emith_move_imm(int r, uintptr_t imm) { - if ((s16)imm != imm) { + if ((s16)imm == imm) { + EMIT(MIPS_ADD_IMM(r, Z0, imm)); + } else if (!(imm >> 16)) { + EMIT(MIPS_OR_IMM(r, Z0, imm)); + } else { int s = Z0; if (imm >> 16) { EMIT(MIPS_MOVT_IMM(r, imm >> 16)); @@ -633,8 +639,7 @@ static void emith_move_imm(int r, uintptr_t imm) } if ((u16)imm) EMIT(MIPS_OR_IMM(r, s, (u16)imm)); - } else - EMIT(MIPS_ADD_IMM(r, Z0, imm)); + } } #define emith_move_r_ptr_imm(r, imm) \ @@ -1372,16 +1377,17 @@ static int emith_cond_check(int cond, int *r) emith_tst_r_imm(sr, S); \ EMITH_SJMP_START(DCOND_EQ); \ /* overflow if top 17 bits of MACH aren't all 1 or 0 */ \ - /* to check: add MACH[15] to MACH[31:16]. this is 0 if no overflow */ \ - emith_asrf(rn, mh, 16); /* sum = (MACH>>16) + ((MACH>>15)&1) */ \ - emith_adcf_r_imm(rn, 0); /* (MACH>>15) is in carry after shift */ \ + /* to check: add MACH >> 31 to MACH >> 15. this is 0 if no overflow */ \ + emith_asr(rn, mh, 15); \ + emith_add_r_r_r_lsr(rn, rn, mh, 31); /* sum = (MACH>>31)+(MACH>>15) */ \ + emith_teq_r_r(rn, Z0); /* (need only N and Z flags) */ \ EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> ov */ \ emith_move_r_imm_c(DCOND_NE, ml, 0x0000); /* -overflow */ \ emith_move_r_imm_c(DCOND_NE, mh, 0x8000); \ - EMITH_SJMP_START(DCOND_LE); /* sum > 0 -> +ovl */ \ - emith_sub_r_imm_c(DCOND_GT, ml, 1); /* 0xffffffff */ \ - emith_sub_r_imm_c(DCOND_GT, mh, 1); /* 0x00007fff */ \ - EMITH_SJMP_END(DCOND_LE); \ + EMITH_SJMP_START(DCOND_PL); /* sum > 0 -> +ovl */ \ + emith_sub_r_imm_c(DCOND_MI, ml, 1); /* 0xffffffff */ \ + emith_sub_r_imm_c(DCOND_MI, mh, 1); /* 0x00007fff */ \ + EMITH_SJMP_END(DCOND_PL); \ EMITH_SJMP_END(DCOND_EQ); \ EMITH_SJMP_END(DCOND_EQ); \ } while (0) @@ -1399,14 +1405,15 @@ static int emith_cond_check(int cond, int *r) /* overflow if top 33 bits of MACH:MACL aren't all 1 or 0 */ \ /* to check: add MACL[31] to MACH. this is 0 if no overflow */ \ emith_lsr(rn, ml, 31); \ - emith_addf_r_r(rn, mh); /* sum = MACH + ((MACL>>31)&1) */ \ + emith_add_r_r(rn, mh); /* sum = MACH + ((MACL>>31)&1) */ \ + emith_teq_r_r(rn, Z0); /* (need only N and Z flags) */ \ EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> overflow */ \ /* XXX: LSB signalling only in SH1, or in SH2 too? */ \ emith_move_r_imm_c(DCOND_NE, mh, 0x00000001); /* LSB of MACH */ \ emith_move_r_imm_c(DCOND_NE, ml, 0x80000000); /* negative ovrfl */ \ - EMITH_SJMP_START(DCOND_LE); /* sum > 0 -> positive ovrfl */ \ - emith_sub_r_imm_c(DCOND_GT, ml, 1); /* 0x7fffffff */ \ - EMITH_SJMP_END(DCOND_LE); \ + EMITH_SJMP_START(DCOND_PL); /* sum > 0 -> positive ovrfl */ \ + emith_sub_r_imm_c(DCOND_MI, ml, 1); /* 0x7fffffff */ \ + EMITH_SJMP_END(DCOND_PL); \ EMITH_SJMP_END(DCOND_EQ); \ EMITH_SJMP_END(DCOND_EQ); \ } while (0) diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 451fa8d0..44e10ecf 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -1225,6 +1225,11 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common rcache_free_tmp(tmp_); \ } while (0) +#define emith_carry_to_t(sr, is_sub) do { \ + emith_rorc(sr); \ + emith_rol(sr, sr, 1); \ +} while (0) + #define emith_tpop_carry(sr, is_sub) \ emith_lsr(sr, sr, 1) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 09546634..2c1e8cff 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -69,7 +69,7 @@ // 800 - state dump on exit // { #ifndef DRC_DEBUG -#define DRC_DEBUG 0//x8c7 +#define DRC_DEBUG 0//x847 #endif #if DRC_DEBUG @@ -2999,6 +2999,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) void *block_entry_ptr; struct block_desc *block; struct block_entry *entry; + struct block_link *bl; u16 *dr_pc_base; struct op_data *opd; int blkid_main = 0; @@ -3245,6 +3246,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (pinned_loop_pc[pinned_loop_count] == pc) { // pin needed regs on loop entry FOR_ALL_BITS_SET_DO(pinned_loop_mask[pinned_loop_count], v, rcache_pin_reg(v)); + emith_flush(); pinned_loop_ptr[pinned_loop_count] = tcache_ptr; } else op_flags[i] &= ~OF_BASIC_LOOP; @@ -3920,9 +3922,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_invalidate_t(); - emith_tpop_carry(sr, 0); // dummy emith_lslf(tmp, tmp2, 1); - emith_tpush_carry(sr, 0); + emith_carry_to_t(sr, 0); goto end_op; case 1: // DT Rn 0100nnnn00010000 sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); @@ -3949,12 +3950,11 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_invalidate_t(); - emith_tpop_carry(sr, 0); // dummy if (op & 0x20) { emith_asrf(tmp, tmp2, 1); } else emith_lsrf(tmp, tmp2, 1); - emith_tpush_carry(sr, 0); + emith_carry_to_t(sr, 0); goto end_op; case 1: // CMP/PZ Rn 0100nnnn00010001 tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); @@ -4007,12 +4007,11 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_invalidate_t(); - emith_tpop_carry(sr, 0); // dummy if (op & 1) { emith_rorf(tmp, tmp2, 1); } else emith_rolf(tmp, tmp2, 1); - emith_tpush_carry(sr, 0); + emith_carry_to_t(sr, 0); goto end_op; case 0x24: // ROTCL Rn 0100nnnn00100100 case 0x25: // ROTCR Rn 0100nnnn00100101 @@ -4391,7 +4390,6 @@ end_op: int cond = -1; int ctaken = 0; void *target = NULL; - struct block_link *bl = NULL; if (OP_ISBRACND(opd_b->op)) ctaken = (op_flags[i] & OF_DELAY_OP) ? 1 : 2; @@ -4545,8 +4543,6 @@ end_op: } } - if (bl) - memcpy(bl->jdisp, bl->jump, emith_jump_at_size()); #if CALL_STACK if (rtsadd) emith_move_r_imm_s8_patch(rtsadd, tcache_ptr - (u8 *)rtsret); @@ -4565,7 +4561,6 @@ end_op: } else if (drcf.pending_branch_indirect) { u32 target_pc; - struct block_link *bl = NULL; tmp = rcache_get_reg_arg(0, SHR_PC, NULL); @@ -4629,8 +4624,6 @@ end_op: if (! OP_ISBRAUC(opd->op)) { - struct block_link *bl; - tmp = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(tmp); emith_sync_t(tmp); @@ -4645,18 +4638,15 @@ end_op: emith_move_r_imm(tmp, pc); emith_jump_patchable(sh2_drc_dispatcher); rcache_invalidate(); - - if (bl) - memcpy(bl->jdisp, bl->jump, emith_jump_at_size()); } else rcache_flush(); // emit blx area for (i = 0; i < blx_target_count; i++) { void *target = (blx_target_pc[i] & 1 ? sh2_drc_exit : sh2_drc_dispatcher); - struct block_link *bl = blx_target_bl[i]; emith_pool_check(); + bl = blx_target_bl[i]; if (bl) bl->blx = tcache_ptr; emith_jump_patch(blx_target_ptr[i], tcache_ptr, NULL); @@ -4664,9 +4654,6 @@ end_op: emith_move_r_imm(tmp, blx_target_pc[i] & ~1); emith_jump(target); rcache_invalidate(); - - if (bl) - memcpy(bl->jdisp, bl->blx, emith_jump_at_size()); } emith_flush(); @@ -4692,6 +4679,11 @@ end_op: emith_jump_patch(branch_patch_ptr[i], target, NULL); } + // fill blx backup; do this last to backup final patched code + for (i = 0; i < block->entry_count; i++) + for (bl = block->entryp[i].o_links; bl; bl = bl->o_next) + memcpy(bl->jdisp, bl->blx ?: bl->jump, emith_jump_at_size()); + tcache_ptrs[tcache_id] = tcache_ptr; host_instructions_updated(block_entry_ptr, tcache_ptr); -- 2.39.5