From 904fb98e6ca213ca4cb31a092e1bbf8dc38b1ea6 Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 6 May 2020 23:06:10 +0200 Subject: [PATCH] sh2: optimisations in drc --- cpu/drc/cmn.h | 24 +++++------ cpu/sh2/compiler.c | 99 ++++++++++++++++++++++++++++------------------ 2 files changed, 72 insertions(+), 51 deletions(-) diff --git a/cpu/drc/cmn.h b/cpu/drc/cmn.h index 2eb52aad..9c041e70 100644 --- a/cpu/drc/cmn.h +++ b/cpu/drc/cmn.h @@ -17,18 +17,18 @@ void drc_cmn_cleanup(void); // binary search approach, since we don't have CLZ on ARM920T #define FOR_ALL_BITS_SET_DO(mask, bit, code) { \ u32 __mask = mask; \ - for (bit = 31; bit >= 0 && mask; bit--, __mask <<= 1) { \ - if (!(__mask & (0xffff << 16))) \ - bit -= 16, __mask <<= 16; \ - if (!(__mask & (0xff << 24))) \ - bit -= 8, __mask <<= 8; \ - if (!(__mask & (0xf << 28))) \ - bit -= 4, __mask <<= 4; \ - if (!(__mask & (0x3 << 30))) \ - bit -= 2, __mask <<= 2; \ - if (!(__mask & (0x1 << 31))) \ - bit -= 1, __mask <<= 1; \ - if (__mask & (0x1 << 31)) { \ + for (bit = 0; bit < 32 && mask; bit++, __mask >>= 1) { \ + if (!(__mask & 0xffff)) \ + bit += 16,__mask >>= 16; \ + if (!(__mask & 0xff)) \ + bit += 8, __mask >>= 8; \ + if (!(__mask & 0xf)) \ + bit += 4, __mask >>= 4; \ + if (!(__mask & 0x3)) \ + bit += 2, __mask >>= 2; \ + if (!(__mask & 0x1)) \ + bit += 1, __mask >>= 1; \ + if (__mask & 0x1) { \ code; \ } \ } \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index ad1983bf..bfd9ec06 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -1549,22 +1549,31 @@ static u32 rcache_regs_clean; // regs needing cleaning static void rcache_lock_vreg(int x) { if (x >= 0) { + cache_regs[x].locked ++; +#if DRC_DEBUG & 64 if (cache_regs[x].type == HR_FREE) { printf("locking free vreg %x, aborting\n", x); exit(1); } - cache_regs[x].locked ++; + if (!cache_regs[x].locked) { + printf("locking overflow vreg %x, aborting\n", x); + exit(1); + } +#endif } } static void rcache_unlock_vreg(int x) { if (x >= 0) { +#if DRC_DEBUG & 64 if (cache_regs[x].type == HR_FREE) { printf("unlocking free vreg %x, aborting\n", x); exit(1); } - cache_regs[x].locked --; +#endif + if (cache_regs[x].locked) + cache_regs[x].locked --; } } @@ -1582,7 +1591,7 @@ static void rcache_unmap_vreg(int x) FOR_ALL_BITS_SET_DO(cache_regs[x].gregs, i, if (guest_regs[i].flags & GRF_DIRTY) { // if a dirty reg is unmapped save its value to context - if (~rcache_regs_discard & (1 << i)) + if ((~rcache_regs_discard | rcache_regs_now) & (1 << i)) emith_ctx_write(cache_regs[x].hreg, i * 4); guest_regs[i].flags &= ~GRF_DIRTY; } @@ -1700,26 +1709,28 @@ static int rcache_allocate(int what, int minprio) continue; if (cache_regs[i].type == HR_FREE || cache_regs[i].type == HR_TEMP) { // REG is free - prio = 6; + prio = 10; oldest = i; break; } if (cache_regs[i].type == HR_CACHED) { if (rcache_regs_now & cache_regs[i].gregs) // REGs needed for the current insn - i_prio = 1; + i_prio = 0; else if (rcache_regs_soon & cache_regs[i].gregs) // REGs needed in the next insns i_prio = 2; else if (rcache_regs_late & cache_regs[i].gregs) // REGs needed in some future insn - i_prio = 3; - else if (!(~rcache_regs_discard & cache_regs[i].gregs)) - // REGs not needed in the foreseeable future i_prio = 4; + else if (~rcache_regs_discard & cache_regs[i].gregs) + // REGs not needed in the foreseeable future + i_prio = 6; else // REGs soon overwritten anyway - i_prio = 5; + i_prio = 8; + if (!(cache_regs[i].flags & HRF_DIRTY)) i_prio ++; + if (prio < i_prio || (prio == i_prio && cache_regs[i].stamp < min_stamp)) { min_stamp = cache_regs[i].stamp; oldest = i; @@ -1744,21 +1755,21 @@ static int rcache_allocate_vreg(int needed) { int x; - x = rcache_allocate(1, needed ? 0 : 3); + x = rcache_allocate(1, needed ? 0 : 4); if (x < 0) - x = rcache_allocate(-1, 1); + x = rcache_allocate(-1, 0); return x; } static int rcache_allocate_nontemp(void) { - int x = rcache_allocate(0, 3); + int x = rcache_allocate(0, 4); return x; } static int rcache_allocate_temp(void) { - int x = rcache_allocate(-1, 1); + int x = rcache_allocate(-1, 0); if (x < 0) x = rcache_allocate(0, 0); return x; @@ -1821,20 +1832,25 @@ static void rcache_remap_vreg(int x) int d; // x must be a cached vreg - if (cache_regs[x].type != HR_CACHED) + if (cache_regs[x].type != HR_CACHED || cache_regs[x].locked) return; - // don't do it if x is already a REG or isn't used or to be cleaned anyway - if ((cache_regs[x].htype & HRT_REG) || - !(rsl_d & cache_regs[x].gregs)) { + // don't do it if x isn't used + if (!(rsl_d & cache_regs[x].gregs)) { // clean here to avoid data loss on invalidation rcache_clean_vreg(x); return; } - if (cache_regs[x].locked) { - printf("remap vreg %d is locked\n", x); - exit(1); - } + FOR_ALL_BITS_SET_DO(cache_regs[x].gregs, d, + if ((guest_regs[d].flags & (GRF_STATIC|GRF_PINNED)) && + !cache_regs[guest_regs[d].sreg].locked && + !((rsl_d|rcache_regs_now) & cache_regs[guest_regs[d].sreg].gregs)) { + // STATIC not in its sreg and sreg is available + rcache_evict_vreg(guest_regs[d].sreg); + rcache_move_vreg(guest_regs[d].sreg, x); + return; + } + ) // allocate a non-TEMP vreg rcache_lock_vreg(x); // lock to avoid evicting x @@ -1891,8 +1907,8 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr { int src, dst, ali; cache_reg_t *tr; - u32 rsp_d = (rcache_regs_now | rcache_regs_soon | - rcache_regs_static | rcache_regs_pinned) & ~rcache_regs_discard; + u32 rsp_d = (rcache_regs_soon | rcache_regs_static | rcache_regs_pinned) & + ~rcache_regs_discard; dst = src = guest_regs[r].vreg; @@ -1901,7 +1917,7 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr if ((guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) && src != guest_regs[r].sreg && (src < 0 || mode != RC_GR_READ) && !cache_regs[guest_regs[r].sreg].locked && - !(rsp_d & cache_regs[guest_regs[r].sreg].gregs)) { + !((rsp_d|rcache_regs_now) & cache_regs[guest_regs[r].sreg].gregs)) { dst = guest_regs[r].sreg; rcache_evict_vreg(dst); } else if (dst < 0) { @@ -1926,7 +1942,7 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr ali = tr->gregs & ~(1 << r); if (mode != RC_GR_READ && src == dst && ali) { int x = -1; - if (rsp_d & ali) { + if ((rsp_d|rcache_regs_now) & ali) { if ((guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) && guest_regs[r].sreg == dst && !tr->locked) { // split aliases if r is STATIC in sreg and dst isn't already locked @@ -1935,7 +1951,7 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr if ((guest_regs[t].flags & (GRF_STATIC|GRF_PINNED)) && !(ali & ~(1 << t)) && !cache_regs[guest_regs[t].sreg].locked && - !(rsp_d & cache_regs[guest_regs[t].sreg].gregs)) { + !((rsp_d|rcache_regs_now) & cache_regs[guest_regs[t].sreg].gregs)) { // alias is a single STATIC and its sreg is available x = guest_regs[t].sreg; rcache_evict_vreg(x); @@ -1947,8 +1963,9 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr break; ) if (x >= 0) { - src = x; - rcache_move_vreg(src, dst); + rcache_remove_vreg_alias(src, r); + src = dst; + rcache_move_vreg(x, dst); } } else { // split r @@ -1956,6 +1973,7 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr x = rcache_allocate_vreg(rsp_d & (1 << r)); rcache_unlock_vreg(src); if (x >= 0) { + rcache_remove_vreg_alias(src, r); dst = x; tr = &cache_regs[dst]; tr->stamp = rcache_counter; @@ -1965,8 +1983,6 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr if (x < 0) // aliases not needed or no vreg available, remove them rcache_evict_vreg_aliases(dst, r); - else if (src != dst) - rcache_remove_vreg_alias(src, r); } // assign r to dst @@ -2342,13 +2358,16 @@ static void rcache_clean_tmp(void) static void rcache_clean_masked(u32 mask) { int i, r, hr; + u32 m; rcache_regs_clean |= mask; mask = rcache_regs_clean; - // clean constants where all aliases are covered by the mask + // clean constants where all aliases are covered by the mask, exempt statics + // to avoid flushing them to context if sreg isn't available + m = mask & ~(rcache_regs_static | rcache_regs_pinned); for (i = 0; i < ARRAY_SIZE(gconsts); i++) - if ((gconsts[i].gregs & mask) && !(gconsts[i].gregs & ~mask)) { + if ((gconsts[i].gregs & m) && !(gconsts[i].gregs & ~mask)) { FOR_ALL_BITS_SET_DO(gconsts[i].gregs, r, if (guest_regs[r].flags & GRF_CDIRTY) { hr = rcache_get_reg_(r, RC_GR_READ, 0, NULL); @@ -2479,6 +2498,9 @@ static void rcache_create(void) } // create static host register mapping for SH2 regs + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { + guest_regs[i] = (guest_reg_t){.sreg = -1}; + } for (i = 0; i < ARRAY_SIZE(regs_static); i += 2) { for (x = ARRAY_SIZE(cache_regs)-1; x >= 0; x--) if (cache_regs[x].hreg == regs_static[i+1]) break; @@ -2486,8 +2508,7 @@ static void rcache_create(void) guest_regs[regs_static[i]] = (guest_reg_t){.flags = GRF_STATIC,.sreg = x}; rcache_regs_static |= (1 << regs_static[i]); rcache_vregs_reg &= ~(1 << x); - } else - guest_regs[regs_static[i]] = (guest_reg_t){.sreg = -1}; + } } printf("DRC registers created, %ld host regs (%d REG, %d STATIC, 1 CTX)\n", @@ -3501,7 +3522,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_set_usage_now(opd[0].source); // current insn rcache_set_usage_soon(soon); // insns 1-4 rcache_set_usage_late(late & ~soon); // insns 5-9 - rcache_set_usage_discard(write & ~(late|soon|opd[0].source)); + rcache_set_usage_discard(write & ~(late|soon)); if (v <= 9) // upcoming rcache_flush, start writing back unused dirty stuff rcache_clean_masked(rcache_dirty_mask() & ~(write|opd[0].dest)); @@ -4717,7 +4738,7 @@ end_op: // branch not taken, correct cycle count if (ctaken) - emith_add_r_imm(sr, ctaken << 12); + cycles -= ctaken; // set T bit to reflect branch not taken for OP_BRANCH_CT/CF if (emith_get_t_cond() >= 0) // T is synced for all other cases emith_set_t(sr, opd_b->op == OP_BRANCH_CF); @@ -5263,11 +5284,11 @@ static void sh2_smc_rm_blocks(u32 a, int len, int tcache_id, u32 shift) start_lit = block->addr_lit & wtmask; end_lit = start_lit + block->size_lit; // disable/delete block if it covers the modified address - if ((start_addr <= a+len && a < end_addr) || - (start_lit <= a+len && a < end_lit)) + if ((start_addr < a+len && a < end_addr) || + (start_lit < a+len && a < end_lit)) { dbg(2, "smc remove @%08x", a); - end_addr = (start_lit <= a+len && block->size_lit ? a : 0); + end_addr = (start_lit < a+len && block->size_lit ? a : 0); dr_rm_block_entry(block, tcache_id, end_addr, 0); #if (DRC_DEBUG & 2) removed = 1; -- 2.39.2