From f53e166cf471684e5325b595fb7bb65df7c5b093 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 17 Sep 2019 22:48:32 +0200 Subject: [PATCH] various smallish optimizations, cleanups, and bug fixes --- Makefile | 4 +++- cpu/drc/emit_arm.c | 48 +++++++++++++++++++++++++++++++++++++++++--- cpu/drc/emit_arm64.c | 4 ++++ cpu/drc/emit_mips.c | 40 ++++++++++++++++++++++-------------- cpu/drc/emit_x86.c | 4 ++-- cpu/sh2/compiler.c | 24 +++++++++++++++++++--- cpu/sh2/compiler.h | 16 +++++++-------- pico/32x/32x.c | 4 ++-- 8 files changed, 109 insertions(+), 35 deletions(-) diff --git a/Makefile b/Makefile index 47463d51..63e9c833 100644 --- a/Makefile +++ b/Makefile @@ -36,7 +36,9 @@ endif ifeq ("$(PLATFORM)",$(filter "$(PLATFORM)","gp2x" "opendingux" "rpi1")) # very small caches, avoid optimization options making the binary much bigger -CFLAGS += -finline-limit=42 -fno-unroll-loops -fno-ipa-cp -fno-common -fno-stack-protector -ffast-math +CFLAGS += -finline-limit=42 -fno-unroll-loops -fno-ipa-cp +# this gets you about 20% better execution speed on 32bit arm/mips +CFLAGS += -fno-common -fno-stack-protector -fno-guess-branch-probability -fno-caller-saves -fno-tree-loop-if-convert -ffast-math endif # default settings diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 1d70866c..66a5b065 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -36,6 +36,47 @@ #define M5(x,y,z,a,b) (M4(x,y,z,a)|M1(b)) #define M10(a,b,c,d,e,f,g,h,i,j) (M5(a,b,c,d,e)|M5(f,g,h,i,j)) +// sys_cacheflush always flushes whole pages, and it's rather expensive on ARMs +// hold a list of pending cache updates and merge requests to reduce cacheflush +static struct { void *base, *end; } pageflush[4]; +static unsigned pagesize = 4096; + +static void emith_update_cache(void) +{ + int i; + + for (i = 0; i < 4 && pageflush[i].base; i++) { + cache_flush_d_inval_i(pageflush[i].base, pageflush[i].end + pagesize-1); + pageflush[i].base = NULL; + } +} + +static inline void emith_update_add(void *base, void *end) +{ + void *p_base = (void *)((uintptr_t)(base) & ~(pagesize-1)); + void *p_end = (void *)((uintptr_t)(end ) & ~(pagesize-1)); + int i; + + for (i = 0; i < 4 && pageflush[i].base; i++) { + if (p_base <= pageflush[i].end+pagesize && p_end >= pageflush[i].end) { + if (p_base < pageflush[i].base) pageflush[i].base = p_base; + pageflush[i].end = p_end; + return; + } + if (p_base <= pageflush[i].base && p_end >= pageflush[i].base-pagesize) { + if (p_end > pageflush[i].end) pageflush[i].end = p_end; + pageflush[i].base = p_base; + return; + } + } + if (i == 4) { + /* list full and not mergeable -> flush list */ + emith_update_cache(); + i = 0; + } + pageflush[i].base = p_base, pageflush[i].end = p_end; +} + // peephole optimizer. ATM only tries to reduce interlock #define EMIT_CACHE_SIZE 3 struct emit_op { @@ -48,8 +89,8 @@ static struct emit_op emit_cache[EMIT_CACHE_SIZE+3]; static int emit_index; #define emith_insn_ptr() (u8 *)((u32 *)tcache_ptr-emit_index) -static int emith_pool_index(int tcache_offs); -static void emith_pool_adjust(int pool_index, int move_offs); +static inline int emith_pool_index(int tcache_offs); +static inline void emith_pool_adjust(int pool_index, int move_offs); static NOINLINE void EMIT(u32 op, u32 dst, u32 src) { @@ -1106,6 +1147,7 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) (u8 *)ptr; \ }) +#define emith_jump_cond_inrange(target) !0 #define emith_jump_patch_size() 4 #define emith_jump_at(ptr, target) do { \ @@ -1170,7 +1212,7 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) } while (0) #define host_instructions_updated(base, end) \ - cache_flush_d_inval_i(base, end) + emith_update_add(base, end) #define host_arg2reg(rd, arg) \ rd = arg diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index de587619..8ce2ef38 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -1038,6 +1038,9 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) #define emith_jump_cond_patchable(cond, target) \ emith_bcond(tcache_ptr, 1, cond, target) +#define emith_jump_cond_inrange(target) \ + !(((u8 *)target - (u8 *)tcache_ptr + 0x100000) >> 22) + #define emith_jump_patch(ptr, target) ({ \ u32 *ptr_ = (u32 *)ptr; \ u32 disp_ = (u8 *)(target) - (u8 *)(ptr_); \ @@ -1116,6 +1119,7 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) #define emith_insn_ptr() ((u8 *)tcache_ptr) #define emith_flush() /**/ #define host_instructions_updated(base, end) __builtin___clear_cache(base, end) +#define emith_update_cache() /**/ #define emith_jump_patch_size() 8 #define emith_rw_offs_max() 0xff diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index e200db0a..0e85f92a 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -209,20 +209,25 @@ enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; // FIFO for 2 instructions, for delay slot handling u32 emith_last_insns[2] = { -1,-1 }; -int emith_last_idx; +int emith_last_idx, emith_last_cnt; #define EMIT_PUSHOP() \ do { \ emith_last_idx ^= 1; \ - if (emith_last_insns[emith_last_idx] != -1) \ - EMIT_PTR(tcache_ptr, emith_last_insns[emith_last_idx]);\ + if (emith_last_insns[emith_last_idx] != -1) { \ + u32 *p = (u32 *)tcache_ptr - emith_last_cnt; \ + EMIT_PTR(p, emith_last_insns[emith_last_idx]);\ + emith_last_cnt --; \ + } \ emith_last_insns[emith_last_idx] = -1; \ } while (0) #define EMIT(op) \ do { \ EMIT_PUSHOP(); \ + tcache_ptr = (void *)((u32 *)tcache_ptr + 1); \ emith_last_insns[emith_last_idx] = op; \ + emith_last_cnt ++; \ COUNT_OP; \ } while (0) @@ -231,8 +236,7 @@ int emith_last_idx; int i; for (i = 0; i < 2; i++) EMIT_PUSHOP(); \ } while (0) -#define emith_insn_ptr() (u8 *)((u32 *)tcache_ptr + \ - (emith_last_insns[0] != -1) + (emith_last_insns[1] != -1)) +#define emith_insn_ptr() (u8 *)((u32 *)tcache_ptr - emith_last_cnt) // delay slot stuff static int emith_is_j(u32 op) // J, JAL @@ -305,12 +309,14 @@ static void *emith_branch(u32 op) } if (bop) { // can swap + tcache_ptr = (void *)((u32 *)tcache_ptr - emith_last_cnt); if (emith_last_insns[idx^1] != -1) EMIT_PTR(tcache_ptr, emith_last_insns[idx^1]); bp = tcache_ptr; EMIT_PTR(tcache_ptr, bop); COUNT_OP; EMIT_PTR(tcache_ptr, emith_last_insns[idx]); emith_last_insns[0] = emith_last_insns[1] = -1; + emith_last_cnt = 0; } else { // can't swap emith_flush(); bp = tcache_ptr; @@ -325,13 +331,13 @@ static void *emith_branch(u32 op) ptr = emith_branch(MIPS_BCONDZ(cond_m, cond_r, 0)); #define JMP_EMIT(cond, ptr) { \ - u32 val_ = emith_insn_ptr() - (u8 *)(ptr) - 4; \ + u32 val_ = (u8 *)tcache_ptr - (u8 *)(ptr) - 4; \ EMIT_PTR(ptr, MIPS_BCONDZ(cond_m, cond_r, val_ & 0x0003ffff)); \ emith_flush(); /* NO delay slot handling across jump targets */ \ } #define JMP_EMIT_NC(ptr) { \ - u32 val_ = emith_insn_ptr() - (u8 *)(ptr) - 4; \ + u32 val_ = (u8 *)tcache_ptr - (u8 *)(ptr) - 4; \ EMIT_PTR(ptr, MIPS_B(val_ & 0x0003ffff)); \ emith_flush(); \ } @@ -881,14 +887,14 @@ static u8 *last_lohi; static void emith_lohi_nops(void) { u32 d; - while ((d = emith_insn_ptr() - last_lohi) < 8 && d >= 0) EMIT(MIPS_NOP); + while ((d = (u8 *)tcache_ptr - last_lohi) < 8 && d >= 0) EMIT(MIPS_NOP); } #define emith_mul(d, s1, s2) do { \ emith_lohi_nops(); \ EMIT(MIPS_MULTU(s1, s2)); \ EMIT(MIPS_MFLO(d)); \ - last_lohi = emith_insn_ptr(); \ + last_lohi = (u8 *)tcache_ptr; \ } while (0) #define emith_mul_u64(dlo, dhi, s1, s2) do { \ @@ -896,7 +902,7 @@ static void emith_lohi_nops(void) EMIT(MIPS_MULTU(s1, s2)); \ EMIT(MIPS_MFLO(dlo)); \ EMIT(MIPS_MFHI(dhi)); \ - last_lohi = emith_insn_ptr(); \ + last_lohi = (u8 *)tcache_ptr; \ } while (0) #define emith_mul_s64(dlo, dhi, s1, s2) do { \ @@ -904,7 +910,7 @@ static void emith_lohi_nops(void) EMIT(MIPS_MULT(s1, s2)); \ EMIT(MIPS_MFLO(dlo)); \ EMIT(MIPS_MFHI(dhi)); \ - last_lohi = emith_insn_ptr(); \ + last_lohi = (u8 *)tcache_ptr; \ } while (0) #define emith_mula_s64(dlo, dhi, s1, s2) do { \ @@ -915,7 +921,7 @@ static void emith_lohi_nops(void) emith_add_r_r(dlo, AT); \ EMIT(MIPS_SLTU_REG(t_, dlo, AT)); \ EMIT(MIPS_MFHI(AT)); \ - last_lohi = emith_insn_ptr(); \ + last_lohi = (u8 *)tcache_ptr; \ emith_add_r_r(dhi, AT); \ emith_add_r_r(dhi, t_); \ rcache_free_tmp(t_); \ @@ -1174,14 +1180,14 @@ static int emith_cond_check(int cond, int *r) // NB: MIPS conditional branches have only +/- 128KB range #define emith_jump_cond(cond, target) do { \ int r_, mcond_ = emith_cond_check(cond, &r_); \ - u32 disp_ = (u8 *)target - emith_insn_ptr() - 4; \ + u32 disp_ = (u8 *)target - (u8 *)tcache_ptr - 4; \ if (disp_ >= 0xfffe0000 || disp_ <= 0x0001ffff) { /* can use near B */ \ emith_branch(MIPS_BCONDZ(mcond_,r_,disp_ & 0x0003ffff)); \ } else { /* far branch if near branch isn't possible */ \ mcond_ = emith_invert_branch(mcond_); \ u8 *bp = emith_branch(MIPS_BCONDZ(mcond_, r_, 0)); \ emith_branch(MIPS_J((uintptr_t)target & 0x0fffffff)); \ - EMIT_PTR(bp, MIPS_BCONDZ(mcond_, r_, emith_insn_ptr()-bp-4)); \ + EMIT_PTR(bp, MIPS_BCONDZ(mcond_, r_, (u8 *)tcache_ptr-bp-4)); \ } \ } while (0) @@ -1190,9 +1196,12 @@ static int emith_cond_check(int cond, int *r) mcond_ = emith_invert_branch(mcond_); \ u8 *bp = emith_branch(MIPS_BCONDZ(mcond_, r_, 0));\ emith_branch(MIPS_J((uintptr_t)target & 0x0fffffff)); \ - EMIT_PTR(bp, MIPS_BCONDZ(mcond_, r_, emith_insn_ptr()-bp-4)); \ + EMIT_PTR(bp, MIPS_BCONDZ(mcond_, r_, (u8 *)tcache_ptr-bp-4)); \ } while (0) +#define emith_jump_cond_inrange(target) \ + !(((u8 *)target - (u8 *)tcache_ptr + 0x10000) >> 18) + // NB: returns position of patch for cache maintenance #define emith_jump_patch(ptr, target) ({ \ u32 *ptr_ = (u32 *)ptr-1; /* must skip condition check code */ \ @@ -1261,6 +1270,7 @@ static int emith_cond_check(int cond, int *r) #define emith_pool_commit(j) /**/ // NB: mips32r2 has SYNCI #define host_instructions_updated(base, end) __builtin___clear_cache(base, end) +#define emith_update_cache() /**/ #define emith_jump_patch_size() 4 #define emith_rw_offs_max() 0x7fff diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index d515cd23..caade3a6 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -877,6 +877,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common ptr; \ }) +#define emith_jump_cond_inrange(ptr) !0 #define emith_jump_patch_size() 6 #define emith_jump_at(ptr, target) do { \ @@ -986,6 +987,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common } while (0) #define host_instructions_updated(base, end) +#define emith_update_cache() /**/ #define emith_rw_offs_max() 0xffffffff @@ -993,7 +995,6 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #define HOST_REGS 16 #define PTR_SCALE 3 -#define NA_TMP_REG xAX // non-arg tmp from reg_temp[] #define EMIT_XREX_IF(w, r, rm, rs) do { \ int xr_ = (r) > 7 ? 1 : 0; \ @@ -1078,7 +1079,6 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #define HOST_REGS 8 #define PTR_SCALE 2 -#define NA_TMP_REG xBX // non-arg tmp from reg_temp[] #define EMIT_REX_IF(w, r, rm) do { \ assert((u32)(r) < 8u); \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 677c8adf..6eaf7123 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -2920,6 +2920,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // mark memory for overwrite detection dr_mark_memory(1, block, tcache_id, 0); block->active = 1; + emith_update_cache(); return block->entryp[0].tcache_ptr; } @@ -3113,8 +3114,15 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp = rcache_get_tmp_arg(0); sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); emith_cmp_r_imm(sr, 0); - emith_move_r_imm_c(DCOND_LE, tmp, pc); - emith_jump_cond(DCOND_LE, sh2_drc_exit); + if (emith_jump_cond_inrange(sh2_drc_exit)) { + emith_move_r_imm_c(DCOND_LE, tmp, pc); + emith_jump_cond(DCOND_LE, sh2_drc_exit); + } else { + EMITH_JMP_START(DCOND_GT); + emith_move_r_imm(tmp, pc); + emith_jump(sh2_drc_exit); + EMITH_JMP_END(DCOND_GT); + } rcache_free_tmp(tmp); #if (DRC_DEBUG & 32) @@ -3249,7 +3257,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } } rcache_set_usage_now(opd[0].source); // current insn - rcache_set_usage_soon(late); // insns 1-3 + rcache_set_usage_soon(soon); // insns 1-3 rcache_set_usage_late(late & ~soon); // insns 4-9 rcache_set_usage_discard(write & ~(late|soon) & ~opd[0].source); @@ -4442,12 +4450,16 @@ end_op: fflush(stdout); #endif + emith_update_cache(); return block_entry_ptr; } static void sh2_generate_utils(void) { int arg0, arg1, arg2, arg3, sr, tmp, tmp2; +#if DRC_DEBUG + int hic = host_insn_count; // don't count utils for insn statistics +#endif host_arg2reg(arg0, 0); host_arg2reg(arg1, 1); @@ -4794,6 +4806,10 @@ static void sh2_generate_utils(void) host_dasm_new_symbol(sh2_drc_read16_poll); host_dasm_new_symbol(sh2_drc_read32_poll); #endif + +#if DRC_DEBUG + host_insn_count = hic; +#endif } static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nolit, int free) @@ -4847,6 +4863,7 @@ static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nol bd->addr = bd->size = bd->addr_lit = bd->size_lit = 0; bd->entry_count = 0; } + emith_update_cache(); } static void sh2_smc_rm_blocks(u32 a, int tcache_id, u32 shift) @@ -5197,6 +5214,7 @@ int sh2_drc_init(SH2 *sh2) tcache_ptr = tcache; sh2_generate_utils(); host_instructions_updated(tcache, tcache_ptr); + emith_update_cache(); tcache_bases[0] = tcache_ptrs[0] = tcache_ptr; tcache_limit[0] = tcache_bases[0] + tcache_sizes[0] - (tcache_ptr-tcache); diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index 1ad922b7..187ad716 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -33,26 +33,24 @@ unsigned short scan_block(unsigned int base_pc, int is_slave, #if defined(DRC_SH2) // direct access to some host CPU registers used by the DRC -// XXX MUST match definitions in cpu/sh2/compiler.c +// XXX MUST match definitions for SHR_SR in cpu/sh2/compiler.c #if defined(__arm__) -#define DRC_SR_REG r10 +#define DRC_SR_REG "r10" #elif defined(__aarch64__) -#define DRC_SR_REG r22 +#define DRC_SR_REG "r22" #elif defined(__mips__) -#define DRC_SR_REG s6 +#define DRC_SR_REG "s6" #elif defined(__i386__) -#define DRC_SR_REG edi +#define DRC_SR_REG "edi" #elif defined(__x86_64__) -#define DRC_SR_REG ebx +#define DRC_SR_REG "ebx" #else #warning "direct DRC register access not available for this host" #endif #endif #ifdef DRC_SR_REG -#define __DRC_DECLARE_SR(SR) register int sh2_sr asm(#SR) -#define _DRC_DECLARE_SR(SR) __DRC_DECLARE_SR(SR) -#define DRC_DECLARE_SR _DRC_DECLARE_SR(DRC_SR_REG) +#define DRC_DECLARE_SR register int sh2_sr asm(DRC_SR_REG) #define DRC_SAVE_SR(sh2) \ if ((sh2->state & (SH2_STATE_RUN|SH2_STATE_SLEEP)) == SH2_STATE_RUN) \ sh2->sr = sh2_sr; diff --git a/pico/32x/32x.c b/pico/32x/32x.c index e9d8ff6d..f6d1a153 100644 --- a/pico/32x/32x.c +++ b/pico/32x/32x.c @@ -471,7 +471,7 @@ void sync_sh2s_normal(unsigned int m68k_target) if (!(ssh2.state & SH2_IDLE_STATES)) { cycles = target - ssh2.m68krcycles_done; if (cycles > 0) { - run_sh2(&ssh2, cycles > 20 ? cycles : 20); + run_sh2(&ssh2, cycles > 20U ? cycles : 20U); if (event_time_next && CYCLES_GT(target, event_time_next)) target = event_time_next; @@ -483,7 +483,7 @@ void sync_sh2s_normal(unsigned int m68k_target) if (!(msh2.state & SH2_IDLE_STATES)) { cycles = target - msh2.m68krcycles_done; if (cycles > 0) { - run_sh2(&msh2, cycles > 20 ? cycles : 20); + run_sh2(&msh2, cycles > 20U ? cycles : 20U); if (event_time_next && CYCLES_GT(target, event_time_next)) target = event_time_next; -- 2.39.5