From d760c90f3a5537231ff0aeaec308ea149f150ba8 Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 4 Apr 2019 20:29:39 +0200 Subject: [PATCH] added branch cache to sh2 drc to improve cross-tcache jump speed --- cpu/drc/emit_arm.c | 15 +++++- cpu/drc/emit_x86.c | 29 +++++++++++- cpu/sh2/compiler.c | 112 +++++++++++++++++++++++++++++++++++++++------ cpu/sh2/sh2.h | 3 ++ 4 files changed, 143 insertions(+), 16 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index c255a8b8..3f782bb6 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -179,6 +179,7 @@ /* ldr and str */ #define EOP_LDR_IMM2(cond,rd,rn,offset_12) EOP_C_AM2_IMM(cond,1,0,1,rn,rd,offset_12) #define EOP_LDRB_IMM2(cond,rd,rn,offset_12) EOP_C_AM2_IMM(cond,1,1,1,rn,rd,offset_12) +#define EOP_STR_IMM2(cond,rd,rn,offset_12) EOP_C_AM2_IMM(cond,(offset_12) >= 0,0,0,rn,rd,abs(offset_12)) #define EOP_LDR_IMM( rd,rn,offset_12) EOP_C_AM2_IMM(A_COND_AL,1,0,1,rn,rd,offset_12) #define EOP_LDR_NEGIMM(rd,rn,offset_12) EOP_C_AM2_IMM(A_COND_AL,0,0,1,rn,rd,offset_12) @@ -478,6 +479,9 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_add_r_r(d, s) \ emith_add_r_r_r(d, d, s) +#define emith_add_r_r_ptr(d, s) \ + emith_add_r_r_r(d, d, s) + #define emith_sub_r_r(d, s) \ EOP_SUB_REG(A_COND_AL,0,d,d,s,A_AM1_LSL,0) @@ -684,6 +688,8 @@ static int emith_xbranch(int cond, void *target, int is_call) // misc #define emith_read_r_r_offs_c(cond, r, rs, offs) \ EOP_LDR_IMM2(cond, r, rs, offs) +#define emith_read_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_read_r_r_offs_c(cond, r, rs, offs) #define emith_read_r_r_r_c(cond, r, rs, rm) \ EOP_LDR_REG_LSL(cond, r, rs, rm, 0) #define emith_read_r_r_r(r, rs, rm) \ @@ -716,8 +722,15 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_read16_r_r_offs(r, rs, offs) \ emith_read16_r_r_offs_c(A_COND_AL, r, rs, offs) +#define emith_write_r_r_offs_c(cond, r, rs, offs) \ + EOP_STR_IMM2(cond, r, rs, offs) +#define emith_write_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_write_r_r_offs_c(cond, r, rs, offs) + +#define emith_ctx_read_c(cond, r, offs) \ + emith_read_r_r_offs_c(cond, r, CONTEXT_REG, offs) #define emith_ctx_read(r, offs) \ - emith_read_r_r_offs(r, CONTEXT_REG, offs) + emith_ctx_read_c(A_COND_AL, r, offs) #define emith_ctx_read_ptr(r, offs) \ emith_ctx_read(r, offs) diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 816e9294..58476a94 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -122,7 +122,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; EMIT_OP_MODRM(0x01, 3, s, d) #define emith_add_r_r_ptr(d, s) do { \ - EMIT_REX_IF(1, dst, src); \ + EMIT_REX_IF(1, s, d); \ EMIT_OP_MODRM64(0x01, 3, s, d); \ } while (0) @@ -260,6 +260,21 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; } \ } while (0) +// _r_r_r_shift +#define emith_add_r_r_r_lsl(d, s1, s2, lslimm) do { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsl(tmp_, s2, lslimm); \ + emith_add_r_r_r(d, s1, tmp_); \ + rcache_free_tmp(tmp_); \ +} while (0) + +#define emith_add_r_r_r_lsr(d, s1, s2, lslimm) do { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsr(tmp_, s2, lslimm); \ + emith_add_r_r_r(d, s1, tmp_); \ + rcache_free_tmp(tmp_); \ +} while (0) + // _r_r_shift #define emith_or_r_r_lsl(d, s, lslimm) do { \ int tmp_ = rcache_get_tmp(); \ @@ -361,8 +376,12 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_read_r_r_offs_c(cond, r, rs, offs) \ emith_read_r_r_offs(r, rs, offs) +#define emith_read_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_read_r_r_offs_ptr(r, rs, offs) #define emith_write_r_r_offs_c(cond, r, rs, offs) \ emith_write_r_r_offs(r, rs, offs) +#define emith_write_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_write_r_r_offs_ptr(r, rs, offs) #define emith_read8_r_r_offs_c(cond, r, rs, offs) \ emith_read8_r_r_offs(r, rs, offs) #define emith_write8_r_r_offs_c(cond, r, rs, offs) \ @@ -583,9 +602,15 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_read_r_r_offs(r, rs, offs) \ emith_deref_op(0x8b, r, rs, offs) +#define emith_read_r_r_offs_ptr(r, rs, offs) \ + EMIT_REX_IF(1, r, rs); \ + emith_deref_op(0x8b, r, rs, offs) #define emith_write_r_r_offs(r, rs, offs) \ emith_deref_op(0x89, r, rs, offs) +#define emith_write_r_r_offs_ptr(r, rs, offs) \ + EMIT_REX_IF(1, r, rs); \ + emith_deref_op(0x89, r, rs, offs) // note: don't use prefixes on this #define emith_read8_r_r_offs(r, rs, offs) do { \ @@ -664,6 +689,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_ctx_read(r, offs) \ emith_read_r_r_offs(r, CONTEXT_REG, offs) +#define emith_ctx_read_c(cond, r, offs) \ + emith_ctx_read(r, offs) #define emith_ctx_read_ptr(r, offs) do { \ EMIT_REX_IF(1, r, CONTEXT_REG); \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index bfd98e2b..d54d204e 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -38,6 +38,7 @@ // features #define PROPAGATE_CONSTANTS 1 #define LINK_BRANCHES 1 +#define BRANCH_CACHE 1 #define ALIAS_REGISTERS 1 #define REMAP_REGISTER 1 @@ -57,10 +58,11 @@ // 10 - smc self-check // 100 - write trace // 200 - compare trace -// 400 - print block entry backtrace +// 400 - block entry backtraceA on exit +// 800 - state dump on exit // { #ifndef DRC_DEBUG -#define DRC_DEBUG 0 +#define DRC_DEBUG 0x800 #endif #if DRC_DEBUG @@ -159,8 +161,6 @@ static char sh2dasm_buff[64]; #define do_host_disasm(x) #endif -#if (DRC_DEBUG & (8|256|512|1024)) || defined(PDB) - #define SH2_DUMP(sh2, reason) { \ char ms = (sh2)->is_slave ? 's' : 'm'; \ printf("%csh2 %s %08x\n", ms, reason, (sh2)->pc); \ @@ -178,6 +178,8 @@ static char sh2dasm_buff[64]; (sh2)->pdb_io_csum[0], (sh2)->pdb_io_csum[1], (sh2)->state, \ (sh2)->poll_addr, (sh2)->poll_cycles, (sh2)->poll_cnt); \ } + +#if (DRC_DEBUG & (8|256|512|1024)) || defined(PDB) static SH2 csh2[2][4]; static void REGPARM(3) *sh2_drc_log_entry(void *block, SH2 *sh2, u32 sr) { @@ -631,6 +633,14 @@ static void REGPARM(1) flush_tcache(int tcid) memset(Pico32xMem->drcblk_da[tcid - 1], 0, sizeof(Pico32xMem->drcblk_da[0])); } +#if BRANCH_CACHE + if (tcid) + memset32(sh2s[tcid-1].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); + else { + memset32(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); + memset32(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)/4); + } +#endif #if (DRC_DEBUG & 4) tcache_dsm_ptrs[tcid] = tcache_bases[tcid]; #endif @@ -3727,14 +3737,35 @@ static void sh2_generate_utils(void) // sh2_drc_dispatcher(void) sh2_drc_dispatcher = (void *)tcache_ptr; - sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); - emith_cmp_r_imm(sr, 0); - emith_jump_cond(DCOND_LT, sh2_drc_exit); - rcache_invalidate(); emith_ctx_read(arg0, SHR_PC * 4); +#if BRANCH_CACHE + // check if PC is in branch target cache + emith_and_r_r_imm(arg1, arg0, (ARRAY_SIZE(sh2s->branch_cache)-1)*4); + // TODO implement emith_add_r_r_r_lsl_ptr, saves one insn on 32bit ARM + emith_lsl(arg1, arg1, sizeof(void *) == 8 ? 2 : 1); + emith_add_r_r_ptr(arg1, CONTEXT_REG); + emith_read_r_r_offs(arg2, arg1, offsetof(SH2, branch_cache)); + emith_cmp_r_r(arg2, arg0); + EMITH_SJMP_START(DCOND_NE); + emith_read_r_r_offs_ptr_c(DCOND_EQ, RET_REG, arg1, offsetof(SH2, branch_cache) + sizeof(void *)); + emith_jump_reg_c(DCOND_EQ, RET_REG); + EMITH_SJMP_END(DCOND_NE); +#endif emith_ctx_read(arg1, offsetof(SH2, is_slave)); emith_add_r_r_ptr_imm(arg2, CONTEXT_REG, offsetof(SH2, drc_tmp)); emith_call(dr_lookup_block); +#if BRANCH_CACHE + // store PC and block entry ptr (in arg0) in branch target cache + emith_tst_r_r_ptr(RET_REG, RET_REG); + EMITH_SJMP_START(DCOND_EQ); + emith_ctx_read_c(DCOND_NE, arg2, SHR_PC * 4); + emith_and_r_r_imm(arg1, arg2, (ARRAY_SIZE(sh2s->branch_cache)-1)*4); + emith_lsl(arg1, arg1, sizeof(void *) == 8 ? 2 : 1); + emith_add_r_r_ptr(arg1, CONTEXT_REG); + emith_write_r_r_offs_c(DCOND_NE, arg2, arg1, offsetof(SH2, branch_cache)); + emith_write_r_r_offs_ptr_c(DCOND_NE, RET_REG, arg1, offsetof(SH2, branch_cache) + sizeof(void *)); + EMITH_SJMP_END(DCOND_EQ); +#endif emit_block_entry(); // lookup failed, call sh2_translate() emith_move_r_r_ptr(arg0, CONTEXT_REG); @@ -3904,6 +3935,15 @@ static void sh2_smc_rm_block(struct block_desc *bd, int tcache_id, u32 ram_mask) bd->addr = bd->size = bd->size_nolit = 0; bd->entry_count = 0; + +#if BRANCH_CACHE + if (tcache_id) + memset32(sh2s[tcache_id-1].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); + else { + memset32(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); + memset32(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)/4); + } +#endif } /* @@ -4015,9 +4055,9 @@ int sh2_execute_drc(SH2 *sh2c, int cycles) return ret_cycles; } -#if (DRC_DEBUG & 2) -void block_stats(void) +static void block_stats(void) { +#if (DRC_DEBUG & 2) int c, b, i, total = 0; printf("block stats:\n"); @@ -4048,12 +4088,10 @@ void block_stats(void) for (b = 0; b < ARRAY_SIZE(block_tables); b++) for (i = 0; i < block_counts[b]; i++) block_tables[b][i].refcount = 0; -} -#else -#define block_stats() #endif +} -void sh2_drc_flush_all(void) +static void backtrace(void) { #if (DRC_DEBUG & 1024) int i; @@ -4064,6 +4102,52 @@ void sh2_drc_flush_all(void) for (i = 0; i < ARRAY_SIZE(csh2[1]); i++) SH2_DUMP(&csh2[1][i], "bt ssh2"); #endif +} + +static void state_dump(void) +{ +#if (DRC_DEBUG & 2048) + int i; + + SH2_DUMP(&sh2s[0], "master"); + printf("VBR msh2: %x\n", sh2s[0].vbr); + for (i = 0; i < 0x60; i++) { + printf("%08x ",p32x_sh2_read32(sh2s[0].vbr + i*4, &sh2s[0])); + if ((i+1) % 8 == 0) printf("\n"); + } + printf("stack msh2: %x\n", sh2s[0].r[15]); + for (i = -0x30; i < 0x30; i++) { + printf("%08x ",p32x_sh2_read32(sh2s[0].r[15] + i*4, &sh2s[0])); + if ((i+1) % 8 == 0) printf("\n"); + } + printf("branch cache master:\n"); + for (i = 0; i < ARRAY_SIZE(sh2s[0].branch_cache); i++) { + printf("%08x ",sh2s[0].branch_cache[i].pc); + if ((i+1) % 8 == 0) printf("\n"); + } + SH2_DUMP(&sh2s[1], "slave"); + printf("VBR ssh2: %x\n", sh2s[1].vbr); + for (i = 0; i < 0x60; i++) { + printf("%08x ",p32x_sh2_read32(sh2s[1].vbr + i*4, &sh2s[1])); + if ((i+1) % 8 == 0) printf("\n"); + } + printf("stack ssh2: %x\n", sh2s[1].r[15]); + for (i = -0x30; i < 0x30; i++) { + printf("%08x ",p32x_sh2_read32(sh2s[1].r[15] + i*4, &sh2s[1])); + if ((i+1) % 8 == 0) printf("\n"); + } + printf("branch cache slave:\n"); + for (i = 0; i < ARRAY_SIZE(sh2s[1].branch_cache); i++) { + printf("%08x ",sh2s[1].branch_cache[i].pc); + if ((i+1) % 8 == 0) printf("\n"); + } +#endif +} + +void sh2_drc_flush_all(void) +{ + backtrace(); + state_dump(); block_stats(); flush_tcache(0); flush_tcache(1); diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h index a073d43f..e53bbf05 100644 --- a/cpu/sh2/sh2.h +++ b/cpu/sh2/sh2.h @@ -50,6 +50,9 @@ typedef struct SH2_ int poll_cycles; int poll_cnt; + // DRC branch cache. size must be 2^n and <=128 + struct { unsigned int pc; void *code; } branch_cache[128]; + // interpreter stuff int icount; // cycles left in current timeslice unsigned int ea; -- 2.39.5