#define EOP_STR_SIMPLE(rd,rn) EOP_C_AM2_IMM(A_COND_AL,1,0,0,rn,rd,0)
#define EOP_LDR_REG_LSL(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,0,1,rn,rd,shift_imm,A_AM1_LSL,rm)
+#define EOP_LDR_REG_LSL_WB(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,0,3,rn,rd,shift_imm,A_AM1_LSL,rm)
#define EOP_LDRB_REG_LSL(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,1,1,rn,rd,shift_imm,A_AM1_LSL,rm);
+#define EOP_STR_REG_LSL_WB(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,0,2,rn,rd,shift_imm,A_AM1_LSL,rm)
#define EOP_LDRH_IMM2(cond,rd,rn,offset_8) EOP_C_AM3_IMM(cond,(offset_8) >= 0,1,rn,rd,0,1,abs(offset_8))
#define EOP_LDRH_REG2(cond,rd,rn,rm) EOP_C_AM3_REG(cond,1,1,rn,rd,0,1,rm)
EOP_LDR_REG_LSL(cond, r, rs, rm, 0)
#define emith_read_r_r_offs(r, rs, offs) \
emith_read_r_r_offs_c(A_COND_AL, r, rs, offs)
+#define emith_read_r_r_offs_ptr(r, rs, offs) \
+ emith_read_r_r_offs_c(A_COND_AL, r, rs, offs)
#define emith_read_r_r_r(r, rs, rm) \
EOP_LDR_REG_LSL(A_COND_AL, r, rs, rm, 0)
+#define emith_read_r_r_r_wb(r, rs, rm) \
+ EOP_LDR_REG_LSL_WB(A_COND_AL, r, rs, rm, 0)
#define emith_read8_r_r_offs_c(cond, r, rs, offs) \
EOP_LDRB_IMM2(cond, r, rs, offs)
EOP_STR_IMM2(cond, r, rs, offs)
#define emith_write_r_r_offs_ptr_c(cond, r, rs, offs) \
emith_write_r_r_offs_c(cond, r, rs, offs)
+#define emith_write_r_r_offs(r, rs, offs) \
+ emith_write_r_r_offs_c(A_COND_AL, r, rs, offs)
+#define emith_write_r_r_offs_ptr(r, rs, offs) \
+ emith_write_r_r_offs_c(A_COND_AL, r, rs, offs)
+#define emith_write_r_r_r_wb(r, rs, rm) \
+ EOP_STR_REG_LSL_WB(A_COND_AL, r, rs, rm, 0)
#define emith_ctx_read_c(cond, r, offs) \
emith_read_r_r_offs_c(cond, r, CONTEXT_REG, offs)
emith_jump_ctx(offs); \
} while (0)
+#define emith_call_link(r, target) do { \
+ emith_move_r_r(r, PC); \
+ emith_jump(target); \
+} while (0)
+
#define emith_ret_c(cond) \
emith_jump_reg_c(cond, LR)
#define PROPAGATE_CONSTANTS 1
#define LINK_BRANCHES 1
#define BRANCH_CACHE 1
+#define CALL_STACK 0
#define ALIAS_REGISTERS 1
#define REMAP_REGISTER 1
#define LOOP_DETECTION 1
// 08 - runtime block entry log
// 10 - smc self-check
// 20 - runtime block entry counter
+// 80 - branch cache statistics
// 100 - write trace
// 200 - compare trace
// 400 - block entry backtrace on exit
// 800 - state dump on exit
// {
#ifndef DRC_DEBUG
-#define DRC_DEBUG 0
+#define DRC_DEBUG 0x0
#endif
#if DRC_DEBUG
#define HASH_FUNC(hash_tab, addr, mask) \
(hash_tab)[(((addr) >> 20) ^ ((addr) >> 2)) & (mask)]
+#if (DRC_DEBUG & 128)
+#if BRANCH_CACHE
+int bchit, bcmiss;
+#endif
+#if CALL_STACK
+int rchit, rcmiss;
+#endif
+#endif
+
// host register tracking
enum {
HR_FREE,
static void REGPARM(1) (*sh2_drc_entry)(SH2 *sh2);
static void (*sh2_drc_dispatcher)(void);
+#if CALL_STACK
+static void REGPARM(1) (*sh2_drc_dispatcher_call)(uptr host_pc);
+static void (*sh2_drc_dispatcher_return)(void);
+#endif
static void (*sh2_drc_exit)(void);
static void (*sh2_drc_test_irq)(void);
memset(Pico32xMem->drclit_ram, 0, sizeof(Pico32xMem->drclit_ram));
memset(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache));
memset(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache));
+ memset(sh2s[0].rts_cache, -1, sizeof(sh2s[0].rts_cache));
+ memset(sh2s[1].rts_cache, -1, sizeof(sh2s[1].rts_cache));
+ sh2s[0].rts_cache_idx = sh2s[1].rts_cache_idx = 0;
} else {
memset(Pico32xMem->drcblk_ram, 0, sizeof(Pico32xMem->drcblk_ram));
memset(Pico32xMem->drclit_ram, 0, sizeof(Pico32xMem->drclit_ram));
memset(Pico32xMem->drcblk_da[tcid - 1], 0, sizeof(Pico32xMem->drcblk_da[tcid - 1]));
memset(Pico32xMem->drclit_da[tcid - 1], 0, sizeof(Pico32xMem->drclit_da[tcid - 1]));
memset(sh2s[tcid - 1].branch_cache, -1, sizeof(sh2s[0].branch_cache));
+ memset(sh2s[tcid - 1].rts_cache, -1, sizeof(sh2s[0].rts_cache));
+ sh2s[tcid - 1].rts_cache_idx = 0;
}
}
#if (DRC_DEBUG & 4)
static u8 *dr_prepare_cache(int tcache_id, int insn_count)
{
-#if BRANCH_CACHE
u8 *limit = tcache_limit[tcache_id];
-#endif
// if no block desc available
if (block_counts[tcache_id] == block_limit[tcache_id])
while (tcache_limit[tcache_id] - tcache_ptrs[tcache_id] < insn_count * 128)
dr_free_oldest_block(tcache_id);
-#if BRANCH_CACHE
if (limit != tcache_limit[tcache_id]) {
+#if BRANCH_CACHE
if (tcache_id)
memset32(sh2s[tcache_id-1].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4);
else {
memset32(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4);
memset32(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)/4);
}
- }
#endif
+#if CALL_STACK
+ if (tcache_id) {
+ memset32(sh2s[tcache_id-1].rts_cache, -1, sizeof(sh2s[0].rts_cache)/4);
+ sh2s[tcache_id-1].rts_cache_idx = 0;
+ } else {
+ memset32(sh2s[0].rts_cache, -1, sizeof(sh2s[0].rts_cache)/4);
+ memset32(sh2s[1].rts_cache, -1, sizeof(sh2s[1].rts_cache)/4);
+ sh2s[0].rts_cache_idx = sh2s[1].rts_cache_idx = 0;
+ }
+#endif
+ }
return (u8 *)tcache_ptrs[tcache_id];
}
// branch handling
if (drcf.pending_branch_direct)
{
- struct op_data *opd_b =
- (op_flags[i] & OF_DELAY_OP) ? opd-1 : opd;
+ struct op_data *opd_b = (op_flags[i] & OF_DELAY_OP) ? opd-1 : opd;
u32 target_pc = opd_b->imm;
int cond = -1;
void *target = NULL;
int ctaken = 0;
- if (OP_ISBRACND(opd_b->op)) {
+ if (OP_ISBRACND(opd_b->op))
ctaken = (op_flags[i] & OF_DELAY_OP) ? 1 : 2;
- }
cycles += ctaken; // assume branch taken
#if LOOP_DETECTION
if ((drcf.loop_type == OF_IDLE_LOOP ||
emit_move_r_imm32(SHR_PC, target_pc);
rcache_clean();
- target = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id);
- if (target == NULL)
- return NULL;
+#if CALL_STACK
+ if ((opd_b->dest & BITMASK1(SHR_PR)) && pc+2 < end_pc) {
+ // BSR
+ tmp = rcache_get_tmp_arg(0);
+ emith_call_link(tmp, sh2_drc_dispatcher_call);
+ rcache_free_tmp(tmp);
+ } else
+#endif
+ target = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id);
}
if (cond != -1) {
emith_jump_cond_patchable(cond, target);
}
- else {
+ else if (target != NULL) {
emith_jump_patchable(target);
rcache_invalidate();
}
drcf.polling = drcf.loop_type = 0;
}
else if (drcf.pending_branch_indirect) {
- struct op_data *opd_b =
- (op_flags[i] & OF_DELAY_OP) ? opd-1 : opd;
void *target;
u32 target_pc;
sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL);
FLUSH_CYCLES(sr);
rcache_clean();
+#if CALL_STACK
+ struct op_data *opd_b = (op_flags[i] & OF_DELAY_OP) ? opd-1 : opd;
+ if (opd_b->rm == SHR_PR) {
+ // RTS
+ emith_jump(sh2_drc_dispatcher_return);
+ } else if ((opd_b->dest & BITMASK1(SHR_PR)) && pc+2 < end_pc) {
+ // JSR/BSRF
+ tmp = rcache_get_tmp_arg(0);
+ emith_call_link(tmp, sh2_drc_dispatcher_call);
+ } else
+#endif
if (gconst_get(SHR_PC, &target_pc)) {
// JMP const, treat like unconditional direct branch
target = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id);
- if (target == NULL)
- return NULL;
emith_jump_patchable(target);
} else {
// JMP
emith_sh2_drc_exit();
emith_flush();
+#if CALL_STACK
+ // sh2_drc_dispatcher_call(uptr host_pc)
+ sh2_drc_dispatcher_call = (void *)tcache_ptr;
+ emith_ctx_read(arg2, offsetof(SH2, rts_cache_idx));
+ emith_add_r_imm(arg2, 2*sizeof(void *));
+ emith_and_r_imm(arg2, (ARRAY_SIZE(sh2s->rts_cache)-1) * 2*sizeof(void *));
+ emith_ctx_write(arg2, offsetof(SH2, rts_cache_idx));
+ emith_add_r_r_ptr_imm(arg1, CONTEXT_REG, offsetof(SH2, rts_cache));
+ emith_ctx_read(arg3, offsetof(SH2, pr));
+ emith_write_r_r_r_wb(arg3, arg1, arg2);
+ emith_write_r_r_offs_ptr(arg0, arg1, sizeof(void *));
+ emith_flush();
+ // FALLTHROUGH
+#endif
// sh2_drc_dispatcher(void)
sh2_drc_dispatcher = (void *)tcache_ptr;
emith_ctx_read(arg0, SHR_PC * 4);
emith_read_r_r_offs(arg2, arg1, offsetof(SH2, branch_cache));
emith_cmp_r_r(arg2, arg0);
EMITH_SJMP_START(DCOND_NE);
+#if (DRC_DEBUG & 128)
+ emith_move_r_ptr_imm(arg2, (uptr)&bchit);
+ emith_read_r_r_offs_c(DCOND_EQ, arg3, arg2, 0);
+ emith_add_r_imm_c(DCOND_EQ, arg3, 1);
+ emith_write_r_r_offs_c(DCOND_EQ, arg3, arg2, 0);
+#endif
emith_read_r_r_offs_ptr_c(DCOND_EQ, RET_REG, arg1, offsetof(SH2, branch_cache) + sizeof(void *));
emith_jump_reg_c(DCOND_EQ, RET_REG);
EMITH_SJMP_END(DCOND_NE);
// store PC and block entry ptr (in arg0) in branch target cache
emith_tst_r_r_ptr(RET_REG, RET_REG);
EMITH_SJMP_START(DCOND_EQ);
+#if (DRC_DEBUG & 128)
+ emith_move_r_ptr_imm(arg2, (uptr)&bcmiss);
+ emith_read_r_r_offs_c(DCOND_NE, arg3, arg2, 0);
+ emith_add_r_imm_c(DCOND_NE, arg3, 1);
+ emith_write_r_r_offs_c(DCOND_NE, arg3, arg2, 0);
+#endif
emith_ctx_read_c(DCOND_NE, arg2, SHR_PC * 4);
emith_and_r_r_imm(arg1, arg2, (ARRAY_SIZE(sh2s->branch_cache)-1)*4);
emith_add_r_r_r_lsl_ptr(arg1, CONTEXT_REG, arg1, sizeof(void *) == 8 ? 2 : 1);
emith_call(dr_failure);
emith_flush();
+#if CALL_STACK
+ // sh2_drc_dispatcher_return(void)
+ sh2_drc_dispatcher_return = (void *)tcache_ptr;
+ emith_ctx_read(arg2, offsetof(SH2, rts_cache_idx));
+ emith_add_r_r_ptr_imm(arg1, CONTEXT_REG, offsetof(SH2, rts_cache));
+ emith_ctx_read(arg0, offsetof(SH2, pc));
+ emith_read_r_r_r_wb(arg3, arg1, arg2);
+ emith_cmp_r_r(arg0, arg3);
+#if (DRC_DEBUG & 128)
+ EMITH_SJMP_START(DCOND_EQ);
+ emith_move_r_ptr_imm(arg2, (uptr)&rcmiss);
+ emith_read_r_r_offs_c(DCOND_NE, arg1, arg2, 0);
+ emith_add_r_imm_c(DCOND_NE, arg1, 1);
+ emith_write_r_r_offs_c(DCOND_NE, arg1, arg2, 0);
+ EMITH_SJMP_END(DCOND_EQ);
+#endif
+ emith_jump_cond(DCOND_NE, sh2_drc_dispatcher);
+ emith_read_r_r_offs_ptr(arg0, arg1, sizeof(void *));
+ emith_sub_r_imm(arg2, 2*sizeof(void *));
+ emith_and_r_imm(arg2, (ARRAY_SIZE(sh2s->rts_cache)-1) * 2*sizeof(void *));
+ emith_ctx_write(arg2, offsetof(SH2, rts_cache_idx));
+#if (DRC_DEBUG & 128)
+ emith_move_r_ptr_imm(arg2, (uptr)&rchit);
+ emith_read_r_r_offs(arg1, arg2, 0);
+ emith_add_r_imm(arg1, 1);
+ emith_write_r_r_offs(arg1, arg2, 0);
+#endif
+ emith_jump_reg(arg0);
+ emith_flush();
+#endif
+
// sh2_drc_test_irq(void)
// assumes it's called from main function (may jump to dispatcher)
sh2_drc_test_irq = (void *)tcache_ptr;
#if (DRC_DEBUG & 4)
host_dasm_new_symbol(sh2_drc_entry);
host_dasm_new_symbol(sh2_drc_dispatcher);
+#if CALL_STACK
+ host_dasm_new_symbol(sh2_drc_dispatcher_call);
+ host_dasm_new_symbol(sh2_drc_dispatcher_return);
+#endif
host_dasm_new_symbol(sh2_drc_exit);
host_dasm_new_symbol(sh2_drc_test_irq);
host_dasm_new_symbol(sh2_drc_write8);
memset32(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)/4);
}
#endif
+#if CALL_STACK
+ if (tcache_id) {
+ memset32(sh2s[tcache_id-1].rts_cache, -1, sizeof(sh2s[0].rts_cache)/4);
+ sh2s[tcache_id-1].rts_cache_idx = 0;
+ } else {
+ memset32(sh2s[0].rts_cache, -1, sizeof(sh2s[0].rts_cache)/4);
+ memset32(sh2s[1].rts_cache, -1, sizeof(sh2s[1].rts_cache)/4);
+ sh2s[0].rts_cache_idx = sh2s[1].rts_cache_idx = 0;
+ }
+#endif
}
void sh2_drc_wcheck_ram(unsigned int a, int val, SH2 *sh2)
printf("%08x ",p32x_sh2_read32(sh2s[0].r[15] + i*4, &sh2s[0]));
if ((i+1) % 8 == 0) printf("\n");
}
- printf("branch cache master:\n");
- for (i = 0; i < ARRAY_SIZE(sh2s[0].branch_cache); i++) {
- printf("%08x ",sh2s[0].branch_cache[i].pc);
- if ((i+1) % 8 == 0) printf("\n");
- }
SH2_DUMP(&sh2s[1], "slave");
printf("VBR ssh2: %x\n", sh2s[1].vbr);
for (i = 0; i < 0x60; i++) {
printf("%08x ",p32x_sh2_read32(sh2s[1].r[15] + i*4, &sh2s[1]));
if ((i+1) % 8 == 0) printf("\n");
}
+#endif
+}
+
+static void bcache_stats(void)
+{
+#if (DRC_DEBUG & 128)
+ int i;
+#if CALL_STACK
+ for (i = 1; i < ARRAY_SIZE(sh2s->rts_cache); i++)
+ if (sh2s[0].rts_cache[i].pc == -1 && sh2s[1].rts_cache[i].pc == -1) break;
+
+ printf("return cache hits:%d misses:%d depth: %d\n", rchit, rcmiss, i);
+#endif
+#if BRANCH_CACHE
+ printf("branch cache hits:%d misses:%d\n", bchit, bcmiss);
+ printf("branch cache master:\n");
+ for (i = 0; i < ARRAY_SIZE(sh2s[0].branch_cache); i++) {
+ printf("%08x ",sh2s[0].branch_cache[i].pc);
+ if ((i+1) % 8 == 0) printf("\n");
+ }
printf("branch cache slave:\n");
for (i = 0; i < ARRAY_SIZE(sh2s[1].branch_cache); i++) {
printf("%08x ",sh2s[1].branch_cache[i].pc);
if ((i+1) % 8 == 0) printf("\n");
}
#endif
+#endif
}
void sh2_drc_flush_all(void)
state_dump();
block_stats();
entry_stats();
+ bcache_stats();
flush_tcache(0);
flush_tcache(1);
flush_tcache(2);
#endif
}
memset(sh2->branch_cache, -1, sizeof(sh2->branch_cache));
+ memset(sh2->rts_cache, -1, sizeof(sh2->rts_cache));
+ sh2->rts_cache_idx = 0;
return 0;