From: kub Date: Wed, 20 Mar 2019 22:39:45 +0000 (+0100) Subject: improved sh2 clock handling, bug fixing + small improvement to drc emitters X-Git-Tag: v2.00~880 X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=2fa02d5a63e4b6dea2d6ed809507480576f6bba0;p=picodrive.git improved sh2 clock handling, bug fixing + small improvement to drc emitters --- diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 91b47402..89582e8d 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -86,7 +86,7 @@ #define A_OP_TST 0x8 #define A_OP_TEQ 0x9 #define A_OP_CMP 0xa -#define A_OP_CMN 0xa +#define A_OP_CMN 0xb #define A_OP_ORR 0xc #define A_OP_MOV 0xd #define A_OP_BIC 0xe @@ -250,7 +250,16 @@ #define EOP_MOVT(rd,imm) \ EMIT(0xe3400000 | ((rd)<<12) | (((imm)>>16)&0xfff) | (((imm)>>12)&0xf0000)) -// XXX: AND, RSB, *C, will break if 1 insn is not enough +static int count_bits(unsigned val) +{ + val = (val & 0x55555555) + ((val >> 1) & 0x55555555); + val = (val & 0x33333333) + ((val >> 2) & 0x33333333); + val = (val & 0x0f0f0f0f) + ((val >> 4) & 0x0f0f0f0f); + val = (val & 0x00ff00ff) + ((val >> 8) & 0x00ff00ff); + return (val & 0xffff) + (val >> 16); +} + +// XXX: RSB, *S will break if 1 insn is not enough static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int imm) { int ror2; @@ -259,23 +268,11 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int switch (op) { case A_OP_MOV: rn = 0; - if (~imm < 0x10000) { + // count bits in imm and use MVN if more bits 1 than 0 + if (count_bits(imm) > 16) { imm = ~imm; op = A_OP_MVN; } -#ifdef HAVE_ARMV7 - for (v = imm, ror2 = 0; v && !(v & 3); v >>= 2) - ror2--; - if (v >> 8) { - /* 2+ insns needed - prefer movw/movt */ - if (op == A_OP_MVN) - imm = ~imm; - EOP_MOVW(rd, imm); - if (imm & 0xffff0000) - EOP_MOVT(rd, imm); - return; - } -#endif break; case A_OP_EOR: @@ -283,27 +280,37 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int case A_OP_ADD: case A_OP_ORR: case A_OP_BIC: - if (s == 0 && imm == 0) + if (s == 0 && imm == 0 && rd == rn) return; break; } - for (v = imm, ror2 = 0; ; ror2 -= 8/2) { - /* shift down to get 'best' rot2 */ - for (; v && !(v & 3); v >>= 2) - ror2--; - - EOP_C_DOP_IMM(cond, op, s, rn, rd, ror2 & 0x0f, v & 0xff); - - v >>= 8; - if (v == 0) - break; - if (op == A_OP_MOV) - op = A_OP_ORR; - if (op == A_OP_MVN) + again: + v = imm, ror2 = 32/2; // arm imm shift is ROR, so rotate for best fit + while ((v >> 24) && !(v & 0xc0)) + v = (v << 2) | (v >> 30), ror2++; + do { + // shift down to get 'best' rot2 + while (v > 0xff && !(v & 3)) + v >>= 2, ror2--; + // AND must fit into 1 insn. if not, use BIC + if (op == A_OP_AND && v != (v & 0xff)) { + imm = ~imm; op = A_OP_BIC; + goto again; + } + EOP_C_DOP_IMM(cond, op, s, rn, rd, ror2 & 0xf, v & 0xff); + + switch (op) { + case A_OP_MOV: op = A_OP_ORR; break; + case A_OP_MVN: op = A_OP_BIC; break; + case A_OP_ADC: op = A_OP_ADD; break; + case A_OP_SBC: op = A_OP_SUB; break; + } rn = rd; - } + + v >>= 8, ror2 -= 8/2; + } while (v); } #define emith_op_imm(cond, s, op, r, imm) \ @@ -491,7 +498,7 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_cmp_r_imm(r, imm) { \ u32 op = A_OP_CMP, imm_ = imm; \ if (~imm_ < 0x100) { \ - imm_ = ~imm_; \ + imm_ = -imm_; \ op = A_OP_CMN; \ } \ emith_top_imm(A_COND_AL, op, r, imm); \ @@ -652,12 +659,10 @@ static int emith_xbranch(int cond, void *target, int is_call) if ((count) <= 8) { \ t = (count) - 8; \ t = (0xff << t) & 0xff; \ - EOP_BIC_IMM(d,s,8/2,t); \ EOP_C_DOP_IMM(cond,A_OP_BIC,0,s,d,8/2,t); \ } else if ((count) >= 24) { \ t = (count) - 24; \ t = 0xff >> t; \ - EOP_AND_IMM(d,s,0,t); \ EOP_C_DOP_IMM(cond,A_OP_AND,0,s,d,0,t); \ } else { \ EOP_MOV_REG(cond,0,d,s,A_AM1_LSL,count); \ diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 865aab4b..e5f2adef 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -421,13 +421,10 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; rmr = s2; \ } \ EMIT_OP_MODRM(0xf7, 3, op, rmr); /* xMUL rmr */ \ - /* XXX: using push/pop for the case of edx->eax; eax->edx */ \ - if (dhi != xDX && dhi != -1) \ - emith_push(xDX); \ if (dlo != xAX) \ - emith_move_r_r(dlo, xAX); \ - if (dhi != xDX && dhi != -1) \ - emith_pop(dhi); \ + EMIT_OP(0x90 + (dlo)); /* XCHG eax, dlo */ \ + if (dhi != xDX && dhi != -1 && !(dhi == xAX && dlo == xDX)) \ + emith_move_r_r(dhi, (dlo == xDX ? xAX : xDX)); \ if (dlo != xDX && dhi != xDX) \ emith_pop(xDX); \ if (dlo != xAX && dhi != xAX) \ @@ -474,12 +471,12 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_deref_op(op, r, rs, offs) do { \ /* mov r <-> [ebp+#offs] */ \ - if ((offs) >= 0x80) { \ + if (abs(offs) >= 0x80) { \ EMIT_OP_MODRM64(op, 2, r, rs); \ EMIT(offs, u32); \ } else { \ EMIT_OP_MODRM64(op, 1, r, rs); \ - EMIT(offs, u8); \ + EMIT((u8)offs, u8); \ } \ } while (0) @@ -496,7 +493,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; int r_ = r; \ if (!is_abcdx(r)) \ r_ = rcache_get_tmp(); \ - emith_deref_op(0x8a, r_, rs, offs); \ + EMIT(0x0f, u8); \ + emith_deref_op(0xb6, r_, rs, offs); \ if ((r) != r_) { \ emith_move_r_r(r, r_); \ rcache_free_tmp(r_); \ @@ -515,8 +513,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; } while (0) #define emith_read16_r_r_offs(r, rs, offs) do { \ - EMIT(0x66, u8); /* operand override */ \ - emith_read_r_r_offs(r, rs, offs); \ + EMIT(0x0f, u8); \ + emith_deref_op(0xb7, r, rs, offs); \ } while (0) #define emith_write16_r_r_offs(r, rs, offs) do { \ @@ -688,6 +686,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; case 0: rd = xDI; break; \ case 1: rd = xSI; break; \ case 2: rd = xDX; break; \ + case 2: rd = xBX; break; \ } #define emith_sh2_drc_entry() { \ diff --git a/cpu/sh2/sh2.c b/cpu/sh2/sh2.c index 403c4c70..ba260718 100644 --- a/cpu/sh2/sh2.c +++ b/cpu/sh2/sh2.c @@ -84,7 +84,7 @@ int sh2_irl_irq(SH2 *sh2, int level, int nested_call) // do this to avoid missing irqs that other SH2 might clear int vector = sh2->irq_callback(sh2, level); sh2_do_irq(sh2, level, vector); - sh2->m68krcycles_done += C_SH2_TO_M68K(*sh2, 13); + sh2->m68krcycles_done += C_SH2_TO_M68K(sh2, 13); } else sh2->test_irq = 1; diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h index 407270f1..69abf8cd 100644 --- a/cpu/sh2/sh2.h +++ b/cpu/sh2/sh2.h @@ -72,9 +72,9 @@ typedef struct SH2_ #define CYCLE_MULT_SHIFT 10 #define C_M68K_TO_SH2(xsh2, c) \ - ((int)((c) * (xsh2).mult_m68k_to_sh2) >> CYCLE_MULT_SHIFT) + ((int)((long long)(c) * (xsh2)->mult_m68k_to_sh2) >> CYCLE_MULT_SHIFT) #define C_SH2_TO_M68K(xsh2, c) \ - ((int)((c + 3) * (xsh2).mult_sh2_to_m68k) >> CYCLE_MULT_SHIFT) + ((int)((long long)(c+3) * (xsh2)->mult_sh2_to_m68k) >> CYCLE_MULT_SHIFT) int sh2_init(SH2 *sh2, int is_slave, SH2 *other_sh2); void sh2_finish(SH2 *sh2); diff --git a/pico/32x/32x.c b/pico/32x/32x.c index 9bfbefac..3ee8c2ea 100644 --- a/pico/32x/32x.c +++ b/pico/32x/32x.c @@ -254,8 +254,8 @@ static void p32x_start_blank(void) } p32x_trigger_irq(NULL, SekCyclesDone(), P32XI_VINT); - p32x_sh2_poll_event(&msh2, SH2_STATE_VPOLL, 0); - p32x_sh2_poll_event(&ssh2, SH2_STATE_VPOLL, 0); + p32x_sh2_poll_event(&msh2, SH2_STATE_VPOLL, SekCyclesDone()); + p32x_sh2_poll_event(&ssh2, SH2_STATE_VPOLL, SekCyclesDone()); } void p32x_schedule_hint(SH2 *sh2, int m68k_cycles) @@ -323,8 +323,12 @@ void p32x_event_schedule_sh2(SH2 *sh2, enum p32x_event event, int after) p32x_event_schedule(now, event, after); - left_to_next = (event_time_next - now) * 3; - sh2_end_run(sh2, left_to_next); + left_to_next = C_M68K_TO_SH2(sh2, (int)(event_time_next - now)); + if (sh2_cycles_left(sh2) > left_to_next) { + if (left_to_next < 1) + left_to_next = 1; + sh2_end_run(sh2, left_to_next); + } } static void p32x_run_events(unsigned int until) @@ -372,13 +376,13 @@ static void run_sh2(SH2 *sh2, int m68k_cycles) pevt_log_sh2_o(sh2, EVT_RUN_START); sh2->state |= SH2_STATE_RUN; - cycles = C_M68K_TO_SH2(*sh2, m68k_cycles); + cycles = C_M68K_TO_SH2(sh2, m68k_cycles); elprintf_sh2(sh2, EL_32X, "+run %u %d @%08x", sh2->m68krcycles_done, cycles, sh2->pc); done = sh2_execute(sh2, cycles, PicoIn.opt & POPT_EN_DRC); - sh2->m68krcycles_done += C_SH2_TO_M68K(*sh2, done); + sh2->m68krcycles_done += C_SH2_TO_M68K(sh2, done); sh2->state &= ~SH2_STATE_RUN; pevt_log_sh2_o(sh2, EVT_RUN_END); elprintf_sh2(sh2, EL_32X, "-run %u %d", @@ -412,8 +416,7 @@ void p32x_sync_other_sh2(SH2 *sh2, unsigned int m68k_target) // there might be new event to schedule current sh2 to if (event_time_next) { - left_to_event = event_time_next - m68k_target; - left_to_event *= 3; + left_to_event = C_M68K_TO_SH2(sh2, (int)(event_time_next - m68k_target)); if (sh2_cycles_left(sh2) > left_to_event) { if (left_to_event < 1) left_to_event = 1; @@ -446,6 +449,7 @@ void sync_sh2s_normal(unsigned int m68k_target) now = ssh2.m68krcycles_done; timer_cycles = now; + pprof_start(m68k); while (CYCLES_GT(m68k_target, now)) { if (event_time_next && CYCLES_GE(now, event_time_next)) @@ -463,6 +467,7 @@ void sync_sh2s_normal(unsigned int m68k_target) target - msh2.m68krcycles_done, target - ssh2.m68krcycles_done, m68k_target - now, Pico32x.emu_flags); + pprof_start(ssh2); if (!(ssh2.state & SH2_IDLE_STATES)) { cycles = target - ssh2.m68krcycles_done; if (cycles > 0) { @@ -472,7 +477,9 @@ void sync_sh2s_normal(unsigned int m68k_target) target = event_time_next; } } + pprof_end(ssh2); + pprof_start(msh2); if (!(msh2.state & SH2_IDLE_STATES)) { cycles = target - msh2.m68krcycles_done; if (cycles > 0) { @@ -482,6 +489,7 @@ void sync_sh2s_normal(unsigned int m68k_target) target = event_time_next; } } + pprof_end(msh2); now = target; if (!(msh2.state & SH2_IDLE_STATES)) { @@ -497,6 +505,7 @@ void sync_sh2s_normal(unsigned int m68k_target) p32x_timers_do(now - timer_cycles); timer_cycles = now; } + pprof_end_sub(m68k); // advance idle CPUs if (msh2.state & SH2_IDLE_STATES) { @@ -553,8 +562,8 @@ void PicoFrame32x(void) if (!(Pico32x.sh2_regs[0] & 0x80)) p32x_schedule_hint(NULL, SekCyclesDone()); - p32x_sh2_poll_event(&msh2, SH2_STATE_VPOLL, 0); - p32x_sh2_poll_event(&ssh2, SH2_STATE_VPOLL, 0); + p32x_sh2_poll_event(&msh2, SH2_STATE_VPOLL, SekCyclesDone()); + p32x_sh2_poll_event(&ssh2, SH2_STATE_VPOLL, SekCyclesDone()); if (PicoIn.AHW & PAHW_MCD) pcd_prepare_frame(); diff --git a/pico/32x/memory.c b/pico/32x/memory.c index eff0ab07..d815853d 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -146,7 +146,7 @@ static void sh2s_sync_on_read(SH2 *sh2) cycles = sh2_cycles_done(sh2); if (cycles > 600) - p32x_sync_other_sh2(sh2, sh2->m68krcycles_done + cycles / 3); + p32x_sync_other_sh2(sh2, sh2->m68krcycles_done + C_SH2_TO_M68K(sh2, cycles)); } // SH2 faking diff --git a/pico/cd/mcd.c b/pico/cd/mcd.c index 5e3629a3..8a2f230d 100644 --- a/pico/cd/mcd.c +++ b/pico/cd/mcd.c @@ -125,6 +125,7 @@ static void SekRunS68k(unsigned int to) if (SekShouldInterrupt()) Pico_mcd->m.s68k_poll_a = 0; + pprof_start(s68k); SekCycleCntS68k += cyc_do; #if defined(EMU_C68K) PicoCpuCS68k.cycles = cyc_do; @@ -137,6 +138,7 @@ static void SekRunS68k(unsigned int to) #elif defined(EMU_F68K) SekCycleCntS68k += fm68k_emulate(&PicoCpuFS68k, cyc_do, 0) - cyc_do; #endif + pprof_end(s68k); } static void pcd_set_cycle_mult(void) diff --git a/pico/pico_int.h b/pico/pico_int.h index 7225cab8..cca7f954 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -241,11 +241,11 @@ extern SH2 sh2s[2]; # define sh2_pc(sh2) (sh2)->pc #endif -#define sh2_cycles_done(sh2) ((int)(sh2)->cycles_timeslice - sh2_cycles_left(sh2)) +#define sh2_cycles_done(sh2) ((unsigned)(sh2)->cycles_timeslice - sh2_cycles_left(sh2)) #define sh2_cycles_done_t(sh2) \ - ((sh2)->m68krcycles_done * 3 + sh2_cycles_done(sh2)) + (unsigned)(C_M68K_TO_SH2(sh2, (sh2)->m68krcycles_done) + sh2_cycles_done(sh2)) #define sh2_cycles_done_m68k(sh2) \ - ((sh2)->m68krcycles_done + (sh2_cycles_done(sh2) / 3)) + (unsigned)((sh2)->m68krcycles_done + C_SH2_TO_M68K(sh2, sh2_cycles_done(sh2))) #define sh2_reg(c, x) (c) ? ssh2.r[x] : msh2.r[x] #define sh2_gbr(c) (c) ? ssh2.gbr : msh2.gbr