From: kub <derkub@gmail.com>
Date: Wed, 20 Mar 2019 22:39:45 +0000 (+0100)
Subject: improved sh2 clock handling, bug fixing + small improvement to drc emitters
X-Git-Tag: v2.00~880
X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=2fa02d5a63e4b6dea2d6ed809507480576f6bba0;p=picodrive.git

improved sh2 clock handling, bug fixing + small improvement to drc emitters
---

diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c
index 91b47402..89582e8d 100644
--- a/cpu/drc/emit_arm.c
+++ b/cpu/drc/emit_arm.c
@@ -86,7 +86,7 @@
 #define A_OP_TST 0x8
 #define A_OP_TEQ 0x9
 #define A_OP_CMP 0xa
-#define A_OP_CMN 0xa
+#define A_OP_CMN 0xb
 #define A_OP_ORR 0xc
 #define A_OP_MOV 0xd
 #define A_OP_BIC 0xe
@@ -250,7 +250,16 @@
 #define EOP_MOVT(rd,imm) \
 	EMIT(0xe3400000 | ((rd)<<12) | (((imm)>>16)&0xfff) | (((imm)>>12)&0xf0000))
 
-// XXX: AND, RSB, *C, will break if 1 insn is not enough
+static int count_bits(unsigned val)
+{
+	val = (val & 0x55555555) + ((val >> 1) & 0x55555555);
+	val = (val & 0x33333333) + ((val >> 2) & 0x33333333);
+	val = (val & 0x0f0f0f0f) + ((val >> 4) & 0x0f0f0f0f);
+	val = (val & 0x00ff00ff) + ((val >> 8) & 0x00ff00ff);
+	return (val & 0xffff) + (val >> 16);
+}
+
+// XXX: RSB, *S will break if 1 insn is not enough
 static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int imm)
 {
 	int ror2;
@@ -259,23 +268,11 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int
 	switch (op) {
 	case A_OP_MOV:
 		rn = 0;
-		if (~imm < 0x10000) {
+		// count bits in imm and use MVN if more bits 1 than 0
+		if (count_bits(imm) > 16) {
 			imm = ~imm;
 			op = A_OP_MVN;
 		}
-#ifdef HAVE_ARMV7
-		for (v = imm, ror2 = 0; v && !(v & 3); v >>= 2)
-			ror2--;
-		if (v >> 8) {
-			/* 2+ insns needed - prefer movw/movt */
-			if (op == A_OP_MVN)
-				imm = ~imm;
-			EOP_MOVW(rd, imm);
-			if (imm & 0xffff0000)
-				EOP_MOVT(rd, imm);
-			return;
-		}
-#endif
 		break;
 
 	case A_OP_EOR:
@@ -283,27 +280,37 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int
 	case A_OP_ADD:
 	case A_OP_ORR:
 	case A_OP_BIC:
-		if (s == 0 && imm == 0)
+		if (s == 0 && imm == 0 && rd == rn)
 			return;
 		break;
 	}
 
-	for (v = imm, ror2 = 0; ; ror2 -= 8/2) {
-		/* shift down to get 'best' rot2 */
-		for (; v && !(v & 3); v >>= 2)
-			ror2--;
-
-		EOP_C_DOP_IMM(cond, op, s, rn, rd, ror2 & 0x0f, v & 0xff);
-
-		v >>= 8;
-		if (v == 0)
-			break;
-		if (op == A_OP_MOV)
-			op = A_OP_ORR;
-		if (op == A_OP_MVN)
+	again:
+	v = imm, ror2 = 32/2; // arm imm shift is ROR, so rotate for best fit
+	while ((v >> 24) && !(v & 0xc0))
+		v = (v << 2) | (v >> 30), ror2++;
+	do {
+		// shift down to get 'best' rot2
+		while (v > 0xff && !(v & 3))
+			v >>= 2, ror2--;
+		// AND must fit into 1 insn. if not, use BIC
+		if (op == A_OP_AND && v != (v & 0xff)) {
+			imm = ~imm;
 			op = A_OP_BIC;
+			goto again;
+		}
+		EOP_C_DOP_IMM(cond, op, s, rn, rd, ror2 & 0xf, v & 0xff);
+
+		switch (op) {
+		case A_OP_MOV:	op = A_OP_ORR; break;
+		case A_OP_MVN:	op = A_OP_BIC; break;
+		case A_OP_ADC:	op = A_OP_ADD; break;
+		case A_OP_SBC:	op = A_OP_SUB; break;
+		}
 		rn = rd;
-	}
+
+		v >>= 8, ror2 -= 8/2;
+	} while (v);
 }
 
 #define emith_op_imm(cond, s, op, r, imm) \
@@ -491,7 +498,7 @@ static int emith_xbranch(int cond, void *target, int is_call)
 #define emith_cmp_r_imm(r, imm) { \
 	u32 op = A_OP_CMP, imm_ = imm; \
 	if (~imm_ < 0x100) { \
-		imm_ = ~imm_; \
+		imm_ = -imm_; \
 		op = A_OP_CMN; \
 	} \
 	emith_top_imm(A_COND_AL, op, r, imm); \
@@ -652,12 +659,10 @@ static int emith_xbranch(int cond, void *target, int is_call)
 	if ((count) <= 8) { \
 		t = (count) - 8; \
 		t = (0xff << t) & 0xff; \
-		EOP_BIC_IMM(d,s,8/2,t); \
 		EOP_C_DOP_IMM(cond,A_OP_BIC,0,s,d,8/2,t); \
 	} else if ((count) >= 24) { \
 		t = (count) - 24; \
 		t = 0xff >> t; \
-		EOP_AND_IMM(d,s,0,t); \
 		EOP_C_DOP_IMM(cond,A_OP_AND,0,s,d,0,t); \
 	} else { \
 		EOP_MOV_REG(cond,0,d,s,A_AM1_LSL,count); \
diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c
index 865aab4b..e5f2adef 100644
--- a/cpu/drc/emit_x86.c
+++ b/cpu/drc/emit_x86.c
@@ -421,13 +421,10 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI };
 		rmr = s2; \
 	} \
 	EMIT_OP_MODRM(0xf7, 3, op, rmr); /* xMUL rmr */ \
-	/* XXX: using push/pop for the case of edx->eax; eax->edx */ \
-	if (dhi != xDX && dhi != -1) \
-		emith_push(xDX); \
 	if (dlo != xAX) \
-		emith_move_r_r(dlo, xAX); \
-	if (dhi != xDX && dhi != -1) \
-		emith_pop(dhi); \
+		EMIT_OP(0x90 + (dlo)); /* XCHG eax, dlo */ \
+	if (dhi != xDX && dhi != -1 && !(dhi == xAX && dlo == xDX)) \
+		emith_move_r_r(dhi, (dlo == xDX ? xAX : xDX)); \
 	if (dlo != xDX && dhi != xDX) \
 		emith_pop(xDX); \
 	if (dlo != xAX && dhi != xAX) \
@@ -474,12 +471,12 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI };
 
 #define emith_deref_op(op, r, rs, offs) do { \
 	/* mov r <-> [ebp+#offs] */ \
-	if ((offs) >= 0x80) { \
+	if (abs(offs) >= 0x80) { \
 		EMIT_OP_MODRM64(op, 2, r, rs); \
 		EMIT(offs, u32); \
 	} else { \
 		EMIT_OP_MODRM64(op, 1, r, rs); \
-		EMIT(offs, u8); \
+		EMIT((u8)offs, u8); \
 	} \
 } while (0)
 
@@ -496,7 +493,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI };
 	int r_ = r; \
 	if (!is_abcdx(r)) \
 		r_ = rcache_get_tmp(); \
-	emith_deref_op(0x8a, r_, rs, offs); \
+	EMIT(0x0f, u8); \
+	emith_deref_op(0xb6, r_, rs, offs); \
 	if ((r) != r_) { \
 		emith_move_r_r(r, r_); \
 		rcache_free_tmp(r_); \
@@ -515,8 +513,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI };
 } while (0)
 
 #define emith_read16_r_r_offs(r, rs, offs) do { \
-	EMIT(0x66, u8); /* operand override */ \
-	emith_read_r_r_offs(r, rs, offs); \
+	EMIT(0x0f, u8); \
+	emith_deref_op(0xb7, r, rs, offs); \
 } while (0)
 
 #define emith_write16_r_r_offs(r, rs, offs) do { \
@@ -688,6 +686,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI };
 	case 0: rd = xDI; break; \
 	case 1: rd = xSI; break; \
 	case 2: rd = xDX; break; \
+	case 2: rd = xBX; break; \
 	}
 
 #define emith_sh2_drc_entry() { \
diff --git a/cpu/sh2/sh2.c b/cpu/sh2/sh2.c
index 403c4c70..ba260718 100644
--- a/cpu/sh2/sh2.c
+++ b/cpu/sh2/sh2.c
@@ -84,7 +84,7 @@ int sh2_irl_irq(SH2 *sh2, int level, int nested_call)
 			// do this to avoid missing irqs that other SH2 might clear
 			int vector = sh2->irq_callback(sh2, level);
 			sh2_do_irq(sh2, level, vector);
-			sh2->m68krcycles_done += C_SH2_TO_M68K(*sh2, 13);
+			sh2->m68krcycles_done += C_SH2_TO_M68K(sh2, 13);
 		}
 		else
 			sh2->test_irq = 1;
diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h
index 407270f1..69abf8cd 100644
--- a/cpu/sh2/sh2.h
+++ b/cpu/sh2/sh2.h
@@ -72,9 +72,9 @@ typedef struct SH2_
 
 #define CYCLE_MULT_SHIFT 10
 #define C_M68K_TO_SH2(xsh2, c) \
-	((int)((c) * (xsh2).mult_m68k_to_sh2) >> CYCLE_MULT_SHIFT)
+	((int)((long long)(c) * (xsh2)->mult_m68k_to_sh2) >> CYCLE_MULT_SHIFT)
 #define C_SH2_TO_M68K(xsh2, c) \
-	((int)((c + 3) * (xsh2).mult_sh2_to_m68k) >> CYCLE_MULT_SHIFT)
+	((int)((long long)(c+3) * (xsh2)->mult_sh2_to_m68k) >> CYCLE_MULT_SHIFT)
 
 int  sh2_init(SH2 *sh2, int is_slave, SH2 *other_sh2);
 void sh2_finish(SH2 *sh2);
diff --git a/pico/32x/32x.c b/pico/32x/32x.c
index 9bfbefac..3ee8c2ea 100644
--- a/pico/32x/32x.c
+++ b/pico/32x/32x.c
@@ -254,8 +254,8 @@ static void p32x_start_blank(void)
   }
 
   p32x_trigger_irq(NULL, SekCyclesDone(), P32XI_VINT);
-  p32x_sh2_poll_event(&msh2, SH2_STATE_VPOLL, 0);
-  p32x_sh2_poll_event(&ssh2, SH2_STATE_VPOLL, 0);
+  p32x_sh2_poll_event(&msh2, SH2_STATE_VPOLL, SekCyclesDone());
+  p32x_sh2_poll_event(&ssh2, SH2_STATE_VPOLL, SekCyclesDone());
 }
 
 void p32x_schedule_hint(SH2 *sh2, int m68k_cycles)
@@ -323,8 +323,12 @@ void p32x_event_schedule_sh2(SH2 *sh2, enum p32x_event event, int after)
 
   p32x_event_schedule(now, event, after);
 
-  left_to_next = (event_time_next - now) * 3;
-  sh2_end_run(sh2, left_to_next);
+  left_to_next = C_M68K_TO_SH2(sh2, (int)(event_time_next - now));
+  if (sh2_cycles_left(sh2) > left_to_next) {
+    if (left_to_next < 1)
+      left_to_next = 1;
+    sh2_end_run(sh2, left_to_next);
+  }
 }
 
 static void p32x_run_events(unsigned int until)
@@ -372,13 +376,13 @@ static void run_sh2(SH2 *sh2, int m68k_cycles)
 
   pevt_log_sh2_o(sh2, EVT_RUN_START);
   sh2->state |= SH2_STATE_RUN;
-  cycles = C_M68K_TO_SH2(*sh2, m68k_cycles);
+  cycles = C_M68K_TO_SH2(sh2, m68k_cycles);
   elprintf_sh2(sh2, EL_32X, "+run %u %d @%08x",
     sh2->m68krcycles_done, cycles, sh2->pc);
 
   done = sh2_execute(sh2, cycles, PicoIn.opt & POPT_EN_DRC);
 
-  sh2->m68krcycles_done += C_SH2_TO_M68K(*sh2, done);
+  sh2->m68krcycles_done += C_SH2_TO_M68K(sh2, done);
   sh2->state &= ~SH2_STATE_RUN;
   pevt_log_sh2_o(sh2, EVT_RUN_END);
   elprintf_sh2(sh2, EL_32X, "-run %u %d",
@@ -412,8 +416,7 @@ void p32x_sync_other_sh2(SH2 *sh2, unsigned int m68k_target)
 
   // there might be new event to schedule current sh2 to
   if (event_time_next) {
-    left_to_event = event_time_next - m68k_target;
-    left_to_event *= 3;
+    left_to_event = C_M68K_TO_SH2(sh2, (int)(event_time_next - m68k_target));
     if (sh2_cycles_left(sh2) > left_to_event) {
       if (left_to_event < 1)
         left_to_event = 1;
@@ -446,6 +449,7 @@ void sync_sh2s_normal(unsigned int m68k_target)
     now = ssh2.m68krcycles_done;
   timer_cycles = now;
 
+  pprof_start(m68k);
   while (CYCLES_GT(m68k_target, now))
   {
     if (event_time_next && CYCLES_GE(now, event_time_next))
@@ -463,6 +467,7 @@ void sync_sh2s_normal(unsigned int m68k_target)
         target - msh2.m68krcycles_done, target - ssh2.m68krcycles_done,
         m68k_target - now, Pico32x.emu_flags);
 
+      pprof_start(ssh2);
       if (!(ssh2.state & SH2_IDLE_STATES)) {
         cycles = target - ssh2.m68krcycles_done;
         if (cycles > 0) {
@@ -472,7 +477,9 @@ void sync_sh2s_normal(unsigned int m68k_target)
             target = event_time_next;
         }
       }
+      pprof_end(ssh2);
 
+      pprof_start(msh2);
       if (!(msh2.state & SH2_IDLE_STATES)) {
         cycles = target - msh2.m68krcycles_done;
         if (cycles > 0) {
@@ -482,6 +489,7 @@ void sync_sh2s_normal(unsigned int m68k_target)
             target = event_time_next;
         }
       }
+      pprof_end(msh2);
 
       now = target;
       if (!(msh2.state & SH2_IDLE_STATES)) {
@@ -497,6 +505,7 @@ void sync_sh2s_normal(unsigned int m68k_target)
     p32x_timers_do(now - timer_cycles);
     timer_cycles = now;
   }
+  pprof_end_sub(m68k);
 
   // advance idle CPUs
   if (msh2.state & SH2_IDLE_STATES) {
@@ -553,8 +562,8 @@ void PicoFrame32x(void)
 
   if (!(Pico32x.sh2_regs[0] & 0x80))
     p32x_schedule_hint(NULL, SekCyclesDone());
-  p32x_sh2_poll_event(&msh2, SH2_STATE_VPOLL, 0);
-  p32x_sh2_poll_event(&ssh2, SH2_STATE_VPOLL, 0);
+  p32x_sh2_poll_event(&msh2, SH2_STATE_VPOLL, SekCyclesDone());
+  p32x_sh2_poll_event(&ssh2, SH2_STATE_VPOLL, SekCyclesDone());
 
   if (PicoIn.AHW & PAHW_MCD)
     pcd_prepare_frame();
diff --git a/pico/32x/memory.c b/pico/32x/memory.c
index eff0ab07..d815853d 100644
--- a/pico/32x/memory.c
+++ b/pico/32x/memory.c
@@ -146,7 +146,7 @@ static void sh2s_sync_on_read(SH2 *sh2)
 
   cycles = sh2_cycles_done(sh2);
   if (cycles > 600)
-    p32x_sync_other_sh2(sh2, sh2->m68krcycles_done + cycles / 3);
+    p32x_sync_other_sh2(sh2, sh2->m68krcycles_done + C_SH2_TO_M68K(sh2, cycles));
 }
 
 // SH2 faking
diff --git a/pico/cd/mcd.c b/pico/cd/mcd.c
index 5e3629a3..8a2f230d 100644
--- a/pico/cd/mcd.c
+++ b/pico/cd/mcd.c
@@ -125,6 +125,7 @@ static void SekRunS68k(unsigned int to)
   if (SekShouldInterrupt())
     Pico_mcd->m.s68k_poll_a = 0;
 
+  pprof_start(s68k);
   SekCycleCntS68k += cyc_do;
 #if defined(EMU_C68K)
   PicoCpuCS68k.cycles = cyc_do;
@@ -137,6 +138,7 @@ static void SekRunS68k(unsigned int to)
 #elif defined(EMU_F68K)
   SekCycleCntS68k += fm68k_emulate(&PicoCpuFS68k, cyc_do, 0) - cyc_do;
 #endif
+  pprof_end(s68k);
 }
 
 static void pcd_set_cycle_mult(void)
diff --git a/pico/pico_int.h b/pico/pico_int.h
index 7225cab8..cca7f954 100644
--- a/pico/pico_int.h
+++ b/pico/pico_int.h
@@ -241,11 +241,11 @@ extern SH2 sh2s[2];
 # define sh2_pc(sh2) (sh2)->pc
 #endif
 
-#define sh2_cycles_done(sh2) ((int)(sh2)->cycles_timeslice - sh2_cycles_left(sh2))
+#define sh2_cycles_done(sh2) ((unsigned)(sh2)->cycles_timeslice - sh2_cycles_left(sh2))
 #define sh2_cycles_done_t(sh2) \
-  ((sh2)->m68krcycles_done * 3 + sh2_cycles_done(sh2))
+  (unsigned)(C_M68K_TO_SH2(sh2, (sh2)->m68krcycles_done) + sh2_cycles_done(sh2))
 #define sh2_cycles_done_m68k(sh2) \
-  ((sh2)->m68krcycles_done + (sh2_cycles_done(sh2) / 3))
+  (unsigned)((sh2)->m68krcycles_done + C_SH2_TO_M68K(sh2, sh2_cycles_done(sh2)))
 
 #define sh2_reg(c, x) (c) ? ssh2.r[x] : msh2.r[x]
 #define sh2_gbr(c)    (c) ? ssh2.gbr : msh2.gbr