From a5e51c16e6bf3d2e5bbc09e517a99c046fc2e111 Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 13 Dec 2019 18:23:03 +0100 Subject: [PATCH] sh2 drc: fix speed regression --- cpu/drc/emit_arm.c | 2 +- cpu/drc/emit_arm64.c | 2 +- cpu/drc/emit_mips.c | 2 +- cpu/drc/emit_riscv.c | 2 +- cpu/drc/emit_x86.c | 2 +- cpu/sh2/compiler.h | 32 ++++++++++++++++++++++++-------- cpu/sh2/sh2.h | 1 + pico/32x/memory.c | 10 +++++----- 8 files changed, 35 insertions(+), 18 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 8ea148eb..af9491f1 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -20,7 +20,7 @@ #define TEMPORARY_REGS { 12, 14 } #define CONTEXT_REG 11 -#define STATIC_SH2_REGS { SHR_SR,10 , SHR_R0,8 , SHR_R0+1,9 } +#define STATIC_SH2_REGS { SHR_SR,10 , SHR_R(0),8 , SHR_R(1),9 } // XXX: tcache_ptr type for SVP and SH2 compilers differs.. #define EMIT_PTR(ptr, x) \ diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index 26fede3a..8d1a7dd1 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -15,7 +15,7 @@ #define TEMPORARY_REGS { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 } #define CONTEXT_REG 29 -#define STATIC_SH2_REGS { SHR_SR,28 , SHR_R0,27 , SHR_R0+1,26 } +#define STATIC_SH2_REGS { SHR_SR,28 , SHR_R(0),27 , SHR_R(1),26 } // R31 doesn't exist, it aliases either with zero or SP #define SP 31 // stack pointer diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index 765986a6..8cb094de 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -17,7 +17,7 @@ #define TEMPORARY_REGS { 2, 3, 8, 9, 10, 11, 12, 13, 14 } // v0-v1,t0-t6 #define CONTEXT_REG 23 // s7 -#define STATIC_SH2_REGS { SHR_SR,22 , SHR_R0,21 , SHR_R0+1,20 } +#define STATIC_SH2_REGS { SHR_SR,22 , SHR_R(0),21 , SHR_R(1),20 } // NB: the ubiquitous JZ74[46]0 uses MIPS32 Release 1, a slight MIPS II superset #ifndef __mips_isa_rev diff --git a/cpu/drc/emit_riscv.c b/cpu/drc/emit_riscv.c index ed45e01c..90234b22 100644 --- a/cpu/drc/emit_riscv.c +++ b/cpu/drc/emit_riscv.c @@ -16,7 +16,7 @@ #define TEMPORARY_REGS { 5, 6, 7 } // t0-t2 #define CONTEXT_REG 9 // s1 -#define STATIC_SH2_REGS { SHR_SR,27 , SHR_R0,26 , SHR_R0+1,25 } +#define STATIC_SH2_REGS { SHR_SR,27 , SHR_R(0),26 , SHR_R(1),25 } // registers usable for user code: r1-r25, others reserved or special #define Z0 0 // zero register diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 0b3f7697..ec13551e 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -1072,7 +1072,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #define PARAM_REGS { xCX, xDX, xR8, xR9 } #define PRESERVED_REGS { xSI, xDI, xR12, xR13, xR14, xR15, xBX, xBP } #define TEMPORARY_REGS { xAX, xR10, xR11 } -#define STATIC_SH2_REGS { SHR_SR,xBX , SHR_R0,xR15 , SH2_R0+1,xR14 } +#define STATIC_SH2_REGS { SHR_SR,xBX , SHR_R(0),xR15 , SH2_R(1),xR14 } #define host_arg2reg(rd, arg) \ switch (arg) { \ diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index 804f2a70..dd37d470 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -33,34 +33,50 @@ unsigned short scan_block(uint32_t base_pc, int is_slave, uint32_t *base_literals, uint32_t *end_literals); #if defined(DRC_SH2) && defined(__GNUC__) -// direct access to some host CPU registers used by the DRC -// XXX MUST match definitions for SHR_SR in cpu/drc/emit_*.c +// direct access to some host CPU registers used by the DRC if gcc is used. +// XXX MUST match SHR_SR definitions in cpu/drc/emit_*.c; should be moved there +// XXX yuck, there's no portable way to determine register size. Use long long +// if target is 64 bit and data model is ILP32 or LLP64(windows), else long #if defined(__arm__) #define DRC_SR_REG "r10" +#define DRC_REG_LL 0 // 32 bit #elif defined(__aarch64__) #define DRC_SR_REG "r28" +#define DRC_REG_LL (__ILP32__ || _WIN32) #elif defined(__mips__) #define DRC_SR_REG "s6" +#define DRC_REG_LL (_MIPS_SIM == _ABIN32) #elif defined(__riscv__) || defined(__riscv) #define DRC_SR_REG "s11" +#define DRC_REG_LL 0 // no ABI for (__ILP32__ && __riscv_xlen != 32) #elif defined(__i386__) #define DRC_SR_REG "edi" +#define DRC_REG_LL 0 // 32 bit #elif defined(__x86_64__) -#define DRC_SR_REG "ebx" +#define DRC_SR_REG "rbx" +#define DRC_REG_LL (__ILP32__ || _WIN32) #endif #endif #ifdef DRC_SR_REG +// XXX this is more clear but produces too much overhead for slow platforms extern void REGPARM(1) (*sh2_drc_save_sr)(SH2 *sh2); extern void REGPARM(1) (*sh2_drc_restore_sr)(SH2 *sh2); -#define DRC_DECLARE_SR register int32_t sh2_sr asm(DRC_SR_REG) +// NB: sh2_sr MUST have register size if optimizing with -O3 (-fif-conversion) +#if DRC_REG_LL +#define DRC_DECLARE_SR register long long _sh2_sr asm(DRC_SR_REG) +#else +#define DRC_DECLARE_SR register long _sh2_sr asm(DRC_SR_REG) +#endif #define DRC_SAVE_SR(sh2) \ - if (likely((sh2->state & (SH2_STATE_RUN|SH2_STATE_SLEEP)) == SH2_STATE_RUN)) \ - sh2_drc_save_sr(sh2) + if (likely((sh2->state&(SH2_STATE_RUN|SH2_STATE_SLEEP)) == SH2_STATE_RUN)) \ + sh2->sr = (s32)_sh2_sr +// sh2_drc_save_sr(sh2) #define DRC_RESTORE_SR(sh2) \ - if (likely((sh2->state & (SH2_STATE_RUN|SH2_STATE_SLEEP)) == SH2_STATE_RUN)) \ - sh2_drc_restore_sr(sh2) + if (likely((sh2->state&(SH2_STATE_RUN|SH2_STATE_SLEEP)) == SH2_STATE_RUN)) \ + _sh2_sr = (s32)sh2->sr +// sh2_drc_restore_sr(sh2) #else #define DRC_DECLARE_SR #define DRC_SAVE_SR(sh2) diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h index 2d73db59..2f2dfd92 100644 --- a/cpu/sh2/sh2.h +++ b/cpu/sh2/sh2.h @@ -10,6 +10,7 @@ typedef enum { SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, SH2_REGS // register set size } sh2_reg_e; +#define SHR_R(n) (SHR_R0+(n)) typedef struct SH2_ { diff --git a/pico/32x/memory.c b/pico/32x/memory.c index 44bc72d7..30d9b577 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -254,14 +254,14 @@ static NOINLINE void sh2_poll_write(u32 a, u32 d, unsigned int cycles, SH2 *sh2) sh2_poll_rd[hix] = rd; sh2_poll_wr[hix] = wr; } -u32 REGPARM(3) p32x_sh2_poll_memory8(unsigned int a, u32 d, SH2 *sh2) +u32 REGPARM(3) p32x_sh2_poll_memory8(u32 a, u32 d, SH2 *sh2) { int shift = (a & 1 ? 0 : 8); d = (s8)(p32x_sh2_poll_memory16(a & ~1, d << shift, sh2) >> shift); return d; } -u32 REGPARM(3) p32x_sh2_poll_memory16(unsigned int a, u32 d, SH2 *sh2) +u32 REGPARM(3) p32x_sh2_poll_memory16(u32 a, u32 d, SH2 *sh2) { unsigned char *p = sh2->p_drcblk_ram; unsigned int cycles; @@ -281,7 +281,7 @@ u32 REGPARM(3) p32x_sh2_poll_memory16(unsigned int a, u32 d, SH2 *sh2) return d; } -u32 REGPARM(3) p32x_sh2_poll_memory32(unsigned int a, u32 d, SH2 *sh2) +u32 REGPARM(3) p32x_sh2_poll_memory32(u32 a, u32 d, SH2 *sh2) { unsigned char *p = sh2->p_drcblk_ram; unsigned int cycles; @@ -2017,9 +2017,9 @@ int p32x_sh2_memcpy(u32 dst, u32 src, int count, int size, SH2 *sh2) // ----------------------------------------------------------------- -static void z80_md_bank_write_32x(unsigned int a, unsigned char d) +static void z80_md_bank_write_32x(u32 a, unsigned char d) { - unsigned int addr68k; + u32 addr68k; addr68k = Pico.m.z80_bank68k << 15; addr68k += a & 0x7fff; -- 2.39.2