From 8284ab710785099f861cdd10d7b1170eaf40828c Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 16 Aug 2019 15:14:41 +0200 Subject: [PATCH] various small fixes and optimsations --- Makefile | 8 ++++++++ Makefile.libretro | 3 ++- cpu/drc/emit_arm.c | 2 ++ cpu/drc/emit_arm64.c | 1 + cpu/drc/emit_mips.c | 5 +++-- cpu/drc/emit_x86.c | 2 ++ cpu/sh2/compiler.c | 16 ++++++++-------- pico/32x/memory.c | 27 ++++++++++++--------------- tools/mkoffsets.sh | 2 +- 9 files changed, 39 insertions(+), 27 deletions(-) diff --git a/Makefile b/Makefile index 5f64f713..96ccb6ca 100644 --- a/Makefile +++ b/Makefile @@ -236,6 +236,14 @@ pico/cd/cd_file.o: CFLAGS += -fno-strict-aliasing pico/cd/pcm.o: CFLAGS += -fno-strict-aliasing pico/cd/LC89510.o: CFLAGS += -fno-strict-aliasing pico/cd/gfx_cd.o: CFLAGS += -fno-strict-aliasing +ifeq (1,$(use_sh2drc)) +ifneq (,$(findstring -flto,$(CFLAGS))) +# if using the DRC, memory and sh2soc use a global register variable to avoid +# saving and reloading the SH2 SR. However, this collides with the use of LTO. +pico/32x/memory.o: CFLAGS += -fno-lto +pico/32x/sh2soc.o: CFLAGS += -fno-lto +endif +endif # fame needs ~2GB of RAM to compile on gcc 4.8 # on x86, this is reduced by ~300MB when debug info is off (but not on ARM) diff --git a/Makefile.libretro b/Makefile.libretro index 1e07d50f..51da9828 100644 --- a/Makefile.libretro +++ b/Makefile.libretro @@ -26,7 +26,7 @@ CFLAGS ?= STATIC_LINKING:= 0 TARGET_NAME := picodrive LIBM := -lm -GIT_VERSION ?= " $(shell git rev-parse --short HEAD || echo unknown)" +GIT_VERSION ?= $(shell git rev-parse --short HEAD || echo unknown) ifneq ($(GIT_VERSION)," unknown") CFLAGS += -DGIT_VERSION=\"$(GIT_VERSION)\" endif @@ -427,6 +427,7 @@ else ifeq ($(platform), gcw0) use_fame = 1 use_drz80 = 0 use_cz80 = 1 + use_sh2drc = 1 # Windows else diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 72542a3f..a4aa2ec6 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -1174,6 +1174,8 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define host_arg2reg(rd, arg) \ rd = arg +#define emith_rw_offs_max() 0xff + /* SH2 drc specific */ /* pushes r12 for eabi alignment */ #define emith_sh2_drc_entry() \ diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index 90010d80..a67f6819 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -1117,6 +1117,7 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) #define emith_flush() /**/ #define host_instructions_updated(base, end) __builtin___clear_cache(base, end) #define emith_jump_patch_size() 8 +#define emith_rw_offs_max() 0xff // SH2 drc specific diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index f56b89a3..91d493b5 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -394,7 +394,7 @@ int emith_flg_noV; // V flag known not to be set // NB: for adcf and sbcf, carry-in must be dealt with separately (see there) static void emith_set_arith_flags(int rd, int rt, int rs, s32 imm, int sub) { - if (sub && rd == FNZ && rt && rs) // is this cmp_r_r? + if (sub && rd == FNZ && rt > AT && rs > AT) // is this cmp_r_r? emith_flg_rs = rs, emith_flg_rt = rt; else emith_flg_rs = emith_flg_rt = 0; @@ -858,7 +858,7 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) // NB: mips32r2 has EXT and INS #define emith_clear_msb(d, s, count) /* bits to clear */ do { \ u32 t; \ - if ((count) > 16) { \ + if ((count) >= 16) { \ t = (count) - 16; \ t = 0xffff >> t; \ emith_and_r_r_imm(d, s, t); \ @@ -1262,6 +1262,7 @@ static int emith_cond_check(int cond, int *r) // NB: mips32r2 has SYNCI #define host_instructions_updated(base, end) __builtin___clear_cache(base, end) #define emith_jump_patch_size() 4 +#define emith_rw_offs_max() 0x7fff // SH2 drc specific #define emith_sh2_drc_entry() do { \ diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index a40c0f8c..2177541c 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -986,6 +986,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #define host_instructions_updated(base, end) +#define emith_rw_offs_max() 0xffffffff + #ifdef __x86_64__ #define HOST_REGS 16 diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 0083dc42..677c8adf 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -419,8 +419,8 @@ typedef struct { static int rcache_get_tmp(void); static void rcache_free_tmp(int hr); -// Note: cache_regs[] must have at least the amount of REG and TEMP registers -// used by handlers in worst case (currently 4). +// Note: cache_regs[] must have at least the amount of HRF_REG registers used +// by handlers in worst case (currently 4). // Register assignment goes by ABI convention. Caller save registers are TEMP, // the others are either static or REG. SR must be static, R0 very recommended. // VBR, PC, PR must not be static (read from context in utils). @@ -2418,7 +2418,7 @@ static void rcache_init(void) // NB may return either REG or TEMP static int emit_get_rbase_and_offs(SH2 *sh2, sh2_reg_e r, int rmode, u32 *offs) { - uptr omask = 0xff; // offset mask, XXX: ARM oriented.. + uptr omask = emith_rw_offs_max(); // offset mask u32 mask = 0; u32 a; int poffs; @@ -4447,7 +4447,7 @@ end_op: static void sh2_generate_utils(void) { - int arg0, arg1, arg2, arg3, sr, tmp; + int arg0, arg1, arg2, arg3, sr, tmp, tmp2; host_arg2reg(arg0, 0); host_arg2reg(arg1, 1); @@ -4689,18 +4689,18 @@ static void sh2_generate_utils(void) emith_sub_r_imm(tmp, 4*2); rcache_clean(); // push SR - tmp = rcache_get_reg_arg(0, SHR_SP, NULL); - emith_add_r_imm(tmp, 4); + tmp = rcache_get_reg_arg(0, SHR_SP,&tmp2); + emith_add_r_r_imm(tmp, tmp2, 4); tmp = rcache_get_reg_arg(1, SHR_SR, NULL); emith_clear_msb(tmp, tmp, 22); emith_move_r_r_ptr(arg2, CONTEXT_REG); - rcache_invalidate(); + rcache_invalidate_tmp(); emith_call(p32x_sh2_write32); // XXX: use sh2_drc_write32? // push PC rcache_get_reg_arg(0, SHR_SP, NULL); emith_ctx_read(arg1, SHR_PC * 4); emith_move_r_r_ptr(arg2, CONTEXT_REG); - rcache_invalidate(); + rcache_invalidate_tmp(); emith_call(p32x_sh2_write32); // update I, cycles, do callback emith_ctx_read(arg1, offsetof(SH2, pending_level)); diff --git a/pico/32x/memory.c b/pico/32x/memory.c index 7148d41c..8d5ca725 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -197,24 +197,19 @@ static NOINLINE u32 sh2_poll_read(u32 a, u32 d, unsigned int cycles, SH2* sh2) // fetch oldest write to address from fifo, but stop when reaching the present idx = sh2_poll_rd[hix]; while (idx != sh2_poll_wr[hix] && CYCLES_GE(cycles, fifo[idx].cycles)) { -// int oidx = idx; p = &fifo[idx]; idx = (idx+1) % PFIFO_SZ; - if (CYCLES_GT(cycles, p->cycles+80)) { - // drop older fifo stores that may cause synchronisation problems. - // NB unfortunately this cycle diff is quite sensitive: - // observed in Brutal Unleashed: min 80, observed in Afterburner: max 110 - sh2_poll_rd[hix] = idx; - } else if (p->a == a) { - // replace current data with fifo value and discard fifo entry - if (cpu != p->cpu) { + if (cpu != p->cpu) { + if (CYCLES_GT(cycles, p->cycles+80)) { + // drop older fifo stores that may cause synchronisation problems. + sh2_poll_rd[hix] = idx; + } else if (p->a == a) { + // replace current data with fifo value and discard fifo entry d = p->d; p->a = -1; -// if (oidx == sh2_poll_rd[hix]) -// sh2_poll_rd[hix] = idx; + break; } - break; } } return d; @@ -224,7 +219,6 @@ static NOINLINE void sh2_poll_write(u32 a, u32 d, unsigned int cycles, SH2 *sh2) { int hix = (a >> 1) % PFIFO_CNT; struct sh2_poll_fifo *fifo = sh2_poll_fifo[hix]; - struct sh2_poll_fifo *p = &fifo[sh2_poll_wr[hix]]; struct sh2_poll_fifo *q = &fifo[(sh2_poll_wr[hix]-1) % PFIFO_SZ]; int cpu = sh2 ? sh2->is_slave+1 : 0; @@ -233,15 +227,16 @@ static NOINLINE void sh2_poll_write(u32 a, u32 d, unsigned int cycles, SH2 *sh2) // intermediate values that may cause synchronisation problems. // NB this can take an eternity on m68k: mov.b , needs // 28 m68k-cycles (~80 sh2-cycles) to complete (observed in Metal Head) - if (q->a == a && !CYCLES_GT(cycles,q->cycles+30)) { + if (q->a == a && sh2_poll_wr[hix] != sh2_poll_rd[hix] && !CYCLES_GT(cycles,q->cycles+30)) { q->d = d; } else { // store write to poll address in fifo + fifo[sh2_poll_wr[hix]] = + (struct sh2_poll_fifo){ .cycles = cycles, .a = a, .d = d, .cpu = cpu }; sh2_poll_wr[hix] = (sh2_poll_wr[hix]+1) % PFIFO_SZ; if (sh2_poll_wr[hix] == sh2_poll_rd[hix]) // fifo overflow, discard oldest value sh2_poll_rd[hix] = (sh2_poll_rd[hix]+1) % PFIFO_SZ; - *p = (struct sh2_poll_fifo){ .cycles = cycles, .a = a, .d = d, .cpu = cpu }; } } @@ -2369,6 +2364,8 @@ void PicoMemSetup32x(void) sh2_drc_mem_setup(&msh2); sh2_drc_mem_setup(&ssh2); + memset(sh2_poll_rd, 0, sizeof(sh2_poll_rd)); + memset(sh2_poll_wr, 0, sizeof(sh2_poll_wr)); // z80 hack z80_map_set(z80_write_map, 0x8000, 0xffff, z80_md_bank_write_32x, 1); diff --git a/tools/mkoffsets.sh b/tools/mkoffsets.sh index a573f7a4..e7632593 100755 --- a/tools/mkoffsets.sh +++ b/tools/mkoffsets.sh @@ -11,7 +11,7 @@ ENDIAN= # compile with target C compiler and extract value from .rodata section compile_rodata () { - $CC $CFLAGS -I .. -c /tmp/getoffs.c -o /tmp/getoffs.o || exit 1 + $CC $CFLAGS -I .. -shared /tmp/getoffs.c -o /tmp/getoffs.o || exit 1 # find the name of the .rodata section (in case -fdata-sections is used) rosect=$(readelf -S /tmp/getoffs.o | grep '\.rodata' | sed 's/^[^.]*././;s/ .*//') -- 2.39.2