From 630b122be82914a74fac752688abe5d5dd798aa8 Mon Sep 17 00:00:00 2001 From: notaz Date: Tue, 14 Dec 2021 01:14:23 +0200 Subject: [PATCH] overwrite dynarec related code with upstream version This gives a fast arm64 dynarec and many other ari64 dynarec fixes from upstream. Although I tried to take care not to overwrite libretro specific changes like lightrec, some things may have got lost or broken. Only tested on rpi4 in 64bit mode. Warning: untested on Android and most other platforms. If there are issues from this merge, post a comment and tag me and I'll try to take a look at it while I'm still active, at least for the time being. --- Makefile | 38 +- Makefile.libretro | 7 +- configure | 20 +- frontend/cspace_neon.S | 8 +- frontend/libretro.c | 31 +- frontend/libretro_core_options.h | 34 +- frontend/plugin_lib.c | 2 + jni/Android.mk | 18 +- libpcsxcore/database.c | 47 + libpcsxcore/database.h | 6 + libpcsxcore/gte.c | 73 +- libpcsxcore/gte.h | 8 + libpcsxcore/gte_neon.S | 2 +- libpcsxcore/lightrec/plugin.c | 24 +- libpcsxcore/misc.c | 22 +- libpcsxcore/new_dynarec/arm/assem_arm.c | 4157 -------- libpcsxcore/new_dynarec/arm/assem_arm.h | 62 - libpcsxcore/new_dynarec/arm/linkage_offsets.h | 45 - libpcsxcore/new_dynarec/assem_arm.c | 2417 +++++ libpcsxcore/new_dynarec/assem_arm.h | 44 + libpcsxcore/new_dynarec/assem_arm64.c | 2093 ++++ libpcsxcore/new_dynarec/assem_arm64.h | 49 + .../new_dynarec/{backends/psx => }/emu_if.c | 196 +- .../new_dynarec/{backends/psx => }/emu_if.h | 27 +- .../new_dynarec/{arm => }/linkage_arm.S | 133 +- libpcsxcore/new_dynarec/linkage_arm64.S | 414 + libpcsxcore/new_dynarec/linkage_offsets.h | 45 + libpcsxcore/new_dynarec/new_dynarec.c | 8965 ++++++++--------- libpcsxcore/new_dynarec/new_dynarec.h | 18 +- libpcsxcore/new_dynarec/new_dynarec_config.h | 9 +- libpcsxcore/new_dynarec/patches/trace_drc_chk | 133 + libpcsxcore/new_dynarec/patches/trace_intr | 323 + .../new_dynarec/{backends/psx => }/pcsxmem.c | 40 +- .../new_dynarec/{backends/psx => }/pcsxmem.h | 4 - .../{backends/psx => }/pcsxmem_inline.c | 14 +- libpcsxcore/plugins.c | 2 - libpcsxcore/psxbios.c | 10 +- libpcsxcore/psxcommon.h | 8 + libpcsxcore/psxcounters.c | 16 +- libpcsxcore/psxinterpreter.c | 286 +- libpcsxcore/psxinterpreter.h | 7 + libpcsxcore/psxmem.c | 91 +- libpcsxcore/psxmem.h | 6 - libpcsxcore/r3000a.c | 27 +- libpcsxcore/r3000a.h | 12 +- plugins/gpulib/vout_pl.c | 2 +- 46 files changed, 10156 insertions(+), 9839 deletions(-) create mode 100644 libpcsxcore/database.c create mode 100644 libpcsxcore/database.h delete mode 100644 libpcsxcore/new_dynarec/arm/assem_arm.c delete mode 100644 libpcsxcore/new_dynarec/arm/assem_arm.h delete mode 100644 libpcsxcore/new_dynarec/arm/linkage_offsets.h create mode 100644 libpcsxcore/new_dynarec/assem_arm.c create mode 100644 libpcsxcore/new_dynarec/assem_arm.h create mode 100644 libpcsxcore/new_dynarec/assem_arm64.c create mode 100644 libpcsxcore/new_dynarec/assem_arm64.h rename libpcsxcore/new_dynarec/{backends/psx => }/emu_if.c (87%) rename libpcsxcore/new_dynarec/{backends/psx => }/emu_if.h (86%) rename libpcsxcore/new_dynarec/{arm => }/linkage_arm.S (91%) create mode 100644 libpcsxcore/new_dynarec/linkage_arm64.S create mode 100644 libpcsxcore/new_dynarec/linkage_offsets.h create mode 100644 libpcsxcore/new_dynarec/patches/trace_drc_chk create mode 100644 libpcsxcore/new_dynarec/patches/trace_intr rename libpcsxcore/new_dynarec/{backends/psx => }/pcsxmem.c (95%) rename libpcsxcore/new_dynarec/{backends/psx => }/pcsxmem.h (76%) rename libpcsxcore/new_dynarec/{backends/psx => }/pcsxmem_inline.c (78%) create mode 100644 libpcsxcore/psxinterpreter.h diff --git a/Makefile b/Makefile index 1d70f643..3f33bd37 100644 --- a/Makefile +++ b/Makefile @@ -48,7 +48,7 @@ CFLAGS += -DPCNT endif # core -OBJS += libpcsxcore/cdriso.o libpcsxcore/cdrom.o libpcsxcore/cheat.o \ +OBJS += libpcsxcore/cdriso.o libpcsxcore/cdrom.o libpcsxcore/cheat.o libpcsxcore/database.o \ libpcsxcore/decode_xa.o libpcsxcore/mdec.o \ libpcsxcore/misc.o libpcsxcore/plugins.o libpcsxcore/ppf.o libpcsxcore/psxbios.o \ libpcsxcore/psxcommon.o libpcsxcore/psxcounters.o libpcsxcore/psxdma.o libpcsxcore/psxhle.o \ @@ -113,26 +113,28 @@ CFLAGS += -Ideps/mman OBJS += deps/mman/mman.o endif else ifeq "$(DYNAREC)" "ari64" -CFLAGS += -DNEW_DYNAREC -OBJS += libpcsxcore/new_dynarec/backends/psx/emu_if.o \ - libpcsxcore/new_dynarec/new_dynarec.o \ - libpcsxcore/new_dynarec/arm/linkage_arm.o \ - libpcsxcore/new_dynarec/backends/psx/pcsxmem.o -libpcsxcore/new_dynarec/new_dynarec.o: libpcsxcore/new_dynarec/arm/assem_arm.c \ - libpcsxcore/new_dynarec/backends/psx/pcsxmem_inline.c +OBJS += libpcsxcore/new_dynarec/new_dynarec.o +OBJS += libpcsxcore/new_dynarec/pcsxmem.o + ifeq "$(ARCH)" "arm" + OBJS += libpcsxcore/new_dynarec/linkage_arm.o + libpcsxcore/new_dynarec/new_dynarec.o: libpcsxcore/new_dynarec/assem_arm.c + else ifneq (,$(findstring $(ARCH),aarch64 arm64)) + OBJS += libpcsxcore/new_dynarec/linkage_arm64.o + libpcsxcore/new_dynarec/new_dynarec.o: libpcsxcore/new_dynarec/assem_arm64.c + else + $(error no dynarec support for architecture $(ARCH)) + endif else -OBJS += libpcsxcore/new_dynarec/backends/psx/emu_if.o -libpcsxcore/new_dynarec/backends/psx/emu_if.o: CFLAGS += -DDRC_DISABLE -frontend/libretro.o: CFLAGS += -DDRC_DISABLE +CFLAGS += -DDRC_DISABLE endif +OBJS += libpcsxcore/new_dynarec/emu_if.o +libpcsxcore/new_dynarec/new_dynarec.o: libpcsxcore/new_dynarec/pcsxmem_inline.c ifdef DRC_DBG -libpcsxcore/new_dynarec/backends/psx/emu_if.o: CFLAGS += -D_FILE_OFFSET_BITS=64 +libpcsxcore/new_dynarec/emu_if.o: CFLAGS += -D_FILE_OFFSET_BITS=64 CFLAGS += -DDRC_DBG endif -ifeq "$(DRC_CACHE_BASE)" "1" -libpcsxcore/new_dynarec/%.o: CFLAGS += -DBASE_ADDR_FIXED=1 -libpcsxcore/new_dynarec/backends/psx/%.o: CFLAGS += -DBASE_ADDR_FIXED=1 -libpcsxcore/new_dynarec/arm/%.o: CFLAGS += -DBASE_ADDR_FIXED=1 +ifeq "$(BASE_ADDR_DYNAMIC)" "1" +libpcsxcore/new_dynarec/%.o: CFLAGS += -DBASE_ADDR_DYNAMIC=1 endif # spu @@ -312,9 +314,6 @@ OBJS += libretro-common/time/rtime.o OBJS += libretro-common/vfs/vfs_implementation.o CFLAGS += -DUSE_LIBRETRO_VFS endif -ifeq "$(ENABLE_ICACHE_EMULATION)" "1" -CFLAGS += -DICACHE_EMULATION -endif OBJS += frontend/libretro.o CFLAGS += -Ilibretro-common/include CFLAGS += -DFRONTEND_SUPPORTS_RGB565 @@ -331,6 +330,7 @@ ifeq "$(USE_PLUGIN_LIB)" "1" OBJS += frontend/plugin_lib.o OBJS += frontend/libpicofe/linux/plat.o OBJS += frontend/libpicofe/readpng.o frontend/libpicofe/fonts.o +frontend/libpicofe/linux/plat.o: CFLAGS += -DNO_HOME_DIR ifeq "$(HAVE_NEON)" "1" OBJS += frontend/libpicofe/arm/neon_scale2x.o OBJS += frontend/libpicofe/arm/neon_eagle2x.o diff --git a/Makefile.libretro b/Makefile.libretro index e1ba9474..59bc5758 100644 --- a/Makefile.libretro +++ b/Makefile.libretro @@ -4,7 +4,6 @@ DEBUG ?= 0 WANT_ZLIB ?= 1 HAVE_CHD ?= 1 USE_LIBRETRO_VFS ?= 0 -ENABLE_ICACHE_EMULATION ?= 1 # Dynarec options: lightrec, ari64 DYNAREC ?= lightrec @@ -76,7 +75,7 @@ else ifneq (,$(findstring h5,$(platform))) fpic := -fPIC SHARED := -shared -Wl,-version-script=link.T ARCH = arm64 - DYNAREC ?= lightrec + DYNAREC ?= ari64 CFLAGS += -fomit-frame-pointer -ffast-math -DARM CPUFLAGS += -march=armv8-a+crc -mfpu=neon-fp-armv8 -mcpu=cortex-a53 -mtune=cortex-a53 @@ -344,7 +343,7 @@ else ifeq ($(platform), rpi3_64) TARGET := $(TARGET_NAME)_libretro.so ARCH := arm64 BUILTIN_GPU = unai - DYNAREC = lightrec + DYNAREC = ari64 fpic := -fPIC CFLAGS += -march=armv8-a+crc+simd -mtune=cortex-a53 -ftree-vectorize @@ -364,7 +363,7 @@ else ifeq ($(platform), rpi4_64) TARGET := $(TARGET_NAME)_libretro.so ARCH := arm64 BUILTIN_GPU = unai - DYNAREC = lightrec + DYNAREC = ari64 fpic := -fPIC CFLAGS += -march=armv8-a+crc+simd -mtune=cortex-a72 -ftree-vectorize diff --git a/configure b/configure index f3a50d17..a618231a 100755 --- a/configure +++ b/configure @@ -45,7 +45,6 @@ sound_driver_list="oss alsa pulseaudio sdl libretro" sound_drivers="" plugins="plugins/spunull/spunull.so \ plugins/dfxvideo/gpu_peops.so plugins/gpu_unai/gpu_unai.so" -ram_fixed="no" drc_cache_base="no" have_armv5="" have_armv6="" @@ -76,6 +75,7 @@ config_mak="config.mak" fail() { echo "$@" + if test -n "$DUMP_CONFIG_LOG"; then cat config.log; fi exit 1 } @@ -88,21 +88,18 @@ set_platform() ;; pandora) sound_drivers="oss alsa" - ram_fixed="yes" drc_cache_base="yes" optimize_cortexa8="yes" have_arm_neon="yes" need_xlib="yes" ;; maemo) - ram_fixed="yes" drc_cache_base="yes" optimize_cortexa8="yes" have_arm_neon="yes" ;; caanoo) sound_drivers="oss" - ram_fixed="yes" drc_cache_base="yes" optimize_arm926ej="yes" need_warm="yes" @@ -278,7 +275,11 @@ arm*) echo " CFLAGS=-march=armv7-a ./configure ..." fi ;; +aarch64) + ;; *) + # dynarec only available on ARM + enable_dynarec="no" ;; esac @@ -291,10 +292,6 @@ if [ "$ARCH" != "arm" -o "$have_armv6" = "yes" ]; then PLUGIN_CFLAGS="$PLUGIN_CFLAGS -fPIC" fi -if [ "$ram_fixed" = "yes" ]; then - CFLAGS="$CFLAGS -DRAM_FIXED" -fi - case "$platform" in generic) need_sdl="yes" @@ -548,15 +545,12 @@ echo >> $config_mak if [ "$platform" = "libretro" ]; then echo "TARGET = libretro.so" >> $config_mak - echo "HAVE_CHD = 1" >> $config_mak fi echo "ARCH = $ARCH" >> $config_mak echo "PLATFORM = $platform" >> $config_mak echo "BUILTIN_GPU = $builtin_gpu" >> $config_mak echo "SOUND_DRIVERS = $sound_drivers" >> $config_mak -if [ "$platform" != "libretro" ]; then - echo "PLUGINS = $plugins" >> $config_mak -fi +echo "PLUGINS = $plugins" >> $config_mak if [ "$have_arm_neon" = "yes" ]; then echo "HAVE_NEON = 1" >> $config_mak fi @@ -572,7 +566,7 @@ if [ "$enable_dynarec" = "yes" ]; then echo "USE_DYNAREC = 1" >> $config_mak fi if [ "$drc_cache_base" = "yes" ]; then - echo "DRC_CACHE_BASE = 1" >> $config_mak + echo "BASE_ADDR_DYNAMIC = 1" >> $config_mak fi if [ "$have_c64x_dsp" = "yes" ]; then echo "HAVE_C64_TOOLS = 1" >> $config_mak diff --git a/frontend/cspace_neon.S b/frontend/cspace_neon.S index 56ab3044..4cb3d4c8 100644 --- a/frontend/cspace_neon.S +++ b/frontend/cspace_neon.S @@ -183,8 +183,8 @@ FUNCTION(bgr888_to_rgb888): @ dst, src, bytes umull r12,r2, r3, r2 0: pld [r1, #48*3] - vld3.8 {d0-d2}, [r1, :64]! - vld3.8 {d3-d5}, [r1, :64]! + vld3.8 {d0-d2}, [r1]! + vld3.8 {d3-d5}, [r1]! vswp d0, d2 vswp d3, d5 vst3.8 {d0-d2}, [r0, :64]! @@ -207,8 +207,8 @@ FUNCTION(bgr888_to_rgb565): @ dst, src, bytes vdup.16 q15, r3 0: pld [r1, #48*3] - vld3.8 {d1-d3}, [r1, :64]! - vld3.8 {d5-d7}, [r1, :64]! + vld3.8 {d1-d3}, [r1]! + vld3.8 {d5-d7}, [r1]! vshll.u8 q8, d2, #3 @ g vshll.u8 q9, d6, #3 diff --git a/frontend/libretro.c b/frontend/libretro.c index 6fee42a5..8252e15b 100644 --- a/frontend/libretro.c +++ b/frontend/libretro.c @@ -1628,7 +1628,7 @@ static void update_variables(bool in_flight) display_internal_fps = true; } -#if defined(LIGHTREC) || defined(NEW_DYNAREC) +#ifndef DRC_DISABLE var.value = NULL; var.key = "pcsx_rearmed_drc"; @@ -1661,7 +1661,8 @@ static void update_variables(bool in_flight) psxCpu->Reset(); // not really a reset.. } } -#endif /* LIGHTREC || NEW_DYNAREC */ +#endif /* !DRC_DISABLE */ + psxCpu->ApplyConfig(); var.value = NULL; var.key = "pcsx_rearmed_spu_reverb"; @@ -1700,7 +1701,6 @@ static void update_variables(bool in_flight) Config.RCntFix = 1; } -#ifdef ICACHE_EMULATION var.value = NULL; var.key = "pcsx_rearmed_icache_emulation"; @@ -1711,7 +1711,6 @@ static void update_variables(bool in_flight) else if (strcmp(var.value, "enabled") == 0) Config.icache_emulation = 1; } -#endif var.value = NULL; var.key = "pcsx_rearmed_inuyasha_fix"; @@ -2054,7 +2053,7 @@ static void update_variables(bool in_flight) GunconAdjustRatioY = atof(var.value); } -#ifdef NEW_DYNAREC +#if !defined(DRC_DISABLE) && !defined(LIGHTREC) var.value = NULL; var.key = "pcsx_rearmed_nosmccheck"; if (environ_cb(RETRO_ENVIRONMENT_GET_VARIABLE, &var) && var.value) @@ -2093,7 +2092,27 @@ static void update_variables(bool in_flight) int psxclock = atoi(var.value); cycle_multiplier = 10000 / psxclock; } -#endif /* NEW_DYNAREC */ + + var.value = NULL; + var.key = "pcsx_rearmed_nocompathacks"; + if (environ_cb(RETRO_ENVIRONMENT_GET_VARIABLE, &var) && var.value) + { + if (strcmp(var.value, "enabled") == 0) + new_dynarec_hacks |= NDHACK_NO_COMPAT_HACKS; + else + new_dynarec_hacks &= ~NDHACK_NO_COMPAT_HACKS; + } +#endif /* !DRC_DISABLE && !LIGHTREC */ + + var.value = NULL; + var.key = "pcsx_rearmed_nostalls"; + if (environ_cb(RETRO_ENVIRONMENT_GET_VARIABLE, &var) && var.value) + { + if (strcmp(var.value, "enabled") == 0) + Config.DisableStalls = 1; + else + Config.DisableStalls = 0; + } var.value = NULL; var.key = "pcsx_rearmed_input_sensitivity"; diff --git a/frontend/libretro_core_options.h b/frontend/libretro_core_options.h index 6a754cf8..3e1daf2b 100644 --- a/frontend/libretro_core_options.h +++ b/frontend/libretro_core_options.h @@ -479,7 +479,7 @@ struct retro_core_option_definition option_defs_us[] = { #endif }, -#if defined(LIGHTREC) || defined(NEW_DYNAREC) +#ifndef DRC_DISABLE { "pcsx_rearmed_drc", "Dynamic Recompiler", @@ -491,9 +491,9 @@ struct retro_core_option_definition option_defs_us[] = { }, "enabled", }, -#endif /* LIGHTREC || NEW_DYNAREC */ +#endif -#ifdef NEW_DYNAREC +#if !defined(DRC_DISABLE) && !defined(LIGHTREC) { "pcsx_rearmed_psxclock", "PSX CPU Clock", @@ -582,7 +582,7 @@ struct retro_core_option_definition option_defs_us[] = { "57", #endif }, -#endif /* NEW_DYNAREC */ +#endif /* !DRC_DISABLE && !LIGHTREC */ #ifdef GPU_NEON { @@ -969,7 +969,7 @@ struct retro_core_option_definition option_defs_us[] = { "disabled", }, -#ifdef NEW_DYNAREC +#if !defined(DRC_DISABLE) && !defined(LIGHTREC) { "pcsx_rearmed_nosmccheck", "(Speed Hack) Disable SMC Checks", @@ -1003,7 +1003,29 @@ struct retro_core_option_definition option_defs_us[] = { }, "disabled", }, -#endif /* NEW_DYNAREC */ + { + "pcsx_rearmed_nostalls", + "Disable CPU/GTE stalls", + "Will cause some games to run too fast.", + { + { "disabled", NULL }, + { "enabled", NULL }, + { NULL, NULL }, + }, + "disabled", + }, + { + "pcsx_rearmed_nocompathacks", + "Disable compat hacks", + "Disables game-specific compatibility hacks.", + { + { "disabled", NULL }, + { "enabled", NULL }, + { NULL, NULL }, + }, + "disabled", + }, +#endif /* !DRC_DISABLE && !LIGHTREC */ { NULL, NULL, NULL, {{0}}, NULL }, }; diff --git a/frontend/plugin_lib.c b/frontend/plugin_lib.c index eee255b7..eb9d48ea 100644 --- a/frontend/plugin_lib.c +++ b/frontend/plugin_lib.c @@ -400,6 +400,8 @@ static void pl_vout_flip(const void *vram, int stride, int bgr24, int w, int h) #endif else { + src = (void *)((uintptr_t)src & ~3); // align for the blitter + for (; h1-- > 0; dest += dstride * 2, src += stride) { bgr555_to_rgb565(dest, src, w * 2); diff --git a/jni/Android.mk b/jni/Android.mk index 644e2afe..50aa6969 100644 --- a/jni/Android.mk +++ b/jni/Android.mk @@ -25,6 +25,7 @@ EXTRA_INCLUDES := SOURCES_C := $(CORE_DIR)/cdriso.c \ $(CORE_DIR)/cdrom.c \ $(CORE_DIR)/cheat.c \ + $(CORE_DIR)/database.c \ $(CORE_DIR)/decode_xa.c \ $(CORE_DIR)/mdec.c \ $(CORE_DIR)/misc.c \ @@ -115,7 +116,7 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) else ifeq ($(TARGET_ARCH_ABI),armeabi) HAVE_ARI64=1 else ifeq ($(TARGET_ARCH_ABI),arm64-v8a) - HAVE_LIGHTREC=1 + HAVE_ARI64=1 else ifeq ($(TARGET_ARCH_ABI),x86_64) HAVE_LIGHTREC=1 else ifeq ($(TARGET_ARCH_ABI),x86) @@ -125,13 +126,17 @@ else endif ifeq ($(HAVE_ARI64),1) - COREFLAGS += -DNEW_DYNAREC - SOURCES_ASM += $(CORE_DIR)/gte_arm.S \ - $(SPU_DIR)/arm_utils.S \ - $(DYNAREC_DIR)/arm/linkage_arm.S SOURCES_C += $(DYNAREC_DIR)/new_dynarec.c \ - $(DYNAREC_DIR)/backends/psx/pcsxmem.c + $(DYNAREC_DIR)/pcsxmem.c + ifeq ($(TARGET_ARCH_ABI),arm64-v8a) + SOURCES_ASM += $(DYNAREC_DIR)/linkage_arm64.S + else + SOURCES_ASM += $(CORE_DIR)/gte_arm.S \ + $(SPU_DIR)/arm_utils.S \ + $(DYNAREC_DIR)/linkage_arm.S + endif endif + SOURCES_C += $(DYNAREC_DIR)/emu_if.c ifeq ($(HAVE_LIGHTREC),1) COREFLAGS += -DLIGHTREC -DLIGHTREC_STATIC @@ -164,7 +169,6 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) $(NEON_DIR)/psx_gpu/psx_gpu_arm_neon.S \ $(FRONTEND_DIR)/cspace_neon.S SOURCES_C += $(NEON_DIR)/psx_gpu_if.c - SOURCES_C += $(DYNAREC_DIR)/backends/psx/emu_if.c else ifeq ($(TARGET_ARCH_ABI),armeabi) COREFLAGS += -DUSE_GPULIB=1 -DGPU_UNAI SOURCES_ASM += $(UNAI_DIR)/gpu_arm.S \ diff --git a/libpcsxcore/database.c b/libpcsxcore/database.c new file mode 100644 index 00000000..61312e09 --- /dev/null +++ b/libpcsxcore/database.c @@ -0,0 +1,47 @@ +#include "misc.h" +#include "sio.h" +#include "new_dynarec/new_dynarec.h" + +/* It's duplicated from emu_if.c */ +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0])) + +static const char MemorycardHack_db[8][10] = +{ + /* Lifeforce Tenka, also known as Codename Tenka */ + {"SLES00613"}, + {"SLED00690"}, + {"SLES00614"}, + {"SLES00615"}, + {"SLES00616"}, + {"SLES00617"}, + {"SCUS94409"} +}; + +/* Function for automatic patching according to GameID. */ +void Apply_Hacks_Cdrom() +{ + uint32_t i; + + /* Apply Memory card hack for Codename Tenka. (The game needs one of the memory card slots to be empty) */ + for(i=0;iCP2C.p[26].sw.l) -#define gteH (regs->CP2C.p[26].w.l) +//#define gteH (psxRegs.CP2C.p[26].sw.l) +#define gteH (psxRegs.CP2C.p[26].w.l) #define gteDQA (regs->CP2C.p[27].sw.l) #define gteDQB (((s32 *)regs->CP2C.r)[28]) #define gteZSF3 (regs->CP2C.p[29].sw.l) @@ -260,6 +260,7 @@ static inline u32 limE_(psxCP2Regs *regs, u32 result) { #define A3U(x) (x) #endif + //senquack - n param should be unsigned (will be 'gteH' reg which is u16) #ifdef GTE_USE_NATIVE_DIVIDE INLINE u32 DIVIDE(u16 n, u16 d) { @@ -274,6 +275,32 @@ INLINE u32 DIVIDE(u16 n, u16 d) { #ifndef FLAGLESS +const unsigned char gte_cycletab[64] = { + /* 1 2 3 4 5 6 7 8 9 a b c d e f */ + 0, 15, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 6, 0, 0, 0, + 8, 8, 8, 19, 13, 0, 44, 0, 0, 0, 0, 17, 11, 0, 14, 0, + 30, 0, 0, 0, 0, 0, 0, 0, 5, 8, 17, 0, 0, 5, 6, 0, + 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 39, +}; + +// warning: called by the dynarec +int gteCheckStallRaw(u32 op_cycles, psxRegisters *regs) { + u32 left = regs->gteBusyCycle - regs->cycle; + int stall = 0; + + if (left <= 44) { + //printf("c %2u stall %2u %u\n", op_cycles, left, regs->cycle); + regs->cycle = regs->gteBusyCycle; + stall = left; + } + regs->gteBusyCycle = regs->cycle + op_cycles; + return stall; +} + +void gteCheckStall(u32 op) { + gteCheckStallRaw(gte_cycletab[op], &psxRegs); +} + u32 MFC2(int reg) { psxCP2Regs *regs = &psxRegs.CP2; switch (reg) { @@ -321,9 +348,10 @@ void MTC2(u32 value, int reg) { case 28: gteIRGB = value; - gteIR1 = (value & 0x1f) << 7; - gteIR2 = (value & 0x3e0) << 2; - gteIR3 = (value & 0x7c00) >> 3; + // not gteIR1 etc. just to be consistent with dynarec + regs->CP2D.n.ir1 = (value & 0x1f) << 7; + regs->CP2D.n.ir2 = (value & 0x3e0) << 2; + regs->CP2D.n.ir3 = (value & 0x7c00) >> 3; break; case 30: @@ -377,13 +405,11 @@ void CTC2(u32 value, int reg) { } void gteMFC2() { - psxRegs.cycle += 1; if (!_Rt_) return; psxRegs.GPR.r[_Rt_] = MFC2(_Rd_); } void gteCFC2() { - psxRegs.cycle += 1; if (!_Rt_) return; psxRegs.GPR.r[_Rt_] = psxRegs.CP2C.r[_Rd_]; } @@ -403,10 +429,19 @@ void gteLWC2() { } void gteSWC2() { - //psxRegs.cycle += 1; psxMemWrite32(_oB_, MFC2(_Rt_)); } +void gteLWC2_stall() { + gteCheckStall(0); + gteLWC2(); +} + +void gteSWC2_stall() { + gteCheckStall(0); + gteSWC2(); +} + #endif // FLAGLESS #if 0 @@ -428,7 +463,6 @@ void gteRTPS(psxCP2Regs *regs) { #ifdef GTE_LOG GTE_LOG("GTE RTPS\n"); #endif - psxRegs.cycle += 15; gteFLAG = 0; gteMAC1 = A1((((s64)gteTRX << 12) + (gteR11 * gteVX0) + (gteR12 * gteVY0) + (gteR13 * gteVZ0)) >> 12); @@ -461,7 +495,6 @@ void gteRTPT(psxCP2Regs *regs) { #ifdef GTE_LOG GTE_LOG("GTE RTPT\n"); #endif - psxRegs.cycle += 23; gteFLAG = 0; gteSZ0 = gteSZ3; @@ -500,7 +533,6 @@ void gteMVMVA(psxCP2Regs *regs) { GTE_LOG("GTE MVMVA\n"); #endif gteFLAG = 0; - psxRegs.cycle += 8; gteMAC1 = A1((((s64)CV1(cv) << 12) + (MX11(mx) * vx) + (MX12(mx) * vy) + (MX13(mx) * vz)) >> shift); gteMAC2 = A2((((s64)CV2(cv) << 12) + (MX21(mx) * vx) + (MX22(mx) * vy) + (MX23(mx) * vz)) >> shift); @@ -516,7 +548,6 @@ void gteNCLIP(psxCP2Regs *regs) { GTE_LOG("GTE NCLIP\n"); #endif gteFLAG = 0; - psxRegs.cycle += 8; gteMAC0 = F((s64)gteSX0 * (gteSY1 - gteSY2) + gteSX1 * (gteSY2 - gteSY0) + @@ -528,7 +559,6 @@ void gteAVSZ3(psxCP2Regs *regs) { GTE_LOG("GTE AVSZ3\n"); #endif gteFLAG = 0; - psxRegs.cycle += 5; gteMAC0 = F((s64)gteZSF3 * (gteSZ1 + gteSZ2 + gteSZ3)); gteOTZ = limD(gteMAC0 >> 12); @@ -539,7 +569,6 @@ void gteAVSZ4(psxCP2Regs *regs) { GTE_LOG("GTE AVSZ4\n"); #endif gteFLAG = 0; - psxRegs.cycle += 6; gteMAC0 = F((s64)gteZSF4 * (gteSZ0 + gteSZ1 + gteSZ2 + gteSZ3)); gteOTZ = limD(gteMAC0 >> 12); @@ -553,7 +582,6 @@ void gteSQR(psxCP2Regs *regs) { GTE_LOG("GTE SQR\n"); #endif gteFLAG = 0; - psxRegs.cycle += 5; gteMAC1 = (gteIR1 * gteIR1) >> shift; gteMAC2 = (gteIR2 * gteIR2) >> shift; @@ -568,7 +596,6 @@ void gteNCCS(psxCP2Regs *regs) { GTE_LOG("GTE NCCS\n"); #endif gteFLAG = 0; - psxRegs.cycle += 17; gteMAC1 = ((s64)(gteL11 * gteVX0) + (gteL12 * gteVY0) + (gteL13 * gteVZ0)) >> 12; gteMAC2 = ((s64)(gteL21 * gteVX0) + (gteL22 * gteVY0) + (gteL23 * gteVZ0)) >> 12; @@ -605,7 +632,6 @@ void gteNCCT(psxCP2Regs *regs) { GTE_LOG("GTE NCCT\n"); #endif gteFLAG = 0; - psxRegs.cycle += 39; for (v = 0; v < 3; v++) { vx = VX(v); @@ -644,7 +670,6 @@ void gteNCDS(psxCP2Regs *regs) { GTE_LOG("GTE NCDS\n"); #endif gteFLAG = 0; - psxRegs.cycle += 19; gteMAC1 = ((s64)(gteL11 * gteVX0) + (gteL12 * gteVY0) + (gteL13 * gteVZ0)) >> 12; gteMAC2 = ((s64)(gteL21 * gteVX0) + (gteL22 * gteVY0) + (gteL23 * gteVZ0)) >> 12; @@ -681,7 +706,6 @@ void gteNCDT(psxCP2Regs *regs) { GTE_LOG("GTE NCDT\n"); #endif gteFLAG = 0; - psxRegs.cycle += 44; for (v = 0; v < 3; v++) { vx = VX(v); @@ -723,7 +747,6 @@ void gteOP(psxCP2Regs *regs) { GTE_LOG("GTE OP\n"); #endif gteFLAG = 0; - psxRegs.cycle += 6; gteMAC1 = ((gteR22 * gteIR3) - (gteR33 * gteIR2)) >> shift; gteMAC2 = ((gteR33 * gteIR1) - (gteR11 * gteIR3)) >> shift; @@ -744,7 +767,6 @@ void gteDCPL(psxCP2Regs *regs) { GTE_LOG("GTE DCPL\n"); #endif gteFLAG = 0; - psxRegs.cycle += 8; gteMAC1 = RIR1 + ((gteIR0 * limB1(A1U((s64)gteRFC - RIR1), 0)) >> 12); gteMAC2 = GIR2 + ((gteIR0 * limB1(A2U((s64)gteGFC - GIR2), 0)) >> 12); @@ -769,7 +791,6 @@ void gteGPF(psxCP2Regs *regs) { GTE_LOG("GTE GPF\n"); #endif gteFLAG = 0; - psxRegs.cycle += 5; gteMAC1 = (gteIR0 * gteIR1) >> shift; gteMAC2 = (gteIR0 * gteIR2) >> shift; @@ -793,7 +814,6 @@ void gteGPL(psxCP2Regs *regs) { GTE_LOG("GTE GPL\n"); #endif gteFLAG = 0; - psxRegs.cycle += 5; gteMAC1 = A1((((s64)gteMAC1 << shift) + (gteIR0 * gteIR1)) >> shift); gteMAC2 = A2((((s64)gteMAC2 << shift) + (gteIR0 * gteIR2)) >> shift); @@ -817,7 +837,6 @@ void gteDPCS(psxCP2Regs *regs) { GTE_LOG("GTE DPCS\n"); #endif gteFLAG = 0; - psxRegs.cycle += 8; gteMAC1 = ((gteR << 16) + (gteIR0 * limB1(A1U(((s64)gteRFC - (gteR << 4)) << (12 - shift)), 0))) >> 12; gteMAC2 = ((gteG << 16) + (gteIR0 * limB2(A2U(((s64)gteGFC - (gteG << 4)) << (12 - shift)), 0))) >> 12; @@ -841,7 +860,6 @@ void gteDPCT(psxCP2Regs *regs) { GTE_LOG("GTE DPCT\n"); #endif gteFLAG = 0; - psxRegs.cycle += 17; for (v = 0; v < 3; v++) { gteMAC1 = ((gteR0 << 16) + (gteIR0 * limB1(A1U((s64)gteRFC - (gteR0 << 4)), 0))) >> 12; @@ -865,7 +883,6 @@ void gteNCS(psxCP2Regs *regs) { GTE_LOG("GTE NCS\n"); #endif gteFLAG = 0; - psxRegs.cycle += 14; gteMAC1 = ((s64)(gteL11 * gteVX0) + (gteL12 * gteVY0) + (gteL13 * gteVZ0)) >> 12; gteMAC2 = ((s64)(gteL21 * gteVX0) + (gteL22 * gteVY0) + (gteL23 * gteVZ0)) >> 12; @@ -896,7 +913,6 @@ void gteNCT(psxCP2Regs *regs) { GTE_LOG("GTE NCT\n"); #endif gteFLAG = 0; - psxRegs.cycle += 30; for (v = 0; v < 3; v++) { vx = VX(v); @@ -928,7 +944,6 @@ void gteCC(psxCP2Regs *regs) { GTE_LOG("GTE CC\n"); #endif gteFLAG = 0; - psxRegs.cycle += 11; gteMAC1 = A1((((s64)gteRBK << 12) + (gteLR1 * gteIR1) + (gteLR2 * gteIR2) + (gteLR3 * gteIR3)) >> 12); gteMAC2 = A2((((s64)gteGBK << 12) + (gteLG1 * gteIR1) + (gteLG2 * gteIR2) + (gteLG3 * gteIR3)) >> 12); @@ -959,7 +974,6 @@ void gteINTPL(psxCP2Regs *regs) { GTE_LOG("GTE INTPL\n"); #endif gteFLAG = 0; - psxRegs.cycle += 8; gteMAC1 = ((gteIR1 << 12) + (gteIR0 * limB1(A1U((s64)gteRFC - gteIR1), 0))) >> shift; gteMAC2 = ((gteIR2 << 12) + (gteIR0 * limB2(A2U((s64)gteGFC - gteIR2), 0))) >> shift; @@ -980,7 +994,6 @@ void gteCDP(psxCP2Regs *regs) { GTE_LOG("GTE CDP\n"); #endif gteFLAG = 0; - psxRegs.cycle += 13; gteMAC1 = A1((((s64)gteRBK << 12) + (gteLR1 * gteIR1) + (gteLR2 * gteIR2) + (gteLR3 * gteIR3)) >> 12); gteMAC2 = A2((((s64)gteGBK << 12) + (gteLG1 * gteIR1) + (gteLG2 * gteIR2) + (gteLG3 * gteIR3)) >> 12); diff --git a/libpcsxcore/gte.h b/libpcsxcore/gte.h index 8bc6988d..8f133f51 100644 --- a/libpcsxcore/gte.h +++ b/libpcsxcore/gte.h @@ -67,6 +67,12 @@ extern "C" { struct psxCP2Regs; +extern const unsigned char gte_cycletab[64]; + +int gteCheckStallRaw(u32 op_cycles, psxRegisters *regs); +void gteCheckStall(u32 op); + +// for lightrec u32 MFC2(int reg); void MTC2(u32 value, int reg); void CTC2(u32 value, int reg); @@ -77,6 +83,8 @@ void gteMTC2(); void gteCTC2(); void gteLWC2(); void gteSWC2(); +void gteLWC2_stall(); +void gteSWC2_stall(); void gteRTPS(struct psxCP2Regs *regs); void gteOP(struct psxCP2Regs *regs); diff --git a/libpcsxcore/gte_neon.S b/libpcsxcore/gte_neon.S index 60065f8a..2799caaa 100644 --- a/libpcsxcore/gte_neon.S +++ b/libpcsxcore/gte_neon.S @@ -6,7 +6,7 @@ */ #include "arm_features.h" -#include "new_dynarec/arm/linkage_offsets.h" +#include "new_dynarec/linkage_offsets.h" .syntax unified .text diff --git a/libpcsxcore/lightrec/plugin.c b/libpcsxcore/lightrec/plugin.c index 3e68a9ca..bb4138b9 100644 --- a/libpcsxcore/lightrec/plugin.c +++ b/libpcsxcore/lightrec/plugin.c @@ -37,6 +37,9 @@ # define unlikely(x) (x) #endif +psxRegisters psxRegs; +Rcnt rcnts[4]; + static struct lightrec_state *lightrec_state; static char *name = "retroarch.exe"; @@ -47,18 +50,6 @@ static bool lightrec_debug; static bool lightrec_very_debug; static u32 lightrec_begin_cycles; -int stop; -u32 cycle_multiplier; -int new_dynarec_hacks; - -/* Unused for now */ -u32 event_cycles[PSXINT_COUNT]; -u32 next_interupt; - -void new_dyna_before_save() {} -void new_dyna_after_save() {} -void new_dyna_freeze(void *f, int i) {} - enum my_cp2_opcodes { OP_CP2_RTPS = 0x01, OP_CP2_NCLIP = 0x06, @@ -578,7 +569,6 @@ static void lightrec_plugin_clear(u32 addr, u32 size) lightrec_invalidate(lightrec_state, addr, size * 4); } -#ifdef ICACHE_EMULATION static void lightrec_plugin_notify(int note, void *data) { /* @@ -595,7 +585,10 @@ static void lightrec_plugin_notify(int note, void *data) break; }*/ } -#endif + +static void lightrec_plugin_apply_config() +{ +} static void lightrec_plugin_shutdown(void) { @@ -615,8 +608,7 @@ R3000Acpu psxRec = lightrec_plugin_execute, lightrec_plugin_execute_block, lightrec_plugin_clear, -#ifdef ICACHE_EMULATION lightrec_plugin_notify, -#endif + lightrec_plugin_apply_config, lightrec_plugin_shutdown, }; diff --git a/libpcsxcore/misc.c b/libpcsxcore/misc.c index 8911bac3..be501a2c 100644 --- a/libpcsxcore/misc.c +++ b/libpcsxcore/misc.c @@ -21,11 +21,13 @@ * Miscellaneous functions, including savestates and CD-ROM loading. */ +#include #include "misc.h" #include "cdrom.h" #include "mdec.h" #include "gpu.h" #include "ppf.h" +#include "database.h" #include char CdromId[10] = ""; @@ -388,17 +390,25 @@ int CheckCdrom() { strcpy(CdromId, "SLUS99999"); if (Config.PsxAuto) { // autodetect system (pal or ntsc) - if (CdromId[2] == 'e' || CdromId[2] == 'E') + if ( + /* Make sure Wild Arms SCUS-94608 is not detected as a PAL game. */ + ((CdromId[0] == 's' || CdromId[0] == 'S') && (CdromId[2] == 'e' || CdromId[2] == 'E')) || + !strncmp(CdromId, "DTLS3035", 8) || + !strncmp(CdromId, "PBPX95001", 9) || // according to redump.org, these PAL + !strncmp(CdromId, "PBPX95007", 9) || // discs have a non-standard ID; + !strncmp(CdromId, "PBPX95008", 9)) // add more serials if they are discovered. Config.PsxType = PSX_TYPE_PAL; // pal else Config.PsxType = PSX_TYPE_NTSC; // ntsc } if (CdromLabel[0] == ' ') { - memcpy(CdromLabel, CdromId, 9); + strncpy(CdromLabel, CdromId, 9); } SysPrintf(_("CD-ROM Label: %.32s\n"), CdromLabel); SysPrintf(_("CD-ROM ID: %.9s\n"), CdromId); SysPrintf(_("CD-ROM EXE Name: %.255s\n"), exename); + + Apply_Hacks_Cdrom(); BuildPPFCache(); @@ -621,7 +631,8 @@ int SaveState(const char *file) { SaveFuncs.write(f, psxM, 0x00200000); SaveFuncs.write(f, psxR, 0x00080000); SaveFuncs.write(f, psxH, 0x00010000); - SaveFuncs.write(f, (void *)&psxRegs, sizeof(psxRegs)); + // only partial save of psxRegisters to maintain savestate compat + SaveFuncs.write(f, &psxRegs, offsetof(psxRegisters, gteBusyCycle)); // gpu gpufP = (GPUFreeze_t *)malloc(sizeof(GPUFreeze_t)); @@ -690,7 +701,8 @@ int LoadState(const char *file) { SaveFuncs.read(f, psxM, 0x00200000); SaveFuncs.read(f, psxR, 0x00080000); SaveFuncs.read(f, psxH, 0x00010000); - SaveFuncs.read(f, (void *)&psxRegs, sizeof(psxRegs)); + SaveFuncs.read(f, &psxRegs, offsetof(psxRegisters, gteBusyCycle)); + psxRegs.gteBusyCycle = psxRegs.cycle; if (Config.HLE) psxBiosFreeze(0); @@ -777,7 +789,7 @@ int RecvPcsxInfo() { NET_recvData(&Config.Cpu, sizeof(Config.Cpu), PSE_NET_BLOCKING); if (tmp != Config.Cpu) { psxCpu->Shutdown(); -#if defined(NEW_DYNAREC) || defined(LIGHTREC) +#ifndef DRC_DISABLE if (Config.Cpu == CPU_INTERPRETER) psxCpu = &psxInt; else psxCpu = &psxRec; #else diff --git a/libpcsxcore/new_dynarec/arm/assem_arm.c b/libpcsxcore/new_dynarec/arm/assem_arm.c deleted file mode 100644 index a373bd33..00000000 --- a/libpcsxcore/new_dynarec/arm/assem_arm.c +++ /dev/null @@ -1,4157 +0,0 @@ -/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * - * Mupen64plus/PCSX - assem_arm.c * - * Copyright (C) 2009-2011 Ari64 * - * Copyright (C) 2010-2011 Gražvydas "notaz" Ignotas * - * * - * This program is free software; you can redistribute it and/or modify * - * it under the terms of the GNU General Public License as published by * - * the Free Software Foundation; either version 2 of the License, or * - * (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, * - * but WITHOUT ANY WARRANTY; without even the implied warranty of * - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * - * GNU General Public License for more details. * - * * - * You should have received a copy of the GNU General Public License * - * along with this program; if not, write to the * - * Free Software Foundation, Inc., * - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * - * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ - -#include "../../gte.h" -#define FLAGLESS -#include "../../gte.h" -#undef FLAGLESS -#include "../../gte_arm.h" -#include "../../gte_neon.h" -#include "pcnt.h" -#include "arm_features.h" - -#if defined(BASE_ADDR_FIXED) -#elif defined(BASE_ADDR_DYNAMIC) -char *translation_cache; -#else -char translation_cache[1 << TARGET_SIZE_2] __attribute__((aligned(4096))); -#endif - -#ifndef __MACH__ -#define CALLER_SAVE_REGS 0x100f -#else -#define CALLER_SAVE_REGS 0x120f -#endif - -#define unused __attribute__((unused)) - -extern int cycle_count; -extern int last_count; -extern int pcaddr; -extern int pending_exception; -extern int branch_target; -extern uint64_t readmem_dword; -extern void *dynarec_local; -extern u_int mini_ht[32][2]; - -void indirect_jump_indexed(); -void indirect_jump(); -void do_interrupt(); -void jump_vaddr_r0(); -void jump_vaddr_r1(); -void jump_vaddr_r2(); -void jump_vaddr_r3(); -void jump_vaddr_r4(); -void jump_vaddr_r5(); -void jump_vaddr_r6(); -void jump_vaddr_r7(); -void jump_vaddr_r8(); -void jump_vaddr_r9(); -void jump_vaddr_r10(); -void jump_vaddr_r12(); - -const u_int jump_vaddr_reg[16] = { - (int)jump_vaddr_r0, - (int)jump_vaddr_r1, - (int)jump_vaddr_r2, - (int)jump_vaddr_r3, - (int)jump_vaddr_r4, - (int)jump_vaddr_r5, - (int)jump_vaddr_r6, - (int)jump_vaddr_r7, - (int)jump_vaddr_r8, - (int)jump_vaddr_r9, - (int)jump_vaddr_r10, - 0, - (int)jump_vaddr_r12, - 0, - 0, - 0}; - -void invalidate_addr_r0(); -void invalidate_addr_r1(); -void invalidate_addr_r2(); -void invalidate_addr_r3(); -void invalidate_addr_r4(); -void invalidate_addr_r5(); -void invalidate_addr_r6(); -void invalidate_addr_r7(); -void invalidate_addr_r8(); -void invalidate_addr_r9(); -void invalidate_addr_r10(); -void invalidate_addr_r12(); - -const u_int invalidate_addr_reg[16] = { - (int)invalidate_addr_r0, - (int)invalidate_addr_r1, - (int)invalidate_addr_r2, - (int)invalidate_addr_r3, - (int)invalidate_addr_r4, - (int)invalidate_addr_r5, - (int)invalidate_addr_r6, - (int)invalidate_addr_r7, - (int)invalidate_addr_r8, - (int)invalidate_addr_r9, - (int)invalidate_addr_r10, - 0, - (int)invalidate_addr_r12, - 0, - 0, - 0}; - -static u_int needs_clear_cache[1<<(TARGET_SIZE_2-17)]; - -/* Linker */ - -static void set_jump_target(int addr,u_int target) -{ - u_char *ptr=(u_char *)addr; - u_int *ptr2=(u_int *)ptr; - if(ptr[3]==0xe2) { - assert((target-(u_int)ptr2-8)<1024); - assert((addr&3)==0); - assert((target&3)==0); - *ptr2=(*ptr2&0xFFFFF000)|((target-(u_int)ptr2-8)>>2)|0xF00; - //printf("target=%x addr=%x insn=%x\n",target,addr,*ptr2); - } - else if(ptr[3]==0x72) { - // generated by emit_jno_unlikely - if((target-(u_int)ptr2-8)<1024) { - assert((addr&3)==0); - assert((target&3)==0); - *ptr2=(*ptr2&0xFFFFF000)|((target-(u_int)ptr2-8)>>2)|0xF00; - } - else if((target-(u_int)ptr2-8)<4096&&!((target-(u_int)ptr2-8)&15)) { - assert((addr&3)==0); - assert((target&3)==0); - *ptr2=(*ptr2&0xFFFFF000)|((target-(u_int)ptr2-8)>>4)|0xE00; - } - else *ptr2=(0x7A000000)|(((target-(u_int)ptr2-8)<<6)>>8); - } - else { - assert((ptr[3]&0x0e)==0xa); - *ptr2=(*ptr2&0xFF000000)|(((target-(u_int)ptr2-8)<<6)>>8); - } -} - -// This optionally copies the instruction from the target of the branch into -// the space before the branch. Works, but the difference in speed is -// usually insignificant. -#if 0 -static void set_jump_target_fillslot(int addr,u_int target,int copy) -{ - u_char *ptr=(u_char *)addr; - u_int *ptr2=(u_int *)ptr; - assert(!copy||ptr2[-1]==0xe28dd000); - if(ptr[3]==0xe2) { - assert(!copy); - assert((target-(u_int)ptr2-8)<4096); - *ptr2=(*ptr2&0xFFFFF000)|(target-(u_int)ptr2-8); - } - else { - assert((ptr[3]&0x0e)==0xa); - u_int target_insn=*(u_int *)target; - if((target_insn&0x0e100000)==0) { // ALU, no immediate, no flags - copy=0; - } - if((target_insn&0x0c100000)==0x04100000) { // Load - copy=0; - } - if(target_insn&0x08000000) { - copy=0; - } - if(copy) { - ptr2[-1]=target_insn; - target+=4; - } - *ptr2=(*ptr2&0xFF000000)|(((target-(u_int)ptr2-8)<<6)>>8); - } -} -#endif - -/* Literal pool */ -static void add_literal(int addr,int val) -{ - assert(literalcount>6)+8; -} - -// Find the "clean" entry point from a "dirty" entry point -// by skipping past the call to verify_code -static u_int get_clean_addr(int addr) -{ - int *ptr=(int *)addr; - #ifndef HAVE_ARMV7 - ptr+=4; - #else - ptr+=6; - #endif - if((*ptr&0xFF000000)!=0xeb000000) ptr++; - assert((*ptr&0xFF000000)==0xeb000000); // bl instruction - ptr++; - if((*ptr&0xFF000000)==0xea000000) { - return (int)ptr+((*ptr<<8)>>6)+8; // follow jump - } - return (u_int)ptr; -} - -static int verify_dirty(u_int *ptr) -{ - #ifndef HAVE_ARMV7 - u_int offset; - // get from literal pool - assert((*ptr&0xFFFF0000)==0xe59f0000); - offset=*ptr&0xfff; - u_int source=*(u_int*)((void *)ptr+offset+8); - ptr++; - assert((*ptr&0xFFFF0000)==0xe59f0000); - offset=*ptr&0xfff; - u_int copy=*(u_int*)((void *)ptr+offset+8); - ptr++; - assert((*ptr&0xFFFF0000)==0xe59f0000); - offset=*ptr&0xfff; - u_int len=*(u_int*)((void *)ptr+offset+8); - ptr++; - ptr++; - #else - // ARMv7 movw/movt - assert((*ptr&0xFFF00000)==0xe3000000); - u_int source=(ptr[0]&0xFFF)+((ptr[0]>>4)&0xF000)+((ptr[2]<<16)&0xFFF0000)+((ptr[2]<<12)&0xF0000000); - u_int copy=(ptr[1]&0xFFF)+((ptr[1]>>4)&0xF000)+((ptr[3]<<16)&0xFFF0000)+((ptr[3]<<12)&0xF0000000); - u_int len=(ptr[4]&0xFFF)+((ptr[4]>>4)&0xF000); - ptr+=6; - #endif - if((*ptr&0xFF000000)!=0xeb000000) ptr++; - assert((*ptr&0xFF000000)==0xeb000000); // bl instruction - //printf("verify_dirty: %x %x %x\n",source,copy,len); - return !memcmp((void *)source,(void *)copy,len); -} - -// This doesn't necessarily find all clean entry points, just -// guarantees that it's not dirty -static int isclean(int addr) -{ - #ifndef HAVE_ARMV7 - u_int *ptr=((u_int *)addr)+4; - #else - u_int *ptr=((u_int *)addr)+6; - #endif - if((*ptr&0xFF000000)!=0xeb000000) ptr++; - if((*ptr&0xFF000000)!=0xeb000000) return 1; // bl instruction - if((int)ptr+((*ptr<<8)>>6)+8==(int)verify_code) return 0; - if((int)ptr+((*ptr<<8)>>6)+8==(int)verify_code_vm) return 0; - if((int)ptr+((*ptr<<8)>>6)+8==(int)verify_code_ds) return 0; - return 1; -} - -// get source that block at addr was compiled from (host pointers) -static void get_bounds(int addr,u_int *start,u_int *end) -{ - u_int *ptr=(u_int *)addr; - #ifndef HAVE_ARMV7 - u_int offset; - // get from literal pool - assert((*ptr&0xFFFF0000)==0xe59f0000); - offset=*ptr&0xfff; - u_int source=*(u_int*)((void *)ptr+offset+8); - ptr++; - //assert((*ptr&0xFFFF0000)==0xe59f0000); - //offset=*ptr&0xfff; - //u_int copy=*(u_int*)((void *)ptr+offset+8); - ptr++; - assert((*ptr&0xFFFF0000)==0xe59f0000); - offset=*ptr&0xfff; - u_int len=*(u_int*)((void *)ptr+offset+8); - ptr++; - ptr++; - #else - // ARMv7 movw/movt - assert((*ptr&0xFFF00000)==0xe3000000); - u_int source=(ptr[0]&0xFFF)+((ptr[0]>>4)&0xF000)+((ptr[2]<<16)&0xFFF0000)+((ptr[2]<<12)&0xF0000000); - //u_int copy=(ptr[1]&0xFFF)+((ptr[1]>>4)&0xF000)+((ptr[3]<<16)&0xFFF0000)+((ptr[3]<<12)&0xF0000000); - u_int len=(ptr[4]&0xFFF)+((ptr[4]>>4)&0xF000); - ptr+=6; - #endif - if((*ptr&0xFF000000)!=0xeb000000) ptr++; - assert((*ptr&0xFF000000)==0xeb000000); // bl instruction - *start=source; - *end=source+len; -} - -/* Register allocation */ - -// Note: registers are allocated clean (unmodified state) -// if you intend to modify the register, you must call dirty_reg(). -static void alloc_reg(struct regstat *cur,int i,signed char reg) -{ - int r,hr; - int preferred_reg = (reg&7); - if(reg==CCREG) preferred_reg=HOST_CCREG; - if(reg==PTEMP||reg==FTEMP) preferred_reg=12; - - // Don't allocate unused registers - if((cur->u>>reg)&1) return; - - // see if it's already allocated - for(hr=0;hrregmap[hr]==reg) return; - } - - // Keep the same mapping if the register was already allocated in a loop - preferred_reg = loop_reg(i,reg,preferred_reg); - - // Try to allocate the preferred register - if(cur->regmap[preferred_reg]==-1) { - cur->regmap[preferred_reg]=reg; - cur->dirty&=~(1<isconst&=~(1<regmap[preferred_reg]; - if(r<64&&((cur->u>>r)&1)) { - cur->regmap[preferred_reg]=reg; - cur->dirty&=~(1<isconst&=~(1<=64&&((cur->uu>>(r&63))&1)) { - cur->regmap[preferred_reg]=reg; - cur->dirty&=~(1<isconst&=~(1<regmap[hr]; - if(r>=0) { - if(r<64) { - if((cur->u>>r)&1) {cur->regmap[hr]=-1;break;} - } - else - { - if((cur->uu>>(r&63))&1) {cur->regmap[hr]=-1;break;} - } - } - } - // Try to allocate any available register, but prefer - // registers that have not been used recently. - if(i>0) { - for(hr=0;hrregmap[hr]==-1) { - if(regs[i-1].regmap[hr]!=rs1[i-1]&®s[i-1].regmap[hr]!=rs2[i-1]&®s[i-1].regmap[hr]!=rt1[i-1]&®s[i-1].regmap[hr]!=rt2[i-1]) { - cur->regmap[hr]=reg; - cur->dirty&=~(1<isconst&=~(1<regmap[hr]==-1) { - cur->regmap[hr]=reg; - cur->dirty&=~(1<isconst&=~(1<regmap[0],cur->regmap[1],cur->regmap[2],cur->regmap[3],cur->regmap[5],cur->regmap[6],cur->regmap[7]); - //printf("hsn(%x): %d %d %d %d %d %d %d\n",start+i*4,hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]); - if(i>0) { - // Don't evict the cycle count at entry points, otherwise the entry - // stub will have to write it. - if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2; - if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP)) hsn[CCREG]=2; - for(j=10;j>=3;j--) - { - // Alloc preferred register if available - if(hsn[r=cur->regmap[preferred_reg]&63]==j) { - for(hr=0;hrregmap[hr]&63)==r) { - cur->regmap[hr]=-1; - cur->dirty&=~(1<isconst&=~(1<regmap[preferred_reg]=reg; - return; - } - for(r=1;r<=MAXREG;r++) - { - if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) { - for(hr=0;hrregmap[hr]==r+64) { - cur->regmap[hr]=reg; - cur->dirty&=~(1<isconst&=~(1<regmap[hr]==r) { - cur->regmap[hr]=reg; - cur->dirty&=~(1<isconst&=~(1<=0;j--) - { - for(r=1;r<=MAXREG;r++) - { - if(hsn[r]==j) { - for(hr=0;hrregmap[hr]==r+64) { - cur->regmap[hr]=reg; - cur->dirty&=~(1<isconst&=~(1<regmap[hr]==r) { - cur->regmap[hr]=reg; - cur->dirty&=~(1<isconst&=~(1<uu>>reg)&1) return; - - // see if the upper half is already allocated - for(hr=0;hrregmap[hr]==reg+64) return; - } - - // Keep the same mapping if the register was already allocated in a loop - preferred_reg = loop_reg(i,reg,preferred_reg); - - // Try to allocate the preferred register - if(cur->regmap[preferred_reg]==-1) { - cur->regmap[preferred_reg]=reg|64; - cur->dirty&=~(1<isconst&=~(1<regmap[preferred_reg]; - if(r<64&&((cur->u>>r)&1)) { - cur->regmap[preferred_reg]=reg|64; - cur->dirty&=~(1<isconst&=~(1<=64&&((cur->uu>>(r&63))&1)) { - cur->regmap[preferred_reg]=reg|64; - cur->dirty&=~(1<isconst&=~(1<=0;hr--) - { - r=cur->regmap[hr]; - if(r>=0) { - if(r<64) { - if((cur->u>>r)&1) {cur->regmap[hr]=-1;break;} - } - else - { - if((cur->uu>>(r&63))&1) {cur->regmap[hr]=-1;break;} - } - } - } - // Try to allocate any available register, but prefer - // registers that have not been used recently. - if(i>0) { - for(hr=0;hrregmap[hr]==-1) { - if(regs[i-1].regmap[hr]!=rs1[i-1]&®s[i-1].regmap[hr]!=rs2[i-1]&®s[i-1].regmap[hr]!=rt1[i-1]&®s[i-1].regmap[hr]!=rt2[i-1]) { - cur->regmap[hr]=reg|64; - cur->dirty&=~(1<isconst&=~(1<regmap[hr]==-1) { - cur->regmap[hr]=reg|64; - cur->dirty&=~(1<isconst&=~(1<regmap[0],cur->regmap[1],cur->regmap[2],cur->regmap[3],cur->regmap[5],cur->regmap[6],cur->regmap[7]); - //printf("hsn(%x): %d %d %d %d %d %d %d\n",start+i*4,hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]); - if(i>0) { - // Don't evict the cycle count at entry points, otherwise the entry - // stub will have to write it. - if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2; - if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP)) hsn[CCREG]=2; - for(j=10;j>=3;j--) - { - // Alloc preferred register if available - if(hsn[r=cur->regmap[preferred_reg]&63]==j) { - for(hr=0;hrregmap[hr]&63)==r) { - cur->regmap[hr]=-1; - cur->dirty&=~(1<isconst&=~(1<regmap[preferred_reg]=reg|64; - return; - } - for(r=1;r<=MAXREG;r++) - { - if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) { - for(hr=0;hrregmap[hr]==r+64) { - cur->regmap[hr]=reg|64; - cur->dirty&=~(1<isconst&=~(1<regmap[hr]==r) { - cur->regmap[hr]=reg|64; - cur->dirty&=~(1<isconst&=~(1<=0;j--) - { - for(r=1;r<=MAXREG;r++) - { - if(hsn[r]==j) { - for(hr=0;hrregmap[hr]==r+64) { - cur->regmap[hr]=reg|64; - cur->dirty&=~(1<isconst&=~(1<regmap[hr]==r) { - cur->regmap[hr]=reg|64; - cur->dirty&=~(1<isconst&=~(1<regmap[hr]==reg) return; - } - - // Try to allocate any available register - for(hr=HOST_REGS-1;hr>=0;hr--) { - if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) { - cur->regmap[hr]=reg; - cur->dirty&=~(1<isconst&=~(1<=0;hr--) - { - r=cur->regmap[hr]; - if(r>=0) { - if(r<64) { - if((cur->u>>r)&1) { - if(i==0||((unneeded_reg[i-1]>>r)&1)) { - cur->regmap[hr]=reg; - cur->dirty&=~(1<isconst&=~(1<uu>>(r&63))&1) { - if(i==0||((unneeded_reg_upper[i-1]>>(r&63))&1)) { - cur->regmap[hr]=reg; - cur->dirty&=~(1<isconst&=~(1<regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]); - if(i>0) { - // Don't evict the cycle count at entry points, otherwise the entry - // stub will have to write it. - if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2; - if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP)) hsn[CCREG]=2; - for(j=10;j>=3;j--) - { - for(r=1;r<=MAXREG;r++) - { - if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) { - for(hr=0;hr2) { - if(cur->regmap[hr]==r+64) { - cur->regmap[hr]=reg; - cur->dirty&=~(1<isconst&=~(1<2) { - if(cur->regmap[hr]==r) { - cur->regmap[hr]=reg; - cur->dirty&=~(1<isconst&=~(1<=0;j--) - { - for(r=1;r<=MAXREG;r++) - { - if(hsn[r]==j) { - for(hr=0;hrregmap[hr]==r+64) { - cur->regmap[hr]=reg; - cur->dirty&=~(1<isconst&=~(1<regmap[hr]==r) { - cur->regmap[hr]=reg; - cur->dirty&=~(1<isconst&=~(1<regmap[n]==reg) { - dirty=(cur->dirty>>n)&1; - cur->regmap[n]=-1; - } - } - - cur->regmap[hr]=reg; - cur->dirty&=~(1<dirty|=dirty<isconst&=~(1<0) - { - if(imm<256) { - *encoded=((i&30)<<7)|imm; - return 1; - } - imm=(imm>>2)|(imm<<30);i-=2; - } - return 0; -} - -static void genimm_checked(u_int imm,u_int *encoded) -{ - u_int ret=genimm(imm,encoded); - assert(ret); - (void)ret; -} - -static u_int genjmp(u_int addr) -{ - int offset=addr-(int)out-8; - if(offset<-33554432||offset>=33554432) { - if (addr>2) { - SysPrintf("genjmp: out of range: %08x\n", offset); - exit(1); - } - return 0; - } - return ((u_int)offset>>2)&0xffffff; -} - -static void emit_mov(int rs,int rt) -{ - assem_debug("mov %s,%s\n",regname[rt],regname[rs]); - output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)); -} - -static void emit_movs(int rs,int rt) -{ - assem_debug("movs %s,%s\n",regname[rt],regname[rs]); - output_w32(0xe1b00000|rd_rn_rm(rt,0,rs)); -} - -static void emit_add(int rs1,int rs2,int rt) -{ - assem_debug("add %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0xe0800000|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_adds(int rs1,int rs2,int rt) -{ - assem_debug("adds %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0xe0900000|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_adcs(int rs1,int rs2,int rt) -{ - assem_debug("adcs %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0xe0b00000|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_sbc(int rs1,int rs2,int rt) -{ - assem_debug("sbc %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0xe0c00000|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_sbcs(int rs1,int rs2,int rt) -{ - assem_debug("sbcs %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0xe0d00000|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_neg(int rs, int rt) -{ - assem_debug("rsb %s,%s,#0\n",regname[rt],regname[rs]); - output_w32(0xe2600000|rd_rn_rm(rt,rs,0)); -} - -static void emit_negs(int rs, int rt) -{ - assem_debug("rsbs %s,%s,#0\n",regname[rt],regname[rs]); - output_w32(0xe2700000|rd_rn_rm(rt,rs,0)); -} - -static void emit_sub(int rs1,int rs2,int rt) -{ - assem_debug("sub %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0xe0400000|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_subs(int rs1,int rs2,int rt) -{ - assem_debug("subs %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0xe0500000|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_zeroreg(int rt) -{ - assem_debug("mov %s,#0\n",regname[rt]); - output_w32(0xe3a00000|rd_rn_rm(rt,0,0)); -} - -static void emit_loadlp(u_int imm,u_int rt) -{ - add_literal((int)out,imm); - assem_debug("ldr %s,pc+? [=%x]\n",regname[rt],imm); - output_w32(0xe5900000|rd_rn_rm(rt,15,0)); -} - -static void emit_movw(u_int imm,u_int rt) -{ - assert(imm<65536); - assem_debug("movw %s,#%d (0x%x)\n",regname[rt],imm,imm); - output_w32(0xe3000000|rd_rn_rm(rt,0,0)|(imm&0xfff)|((imm<<4)&0xf0000)); -} - -static void emit_movt(u_int imm,u_int rt) -{ - assem_debug("movt %s,#%d (0x%x)\n",regname[rt],imm&0xffff0000,imm&0xffff0000); - output_w32(0xe3400000|rd_rn_rm(rt,0,0)|((imm>>16)&0xfff)|((imm>>12)&0xf0000)); -} - -static void emit_movimm(u_int imm,u_int rt) -{ - u_int armval; - if(genimm(imm,&armval)) { - assem_debug("mov %s,#%d\n",regname[rt],imm); - output_w32(0xe3a00000|rd_rn_rm(rt,0,0)|armval); - }else if(genimm(~imm,&armval)) { - assem_debug("mvn %s,#%d\n",regname[rt],imm); - output_w32(0xe3e00000|rd_rn_rm(rt,0,0)|armval); - }else if(imm<65536) { - #ifndef HAVE_ARMV7 - assem_debug("mov %s,#%d\n",regname[rt],imm&0xFF00); - output_w32(0xe3a00000|rd_rn_imm_shift(rt,0,imm>>8,8)); - assem_debug("add %s,%s,#%d\n",regname[rt],regname[rt],imm&0xFF); - output_w32(0xe2800000|rd_rn_imm_shift(rt,rt,imm&0xff,0)); - #else - emit_movw(imm,rt); - #endif - }else{ - #ifndef HAVE_ARMV7 - emit_loadlp(imm,rt); - #else - emit_movw(imm&0x0000FFFF,rt); - emit_movt(imm&0xFFFF0000,rt); - #endif - } -} - -static void emit_pcreladdr(u_int rt) -{ - assem_debug("add %s,pc,#?\n",regname[rt]); - output_w32(0xe2800000|rd_rn_rm(rt,15,0)); -} - -static void emit_loadreg(int r, int hr) -{ - if(r&64) { - SysPrintf("64bit load in 32bit mode!\n"); - assert(0); - return; - } - if((r&63)==0) - emit_zeroreg(hr); - else { - int addr=((int)reg)+((r&63)<>4); - if((r&63)==HIREG) addr=(int)&hi+((r&64)>>4); - if((r&63)==LOREG) addr=(int)&lo+((r&64)>>4); - if(r==CCREG) addr=(int)&cycle_count; - if(r==CSREG) addr=(int)&Status; - if(r==FSREG) addr=(int)&FCR31; - if(r==INVCP) addr=(int)&invc_ptr; - u_int offset = addr-(u_int)&dynarec_local; - assert(offset<4096); - assem_debug("ldr %s,fp+%d\n",regname[hr],offset); - output_w32(0xe5900000|rd_rn_rm(hr,FP,0)|offset); - } -} - -static void emit_storereg(int r, int hr) -{ - if(r&64) { - SysPrintf("64bit store in 32bit mode!\n"); - assert(0); - return; - } - int addr=((int)reg)+((r&63)<>4); - if((r&63)==HIREG) addr=(int)&hi+((r&64)>>4); - if((r&63)==LOREG) addr=(int)&lo+((r&64)>>4); - if(r==CCREG) addr=(int)&cycle_count; - if(r==FSREG) addr=(int)&FCR31; - u_int offset = addr-(u_int)&dynarec_local; - assert(offset<4096); - assem_debug("str %s,fp+%d\n",regname[hr],offset); - output_w32(0xe5800000|rd_rn_rm(hr,FP,0)|offset); -} - -static void emit_test(int rs, int rt) -{ - assem_debug("tst %s,%s\n",regname[rs],regname[rt]); - output_w32(0xe1100000|rd_rn_rm(0,rs,rt)); -} - -static void emit_testimm(int rs,int imm) -{ - u_int armval; - assem_debug("tst %s,#%d\n",regname[rs],imm); - genimm_checked(imm,&armval); - output_w32(0xe3100000|rd_rn_rm(0,rs,0)|armval); -} - -static void emit_testeqimm(int rs,int imm) -{ - u_int armval; - assem_debug("tsteq %s,$%d\n",regname[rs],imm); - genimm_checked(imm,&armval); - output_w32(0x03100000|rd_rn_rm(0,rs,0)|armval); -} - -static void emit_not(int rs,int rt) -{ - assem_debug("mvn %s,%s\n",regname[rt],regname[rs]); - output_w32(0xe1e00000|rd_rn_rm(rt,0,rs)); -} - -static void emit_mvnmi(int rs,int rt) -{ - assem_debug("mvnmi %s,%s\n",regname[rt],regname[rs]); - output_w32(0x41e00000|rd_rn_rm(rt,0,rs)); -} - -static void emit_and(u_int rs1,u_int rs2,u_int rt) -{ - assem_debug("and %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0xe0000000|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_or(u_int rs1,u_int rs2,u_int rt) -{ - assem_debug("orr %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0xe1800000|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_or_and_set_flags(int rs1,int rs2,int rt) -{ - assem_debug("orrs %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0xe1900000|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_orrshl_imm(u_int rs,u_int imm,u_int rt) -{ - assert(rs<16); - assert(rt<16); - assert(imm<32); - assem_debug("orr %s,%s,%s,lsl #%d\n",regname[rt],regname[rt],regname[rs],imm); - output_w32(0xe1800000|rd_rn_rm(rt,rt,rs)|(imm<<7)); -} - -static void emit_orrshr_imm(u_int rs,u_int imm,u_int rt) -{ - assert(rs<16); - assert(rt<16); - assert(imm<32); - assem_debug("orr %s,%s,%s,lsr #%d\n",regname[rt],regname[rt],regname[rs],imm); - output_w32(0xe1800020|rd_rn_rm(rt,rt,rs)|(imm<<7)); -} - -static void emit_xor(u_int rs1,u_int rs2,u_int rt) -{ - assem_debug("eor %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0xe0200000|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_addimm(u_int rs,int imm,u_int rt) -{ - assert(rs<16); - assert(rt<16); - if(imm!=0) { - u_int armval; - if(genimm(imm,&armval)) { - assem_debug("add %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0xe2800000|rd_rn_rm(rt,rs,0)|armval); - }else if(genimm(-imm,&armval)) { - assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rs],-imm); - output_w32(0xe2400000|rd_rn_rm(rt,rs,0)|armval); - #ifdef HAVE_ARMV7 - }else if(rt!=rs&&(u_int)imm<65536) { - emit_movw(imm&0x0000ffff,rt); - emit_add(rs,rt,rt); - }else if(rt!=rs&&(u_int)-imm<65536) { - emit_movw(-imm&0x0000ffff,rt); - emit_sub(rs,rt,rt); - #endif - }else if((u_int)-imm<65536) { - assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rs],(-imm)&0xFF00); - assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rt],(-imm)&0xFF); - output_w32(0xe2400000|rd_rn_imm_shift(rt,rs,(-imm)>>8,8)); - output_w32(0xe2400000|rd_rn_imm_shift(rt,rt,(-imm)&0xff,0)); - }else { - do { - int shift = (ffs(imm) - 1) & ~1; - int imm8 = imm & (0xff << shift); - genimm_checked(imm8,&armval); - assem_debug("add %s,%s,#0x%x\n",regname[rt],regname[rs],imm8); - output_w32(0xe2800000|rd_rn_rm(rt,rs,0)|armval); - rs = rt; - imm &= ~imm8; - } - while (imm != 0); - } - } - else if(rs!=rt) emit_mov(rs,rt); -} - -static void emit_addimm_and_set_flags(int imm,int rt) -{ - assert(imm>-65536&&imm<65536); - u_int armval; - if(genimm(imm,&armval)) { - assem_debug("adds %s,%s,#%d\n",regname[rt],regname[rt],imm); - output_w32(0xe2900000|rd_rn_rm(rt,rt,0)|armval); - }else if(genimm(-imm,&armval)) { - assem_debug("subs %s,%s,#%d\n",regname[rt],regname[rt],imm); - output_w32(0xe2500000|rd_rn_rm(rt,rt,0)|armval); - }else if(imm<0) { - assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rt],(-imm)&0xFF00); - assem_debug("subs %s,%s,#%d\n",regname[rt],regname[rt],(-imm)&0xFF); - output_w32(0xe2400000|rd_rn_imm_shift(rt,rt,(-imm)>>8,8)); - output_w32(0xe2500000|rd_rn_imm_shift(rt,rt,(-imm)&0xff,0)); - }else{ - assem_debug("add %s,%s,#%d\n",regname[rt],regname[rt],imm&0xFF00); - assem_debug("adds %s,%s,#%d\n",regname[rt],regname[rt],imm&0xFF); - output_w32(0xe2800000|rd_rn_imm_shift(rt,rt,imm>>8,8)); - output_w32(0xe2900000|rd_rn_imm_shift(rt,rt,imm&0xff,0)); - } -} - -static void emit_addimm_no_flags(u_int imm,u_int rt) -{ - emit_addimm(rt,imm,rt); -} - -static void emit_addnop(u_int r) -{ - assert(r<16); - assem_debug("add %s,%s,#0 (nop)\n",regname[r],regname[r]); - output_w32(0xe2800000|rd_rn_rm(r,r,0)); -} - -static void emit_adcimm(u_int rs,int imm,u_int rt) -{ - u_int armval; - genimm_checked(imm,&armval); - assem_debug("adc %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0xe2a00000|rd_rn_rm(rt,rs,0)|armval); -} - -static void emit_rscimm(int rs,int imm,u_int rt) -{ - assert(0); - u_int armval; - genimm_checked(imm,&armval); - assem_debug("rsc %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0xe2e00000|rd_rn_rm(rt,rs,0)|armval); -} - -static void emit_addimm64_32(int rsh,int rsl,int imm,int rth,int rtl) -{ - // TODO: if(genimm(imm,&armval)) ... - // else - emit_movimm(imm,HOST_TEMPREG); - emit_adds(HOST_TEMPREG,rsl,rtl); - emit_adcimm(rsh,0,rth); -} - -static void emit_andimm(int rs,int imm,int rt) -{ - u_int armval; - if(imm==0) { - emit_zeroreg(rt); - }else if(genimm(imm,&armval)) { - assem_debug("and %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0xe2000000|rd_rn_rm(rt,rs,0)|armval); - }else if(genimm(~imm,&armval)) { - assem_debug("bic %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0xe3c00000|rd_rn_rm(rt,rs,0)|armval); - }else if(imm==65535) { - #ifndef HAVE_ARMV6 - assem_debug("bic %s,%s,#FF000000\n",regname[rt],regname[rs]); - output_w32(0xe3c00000|rd_rn_rm(rt,rs,0)|0x4FF); - assem_debug("bic %s,%s,#00FF0000\n",regname[rt],regname[rt]); - output_w32(0xe3c00000|rd_rn_rm(rt,rt,0)|0x8FF); - #else - assem_debug("uxth %s,%s\n",regname[rt],regname[rs]); - output_w32(0xe6ff0070|rd_rn_rm(rt,0,rs)); - #endif - }else{ - assert(imm>0&&imm<65535); - #ifndef HAVE_ARMV7 - assem_debug("mov r14,#%d\n",imm&0xFF00); - output_w32(0xe3a00000|rd_rn_imm_shift(HOST_TEMPREG,0,imm>>8,8)); - assem_debug("add r14,r14,#%d\n",imm&0xFF); - output_w32(0xe2800000|rd_rn_imm_shift(HOST_TEMPREG,HOST_TEMPREG,imm&0xff,0)); - #else - emit_movw(imm,HOST_TEMPREG); - #endif - assem_debug("and %s,%s,r14\n",regname[rt],regname[rs]); - output_w32(0xe0000000|rd_rn_rm(rt,rs,HOST_TEMPREG)); - } -} - -static void emit_orimm(int rs,int imm,int rt) -{ - u_int armval; - if(imm==0) { - if(rs!=rt) emit_mov(rs,rt); - }else if(genimm(imm,&armval)) { - assem_debug("orr %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0xe3800000|rd_rn_rm(rt,rs,0)|armval); - }else{ - assert(imm>0&&imm<65536); - assem_debug("orr %s,%s,#%d\n",regname[rt],regname[rs],imm&0xFF00); - assem_debug("orr %s,%s,#%d\n",regname[rt],regname[rs],imm&0xFF); - output_w32(0xe3800000|rd_rn_imm_shift(rt,rs,imm>>8,8)); - output_w32(0xe3800000|rd_rn_imm_shift(rt,rt,imm&0xff,0)); - } -} - -static void emit_xorimm(int rs,int imm,int rt) -{ - u_int armval; - if(imm==0) { - if(rs!=rt) emit_mov(rs,rt); - }else if(genimm(imm,&armval)) { - assem_debug("eor %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0xe2200000|rd_rn_rm(rt,rs,0)|armval); - }else{ - assert(imm>0&&imm<65536); - assem_debug("eor %s,%s,#%d\n",regname[rt],regname[rs],imm&0xFF00); - assem_debug("eor %s,%s,#%d\n",regname[rt],regname[rs],imm&0xFF); - output_w32(0xe2200000|rd_rn_imm_shift(rt,rs,imm>>8,8)); - output_w32(0xe2200000|rd_rn_imm_shift(rt,rt,imm&0xff,0)); - } -} - -static void emit_shlimm(int rs,u_int imm,int rt) -{ - assert(imm>0); - assert(imm<32); - //if(imm==1) ... - assem_debug("lsl %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|(imm<<7)); -} - -static void emit_lsls_imm(int rs,int imm,int rt) -{ - assert(imm>0); - assert(imm<32); - assem_debug("lsls %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0xe1b00000|rd_rn_rm(rt,0,rs)|(imm<<7)); -} - -static unused void emit_lslpls_imm(int rs,int imm,int rt) -{ - assert(imm>0); - assert(imm<32); - assem_debug("lslpls %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0x51b00000|rd_rn_rm(rt,0,rs)|(imm<<7)); -} - -static void emit_shrimm(int rs,u_int imm,int rt) -{ - assert(imm>0); - assert(imm<32); - assem_debug("lsr %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x20|(imm<<7)); -} - -static void emit_sarimm(int rs,u_int imm,int rt) -{ - assert(imm>0); - assert(imm<32); - assem_debug("asr %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x40|(imm<<7)); -} - -static void emit_rorimm(int rs,u_int imm,int rt) -{ - assert(imm>0); - assert(imm<32); - assem_debug("ror %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x60|(imm<<7)); -} - -static void emit_shldimm(int rs,int rs2,u_int imm,int rt) -{ - assem_debug("shld %%%s,%%%s,%d\n",regname[rt],regname[rs2],imm); - assert(imm>0); - assert(imm<32); - //if(imm==1) ... - assem_debug("lsl %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|(imm<<7)); - assem_debug("orr %s,%s,%s,lsr #%d\n",regname[rt],regname[rt],regname[rs2],32-imm); - output_w32(0xe1800020|rd_rn_rm(rt,rt,rs2)|((32-imm)<<7)); -} - -static void emit_shrdimm(int rs,int rs2,u_int imm,int rt) -{ - assem_debug("shrd %%%s,%%%s,%d\n",regname[rt],regname[rs2],imm); - assert(imm>0); - assert(imm<32); - //if(imm==1) ... - assem_debug("lsr %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0xe1a00020|rd_rn_rm(rt,0,rs)|(imm<<7)); - assem_debug("orr %s,%s,%s,lsl #%d\n",regname[rt],regname[rt],regname[rs2],32-imm); - output_w32(0xe1800000|rd_rn_rm(rt,rt,rs2)|((32-imm)<<7)); -} - -static void emit_signextend16(int rs,int rt) -{ - #ifndef HAVE_ARMV6 - emit_shlimm(rs,16,rt); - emit_sarimm(rt,16,rt); - #else - assem_debug("sxth %s,%s\n",regname[rt],regname[rs]); - output_w32(0xe6bf0070|rd_rn_rm(rt,0,rs)); - #endif -} - -static void emit_signextend8(int rs,int rt) -{ - #ifndef HAVE_ARMV6 - emit_shlimm(rs,24,rt); - emit_sarimm(rt,24,rt); - #else - assem_debug("sxtb %s,%s\n",regname[rt],regname[rs]); - output_w32(0xe6af0070|rd_rn_rm(rt,0,rs)); - #endif -} - -static void emit_shl(u_int rs,u_int shift,u_int rt) -{ - assert(rs<16); - assert(rt<16); - assert(shift<16); - //if(imm==1) ... - assem_debug("lsl %s,%s,%s\n",regname[rt],regname[rs],regname[shift]); - output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x10|(shift<<8)); -} - -static void emit_shr(u_int rs,u_int shift,u_int rt) -{ - assert(rs<16); - assert(rt<16); - assert(shift<16); - assem_debug("lsr %s,%s,%s\n",regname[rt],regname[rs],regname[shift]); - output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x30|(shift<<8)); -} - -static void emit_sar(u_int rs,u_int shift,u_int rt) -{ - assert(rs<16); - assert(rt<16); - assert(shift<16); - assem_debug("asr %s,%s,%s\n",regname[rt],regname[rs],regname[shift]); - output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x50|(shift<<8)); -} - -static void emit_orrshl(u_int rs,u_int shift,u_int rt) -{ - assert(rs<16); - assert(rt<16); - assert(shift<16); - assem_debug("orr %s,%s,%s,lsl %s\n",regname[rt],regname[rt],regname[rs],regname[shift]); - output_w32(0xe1800000|rd_rn_rm(rt,rt,rs)|0x10|(shift<<8)); -} - -static void emit_orrshr(u_int rs,u_int shift,u_int rt) -{ - assert(rs<16); - assert(rt<16); - assert(shift<16); - assem_debug("orr %s,%s,%s,lsr %s\n",regname[rt],regname[rt],regname[rs],regname[shift]); - output_w32(0xe1800000|rd_rn_rm(rt,rt,rs)|0x30|(shift<<8)); -} - -static void emit_cmpimm(int rs,int imm) -{ - u_int armval; - if(genimm(imm,&armval)) { - assem_debug("cmp %s,#%d\n",regname[rs],imm); - output_w32(0xe3500000|rd_rn_rm(0,rs,0)|armval); - }else if(genimm(-imm,&armval)) { - assem_debug("cmn %s,#%d\n",regname[rs],imm); - output_w32(0xe3700000|rd_rn_rm(0,rs,0)|armval); - }else if(imm>0) { - assert(imm<65536); - emit_movimm(imm,HOST_TEMPREG); - assem_debug("cmp %s,r14\n",regname[rs]); - output_w32(0xe1500000|rd_rn_rm(0,rs,HOST_TEMPREG)); - }else{ - assert(imm>-65536); - emit_movimm(-imm,HOST_TEMPREG); - assem_debug("cmn %s,r14\n",regname[rs]); - output_w32(0xe1700000|rd_rn_rm(0,rs,HOST_TEMPREG)); - } -} - -static void emit_cmovne_imm(int imm,int rt) -{ - assem_debug("movne %s,#%d\n",regname[rt],imm); - u_int armval; - genimm_checked(imm,&armval); - output_w32(0x13a00000|rd_rn_rm(rt,0,0)|armval); -} - -static void emit_cmovl_imm(int imm,int rt) -{ - assem_debug("movlt %s,#%d\n",regname[rt],imm); - u_int armval; - genimm_checked(imm,&armval); - output_w32(0xb3a00000|rd_rn_rm(rt,0,0)|armval); -} - -static void emit_cmovb_imm(int imm,int rt) -{ - assem_debug("movcc %s,#%d\n",regname[rt],imm); - u_int armval; - genimm_checked(imm,&armval); - output_w32(0x33a00000|rd_rn_rm(rt,0,0)|armval); -} - -static void emit_cmovs_imm(int imm,int rt) -{ - assem_debug("movmi %s,#%d\n",regname[rt],imm); - u_int armval; - genimm_checked(imm,&armval); - output_w32(0x43a00000|rd_rn_rm(rt,0,0)|armval); -} - -static void emit_cmove_reg(int rs,int rt) -{ - assem_debug("moveq %s,%s\n",regname[rt],regname[rs]); - output_w32(0x01a00000|rd_rn_rm(rt,0,rs)); -} - -static void emit_cmovne_reg(int rs,int rt) -{ - assem_debug("movne %s,%s\n",regname[rt],regname[rs]); - output_w32(0x11a00000|rd_rn_rm(rt,0,rs)); -} - -static void emit_cmovl_reg(int rs,int rt) -{ - assem_debug("movlt %s,%s\n",regname[rt],regname[rs]); - output_w32(0xb1a00000|rd_rn_rm(rt,0,rs)); -} - -static void emit_cmovs_reg(int rs,int rt) -{ - assem_debug("movmi %s,%s\n",regname[rt],regname[rs]); - output_w32(0x41a00000|rd_rn_rm(rt,0,rs)); -} - -static void emit_slti32(int rs,int imm,int rt) -{ - if(rs!=rt) emit_zeroreg(rt); - emit_cmpimm(rs,imm); - if(rs==rt) emit_movimm(0,rt); - emit_cmovl_imm(1,rt); -} - -static void emit_sltiu32(int rs,int imm,int rt) -{ - if(rs!=rt) emit_zeroreg(rt); - emit_cmpimm(rs,imm); - if(rs==rt) emit_movimm(0,rt); - emit_cmovb_imm(1,rt); -} - -static void emit_slti64_32(int rsh,int rsl,int imm,int rt) -{ - assert(rsh!=rt); - emit_slti32(rsl,imm,rt); - if(imm>=0) - { - emit_test(rsh,rsh); - emit_cmovne_imm(0,rt); - emit_cmovs_imm(1,rt); - } - else - { - emit_cmpimm(rsh,-1); - emit_cmovne_imm(0,rt); - emit_cmovl_imm(1,rt); - } -} - -static void emit_sltiu64_32(int rsh,int rsl,int imm,int rt) -{ - assert(rsh!=rt); - emit_sltiu32(rsl,imm,rt); - if(imm>=0) - { - emit_test(rsh,rsh); - emit_cmovne_imm(0,rt); - } - else - { - emit_cmpimm(rsh,-1); - emit_cmovne_imm(1,rt); - } -} - -static void emit_cmp(int rs,int rt) -{ - assem_debug("cmp %s,%s\n",regname[rs],regname[rt]); - output_w32(0xe1500000|rd_rn_rm(0,rs,rt)); -} - -static void emit_set_gz32(int rs, int rt) -{ - //assem_debug("set_gz32\n"); - emit_cmpimm(rs,1); - emit_movimm(1,rt); - emit_cmovl_imm(0,rt); -} - -static void emit_set_nz32(int rs, int rt) -{ - //assem_debug("set_nz32\n"); - if(rs!=rt) emit_movs(rs,rt); - else emit_test(rs,rs); - emit_cmovne_imm(1,rt); -} - -static void emit_set_gz64_32(int rsh, int rsl, int rt) -{ - //assem_debug("set_gz64\n"); - emit_set_gz32(rsl,rt); - emit_test(rsh,rsh); - emit_cmovne_imm(1,rt); - emit_cmovs_imm(0,rt); -} - -static void emit_set_nz64_32(int rsh, int rsl, int rt) -{ - //assem_debug("set_nz64\n"); - emit_or_and_set_flags(rsh,rsl,rt); - emit_cmovne_imm(1,rt); -} - -static void emit_set_if_less32(int rs1, int rs2, int rt) -{ - //assem_debug("set if less (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]); - if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt); - emit_cmp(rs1,rs2); - if(rs1==rt||rs2==rt) emit_movimm(0,rt); - emit_cmovl_imm(1,rt); -} - -static void emit_set_if_carry32(int rs1, int rs2, int rt) -{ - //assem_debug("set if carry (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]); - if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt); - emit_cmp(rs1,rs2); - if(rs1==rt||rs2==rt) emit_movimm(0,rt); - emit_cmovb_imm(1,rt); -} - -static void emit_set_if_less64_32(int u1, int l1, int u2, int l2, int rt) -{ - //assem_debug("set if less64 (%%%s,%%%s,%%%s,%%%s),%%%s\n",regname[u1],regname[l1],regname[u2],regname[l2],regname[rt]); - assert(u1!=rt); - assert(u2!=rt); - emit_cmp(l1,l2); - emit_movimm(0,rt); - emit_sbcs(u1,u2,HOST_TEMPREG); - emit_cmovl_imm(1,rt); -} - -static void emit_set_if_carry64_32(int u1, int l1, int u2, int l2, int rt) -{ - //assem_debug("set if carry64 (%%%s,%%%s,%%%s,%%%s),%%%s\n",regname[u1],regname[l1],regname[u2],regname[l2],regname[rt]); - assert(u1!=rt); - assert(u2!=rt); - emit_cmp(l1,l2); - emit_movimm(0,rt); - emit_sbcs(u1,u2,HOST_TEMPREG); - emit_cmovb_imm(1,rt); -} - -static void emit_call(int a) -{ - assem_debug("bl %x (%x+%x)\n",a,(int)out,a-(int)out-8); - u_int offset=genjmp(a); - output_w32(0xeb000000|offset); -} - -static void emit_jmp(int a) -{ - assem_debug("b %x (%x+%x)\n",a,(int)out,a-(int)out-8); - u_int offset=genjmp(a); - output_w32(0xea000000|offset); -} - -static void emit_jne(int a) -{ - assem_debug("bne %x\n",a); - u_int offset=genjmp(a); - output_w32(0x1a000000|offset); -} - -static void emit_jeq(int a) -{ - assem_debug("beq %x\n",a); - u_int offset=genjmp(a); - output_w32(0x0a000000|offset); -} - -static void emit_js(int a) -{ - assem_debug("bmi %x\n",a); - u_int offset=genjmp(a); - output_w32(0x4a000000|offset); -} - -static void emit_jns(int a) -{ - assem_debug("bpl %x\n",a); - u_int offset=genjmp(a); - output_w32(0x5a000000|offset); -} - -static void emit_jl(int a) -{ - assem_debug("blt %x\n",a); - u_int offset=genjmp(a); - output_w32(0xba000000|offset); -} - -static void emit_jge(int a) -{ - assem_debug("bge %x\n",a); - u_int offset=genjmp(a); - output_w32(0xaa000000|offset); -} - -static void emit_jno(int a) -{ - assem_debug("bvc %x\n",a); - u_int offset=genjmp(a); - output_w32(0x7a000000|offset); -} - -static void emit_jc(int a) -{ - assem_debug("bcs %x\n",a); - u_int offset=genjmp(a); - output_w32(0x2a000000|offset); -} - -static void emit_jcc(int a) -{ - assem_debug("bcc %x\n",a); - u_int offset=genjmp(a); - output_w32(0x3a000000|offset); -} - -static void emit_callreg(u_int r) -{ - assert(r<15); - assem_debug("blx %s\n",regname[r]); - output_w32(0xe12fff30|r); -} - -static void emit_jmpreg(u_int r) -{ - assem_debug("mov pc,%s\n",regname[r]); - output_w32(0xe1a00000|rd_rn_rm(15,0,r)); -} - -static void emit_readword_indexed(int offset, int rs, int rt) -{ - assert(offset>-4096&&offset<4096); - assem_debug("ldr %s,%s+%d\n",regname[rt],regname[rs],offset); - if(offset>=0) { - output_w32(0xe5900000|rd_rn_rm(rt,rs,0)|offset); - }else{ - output_w32(0xe5100000|rd_rn_rm(rt,rs,0)|(-offset)); - } -} - -static void emit_readword_dualindexedx4(int rs1, int rs2, int rt) -{ - assem_debug("ldr %s,%s,%s lsl #2\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0xe7900000|rd_rn_rm(rt,rs1,rs2)|0x100); -} - -static void emit_ldrcc_dualindexed(int rs1, int rs2, int rt) -{ - assem_debug("ldrcc %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0x37900000|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_ldrccb_dualindexed(int rs1, int rs2, int rt) -{ - assem_debug("ldrccb %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0x37d00000|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_ldrccsb_dualindexed(int rs1, int rs2, int rt) -{ - assem_debug("ldrccsb %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0x319000d0|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_ldrcch_dualindexed(int rs1, int rs2, int rt) -{ - assem_debug("ldrcch %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0x319000b0|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_ldrccsh_dualindexed(int rs1, int rs2, int rt) -{ - assem_debug("ldrccsh %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0x319000f0|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_readword_indexed_tlb(int addr, int rs, int map, int rt) -{ - if(map<0) emit_readword_indexed(addr, rs, rt); - else { - assert(addr==0); - emit_readword_dualindexedx4(rs, map, rt); - } -} - -static void emit_readdword_indexed_tlb(int addr, int rs, int map, int rh, int rl) -{ - if(map<0) { - if(rh>=0) emit_readword_indexed(addr, rs, rh); - emit_readword_indexed(addr+4, rs, rl); - }else{ - assert(rh!=rs); - if(rh>=0) emit_readword_indexed_tlb(addr, rs, map, rh); - emit_addimm(map,1,map); - emit_readword_indexed_tlb(addr, rs, map, rl); - } -} - -static void emit_movsbl_indexed(int offset, int rs, int rt) -{ - assert(offset>-256&&offset<256); - assem_debug("ldrsb %s,%s+%d\n",regname[rt],regname[rs],offset); - if(offset>=0) { - output_w32(0xe1d000d0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf)); - }else{ - output_w32(0xe15000d0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf)); - } -} - -static void emit_movsbl_indexed_tlb(int addr, int rs, int map, int rt) -{ - if(map<0) emit_movsbl_indexed(addr, rs, rt); - else { - if(addr==0) { - emit_shlimm(map,2,map); - assem_debug("ldrsb %s,%s+%s\n",regname[rt],regname[rs],regname[map]); - output_w32(0xe19000d0|rd_rn_rm(rt,rs,map)); - }else{ - assert(addr>-256&&addr<256); - assem_debug("add %s,%s,%s,lsl #2\n",regname[rt],regname[rs],regname[map]); - output_w32(0xe0800000|rd_rn_rm(rt,rs,map)|(2<<7)); - emit_movsbl_indexed(addr, rt, rt); - } - } -} - -static void emit_movswl_indexed(int offset, int rs, int rt) -{ - assert(offset>-256&&offset<256); - assem_debug("ldrsh %s,%s+%d\n",regname[rt],regname[rs],offset); - if(offset>=0) { - output_w32(0xe1d000f0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf)); - }else{ - output_w32(0xe15000f0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf)); - } -} - -static void emit_movzbl_indexed(int offset, int rs, int rt) -{ - assert(offset>-4096&&offset<4096); - assem_debug("ldrb %s,%s+%d\n",regname[rt],regname[rs],offset); - if(offset>=0) { - output_w32(0xe5d00000|rd_rn_rm(rt,rs,0)|offset); - }else{ - output_w32(0xe5500000|rd_rn_rm(rt,rs,0)|(-offset)); - } -} - -static void emit_movzbl_dualindexedx4(int rs1, int rs2, int rt) -{ - assem_debug("ldrb %s,%s,%s lsl #2\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0xe7d00000|rd_rn_rm(rt,rs1,rs2)|0x100); -} - -static void emit_movzbl_indexed_tlb(int addr, int rs, int map, int rt) -{ - if(map<0) emit_movzbl_indexed(addr, rs, rt); - else { - if(addr==0) { - emit_movzbl_dualindexedx4(rs, map, rt); - }else{ - emit_addimm(rs,addr,rt); - emit_movzbl_dualindexedx4(rt, map, rt); - } - } -} - -static void emit_movzwl_indexed(int offset, int rs, int rt) -{ - assert(offset>-256&&offset<256); - assem_debug("ldrh %s,%s+%d\n",regname[rt],regname[rs],offset); - if(offset>=0) { - output_w32(0xe1d000b0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf)); - }else{ - output_w32(0xe15000b0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf)); - } -} - -static void emit_ldrd(int offset, int rs, int rt) -{ - assert(offset>-256&&offset<256); - assem_debug("ldrd %s,%s+%d\n",regname[rt],regname[rs],offset); - if(offset>=0) { - output_w32(0xe1c000d0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf)); - }else{ - output_w32(0xe14000d0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf)); - } -} - -static void emit_readword(int addr, int rt) -{ - u_int offset = addr-(u_int)&dynarec_local; - assert(offset<4096); - assem_debug("ldr %s,fp+%d\n",regname[rt],offset); - output_w32(0xe5900000|rd_rn_rm(rt,FP,0)|offset); -} - -static unused void emit_movsbl(int addr, int rt) -{ - u_int offset = addr-(u_int)&dynarec_local; - assert(offset<256); - assem_debug("ldrsb %s,fp+%d\n",regname[rt],offset); - output_w32(0xe1d000d0|rd_rn_rm(rt,FP,0)|((offset<<4)&0xf00)|(offset&0xf)); -} - -static unused void emit_movswl(int addr, int rt) -{ - u_int offset = addr-(u_int)&dynarec_local; - assert(offset<256); - assem_debug("ldrsh %s,fp+%d\n",regname[rt],offset); - output_w32(0xe1d000f0|rd_rn_rm(rt,FP,0)|((offset<<4)&0xf00)|(offset&0xf)); -} - -static unused void emit_movzbl(int addr, int rt) -{ - u_int offset = addr-(u_int)&dynarec_local; - assert(offset<4096); - assem_debug("ldrb %s,fp+%d\n",regname[rt],offset); - output_w32(0xe5d00000|rd_rn_rm(rt,FP,0)|offset); -} - -static unused void emit_movzwl(int addr, int rt) -{ - u_int offset = addr-(u_int)&dynarec_local; - assert(offset<256); - assem_debug("ldrh %s,fp+%d\n",regname[rt],offset); - output_w32(0xe1d000b0|rd_rn_rm(rt,FP,0)|((offset<<4)&0xf00)|(offset&0xf)); -} - -static void emit_writeword_indexed(int rt, int offset, int rs) -{ - assert(offset>-4096&&offset<4096); - assem_debug("str %s,%s+%d\n",regname[rt],regname[rs],offset); - if(offset>=0) { - output_w32(0xe5800000|rd_rn_rm(rt,rs,0)|offset); - }else{ - output_w32(0xe5000000|rd_rn_rm(rt,rs,0)|(-offset)); - } -} - -static void emit_writeword_dualindexedx4(int rt, int rs1, int rs2) -{ - assem_debug("str %s,%s,%s lsl #2\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0xe7800000|rd_rn_rm(rt,rs1,rs2)|0x100); -} - -static void emit_writeword_indexed_tlb(int rt, int addr, int rs, int map, int temp) -{ - if(map<0) emit_writeword_indexed(rt, addr, rs); - else { - assert(addr==0); - emit_writeword_dualindexedx4(rt, rs, map); - } -} - -static void emit_writedword_indexed_tlb(int rh, int rl, int addr, int rs, int map, int temp) -{ - if(map<0) { - if(rh>=0) emit_writeword_indexed(rh, addr, rs); - emit_writeword_indexed(rl, addr+4, rs); - }else{ - assert(rh>=0); - if(temp!=rs) emit_addimm(map,1,temp); - emit_writeword_indexed_tlb(rh, addr, rs, map, temp); - if(temp!=rs) emit_writeword_indexed_tlb(rl, addr, rs, temp, temp); - else { - emit_addimm(rs,4,rs); - emit_writeword_indexed_tlb(rl, addr, rs, map, temp); - } - } -} - -static void emit_writehword_indexed(int rt, int offset, int rs) -{ - assert(offset>-256&&offset<256); - assem_debug("strh %s,%s+%d\n",regname[rt],regname[rs],offset); - if(offset>=0) { - output_w32(0xe1c000b0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf)); - }else{ - output_w32(0xe14000b0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf)); - } -} - -static void emit_writebyte_indexed(int rt, int offset, int rs) -{ - assert(offset>-4096&&offset<4096); - assem_debug("strb %s,%s+%d\n",regname[rt],regname[rs],offset); - if(offset>=0) { - output_w32(0xe5c00000|rd_rn_rm(rt,rs,0)|offset); - }else{ - output_w32(0xe5400000|rd_rn_rm(rt,rs,0)|(-offset)); - } -} - -static void emit_writebyte_dualindexedx4(int rt, int rs1, int rs2) -{ - assem_debug("strb %s,%s,%s lsl #2\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0xe7c00000|rd_rn_rm(rt,rs1,rs2)|0x100); -} - -static void emit_writebyte_indexed_tlb(int rt, int addr, int rs, int map, int temp) -{ - if(map<0) emit_writebyte_indexed(rt, addr, rs); - else { - if(addr==0) { - emit_writebyte_dualindexedx4(rt, rs, map); - }else{ - emit_addimm(rs,addr,temp); - emit_writebyte_dualindexedx4(rt, temp, map); - } - } -} - -static void emit_strcc_dualindexed(int rs1, int rs2, int rt) -{ - assem_debug("strcc %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0x37800000|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_strccb_dualindexed(int rs1, int rs2, int rt) -{ - assem_debug("strccb %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0x37c00000|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_strcch_dualindexed(int rs1, int rs2, int rt) -{ - assem_debug("strcch %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0x318000b0|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_writeword(int rt, int addr) -{ - u_int offset = addr-(u_int)&dynarec_local; - assert(offset<4096); - assem_debug("str %s,fp+%d\n",regname[rt],offset); - output_w32(0xe5800000|rd_rn_rm(rt,FP,0)|offset); -} - -static unused void emit_writehword(int rt, int addr) -{ - u_int offset = addr-(u_int)&dynarec_local; - assert(offset<256); - assem_debug("strh %s,fp+%d\n",regname[rt],offset); - output_w32(0xe1c000b0|rd_rn_rm(rt,FP,0)|((offset<<4)&0xf00)|(offset&0xf)); -} - -static unused void emit_writebyte(int rt, int addr) -{ - u_int offset = addr-(u_int)&dynarec_local; - assert(offset<4096); - assem_debug("strb %s,fp+%d\n",regname[rt],offset); - output_w32(0xe5c00000|rd_rn_rm(rt,FP,0)|offset); -} - -static void emit_umull(u_int rs1, u_int rs2, u_int hi, u_int lo) -{ - assem_debug("umull %s, %s, %s, %s\n",regname[lo],regname[hi],regname[rs1],regname[rs2]); - assert(rs1<16); - assert(rs2<16); - assert(hi<16); - assert(lo<16); - output_w32(0xe0800090|(hi<<16)|(lo<<12)|(rs2<<8)|rs1); -} - -static void emit_smull(u_int rs1, u_int rs2, u_int hi, u_int lo) -{ - assem_debug("smull %s, %s, %s, %s\n",regname[lo],regname[hi],regname[rs1],regname[rs2]); - assert(rs1<16); - assert(rs2<16); - assert(hi<16); - assert(lo<16); - output_w32(0xe0c00090|(hi<<16)|(lo<<12)|(rs2<<8)|rs1); -} - -static void emit_clz(int rs,int rt) -{ - assem_debug("clz %s,%s\n",regname[rt],regname[rs]); - output_w32(0xe16f0f10|rd_rn_rm(rt,0,rs)); -} - -static void emit_subcs(int rs1,int rs2,int rt) -{ - assem_debug("subcs %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0x20400000|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_shrcc_imm(int rs,u_int imm,int rt) -{ - assert(imm>0); - assert(imm<32); - assem_debug("lsrcc %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0x31a00000|rd_rn_rm(rt,0,rs)|0x20|(imm<<7)); -} - -static void emit_shrne_imm(int rs,u_int imm,int rt) -{ - assert(imm>0); - assert(imm<32); - assem_debug("lsrne %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0x11a00000|rd_rn_rm(rt,0,rs)|0x20|(imm<<7)); -} - -static void emit_negmi(int rs, int rt) -{ - assem_debug("rsbmi %s,%s,#0\n",regname[rt],regname[rs]); - output_w32(0x42600000|rd_rn_rm(rt,rs,0)); -} - -static void emit_negsmi(int rs, int rt) -{ - assem_debug("rsbsmi %s,%s,#0\n",regname[rt],regname[rs]); - output_w32(0x42700000|rd_rn_rm(rt,rs,0)); -} - -static void emit_orreq(u_int rs1,u_int rs2,u_int rt) -{ - assem_debug("orreq %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0x01800000|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_orrne(u_int rs1,u_int rs2,u_int rt) -{ - assem_debug("orrne %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); - output_w32(0x11800000|rd_rn_rm(rt,rs1,rs2)); -} - -static void emit_bic_lsl(u_int rs1,u_int rs2,u_int shift,u_int rt) -{ - assem_debug("bic %s,%s,%s lsl %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]); - output_w32(0xe1C00000|rd_rn_rm(rt,rs1,rs2)|0x10|(shift<<8)); -} - -static void emit_biceq_lsl(u_int rs1,u_int rs2,u_int shift,u_int rt) -{ - assem_debug("biceq %s,%s,%s lsl %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]); - output_w32(0x01C00000|rd_rn_rm(rt,rs1,rs2)|0x10|(shift<<8)); -} - -static void emit_bicne_lsl(u_int rs1,u_int rs2,u_int shift,u_int rt) -{ - assem_debug("bicne %s,%s,%s lsl %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]); - output_w32(0x11C00000|rd_rn_rm(rt,rs1,rs2)|0x10|(shift<<8)); -} - -static void emit_bic_lsr(u_int rs1,u_int rs2,u_int shift,u_int rt) -{ - assem_debug("bic %s,%s,%s lsr %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]); - output_w32(0xe1C00000|rd_rn_rm(rt,rs1,rs2)|0x30|(shift<<8)); -} - -static void emit_biceq_lsr(u_int rs1,u_int rs2,u_int shift,u_int rt) -{ - assem_debug("biceq %s,%s,%s lsr %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]); - output_w32(0x01C00000|rd_rn_rm(rt,rs1,rs2)|0x30|(shift<<8)); -} - -static void emit_bicne_lsr(u_int rs1,u_int rs2,u_int shift,u_int rt) -{ - assem_debug("bicne %s,%s,%s lsr %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]); - output_w32(0x11C00000|rd_rn_rm(rt,rs1,rs2)|0x30|(shift<<8)); -} - -static void emit_teq(int rs, int rt) -{ - assem_debug("teq %s,%s\n",regname[rs],regname[rt]); - output_w32(0xe1300000|rd_rn_rm(0,rs,rt)); -} - -static void emit_rsbimm(int rs, int imm, int rt) -{ - u_int armval; - genimm_checked(imm,&armval); - assem_debug("rsb %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0xe2600000|rd_rn_rm(rt,rs,0)|armval); -} - -// Load 2 immediates optimizing for small code size -static void emit_mov2imm_compact(int imm1,u_int rt1,int imm2,u_int rt2) -{ - emit_movimm(imm1,rt1); - u_int armval; - if(genimm(imm2-imm1,&armval)) { - assem_debug("add %s,%s,#%d\n",regname[rt2],regname[rt1],imm2-imm1); - output_w32(0xe2800000|rd_rn_rm(rt2,rt1,0)|armval); - }else if(genimm(imm1-imm2,&armval)) { - assem_debug("sub %s,%s,#%d\n",regname[rt2],regname[rt1],imm1-imm2); - output_w32(0xe2400000|rd_rn_rm(rt2,rt1,0)|armval); - } - else emit_movimm(imm2,rt2); -} - -// Conditionally select one of two immediates, optimizing for small code size -// This will only be called if HAVE_CMOV_IMM is defined -static void emit_cmov2imm_e_ne_compact(int imm1,int imm2,u_int rt) -{ - u_int armval; - if(genimm(imm2-imm1,&armval)) { - emit_movimm(imm1,rt); - assem_debug("addne %s,%s,#%d\n",regname[rt],regname[rt],imm2-imm1); - output_w32(0x12800000|rd_rn_rm(rt,rt,0)|armval); - }else if(genimm(imm1-imm2,&armval)) { - emit_movimm(imm1,rt); - assem_debug("subne %s,%s,#%d\n",regname[rt],regname[rt],imm1-imm2); - output_w32(0x12400000|rd_rn_rm(rt,rt,0)|armval); - } - else { - #ifndef HAVE_ARMV7 - emit_movimm(imm1,rt); - add_literal((int)out,imm2); - assem_debug("ldrne %s,pc+? [=%x]\n",regname[rt],imm2); - output_w32(0x15900000|rd_rn_rm(rt,15,0)); - #else - emit_movw(imm1&0x0000FFFF,rt); - if((imm1&0xFFFF)!=(imm2&0xFFFF)) { - assem_debug("movwne %s,#%d (0x%x)\n",regname[rt],imm2&0xFFFF,imm2&0xFFFF); - output_w32(0x13000000|rd_rn_rm(rt,0,0)|(imm2&0xfff)|((imm2<<4)&0xf0000)); - } - emit_movt(imm1&0xFFFF0000,rt); - if((imm1&0xFFFF0000)!=(imm2&0xFFFF0000)) { - assem_debug("movtne %s,#%d (0x%x)\n",regname[rt],imm2&0xffff0000,imm2&0xffff0000); - output_w32(0x13400000|rd_rn_rm(rt,0,0)|((imm2>>16)&0xfff)|((imm2>>12)&0xf0000)); - } - #endif - } -} - -// special case for checking invalid_code -static void emit_cmpmem_indexedsr12_reg(int base,int r,int imm) -{ - assert(imm<128&&imm>=0); - assert(r>=0&&r<16); - assem_debug("ldrb lr,%s,%s lsr #12\n",regname[base],regname[r]); - output_w32(0xe7d00000|rd_rn_rm(HOST_TEMPREG,base,r)|0x620); - emit_cmpimm(HOST_TEMPREG,imm); -} - -static void emit_callne(int a) -{ - assem_debug("blne %x\n",a); - u_int offset=genjmp(a); - output_w32(0x1b000000|offset); -} - -// Used to preload hash table entries -static unused void emit_prefetchreg(int r) -{ - assem_debug("pld %s\n",regname[r]); - output_w32(0xf5d0f000|rd_rn_rm(0,r,0)); -} - -// Special case for mini_ht -static void emit_ldreq_indexed(int rs, u_int offset, int rt) -{ - assert(offset<4096); - assem_debug("ldreq %s,[%s, #%d]\n",regname[rt],regname[rs],offset); - output_w32(0x05900000|rd_rn_rm(rt,rs,0)|offset); -} - -static unused void emit_bicne_imm(int rs,int imm,int rt) -{ - u_int armval; - genimm_checked(imm,&armval); - assem_debug("bicne %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0x13c00000|rd_rn_rm(rt,rs,0)|armval); -} - -static unused void emit_biccs_imm(int rs,int imm,int rt) -{ - u_int armval; - genimm_checked(imm,&armval); - assem_debug("biccs %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0x23c00000|rd_rn_rm(rt,rs,0)|armval); -} - -static unused void emit_bicvc_imm(int rs,int imm,int rt) -{ - u_int armval; - genimm_checked(imm,&armval); - assem_debug("bicvc %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0x73c00000|rd_rn_rm(rt,rs,0)|armval); -} - -static unused void emit_bichi_imm(int rs,int imm,int rt) -{ - u_int armval; - genimm_checked(imm,&armval); - assem_debug("bichi %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0x83c00000|rd_rn_rm(rt,rs,0)|armval); -} - -static unused void emit_orrvs_imm(int rs,int imm,int rt) -{ - u_int armval; - genimm_checked(imm,&armval); - assem_debug("orrvs %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0x63800000|rd_rn_rm(rt,rs,0)|armval); -} - -static void emit_orrne_imm(int rs,int imm,int rt) -{ - u_int armval; - genimm_checked(imm,&armval); - assem_debug("orrne %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0x13800000|rd_rn_rm(rt,rs,0)|armval); -} - -static void emit_andne_imm(int rs,int imm,int rt) -{ - u_int armval; - genimm_checked(imm,&armval); - assem_debug("andne %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0x12000000|rd_rn_rm(rt,rs,0)|armval); -} - -static unused void emit_addpl_imm(int rs,int imm,int rt) -{ - u_int armval; - genimm_checked(imm,&armval); - assem_debug("addpl %s,%s,#%d\n",regname[rt],regname[rs],imm); - output_w32(0x52800000|rd_rn_rm(rt,rs,0)|armval); -} - -static void emit_jno_unlikely(int a) -{ - //emit_jno(a); - assem_debug("addvc pc,pc,#? (%x)\n",/*a-(int)out-8,*/a); - output_w32(0x72800000|rd_rn_rm(15,15,0)); -} - -static void save_regs_all(u_int reglist) -{ - int i; - if(!reglist) return; - assem_debug("stmia fp,{"); - for(i=0;i<16;i++) - if(reglist&(1<=BASE_ADDR&&addr<(BASE_ADDR+(1<=0x80000000&&target<0x80800000)||(target>0xA4000000&&target<0xA4001000)); -//DEBUG > -#ifdef DEBUG_CYCLE_COUNT - emit_readword((int)&last_count,ECX); - emit_add(HOST_CCREG,ECX,HOST_CCREG); - emit_readword((int)&next_interupt,ECX); - emit_writeword(HOST_CCREG,(int)&Count); - emit_sub(HOST_CCREG,ECX,HOST_CCREG); - emit_writeword(ECX,(int)&last_count); -#endif -//DEBUG < - emit_jmp(linker); -} - -static void emit_extjump(int addr, int target) -{ - emit_extjump2(addr, target, (int)dyna_linker); -} - -static void emit_extjump_ds(int addr, int target) -{ - emit_extjump2(addr, target, (int)dyna_linker_ds); -} - -// put rt_val into rt, potentially making use of rs with value rs_val -static void emit_movimm_from(u_int rs_val,int rs,u_int rt_val,int rt) -{ - u_int armval; - int diff; - if(genimm(rt_val,&armval)) { - assem_debug("mov %s,#%d\n",regname[rt],rt_val); - output_w32(0xe3a00000|rd_rn_rm(rt,0,0)|armval); - return; - } - if(genimm(~rt_val,&armval)) { - assem_debug("mvn %s,#%d\n",regname[rt],rt_val); - output_w32(0xe3e00000|rd_rn_rm(rt,0,0)|armval); - return; - } - diff=rt_val-rs_val; - if(genimm(diff,&armval)) { - assem_debug("add %s,%s,#%d\n",regname[rt],regname[rs],diff); - output_w32(0xe2800000|rd_rn_rm(rt,rs,0)|armval); - return; - }else if(genimm(-diff,&armval)) { - assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rs],-diff); - output_w32(0xe2400000|rd_rn_rm(rt,rs,0)|armval); - return; - } - emit_movimm(rt_val,rt); -} - -// return 1 if above function can do it's job cheaply -static int is_similar_value(u_int v1,u_int v2) -{ - u_int xs; - int diff; - if(v1==v2) return 1; - diff=v2-v1; - for(xs=diff;xs!=0&&(xs&3)==0;xs>>=2) - ; - if(xs<0x100) return 1; - for(xs=-diff;xs!=0&&(xs&3)==0;xs>>=2) - ; - if(xs<0x100) return 1; - return 0; -} - -// trashes r2 -static void pass_args(int a0, int a1) -{ - if(a0==1&&a1==0) { - // must swap - emit_mov(a0,2); emit_mov(a1,1); emit_mov(2,0); - } - else if(a0!=0&&a1==0) { - emit_mov(a1,1); - if (a0>=0) emit_mov(a0,0); - } - else { - if(a0>=0&&a0!=0) emit_mov(a0,0); - if(a1>=0&&a1!=1) emit_mov(a1,1); - } -} - -static void mov_loadtype_adj(int type,int rs,int rt) -{ - switch(type) { - case LOADB_STUB: emit_signextend8(rs,rt); break; - case LOADBU_STUB: emit_andimm(rs,0xff,rt); break; - case LOADH_STUB: emit_signextend16(rs,rt); break; - case LOADHU_STUB: emit_andimm(rs,0xffff,rt); break; - case LOADW_STUB: if(rs!=rt) emit_mov(rs,rt); break; - default: assert(0); - } -} - -#include "../backends/psx/pcsxmem.h" -#include "../backends/psx/pcsxmem_inline.c" - -static void do_readstub(int n) -{ - assem_debug("do_readstub %x\n",start+stubs[n][3]*4); - literal_pool(256); - set_jump_target(stubs[n][1],(int)out); - int type=stubs[n][0]; - int i=stubs[n][3]; - int rs=stubs[n][4]; - struct regstat *i_regs=(struct regstat *)stubs[n][5]; - u_int reglist=stubs[n][7]; - signed char *i_regmap=i_regs->regmap; - int rt; - if(itype[i]==C1LS||itype[i]==C2LS||itype[i]==LOADLR) { - rt=get_reg(i_regmap,FTEMP); - }else{ - rt=get_reg(i_regmap,rt1[i]); - } - assert(rs>=0); - int r,temp=-1,temp2=HOST_TEMPREG,regs_saved=0,restore_jump=0; - reglist|=(1<=0&&rt1[i]!=0) - reglist&=~(1<=0&&rt1[i]!=0)) { - switch(type) { - case LOADB_STUB: emit_ldrccsb_dualindexed(temp2,rs,rt); break; - case LOADBU_STUB: emit_ldrccb_dualindexed(temp2,rs,rt); break; - case LOADH_STUB: emit_ldrccsh_dualindexed(temp2,rs,rt); break; - case LOADHU_STUB: emit_ldrcch_dualindexed(temp2,rs,rt); break; - case LOADW_STUB: emit_ldrcc_dualindexed(temp2,rs,rt); break; - } - } - if(regs_saved) { - restore_jump=(int)out; - emit_jcc(0); // jump to reg restore - } - else - emit_jcc(stubs[n][2]); // return address - - if(!regs_saved) - save_regs(reglist); - int handler=0; - if(type==LOADB_STUB||type==LOADBU_STUB) - handler=(int)jump_handler_read8; - if(type==LOADH_STUB||type==LOADHU_STUB) - handler=(int)jump_handler_read16; - if(type==LOADW_STUB) - handler=(int)jump_handler_read32; - assert(handler!=0); - pass_args(rs,temp2); - int cc=get_reg(i_regmap,CCREG); - if(cc<0) - emit_loadreg(CCREG,2); - emit_addimm(cc<0?2:cc,CLOCK_ADJUST((int)stubs[n][6]+1),2); - emit_call(handler); - if(itype[i]==C1LS||itype[i]==C2LS||(rt>=0&&rt1[i]!=0)) { - mov_loadtype_adj(type,0,rt); - } - if(restore_jump) - set_jump_target(restore_jump,(int)out); - restore_regs(reglist); - emit_jmp(stubs[n][2]); // return address -} - -// return memhandler, or get directly accessable address and return 0 -static u_int get_direct_memhandler(void *table,u_int addr,int type,u_int *addr_host) -{ - u_int l1,l2=0; - l1=((u_int *)table)[addr>>12]; - if((l1&(1<<31))==0) { - u_int v=l1<<1; - *addr_host=v+addr; - return 0; - } - else { - l1<<=1; - if(type==LOADB_STUB||type==LOADBU_STUB||type==STOREB_STUB) - l2=((u_int *)l1)[0x1000/4 + 0x1000/2 + (addr&0xfff)]; - else if(type==LOADH_STUB||type==LOADHU_STUB||type==STOREH_STUB) - l2=((u_int *)l1)[0x1000/4 + (addr&0xfff)/2]; - else - l2=((u_int *)l1)[(addr&0xfff)/4]; - if((l2&(1<<31))==0) { - u_int v=l2<<1; - *addr_host=v+(addr&0xfff); - return 0; - } - return l2<<1; - } -} - -static void inline_readstub(int type, int i, u_int addr, signed char regmap[], int target, int adj, u_int reglist) -{ - int rs=get_reg(regmap,target); - int rt=get_reg(regmap,target); - if(rs<0) rs=get_reg(regmap,-1); - assert(rs>=0); - u_int handler,host_addr=0,is_dynamic,far_call=0; - int cc=get_reg(regmap,CCREG); - if(pcsx_direct_read(type,addr,CLOCK_ADJUST(adj+1),cc,target?rs:-1,rt)) - return; - handler=get_direct_memhandler(mem_rtab,addr,type,&host_addr); - if (handler==0) { - if(rt<0||rt1[i]==0) - return; - if(addr!=host_addr) - emit_movimm_from(addr,rs,host_addr,rs); - switch(type) { - case LOADB_STUB: emit_movsbl_indexed(0,rs,rt); break; - case LOADBU_STUB: emit_movzbl_indexed(0,rs,rt); break; - case LOADH_STUB: emit_movswl_indexed(0,rs,rt); break; - case LOADHU_STUB: emit_movzwl_indexed(0,rs,rt); break; - case LOADW_STUB: emit_readword_indexed(0,rs,rt); break; - default: assert(0); - } - return; - } - is_dynamic=pcsxmem_is_handler_dynamic(addr); - if(is_dynamic) { - if(type==LOADB_STUB||type==LOADBU_STUB) - handler=(int)jump_handler_read8; - if(type==LOADH_STUB||type==LOADHU_STUB) - handler=(int)jump_handler_read16; - if(type==LOADW_STUB) - handler=(int)jump_handler_read32; - } - - // call a memhandler - if(rt>=0&&rt1[i]!=0) - reglist&=~(1<=33554432) { - // unreachable memhandler, a plugin func perhaps - emit_movimm(handler,12); - far_call=1; - } - if(cc<0) - emit_loadreg(CCREG,2); - if(is_dynamic) { - emit_movimm(((u_int *)mem_rtab)[addr>>12]<<1,1); - emit_addimm(cc<0?2:cc,CLOCK_ADJUST(adj+1),2); - } - else { - emit_readword((int)&last_count,3); - emit_addimm(cc<0?2:cc,CLOCK_ADJUST(adj+1),2); - emit_add(2,3,2); - emit_writeword(2,(int)&Count); - } - - if(far_call) - emit_callreg(12); - else - emit_call(handler); - - if(rt>=0&&rt1[i]!=0) { - switch(type) { - case LOADB_STUB: emit_signextend8(0,rt); break; - case LOADBU_STUB: emit_andimm(0,0xff,rt); break; - case LOADH_STUB: emit_signextend16(0,rt); break; - case LOADHU_STUB: emit_andimm(0,0xffff,rt); break; - case LOADW_STUB: if(rt!=0) emit_mov(0,rt); break; - default: assert(0); - } - } - restore_regs(reglist); -} - -static void do_writestub(int n) -{ - assem_debug("do_writestub %x\n",start+stubs[n][3]*4); - literal_pool(256); - set_jump_target(stubs[n][1],(int)out); - int type=stubs[n][0]; - int i=stubs[n][3]; - int rs=stubs[n][4]; - struct regstat *i_regs=(struct regstat *)stubs[n][5]; - u_int reglist=stubs[n][7]; - signed char *i_regmap=i_regs->regmap; - int rt,r; - if(itype[i]==C1LS||itype[i]==C2LS) { - rt=get_reg(i_regmap,r=FTEMP); - }else{ - rt=get_reg(i_regmap,r=rs2[i]); - } - assert(rs>=0); - assert(rt>=0); - int rtmp,temp=-1,temp2=HOST_TEMPREG,regs_saved=0,restore_jump=0,ra; - int reglist2=reglist|(1<=0); - assert(rt>=0); - u_int handler,host_addr=0; - handler=get_direct_memhandler(mem_wtab,addr,type,&host_addr); - if (handler==0) { - if(addr!=host_addr) - emit_movimm_from(addr,rs,host_addr,rs); - switch(type) { - case STOREB_STUB: emit_writebyte_indexed(rt,0,rs); break; - case STOREH_STUB: emit_writehword_indexed(rt,0,rs); break; - case STOREW_STUB: emit_writeword_indexed(rt,0,rs); break; - default: assert(0); - } - return; - } - - // call a memhandler - save_regs(reglist); - pass_args(rs,rt); - int cc=get_reg(regmap,CCREG); - if(cc<0) - emit_loadreg(CCREG,2); - emit_addimm(cc<0?2:cc,CLOCK_ADJUST(adj+1),2); - emit_movimm(handler,3); - // returns new cycle_count - emit_call((int)jump_handler_write_h); - emit_addimm(0,-CLOCK_ADJUST(adj+1),cc<0?2:cc); - if(cc<0) - emit_storereg(CCREG,2); - restore_regs(reglist); -} - -static void do_unalignedwritestub(int n) -{ - assem_debug("do_unalignedwritestub %x\n",start+stubs[n][3]*4); - literal_pool(256); - set_jump_target(stubs[n][1],(int)out); - - int i=stubs[n][3]; - struct regstat *i_regs=(struct regstat *)stubs[n][4]; - int addr=stubs[n][5]; - u_int reglist=stubs[n][7]; - signed char *i_regmap=i_regs->regmap; - int temp2=get_reg(i_regmap,FTEMP); - int rt; - rt=get_reg(i_regmap,rs2[i]); - assert(rt>=0); - assert(addr>=0); - assert(opcode[i]==0x2a||opcode[i]==0x2e); // SWL/SWR only implemented - reglist|=(1<regmap_entry,i_regs->was32,i_regs->wasdirty); - if(regs[i].regmap_entry[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG); - emit_movimm(start+(i-ds)*4,EAX); // Get PC - emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right? There should probably be an extra cycle... - emit_jmp(ds?(int)fp_exception_ds:(int)fp_exception); -} - -/* Special assem */ - -static void shift_assemble_arm(int i,struct regstat *i_regs) -{ - if(rt1[i]) { - if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV - { - signed char s,t,shift; - t=get_reg(i_regs->regmap,rt1[i]); - s=get_reg(i_regs->regmap,rs1[i]); - shift=get_reg(i_regs->regmap,rs2[i]); - if(t>=0){ - if(rs1[i]==0) - { - emit_zeroreg(t); - } - else if(rs2[i]==0) - { - assert(s>=0); - if(s!=t) emit_mov(s,t); - } - else - { - emit_andimm(shift,31,HOST_TEMPREG); - if(opcode2[i]==4) // SLLV - { - emit_shl(s,HOST_TEMPREG,t); - } - if(opcode2[i]==6) // SRLV - { - emit_shr(s,HOST_TEMPREG,t); - } - if(opcode2[i]==7) // SRAV - { - emit_sar(s,HOST_TEMPREG,t); - } - } - } - } else { // DSLLV/DSRLV/DSRAV - signed char sh,sl,th,tl,shift; - th=get_reg(i_regs->regmap,rt1[i]|64); - tl=get_reg(i_regs->regmap,rt1[i]); - sh=get_reg(i_regs->regmap,rs1[i]|64); - sl=get_reg(i_regs->regmap,rs1[i]); - shift=get_reg(i_regs->regmap,rs2[i]); - if(tl>=0){ - if(rs1[i]==0) - { - emit_zeroreg(tl); - if(th>=0) emit_zeroreg(th); - } - else if(rs2[i]==0) - { - assert(sl>=0); - if(sl!=tl) emit_mov(sl,tl); - if(th>=0&&sh!=th) emit_mov(sh,th); - } - else - { - // FIXME: What if shift==tl ? - assert(shift!=tl); - int temp=get_reg(i_regs->regmap,-1); - int real_th=th; - if(th<0&&opcode2[i]!=0x14) {th=temp;} // DSLLV doesn't need a temporary register - assert(sl>=0); - assert(sh>=0); - emit_andimm(shift,31,HOST_TEMPREG); - if(opcode2[i]==0x14) // DSLLV - { - if(th>=0) emit_shl(sh,HOST_TEMPREG,th); - emit_rsbimm(HOST_TEMPREG,32,HOST_TEMPREG); - emit_orrshr(sl,HOST_TEMPREG,th); - emit_andimm(shift,31,HOST_TEMPREG); - emit_testimm(shift,32); - emit_shl(sl,HOST_TEMPREG,tl); - if(th>=0) emit_cmovne_reg(tl,th); - emit_cmovne_imm(0,tl); - } - if(opcode2[i]==0x16) // DSRLV - { - assert(th>=0); - emit_shr(sl,HOST_TEMPREG,tl); - emit_rsbimm(HOST_TEMPREG,32,HOST_TEMPREG); - emit_orrshl(sh,HOST_TEMPREG,tl); - emit_andimm(shift,31,HOST_TEMPREG); - emit_testimm(shift,32); - emit_shr(sh,HOST_TEMPREG,th); - emit_cmovne_reg(th,tl); - if(real_th>=0) emit_cmovne_imm(0,th); - } - if(opcode2[i]==0x17) // DSRAV - { - assert(th>=0); - emit_shr(sl,HOST_TEMPREG,tl); - emit_rsbimm(HOST_TEMPREG,32,HOST_TEMPREG); - if(real_th>=0) { - assert(temp>=0); - emit_sarimm(th,31,temp); - } - emit_orrshl(sh,HOST_TEMPREG,tl); - emit_andimm(shift,31,HOST_TEMPREG); - emit_testimm(shift,32); - emit_sar(sh,HOST_TEMPREG,th); - emit_cmovne_reg(th,tl); - if(real_th>=0) emit_cmovne_reg(temp,th); - } - } - } - } - } -} - -static void speculate_mov(int rs,int rt) -{ - if(rt!=0) { - smrv_strong_next|=1<>rs1[i])&1) speculate_mov(rs1[i],rt1[i]); - else if((smrv_strong>>rs2[i])&1) speculate_mov(rs2[i],rt1[i]); - else if((smrv_weak>>rs1[i])&1) speculate_mov_weak(rs1[i],rt1[i]); - else if((smrv_weak>>rs2[i])&1) speculate_mov_weak(rs2[i],rt1[i]); - else { - smrv_strong_next&=~(1<=0) { - if(get_final_value(hr,i,&value)) - smrv[rt1[i]]=value; - else smrv[rt1[i]]=constmap[i][hr]; - smrv_strong_next|=1<>rs1[i])&1) speculate_mov(rs1[i],rt1[i]); - else if((smrv_weak>>rs1[i])&1) speculate_mov_weak(rs1[i],rt1[i]); - } - break; - case LOAD: - if(start<0x2000&&(rt1[i]==26||(smrv[rt1[i]]>>24)==0xa0)) { - // special case for BIOS - smrv[rt1[i]]=0xa0000000; - smrv_strong_next|=1<>r)&1),(smrv_weak>>r)&1,regs[i].isconst,regs[i].wasconst); -#endif -} - -enum { - MTYPE_8000 = 0, - MTYPE_8020, - MTYPE_0000, - MTYPE_A000, - MTYPE_1F80, -}; - -static int get_ptr_mem_type(u_int a) -{ - if(a < 0x00200000) { - if(a<0x1000&&((start>>20)==0xbfc||(start>>24)==0xa0)) - // return wrong, must use memhandler for BIOS self-test to pass - // 007 does similar stuff from a00 mirror, weird stuff - return MTYPE_8000; - return MTYPE_0000; - } - if(0x1f800000 <= a && a < 0x1f801000) - return MTYPE_1F80; - if(0x80200000 <= a && a < 0x80800000) - return MTYPE_8020; - if(0xa0000000 <= a && a < 0xa0200000) - return MTYPE_A000; - return MTYPE_8000; -} - -static int emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override) -{ - int jaddr=0,type=0; - int mr=rs1[i]; - if(((smrv_strong|smrv_weak)>>mr)&1) { - type=get_ptr_mem_type(smrv[mr]); - //printf("set %08x @%08x r%d %d\n", smrv[mr], start+i*4, mr, type); - } - else { - // use the mirror we are running on - type=get_ptr_mem_type(start); - //printf("set nospec @%08x r%d %d\n", start+i*4, mr, type); - } - - if(type==MTYPE_8020) { // RAM 80200000+ mirror - emit_andimm(addr,~0x00e00000,HOST_TEMPREG); - addr=*addr_reg_override=HOST_TEMPREG; - type=0; - } - else if(type==MTYPE_0000) { // RAM 0 mirror - emit_orimm(addr,0x80000000,HOST_TEMPREG); - addr=*addr_reg_override=HOST_TEMPREG; - type=0; - } - else if(type==MTYPE_A000) { // RAM A mirror - emit_andimm(addr,~0x20000000,HOST_TEMPREG); - addr=*addr_reg_override=HOST_TEMPREG; - type=0; - } - else if(type==MTYPE_1F80) { // scratchpad - if (psxH == (void *)0x1f800000) { - emit_addimm(addr,-0x1f800000,HOST_TEMPREG); - emit_cmpimm(HOST_TEMPREG,0x1000); - jaddr=(int)out; - emit_jc(0); - } - else { - // do usual RAM check, jump will go to the right handler - type=0; - } - } - - if(type==0) - { - emit_cmpimm(addr,RAM_SIZE); - jaddr=(int)out; - #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK - // Hint to branch predictor that the branch is unlikely to be taken - if(rs1[i]>=28) - emit_jno_unlikely(0); - else - #endif - emit_jno(0); - if(ram_offset!=0) { - emit_addimm(addr,ram_offset,HOST_TEMPREG); - addr=*addr_reg_override=HOST_TEMPREG; - } - } - - return jaddr; -} - -#define shift_assemble shift_assemble_arm - -static void loadlr_assemble_arm(int i,struct regstat *i_regs) -{ - int s,th,tl,temp,temp2,addr,map=-1; - int offset; - int jaddr=0; - int memtarget=0,c=0; - int fastload_reg_override=0; - u_int hr,reglist=0; - th=get_reg(i_regs->regmap,rt1[i]|64); - tl=get_reg(i_regs->regmap,rt1[i]); - s=get_reg(i_regs->regmap,rs1[i]); - temp=get_reg(i_regs->regmap,-1); - temp2=get_reg(i_regs->regmap,FTEMP); - addr=get_reg(i_regs->regmap,AGEN1+(i&1)); - assert(addr<0); - offset=imm[i]; - for(hr=0;hrregmap[hr]>=0) reglist|=1<=0) { - c=(i_regs->wasconst>>s)&1; - if(c) { - memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE; - } - } - if(!c) { - #ifdef RAM_OFFSET - map=get_reg(i_regs->regmap,ROREG); - if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG); - #endif - emit_shlimm(addr,3,temp); - if (opcode[i]==0x22||opcode[i]==0x26) { - emit_andimm(addr,0xFFFFFFFC,temp2); // LWL/LWR - }else{ - emit_andimm(addr,0xFFFFFFF8,temp2); // LDL/LDR - } - jaddr=emit_fastpath_cmp_jump(i,temp2,&fastload_reg_override); - } - else { - if(ram_offset&&memtarget) { - emit_addimm(temp2,ram_offset,HOST_TEMPREG); - fastload_reg_override=HOST_TEMPREG; - } - if (opcode[i]==0x22||opcode[i]==0x26) { - emit_movimm(((constmap[i][s]+offset)<<3)&24,temp); // LWL/LWR - }else{ - emit_movimm(((constmap[i][s]+offset)<<3)&56,temp); // LDL/LDR - } - } - if (opcode[i]==0x22||opcode[i]==0x26) { // LWL/LWR - if(!c||memtarget) { - int a=temp2; - if(fastload_reg_override) a=fastload_reg_override; - //emit_readword_indexed((int)rdram-0x80000000,temp2,temp2); - emit_readword_indexed_tlb(0,a,map,temp2); - if(jaddr) add_stub(LOADW_STUB,jaddr,(int)out,i,temp2,(int)i_regs,ccadj[i],reglist); - } - else - inline_readstub(LOADW_STUB,i,(constmap[i][s]+offset)&0xFFFFFFFC,i_regs->regmap,FTEMP,ccadj[i],reglist); - if(rt1[i]) { - assert(tl>=0); - emit_andimm(temp,24,temp); -#ifdef BIG_ENDIAN_MIPS - if (opcode[i]==0x26) // LWR -#else - if (opcode[i]==0x22) // LWL -#endif - emit_xorimm(temp,24,temp); - emit_movimm(-1,HOST_TEMPREG); - if (opcode[i]==0x26) { - emit_shr(temp2,temp,temp2); - emit_bic_lsr(tl,HOST_TEMPREG,temp,tl); - }else{ - emit_shl(temp2,temp,temp2); - emit_bic_lsl(tl,HOST_TEMPREG,temp,tl); - } - emit_or(temp2,tl,tl); - } - //emit_storereg(rt1[i],tl); // DEBUG - } - if (opcode[i]==0x1A||opcode[i]==0x1B) { // LDL/LDR - // FIXME: little endian, fastload_reg_override - int temp2h=get_reg(i_regs->regmap,FTEMP|64); - if(!c||memtarget) { - //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,temp2,temp2h); - //emit_readword_indexed((int)rdram-0x7FFFFFFC,temp2,temp2); - emit_readdword_indexed_tlb(0,temp2,map,temp2h,temp2); - if(jaddr) add_stub(LOADD_STUB,jaddr,(int)out,i,temp2,(int)i_regs,ccadj[i],reglist); - } - else - inline_readstub(LOADD_STUB,i,(constmap[i][s]+offset)&0xFFFFFFF8,i_regs->regmap,FTEMP,ccadj[i],reglist); - if(rt1[i]) { - assert(th>=0); - assert(tl>=0); - emit_testimm(temp,32); - emit_andimm(temp,24,temp); - if (opcode[i]==0x1A) { // LDL - emit_rsbimm(temp,32,HOST_TEMPREG); - emit_shl(temp2h,temp,temp2h); - emit_orrshr(temp2,HOST_TEMPREG,temp2h); - emit_movimm(-1,HOST_TEMPREG); - emit_shl(temp2,temp,temp2); - emit_cmove_reg(temp2h,th); - emit_biceq_lsl(tl,HOST_TEMPREG,temp,tl); - emit_bicne_lsl(th,HOST_TEMPREG,temp,th); - emit_orreq(temp2,tl,tl); - emit_orrne(temp2,th,th); - } - if (opcode[i]==0x1B) { // LDR - emit_xorimm(temp,24,temp); - emit_rsbimm(temp,32,HOST_TEMPREG); - emit_shr(temp2,temp,temp2); - emit_orrshl(temp2h,HOST_TEMPREG,temp2); - emit_movimm(-1,HOST_TEMPREG); - emit_shr(temp2h,temp,temp2h); - emit_cmovne_reg(temp2,tl); - emit_bicne_lsr(th,HOST_TEMPREG,temp,th); - emit_biceq_lsr(tl,HOST_TEMPREG,temp,tl); - emit_orrne(temp2h,th,th); - emit_orreq(temp2h,tl,tl); - } - } - } -} -#define loadlr_assemble loadlr_assemble_arm - -static void cop0_assemble(int i,struct regstat *i_regs) -{ - if(opcode2[i]==0) // MFC0 - { - signed char t=get_reg(i_regs->regmap,rt1[i]); - char copr=(source[i]>>11)&0x1f; - //assert(t>=0); // Why does this happen? OOT is weird - if(t>=0&&rt1[i]!=0) { - emit_readword((int)®_cop0+copr*4,t); - } - } - else if(opcode2[i]==4) // MTC0 - { - signed char s=get_reg(i_regs->regmap,rs1[i]); - char copr=(source[i]>>11)&0x1f; - assert(s>=0); - wb_register(rs1[i],i_regs->regmap,i_regs->dirty,i_regs->is32); - if(copr==9||copr==11||copr==12||copr==13) { - emit_readword((int)&last_count,HOST_TEMPREG); - emit_loadreg(CCREG,HOST_CCREG); // TODO: do proper reg alloc - emit_add(HOST_CCREG,HOST_TEMPREG,HOST_CCREG); - emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); - emit_writeword(HOST_CCREG,(int)&Count); - } - // What a mess. The status register (12) can enable interrupts, - // so needs a special case to handle a pending interrupt. - // The interrupt must be taken immediately, because a subsequent - // instruction might disable interrupts again. - if(copr==12||copr==13) { - if (is_delayslot) { - // burn cycles to cause cc_interrupt, which will - // reschedule next_interupt. Relies on CCREG from above. - assem_debug("MTC0 DS %d\n", copr); - emit_writeword(HOST_CCREG,(int)&last_count); - emit_movimm(0,HOST_CCREG); - emit_storereg(CCREG,HOST_CCREG); - emit_loadreg(rs1[i],1); - emit_movimm(copr,0); - emit_call((int)pcsx_mtc0_ds); - emit_loadreg(rs1[i],s); - return; - } - emit_movimm(start+i*4+4,HOST_TEMPREG); - emit_writeword(HOST_TEMPREG,(int)&pcaddr); - emit_movimm(0,HOST_TEMPREG); - emit_writeword(HOST_TEMPREG,(int)&pending_exception); - } - //else if(copr==12&&is_delayslot) emit_call((int)MTC0_R12); - //else - if(s==HOST_CCREG) - emit_loadreg(rs1[i],1); - else if(s!=1) - emit_mov(s,1); - emit_movimm(copr,0); - emit_call((int)pcsx_mtc0); - if(copr==9||copr==11||copr==12||copr==13) { - emit_readword((int)&Count,HOST_CCREG); - emit_readword((int)&next_interupt,HOST_TEMPREG); - emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[i]),HOST_CCREG); - emit_sub(HOST_CCREG,HOST_TEMPREG,HOST_CCREG); - emit_writeword(HOST_TEMPREG,(int)&last_count); - emit_storereg(CCREG,HOST_CCREG); - } - if(copr==12||copr==13) { - assert(!is_delayslot); - emit_readword((int)&pending_exception,14); - emit_test(14,14); - emit_jne((int)&do_interrupt); - } - emit_loadreg(rs1[i],s); - if(get_reg(i_regs->regmap,rs1[i]|64)>=0) - emit_loadreg(rs1[i]|64,get_reg(i_regs->regmap,rs1[i]|64)); - cop1_usable=0; - } - else - { - assert(opcode2[i]==0x10); - if((source[i]&0x3f)==0x10) // RFE - { - emit_readword((int)&Status,0); - emit_andimm(0,0x3c,1); - emit_andimm(0,~0xf,0); - emit_orrshr_imm(1,2,0); - emit_writeword(0,(int)&Status); - } - } -} - -static void cop2_get_dreg(u_int copr,signed char tl,signed char temp) -{ - switch (copr) { - case 1: - case 3: - case 5: - case 8: - case 9: - case 10: - case 11: - emit_readword((int)®_cop2d[copr],tl); - emit_signextend16(tl,tl); - emit_writeword(tl,(int)®_cop2d[copr]); // hmh - break; - case 7: - case 16: - case 17: - case 18: - case 19: - emit_readword((int)®_cop2d[copr],tl); - emit_andimm(tl,0xffff,tl); - emit_writeword(tl,(int)®_cop2d[copr]); - break; - case 15: - emit_readword((int)®_cop2d[14],tl); // SXY2 - emit_writeword(tl,(int)®_cop2d[copr]); - break; - case 28: - case 29: - emit_readword((int)®_cop2d[9],temp); - emit_testimm(temp,0x8000); // do we need this? - emit_andimm(temp,0xf80,temp); - emit_andne_imm(temp,0,temp); - emit_shrimm(temp,7,tl); - emit_readword((int)®_cop2d[10],temp); - emit_testimm(temp,0x8000); - emit_andimm(temp,0xf80,temp); - emit_andne_imm(temp,0,temp); - emit_orrshr_imm(temp,2,tl); - emit_readword((int)®_cop2d[11],temp); - emit_testimm(temp,0x8000); - emit_andimm(temp,0xf80,temp); - emit_andne_imm(temp,0,temp); - emit_orrshl_imm(temp,3,tl); - emit_writeword(tl,(int)®_cop2d[copr]); - break; - default: - emit_readword((int)®_cop2d[copr],tl); - break; - } -} - -static void cop2_put_dreg(u_int copr,signed char sl,signed char temp) -{ - switch (copr) { - case 15: - emit_readword((int)®_cop2d[13],temp); // SXY1 - emit_writeword(sl,(int)®_cop2d[copr]); - emit_writeword(temp,(int)®_cop2d[12]); // SXY0 - emit_readword((int)®_cop2d[14],temp); // SXY2 - emit_writeword(sl,(int)®_cop2d[14]); - emit_writeword(temp,(int)®_cop2d[13]); // SXY1 - break; - case 28: - emit_andimm(sl,0x001f,temp); - emit_shlimm(temp,7,temp); - emit_writeword(temp,(int)®_cop2d[9]); - emit_andimm(sl,0x03e0,temp); - emit_shlimm(temp,2,temp); - emit_writeword(temp,(int)®_cop2d[10]); - emit_andimm(sl,0x7c00,temp); - emit_shrimm(temp,3,temp); - emit_writeword(temp,(int)®_cop2d[11]); - emit_writeword(sl,(int)®_cop2d[28]); - break; - case 30: - emit_movs(sl,temp); - emit_mvnmi(temp,temp); -#ifdef HAVE_ARMV5 - emit_clz(temp,temp); -#else - emit_movs(temp,HOST_TEMPREG); - emit_movimm(0,temp); - emit_jeq((int)out+4*4); - emit_addpl_imm(temp,1,temp); - emit_lslpls_imm(HOST_TEMPREG,1,HOST_TEMPREG); - emit_jns((int)out-2*4); -#endif - emit_writeword(sl,(int)®_cop2d[30]); - emit_writeword(temp,(int)®_cop2d[31]); - break; - case 31: - break; - default: - emit_writeword(sl,(int)®_cop2d[copr]); - break; - } -} - -static void cop2_assemble(int i,struct regstat *i_regs) -{ - u_int copr=(source[i]>>11)&0x1f; - signed char temp=get_reg(i_regs->regmap,-1); - if (opcode2[i]==0) { // MFC2 - signed char tl=get_reg(i_regs->regmap,rt1[i]); - if(tl>=0&&rt1[i]!=0) - cop2_get_dreg(copr,tl,temp); - } - else if (opcode2[i]==4) { // MTC2 - signed char sl=get_reg(i_regs->regmap,rs1[i]); - cop2_put_dreg(copr,sl,temp); - } - else if (opcode2[i]==2) // CFC2 - { - signed char tl=get_reg(i_regs->regmap,rt1[i]); - if(tl>=0&&rt1[i]!=0) - emit_readword((int)®_cop2c[copr],tl); - } - else if (opcode2[i]==6) // CTC2 - { - signed char sl=get_reg(i_regs->regmap,rs1[i]); - switch(copr) { - case 4: - case 12: - case 20: - case 26: - case 27: - case 29: - case 30: - emit_signextend16(sl,temp); - break; - case 31: - //value = value & 0x7ffff000; - //if (value & 0x7f87e000) value |= 0x80000000; - emit_shrimm(sl,12,temp); - emit_shlimm(temp,12,temp); - emit_testimm(temp,0x7f000000); - emit_testeqimm(temp,0x00870000); - emit_testeqimm(temp,0x0000e000); - emit_orrne_imm(temp,0x80000000,temp); - break; - default: - temp=sl; - break; - } - emit_writeword(temp,(int)®_cop2c[copr]); - assert(sl>=0); - } -} - -static void c2op_prologue(u_int op,u_int reglist) -{ - save_regs_all(reglist); -#ifdef PCNT - emit_movimm(op,0); - emit_call((int)pcnt_gte_start); -#endif - emit_addimm(FP,(int)&psxRegs.CP2D.r[0]-(int)&dynarec_local,0); // cop2 regs -} - -static void c2op_epilogue(u_int op,u_int reglist) -{ -#ifdef PCNT - emit_movimm(op,0); - emit_call((int)pcnt_gte_end); -#endif - restore_regs_all(reglist); -} - -static void c2op_call_MACtoIR(int lm,int need_flags) -{ - if(need_flags) - emit_call((int)(lm?gteMACtoIR_lm1:gteMACtoIR_lm0)); - else - emit_call((int)(lm?gteMACtoIR_lm1_nf:gteMACtoIR_lm0_nf)); -} - -static void c2op_call_rgb_func(void *func,int lm,int need_ir,int need_flags) -{ - emit_call((int)func); - // func is C code and trashes r0 - emit_addimm(FP,(int)&psxRegs.CP2D.r[0]-(int)&dynarec_local,0); - if(need_flags||need_ir) - c2op_call_MACtoIR(lm,need_flags); - emit_call((int)(need_flags?gteMACtoRGB:gteMACtoRGB_nf)); -} - -static void c2op_assemble(int i,struct regstat *i_regs) -{ - u_int c2op=source[i]&0x3f; - u_int hr,reglist_full=0,reglist; - int need_flags,need_ir; - for(hr=0;hrregmap[hr]>=0) reglist_full|=1<>63); // +1 because of how liveness detection works - need_ir=(gte_unneeded[i+1]&0xe00)!=0xe00; - assem_debug("gte op %08x, unneeded %016llx, need_flags %d, need_ir %d\n", - source[i],gte_unneeded[i+1],need_flags,need_ir); - if(new_dynarec_hacks&NDHACK_GTE_NO_FLAGS) - need_flags=0; - int shift = (source[i] >> 19) & 1; - int lm = (source[i] >> 10) & 1; - switch(c2op) { -#ifndef DRC_DBG - case GTE_MVMVA: { -#ifdef HAVE_ARMV5 - int v = (source[i] >> 15) & 3; - int cv = (source[i] >> 13) & 3; - int mx = (source[i] >> 17) & 3; - reglist=reglist_full&(CALLER_SAVE_REGS|0xf0); // +{r4-r7} - c2op_prologue(c2op,reglist); - /* r4,r5 = VXYZ(v) packed; r6 = &MX11(mx); r7 = &CV1(cv) */ - if(v<3) - emit_ldrd(v*8,0,4); - else { - emit_movzwl_indexed(9*4,0,4); // gteIR - emit_movzwl_indexed(10*4,0,6); - emit_movzwl_indexed(11*4,0,5); - emit_orrshl_imm(6,16,4); - } - if(mx<3) - emit_addimm(0,32*4+mx*8*4,6); - else - emit_readword((int)&zeromem_ptr,6); - if(cv<3) - emit_addimm(0,32*4+(cv*8+5)*4,7); - else - emit_readword((int)&zeromem_ptr,7); -#ifdef __ARM_NEON__ - emit_movimm(source[i],1); // opcode - emit_call((int)gteMVMVA_part_neon); - if(need_flags) { - emit_movimm(lm,1); - emit_call((int)gteMACtoIR_flags_neon); - } -#else - if(cv==3&&shift) - emit_call((int)gteMVMVA_part_cv3sh12_arm); - else { - emit_movimm(shift,1); - emit_call((int)(need_flags?gteMVMVA_part_arm:gteMVMVA_part_nf_arm)); - } - if(need_flags||need_ir) - c2op_call_MACtoIR(lm,need_flags); -#endif -#else /* if not HAVE_ARMV5 */ - c2op_prologue(c2op,reglist); - emit_movimm(source[i],1); // opcode - emit_writeword(1,(int)&psxRegs.code); - emit_call((int)(need_flags?gte_handlers[c2op]:gte_handlers_nf[c2op])); -#endif - break; - } - case GTE_OP: - c2op_prologue(c2op,reglist); - emit_call((int)(shift?gteOP_part_shift:gteOP_part_noshift)); - if(need_flags||need_ir) { - emit_addimm(FP,(int)&psxRegs.CP2D.r[0]-(int)&dynarec_local,0); - c2op_call_MACtoIR(lm,need_flags); - } - break; - case GTE_DPCS: - c2op_prologue(c2op,reglist); - c2op_call_rgb_func(shift?gteDPCS_part_shift:gteDPCS_part_noshift,lm,need_ir,need_flags); - break; - case GTE_INTPL: - c2op_prologue(c2op,reglist); - c2op_call_rgb_func(shift?gteINTPL_part_shift:gteINTPL_part_noshift,lm,need_ir,need_flags); - break; - case GTE_SQR: - c2op_prologue(c2op,reglist); - emit_call((int)(shift?gteSQR_part_shift:gteSQR_part_noshift)); - if(need_flags||need_ir) { - emit_addimm(FP,(int)&psxRegs.CP2D.r[0]-(int)&dynarec_local,0); - c2op_call_MACtoIR(lm,need_flags); - } - break; - case GTE_DCPL: - c2op_prologue(c2op,reglist); - c2op_call_rgb_func(gteDCPL_part,lm,need_ir,need_flags); - break; - case GTE_GPF: - c2op_prologue(c2op,reglist); - c2op_call_rgb_func(shift?gteGPF_part_shift:gteGPF_part_noshift,lm,need_ir,need_flags); - break; - case GTE_GPL: - c2op_prologue(c2op,reglist); - c2op_call_rgb_func(shift?gteGPL_part_shift:gteGPL_part_noshift,lm,need_ir,need_flags); - break; -#endif - default: - c2op_prologue(c2op,reglist); -#ifdef DRC_DBG - emit_movimm(source[i],1); // opcode - emit_writeword(1,(int)&psxRegs.code); -#endif - emit_call((int)(need_flags?gte_handlers[c2op]:gte_handlers_nf[c2op])); - break; - } - c2op_epilogue(c2op,reglist); - } -} - -static void cop1_unusable(int i,struct regstat *i_regs) -{ - // XXX: should just just do the exception instead - if(!cop1_usable) { - int jaddr=(int)out; - emit_jmp(0); - add_stub(FP_STUB,jaddr,(int)out,i,0,(int)i_regs,is_delayslot,0); - cop1_usable=1; - } -} - -static void cop1_assemble(int i,struct regstat *i_regs) -{ - cop1_unusable(i, i_regs); -} - -static void fconv_assemble_arm(int i,struct regstat *i_regs) -{ - cop1_unusable(i, i_regs); -} -#define fconv_assemble fconv_assemble_arm - -static void fcomp_assemble(int i,struct regstat *i_regs) -{ - cop1_unusable(i, i_regs); -} - -static void float_assemble(int i,struct regstat *i_regs) -{ - cop1_unusable(i, i_regs); -} - -static void multdiv_assemble_arm(int i,struct regstat *i_regs) -{ - // case 0x18: MULT - // case 0x19: MULTU - // case 0x1A: DIV - // case 0x1B: DIVU - // case 0x1C: DMULT - // case 0x1D: DMULTU - // case 0x1E: DDIV - // case 0x1F: DDIVU - if(rs1[i]&&rs2[i]) - { - if((opcode2[i]&4)==0) // 32-bit - { - if(opcode2[i]==0x18) // MULT - { - signed char m1=get_reg(i_regs->regmap,rs1[i]); - signed char m2=get_reg(i_regs->regmap,rs2[i]); - signed char hi=get_reg(i_regs->regmap,HIREG); - signed char lo=get_reg(i_regs->regmap,LOREG); - assert(m1>=0); - assert(m2>=0); - assert(hi>=0); - assert(lo>=0); - emit_smull(m1,m2,hi,lo); - } - if(opcode2[i]==0x19) // MULTU - { - signed char m1=get_reg(i_regs->regmap,rs1[i]); - signed char m2=get_reg(i_regs->regmap,rs2[i]); - signed char hi=get_reg(i_regs->regmap,HIREG); - signed char lo=get_reg(i_regs->regmap,LOREG); - assert(m1>=0); - assert(m2>=0); - assert(hi>=0); - assert(lo>=0); - emit_umull(m1,m2,hi,lo); - } - if(opcode2[i]==0x1A) // DIV - { - signed char d1=get_reg(i_regs->regmap,rs1[i]); - signed char d2=get_reg(i_regs->regmap,rs2[i]); - assert(d1>=0); - assert(d2>=0); - signed char quotient=get_reg(i_regs->regmap,LOREG); - signed char remainder=get_reg(i_regs->regmap,HIREG); - assert(quotient>=0); - assert(remainder>=0); - emit_movs(d1,remainder); - emit_movimm(0xffffffff,quotient); - emit_negmi(quotient,quotient); // .. quotient and .. - emit_negmi(remainder,remainder); // .. remainder for div0 case (will be negated back after jump) - emit_movs(d2,HOST_TEMPREG); - emit_jeq((int)out+52); // Division by zero - emit_negsmi(HOST_TEMPREG,HOST_TEMPREG); -#ifdef HAVE_ARMV5 - emit_clz(HOST_TEMPREG,quotient); - emit_shl(HOST_TEMPREG,quotient,HOST_TEMPREG); -#else - emit_movimm(0,quotient); - emit_addpl_imm(quotient,1,quotient); - emit_lslpls_imm(HOST_TEMPREG,1,HOST_TEMPREG); - emit_jns((int)out-2*4); -#endif - emit_orimm(quotient,1<<31,quotient); - emit_shr(quotient,quotient,quotient); - emit_cmp(remainder,HOST_TEMPREG); - emit_subcs(remainder,HOST_TEMPREG,remainder); - emit_adcs(quotient,quotient,quotient); - emit_shrimm(HOST_TEMPREG,1,HOST_TEMPREG); - emit_jcc((int)out-16); // -4 - emit_teq(d1,d2); - emit_negmi(quotient,quotient); - emit_test(d1,d1); - emit_negmi(remainder,remainder); - } - if(opcode2[i]==0x1B) // DIVU - { - signed char d1=get_reg(i_regs->regmap,rs1[i]); // dividend - signed char d2=get_reg(i_regs->regmap,rs2[i]); // divisor - assert(d1>=0); - assert(d2>=0); - signed char quotient=get_reg(i_regs->regmap,LOREG); - signed char remainder=get_reg(i_regs->regmap,HIREG); - assert(quotient>=0); - assert(remainder>=0); - emit_mov(d1,remainder); - emit_movimm(0xffffffff,quotient); // div0 case - emit_test(d2,d2); - emit_jeq((int)out+40); // Division by zero -#ifdef HAVE_ARMV5 - emit_clz(d2,HOST_TEMPREG); - emit_movimm(1<<31,quotient); - emit_shl(d2,HOST_TEMPREG,d2); -#else - emit_movimm(0,HOST_TEMPREG); - emit_addpl_imm(HOST_TEMPREG,1,HOST_TEMPREG); - emit_lslpls_imm(d2,1,d2); - emit_jns((int)out-2*4); - emit_movimm(1<<31,quotient); -#endif - emit_shr(quotient,HOST_TEMPREG,quotient); - emit_cmp(remainder,d2); - emit_subcs(remainder,d2,remainder); - emit_adcs(quotient,quotient,quotient); - emit_shrcc_imm(d2,1,d2); - emit_jcc((int)out-16); // -4 - } - } - else // 64-bit - assert(0); - } - else - { - // Multiply by zero is zero. - // MIPS does not have a divide by zero exception. - // The result is undefined, we return zero. - signed char hr=get_reg(i_regs->regmap,HIREG); - signed char lr=get_reg(i_regs->regmap,LOREG); - if(hr>=0) emit_zeroreg(hr); - if(lr>=0) emit_zeroreg(lr); - } -} -#define multdiv_assemble multdiv_assemble_arm - -static void do_preload_rhash(int r) { - // Don't need this for ARM. On x86, this puts the value 0xf8 into the - // register. On ARM the hash can be done with a single instruction (below) -} - -static void do_preload_rhtbl(int ht) { - emit_addimm(FP,(int)&mini_ht-(int)&dynarec_local,ht); -} - -static void do_rhash(int rs,int rh) { - emit_andimm(rs,0xf8,rh); -} - -static void do_miniht_load(int ht,int rh) { - assem_debug("ldr %s,[%s,%s]!\n",regname[rh],regname[ht],regname[rh]); - output_w32(0xe7b00000|rd_rn_rm(rh,ht,rh)); -} - -static void do_miniht_jump(int rs,int rh,int ht) { - emit_cmp(rh,rs); - emit_ldreq_indexed(ht,4,15); - #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK - emit_mov(rs,7); - emit_jmp(jump_vaddr_reg[7]); - #else - emit_jmp(jump_vaddr_reg[rs]); - #endif -} - -static void do_miniht_insert(u_int return_address,int rt,int temp) { - #ifndef HAVE_ARMV7 - emit_movimm(return_address,rt); // PC into link register - add_to_linker((int)out,return_address,1); - emit_pcreladdr(temp); - emit_writeword(rt,(int)&mini_ht[(return_address&0xFF)>>3][0]); - emit_writeword(temp,(int)&mini_ht[(return_address&0xFF)>>3][1]); - #else - emit_movw(return_address&0x0000FFFF,rt); - add_to_linker((int)out,return_address,1); - emit_pcreladdr(temp); - emit_writeword(temp,(int)&mini_ht[(return_address&0xFF)>>3][1]); - emit_movt(return_address&0xFFFF0000,rt); - emit_writeword(rt,(int)&mini_ht[(return_address&0xFF)>>3][0]); - #endif -} - -static void wb_valid(signed char pre[],signed char entry[],u_int dirty_pre,u_int dirty,uint64_t is32_pre,uint64_t u,uint64_t uu) -{ - //if(dirty_pre==dirty) return; - int hr,reg; - for(hr=0;hr>(reg&63))&1) { - if(reg>0) { - if(((dirty_pre&~dirty)>>hr)&1) { - if(reg>0&®<34) { - emit_storereg(reg,hr); - if( ((is32_pre&~uu)>>reg)&1 ) { - emit_sarimm(hr,31,HOST_TEMPREG); - emit_storereg(reg|64,HOST_TEMPREG); - } - } - else if(reg>=64) { - emit_storereg(reg,hr); - } - } - } - } - } - } -} - - -/* using strd could possibly help but you'd have to allocate registers in pairs -static void wb_invalidate_arm(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,uint64_t u,uint64_t uu) -{ - int hr; - int wrote=-1; - for(hr=HOST_REGS-1;hr>=0;hr--) { - if(hr!=EXCLUDE_REG) { - if(pre[hr]!=entry[hr]) { - if(pre[hr]>=0) { - if((dirty>>hr)&1) { - if(get_reg(entry,pre[hr])<0) { - if(pre[hr]<64) { - if(!((u>>pre[hr])&1)) { - if(hr<10&&(~hr&1)&&(pre[hr+1]<0||wrote==hr+1)) { - if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) { - emit_sarimm(hr,31,hr+1); - emit_strdreg(pre[hr],hr); - } - else - emit_storereg(pre[hr],hr); - }else{ - emit_storereg(pre[hr],hr); - if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) { - emit_sarimm(hr,31,hr); - emit_storereg(pre[hr]|64,hr); - } - } - } - }else{ - if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) { - emit_storereg(pre[hr],hr); - } - } - wrote=hr; - } - } - } - } - } - } - for(hr=0;hr=0) { - int nr; - if((nr=get_reg(entry,pre[hr]))>=0) { - emit_mov(hr,nr); - } - } - } - } - } -} -#define wb_invalidate wb_invalidate_arm -*/ - -static void mark_clear_cache(void *target) -{ - u_long offset = (char *)target - (char *)BASE_ADDR; - u_int mask = 1u << ((offset >> 12) & 31); - if (!(needs_clear_cache[offset >> 17] & mask)) { - char *start = (char *)((u_long)target & ~4095ul); - start_tcache_write(start, start + 4096); - needs_clear_cache[offset >> 17] |= mask; - } -} - -// Clearing the cache is rather slow on ARM Linux, so mark the areas -// that need to be cleared, and then only clear these areas once. -static void do_clear_cache() -{ - int i,j; - for (i=0;i<(1<<(TARGET_SIZE_2-17));i++) - { - u_int bitmap=needs_clear_cache[i]; - if(bitmap) { - u_int start,end; - for(j=0;j<32;j++) - { - if(bitmap&(1<>2)|0xF00; + //printf("target=%x addr=%p insn=%x\n",target,addr,*ptr2); + } + else if(ptr[3]==0x72) { + // generated by emit_jno_unlikely + if((target-(u_int)ptr2-8)<1024) { + assert(((uintptr_t)addr&3)==0); + assert((target&3)==0); + *ptr2=(*ptr2&0xFFFFF000)|((target-(u_int)ptr2-8)>>2)|0xF00; + } + else if((target-(u_int)ptr2-8)<4096&&!((target-(u_int)ptr2-8)&15)) { + assert(((uintptr_t)addr&3)==0); + assert((target&3)==0); + *ptr2=(*ptr2&0xFFFFF000)|((target-(u_int)ptr2-8)>>4)|0xE00; + } + else *ptr2=(0x7A000000)|(((target-(u_int)ptr2-8)<<6)>>8); + } + else { + assert((ptr[3]&0x0e)==0xa); + *ptr2=(*ptr2&0xFF000000)|(((target-(u_int)ptr2-8)<<6)>>8); + } +} + +// This optionally copies the instruction from the target of the branch into +// the space before the branch. Works, but the difference in speed is +// usually insignificant. +#if 0 +static void set_jump_target_fillslot(int addr,u_int target,int copy) +{ + u_char *ptr=(u_char *)addr; + u_int *ptr2=(u_int *)ptr; + assert(!copy||ptr2[-1]==0xe28dd000); + if(ptr[3]==0xe2) { + assert(!copy); + assert((target-(u_int)ptr2-8)<4096); + *ptr2=(*ptr2&0xFFFFF000)|(target-(u_int)ptr2-8); + } + else { + assert((ptr[3]&0x0e)==0xa); + u_int target_insn=*(u_int *)target; + if((target_insn&0x0e100000)==0) { // ALU, no immediate, no flags + copy=0; + } + if((target_insn&0x0c100000)==0x04100000) { // Load + copy=0; + } + if(target_insn&0x08000000) { + copy=0; + } + if(copy) { + ptr2[-1]=target_insn; + target+=4; + } + *ptr2=(*ptr2&0xFF000000)|(((target-(u_int)ptr2-8)<<6)>>8); + } +} +#endif + +/* Literal pool */ +static void add_literal(int addr,int val) +{ + assert(literalcount>6)+8; +} + +// Find the "clean" entry point from a "dirty" entry point +// by skipping past the call to verify_code +static void *get_clean_addr(void *addr) +{ + signed int *ptr = addr; + #ifndef HAVE_ARMV7 + ptr+=4; + #else + ptr+=6; + #endif + if((*ptr&0xFF000000)!=0xeb000000) ptr++; + assert((*ptr&0xFF000000)==0xeb000000); // bl instruction + ptr++; + if((*ptr&0xFF000000)==0xea000000) { + return (char *)ptr+((*ptr<<8)>>6)+8; // follow jump + } + return ptr; +} + +static int verify_dirty(const u_int *ptr) +{ + #ifndef HAVE_ARMV7 + u_int offset; + // get from literal pool + assert((*ptr&0xFFFF0000)==0xe59f0000); + offset=*ptr&0xfff; + u_int source=*(u_int*)((void *)ptr+offset+8); + ptr++; + assert((*ptr&0xFFFF0000)==0xe59f0000); + offset=*ptr&0xfff; + u_int copy=*(u_int*)((void *)ptr+offset+8); + ptr++; + assert((*ptr&0xFFFF0000)==0xe59f0000); + offset=*ptr&0xfff; + u_int len=*(u_int*)((void *)ptr+offset+8); + ptr++; + ptr++; + #else + // ARMv7 movw/movt + assert((*ptr&0xFFF00000)==0xe3000000); + u_int source=(ptr[0]&0xFFF)+((ptr[0]>>4)&0xF000)+((ptr[2]<<16)&0xFFF0000)+((ptr[2]<<12)&0xF0000000); + u_int copy=(ptr[1]&0xFFF)+((ptr[1]>>4)&0xF000)+((ptr[3]<<16)&0xFFF0000)+((ptr[3]<<12)&0xF0000000); + u_int len=(ptr[4]&0xFFF)+((ptr[4]>>4)&0xF000); + ptr+=6; + #endif + if((*ptr&0xFF000000)!=0xeb000000) ptr++; + assert((*ptr&0xFF000000)==0xeb000000); // bl instruction + //printf("verify_dirty: %x %x %x\n",source,copy,len); + return !memcmp((void *)source,(void *)copy,len); +} + +// This doesn't necessarily find all clean entry points, just +// guarantees that it's not dirty +static int isclean(void *addr) +{ + #ifndef HAVE_ARMV7 + u_int *ptr=((u_int *)addr)+4; + #else + u_int *ptr=((u_int *)addr)+6; + #endif + if((*ptr&0xFF000000)!=0xeb000000) ptr++; + if((*ptr&0xFF000000)!=0xeb000000) return 1; // bl instruction + if((int)ptr+((*ptr<<8)>>6)+8==(int)verify_code) return 0; + if((int)ptr+((*ptr<<8)>>6)+8==(int)verify_code_ds) return 0; + return 1; +} + +// get source that block at addr was compiled from (host pointers) +static void get_bounds(void *addr, u_char **start, u_char **end) +{ + u_int *ptr = addr; + #ifndef HAVE_ARMV7 + u_int offset; + // get from literal pool + assert((*ptr&0xFFFF0000)==0xe59f0000); + offset=*ptr&0xfff; + u_int source=*(u_int*)((void *)ptr+offset+8); + ptr++; + //assert((*ptr&0xFFFF0000)==0xe59f0000); + //offset=*ptr&0xfff; + //u_int copy=*(u_int*)((void *)ptr+offset+8); + ptr++; + assert((*ptr&0xFFFF0000)==0xe59f0000); + offset=*ptr&0xfff; + u_int len=*(u_int*)((void *)ptr+offset+8); + ptr++; + ptr++; + #else + // ARMv7 movw/movt + assert((*ptr&0xFFF00000)==0xe3000000); + u_int source=(ptr[0]&0xFFF)+((ptr[0]>>4)&0xF000)+((ptr[2]<<16)&0xFFF0000)+((ptr[2]<<12)&0xF0000000); + //u_int copy=(ptr[1]&0xFFF)+((ptr[1]>>4)&0xF000)+((ptr[3]<<16)&0xFFF0000)+((ptr[3]<<12)&0xF0000000); + u_int len=(ptr[4]&0xFFF)+((ptr[4]>>4)&0xF000); + ptr+=6; + #endif + if((*ptr&0xFF000000)!=0xeb000000) ptr++; + assert((*ptr&0xFF000000)==0xeb000000); // bl instruction + *start=(u_char *)source; + *end=(u_char *)source+len; +} + +// Allocate a specific ARM register. +static void alloc_arm_reg(struct regstat *cur,int i,signed char reg,int hr) +{ + int n; + int dirty=0; + + // see if it's already allocated (and dealloc it) + for(n=0;nregmap[n]==reg) { + dirty=(cur->dirty>>n)&1; + cur->regmap[n]=-1; + } + } + + cur->regmap[hr]=reg; + cur->dirty&=~(1<dirty|=dirty<isconst&=~(1<0) + { + if(imm<256) { + *encoded=((i&30)<<7)|imm; + return 1; + } + imm=(imm>>2)|(imm<<30);i-=2; + } + return 0; +} + +static void genimm_checked(u_int imm,u_int *encoded) +{ + u_int ret=genimm(imm,encoded); + assert(ret); + (void)ret; +} + +static u_int genjmp(u_int addr) +{ + if (addr < 3) return 0; // a branch that will be patched later + int offset = addr-(int)out-8; + if (offset < -33554432 || offset >= 33554432) { + SysPrintf("genjmp: out of range: %08x\n", offset); + abort(); + return 0; + } + return ((u_int)offset>>2)&0xffffff; +} + +static unused void emit_breakpoint(void) +{ + assem_debug("bkpt #0\n"); + //output_w32(0xe1200070); + output_w32(0xe7f001f0); +} + +static void emit_mov(int rs,int rt) +{ + assem_debug("mov %s,%s\n",regname[rt],regname[rs]); + output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)); +} + +static void emit_movs(int rs,int rt) +{ + assem_debug("movs %s,%s\n",regname[rt],regname[rs]); + output_w32(0xe1b00000|rd_rn_rm(rt,0,rs)); +} + +static void emit_add(int rs1,int rs2,int rt) +{ + assem_debug("add %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe0800000|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_adds(int rs1,int rs2,int rt) +{ + assem_debug("adds %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe0900000|rd_rn_rm(rt,rs1,rs2)); +} +#define emit_adds_ptr emit_adds + +static void emit_adcs(int rs1,int rs2,int rt) +{ + assem_debug("adcs %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe0b00000|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_neg(int rs, int rt) +{ + assem_debug("rsb %s,%s,#0\n",regname[rt],regname[rs]); + output_w32(0xe2600000|rd_rn_rm(rt,rs,0)); +} + +static void emit_sub(int rs1,int rs2,int rt) +{ + assem_debug("sub %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe0400000|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_zeroreg(int rt) +{ + assem_debug("mov %s,#0\n",regname[rt]); + output_w32(0xe3a00000|rd_rn_rm(rt,0,0)); +} + +static void emit_loadlp(u_int imm,u_int rt) +{ + add_literal((int)out,imm); + assem_debug("ldr %s,pc+? [=%x]\n",regname[rt],imm); + output_w32(0xe5900000|rd_rn_rm(rt,15,0)); +} + +static void emit_movw(u_int imm,u_int rt) +{ + assert(imm<65536); + assem_debug("movw %s,#%d (0x%x)\n",regname[rt],imm,imm); + output_w32(0xe3000000|rd_rn_rm(rt,0,0)|(imm&0xfff)|((imm<<4)&0xf0000)); +} + +static void emit_movt(u_int imm,u_int rt) +{ + assem_debug("movt %s,#%d (0x%x)\n",regname[rt],imm&0xffff0000,imm&0xffff0000); + output_w32(0xe3400000|rd_rn_rm(rt,0,0)|((imm>>16)&0xfff)|((imm>>12)&0xf0000)); +} + +static void emit_movimm(u_int imm,u_int rt) +{ + u_int armval; + if(genimm(imm,&armval)) { + assem_debug("mov %s,#%d\n",regname[rt],imm); + output_w32(0xe3a00000|rd_rn_rm(rt,0,0)|armval); + }else if(genimm(~imm,&armval)) { + assem_debug("mvn %s,#%d\n",regname[rt],imm); + output_w32(0xe3e00000|rd_rn_rm(rt,0,0)|armval); + }else if(imm<65536) { + #ifndef HAVE_ARMV7 + assem_debug("mov %s,#%d\n",regname[rt],imm&0xFF00); + output_w32(0xe3a00000|rd_rn_imm_shift(rt,0,imm>>8,8)); + assem_debug("add %s,%s,#%d\n",regname[rt],regname[rt],imm&0xFF); + output_w32(0xe2800000|rd_rn_imm_shift(rt,rt,imm&0xff,0)); + #else + emit_movw(imm,rt); + #endif + }else{ + #ifndef HAVE_ARMV7 + emit_loadlp(imm,rt); + #else + emit_movw(imm&0x0000FFFF,rt); + emit_movt(imm&0xFFFF0000,rt); + #endif + } +} + +static void emit_pcreladdr(u_int rt) +{ + assem_debug("add %s,pc,#?\n",regname[rt]); + output_w32(0xe2800000|rd_rn_rm(rt,15,0)); +} + +static void emit_loadreg(int r, int hr) +{ + if(r&64) { + SysPrintf("64bit load in 32bit mode!\n"); + assert(0); + return; + } + if((r&63)==0) + emit_zeroreg(hr); + else { + int addr = (int)&psxRegs.GPR.r[r]; + switch (r) { + //case HIREG: addr = &hi; break; + //case LOREG: addr = &lo; break; + case CCREG: addr = (int)&cycle_count; break; + case CSREG: addr = (int)&Status; break; + case INVCP: addr = (int)&invc_ptr; break; + case ROREG: addr = (int)&ram_offset; break; + default: assert(r < 34); break; + } + u_int offset = addr-(u_int)&dynarec_local; + assert(offset<4096); + assem_debug("ldr %s,fp+%d\n",regname[hr],offset); + output_w32(0xe5900000|rd_rn_rm(hr,FP,0)|offset); + } +} + +static void emit_storereg(int r, int hr) +{ + if(r&64) { + SysPrintf("64bit store in 32bit mode!\n"); + assert(0); + return; + } + int addr = (int)&psxRegs.GPR.r[r]; + switch (r) { + //case HIREG: addr = &hi; break; + //case LOREG: addr = &lo; break; + case CCREG: addr = (int)&cycle_count; break; + default: assert(r < 34); break; + } + u_int offset = addr-(u_int)&dynarec_local; + assert(offset<4096); + assem_debug("str %s,fp+%d\n",regname[hr],offset); + output_w32(0xe5800000|rd_rn_rm(hr,FP,0)|offset); +} + +static void emit_test(int rs, int rt) +{ + assem_debug("tst %s,%s\n",regname[rs],regname[rt]); + output_w32(0xe1100000|rd_rn_rm(0,rs,rt)); +} + +static void emit_testimm(int rs,int imm) +{ + u_int armval; + assem_debug("tst %s,#%d\n",regname[rs],imm); + genimm_checked(imm,&armval); + output_w32(0xe3100000|rd_rn_rm(0,rs,0)|armval); +} + +static void emit_testeqimm(int rs,int imm) +{ + u_int armval; + assem_debug("tsteq %s,$%d\n",regname[rs],imm); + genimm_checked(imm,&armval); + output_w32(0x03100000|rd_rn_rm(0,rs,0)|armval); +} + +static void emit_not(int rs,int rt) +{ + assem_debug("mvn %s,%s\n",regname[rt],regname[rs]); + output_w32(0xe1e00000|rd_rn_rm(rt,0,rs)); +} + +static void emit_and(u_int rs1,u_int rs2,u_int rt) +{ + assem_debug("and %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe0000000|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_or(u_int rs1,u_int rs2,u_int rt) +{ + assem_debug("orr %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe1800000|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_orrshl_imm(u_int rs,u_int imm,u_int rt) +{ + assert(rs<16); + assert(rt<16); + assert(imm<32); + assem_debug("orr %s,%s,%s,lsl #%d\n",regname[rt],regname[rt],regname[rs],imm); + output_w32(0xe1800000|rd_rn_rm(rt,rt,rs)|(imm<<7)); +} + +static void emit_orrshr_imm(u_int rs,u_int imm,u_int rt) +{ + assert(rs<16); + assert(rt<16); + assert(imm<32); + assem_debug("orr %s,%s,%s,lsr #%d\n",regname[rt],regname[rt],regname[rs],imm); + output_w32(0xe1800020|rd_rn_rm(rt,rt,rs)|(imm<<7)); +} + +static void emit_xor(u_int rs1,u_int rs2,u_int rt) +{ + assem_debug("eor %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe0200000|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_xorsar_imm(u_int rs1,u_int rs2,u_int imm,u_int rt) +{ + assem_debug("eor %s,%s,%s,asr #%d\n",regname[rt],regname[rs1],regname[rs2],imm); + output_w32(0xe0200040|rd_rn_rm(rt,rs1,rs2)|(imm<<7)); +} + +static void emit_addimm(u_int rs,int imm,u_int rt) +{ + assert(rs<16); + assert(rt<16); + if(imm!=0) { + u_int armval; + if(genimm(imm,&armval)) { + assem_debug("add %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe2800000|rd_rn_rm(rt,rs,0)|armval); + }else if(genimm(-imm,&armval)) { + assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rs],-imm); + output_w32(0xe2400000|rd_rn_rm(rt,rs,0)|armval); + #ifdef HAVE_ARMV7 + }else if(rt!=rs&&(u_int)imm<65536) { + emit_movw(imm&0x0000ffff,rt); + emit_add(rs,rt,rt); + }else if(rt!=rs&&(u_int)-imm<65536) { + emit_movw(-imm&0x0000ffff,rt); + emit_sub(rs,rt,rt); + #endif + }else if((u_int)-imm<65536) { + assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rs],(-imm)&0xFF00); + assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rt],(-imm)&0xFF); + output_w32(0xe2400000|rd_rn_imm_shift(rt,rs,(-imm)>>8,8)); + output_w32(0xe2400000|rd_rn_imm_shift(rt,rt,(-imm)&0xff,0)); + }else { + do { + int shift = (ffs(imm) - 1) & ~1; + int imm8 = imm & (0xff << shift); + genimm_checked(imm8,&armval); + assem_debug("add %s,%s,#0x%x\n",regname[rt],regname[rs],imm8); + output_w32(0xe2800000|rd_rn_rm(rt,rs,0)|armval); + rs = rt; + imm &= ~imm8; + } + while (imm != 0); + } + } + else if(rs!=rt) emit_mov(rs,rt); +} + +static void emit_addimm_and_set_flags(int imm,int rt) +{ + assert(imm>-65536&&imm<65536); + u_int armval; + if(genimm(imm,&armval)) { + assem_debug("adds %s,%s,#%d\n",regname[rt],regname[rt],imm); + output_w32(0xe2900000|rd_rn_rm(rt,rt,0)|armval); + }else if(genimm(-imm,&armval)) { + assem_debug("subs %s,%s,#%d\n",regname[rt],regname[rt],imm); + output_w32(0xe2500000|rd_rn_rm(rt,rt,0)|armval); + }else if(imm<0) { + assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rt],(-imm)&0xFF00); + assem_debug("subs %s,%s,#%d\n",regname[rt],regname[rt],(-imm)&0xFF); + output_w32(0xe2400000|rd_rn_imm_shift(rt,rt,(-imm)>>8,8)); + output_w32(0xe2500000|rd_rn_imm_shift(rt,rt,(-imm)&0xff,0)); + }else{ + assem_debug("add %s,%s,#%d\n",regname[rt],regname[rt],imm&0xFF00); + assem_debug("adds %s,%s,#%d\n",regname[rt],regname[rt],imm&0xFF); + output_w32(0xe2800000|rd_rn_imm_shift(rt,rt,imm>>8,8)); + output_w32(0xe2900000|rd_rn_imm_shift(rt,rt,imm&0xff,0)); + } +} + +static void emit_addnop(u_int r) +{ + assert(r<16); + assem_debug("add %s,%s,#0 (nop)\n",regname[r],regname[r]); + output_w32(0xe2800000|rd_rn_rm(r,r,0)); +} + +static void emit_andimm(int rs,int imm,int rt) +{ + u_int armval; + if(imm==0) { + emit_zeroreg(rt); + }else if(genimm(imm,&armval)) { + assem_debug("and %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe2000000|rd_rn_rm(rt,rs,0)|armval); + }else if(genimm(~imm,&armval)) { + assem_debug("bic %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe3c00000|rd_rn_rm(rt,rs,0)|armval); + }else if(imm==65535) { + #ifndef HAVE_ARMV6 + assem_debug("bic %s,%s,#FF000000\n",regname[rt],regname[rs]); + output_w32(0xe3c00000|rd_rn_rm(rt,rs,0)|0x4FF); + assem_debug("bic %s,%s,#00FF0000\n",regname[rt],regname[rt]); + output_w32(0xe3c00000|rd_rn_rm(rt,rt,0)|0x8FF); + #else + assem_debug("uxth %s,%s\n",regname[rt],regname[rs]); + output_w32(0xe6ff0070|rd_rn_rm(rt,0,rs)); + #endif + }else{ + assert(imm>0&&imm<65535); + #ifndef HAVE_ARMV7 + assem_debug("mov r14,#%d\n",imm&0xFF00); + output_w32(0xe3a00000|rd_rn_imm_shift(HOST_TEMPREG,0,imm>>8,8)); + assem_debug("add r14,r14,#%d\n",imm&0xFF); + output_w32(0xe2800000|rd_rn_imm_shift(HOST_TEMPREG,HOST_TEMPREG,imm&0xff,0)); + #else + emit_movw(imm,HOST_TEMPREG); + #endif + assem_debug("and %s,%s,r14\n",regname[rt],regname[rs]); + output_w32(0xe0000000|rd_rn_rm(rt,rs,HOST_TEMPREG)); + } +} + +static void emit_orimm(int rs,int imm,int rt) +{ + u_int armval; + if(imm==0) { + if(rs!=rt) emit_mov(rs,rt); + }else if(genimm(imm,&armval)) { + assem_debug("orr %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe3800000|rd_rn_rm(rt,rs,0)|armval); + }else{ + assert(imm>0&&imm<65536); + assem_debug("orr %s,%s,#%d\n",regname[rt],regname[rs],imm&0xFF00); + assem_debug("orr %s,%s,#%d\n",regname[rt],regname[rs],imm&0xFF); + output_w32(0xe3800000|rd_rn_imm_shift(rt,rs,imm>>8,8)); + output_w32(0xe3800000|rd_rn_imm_shift(rt,rt,imm&0xff,0)); + } +} + +static void emit_xorimm(int rs,int imm,int rt) +{ + u_int armval; + if(imm==0) { + if(rs!=rt) emit_mov(rs,rt); + }else if(genimm(imm,&armval)) { + assem_debug("eor %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe2200000|rd_rn_rm(rt,rs,0)|armval); + }else{ + assert(imm>0&&imm<65536); + assem_debug("eor %s,%s,#%d\n",regname[rt],regname[rs],imm&0xFF00); + assem_debug("eor %s,%s,#%d\n",regname[rt],regname[rs],imm&0xFF); + output_w32(0xe2200000|rd_rn_imm_shift(rt,rs,imm>>8,8)); + output_w32(0xe2200000|rd_rn_imm_shift(rt,rt,imm&0xff,0)); + } +} + +static void emit_shlimm(int rs,u_int imm,int rt) +{ + assert(imm>0); + assert(imm<32); + //if(imm==1) ... + assem_debug("lsl %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|(imm<<7)); +} + +static void emit_lsls_imm(int rs,int imm,int rt) +{ + assert(imm>0); + assert(imm<32); + assem_debug("lsls %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe1b00000|rd_rn_rm(rt,0,rs)|(imm<<7)); +} + +static unused void emit_lslpls_imm(int rs,int imm,int rt) +{ + assert(imm>0); + assert(imm<32); + assem_debug("lslpls %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0x51b00000|rd_rn_rm(rt,0,rs)|(imm<<7)); +} + +static void emit_shrimm(int rs,u_int imm,int rt) +{ + assert(imm>0); + assert(imm<32); + assem_debug("lsr %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x20|(imm<<7)); +} + +static void emit_sarimm(int rs,u_int imm,int rt) +{ + assert(imm>0); + assert(imm<32); + assem_debug("asr %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x40|(imm<<7)); +} + +static void emit_rorimm(int rs,u_int imm,int rt) +{ + assert(imm>0); + assert(imm<32); + assem_debug("ror %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x60|(imm<<7)); +} + +static void emit_signextend16(int rs,int rt) +{ + #ifndef HAVE_ARMV6 + emit_shlimm(rs,16,rt); + emit_sarimm(rt,16,rt); + #else + assem_debug("sxth %s,%s\n",regname[rt],regname[rs]); + output_w32(0xe6bf0070|rd_rn_rm(rt,0,rs)); + #endif +} + +static void emit_signextend8(int rs,int rt) +{ + #ifndef HAVE_ARMV6 + emit_shlimm(rs,24,rt); + emit_sarimm(rt,24,rt); + #else + assem_debug("sxtb %s,%s\n",regname[rt],regname[rs]); + output_w32(0xe6af0070|rd_rn_rm(rt,0,rs)); + #endif +} + +static void emit_shl(u_int rs,u_int shift,u_int rt) +{ + assert(rs<16); + assert(rt<16); + assert(shift<16); + //if(imm==1) ... + assem_debug("lsl %s,%s,%s\n",regname[rt],regname[rs],regname[shift]); + output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x10|(shift<<8)); +} + +static void emit_shr(u_int rs,u_int shift,u_int rt) +{ + assert(rs<16); + assert(rt<16); + assert(shift<16); + assem_debug("lsr %s,%s,%s\n",regname[rt],regname[rs],regname[shift]); + output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x30|(shift<<8)); +} + +static void emit_sar(u_int rs,u_int shift,u_int rt) +{ + assert(rs<16); + assert(rt<16); + assert(shift<16); + assem_debug("asr %s,%s,%s\n",regname[rt],regname[rs],regname[shift]); + output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x50|(shift<<8)); +} + +static unused void emit_orrshl(u_int rs,u_int shift,u_int rt) +{ + assert(rs<16); + assert(rt<16); + assert(shift<16); + assem_debug("orr %s,%s,%s,lsl %s\n",regname[rt],regname[rt],regname[rs],regname[shift]); + output_w32(0xe1800000|rd_rn_rm(rt,rt,rs)|0x10|(shift<<8)); +} + +static unused void emit_orrshr(u_int rs,u_int shift,u_int rt) +{ + assert(rs<16); + assert(rt<16); + assert(shift<16); + assem_debug("orr %s,%s,%s,lsr %s\n",regname[rt],regname[rt],regname[rs],regname[shift]); + output_w32(0xe1800000|rd_rn_rm(rt,rt,rs)|0x30|(shift<<8)); +} + +static void emit_cmpimm(int rs,int imm) +{ + u_int armval; + if(genimm(imm,&armval)) { + assem_debug("cmp %s,#%d\n",regname[rs],imm); + output_w32(0xe3500000|rd_rn_rm(0,rs,0)|armval); + }else if(genimm(-imm,&armval)) { + assem_debug("cmn %s,#%d\n",regname[rs],imm); + output_w32(0xe3700000|rd_rn_rm(0,rs,0)|armval); + }else if(imm>0) { + assert(imm<65536); + emit_movimm(imm,HOST_TEMPREG); + assem_debug("cmp %s,r14\n",regname[rs]); + output_w32(0xe1500000|rd_rn_rm(0,rs,HOST_TEMPREG)); + }else{ + assert(imm>-65536); + emit_movimm(-imm,HOST_TEMPREG); + assem_debug("cmn %s,r14\n",regname[rs]); + output_w32(0xe1700000|rd_rn_rm(0,rs,HOST_TEMPREG)); + } +} + +static void emit_cmovne_imm(int imm,int rt) +{ + assem_debug("movne %s,#%d\n",regname[rt],imm); + u_int armval; + genimm_checked(imm,&armval); + output_w32(0x13a00000|rd_rn_rm(rt,0,0)|armval); +} + +static void emit_cmovl_imm(int imm,int rt) +{ + assem_debug("movlt %s,#%d\n",regname[rt],imm); + u_int armval; + genimm_checked(imm,&armval); + output_w32(0xb3a00000|rd_rn_rm(rt,0,0)|armval); +} + +static void emit_cmovb_imm(int imm,int rt) +{ + assem_debug("movcc %s,#%d\n",regname[rt],imm); + u_int armval; + genimm_checked(imm,&armval); + output_w32(0x33a00000|rd_rn_rm(rt,0,0)|armval); +} + +static void emit_cmovae_imm(int imm,int rt) +{ + assem_debug("movcs %s,#%d\n",regname[rt],imm); + u_int armval; + genimm_checked(imm,&armval); + output_w32(0x23a00000|rd_rn_rm(rt,0,0)|armval); +} + +static void emit_cmovs_imm(int imm,int rt) +{ + assem_debug("movmi %s,#%d\n",regname[rt],imm); + u_int armval; + genimm_checked(imm,&armval); + output_w32(0x43a00000|rd_rn_rm(rt,0,0)|armval); +} + +static void emit_cmovne_reg(int rs,int rt) +{ + assem_debug("movne %s,%s\n",regname[rt],regname[rs]); + output_w32(0x11a00000|rd_rn_rm(rt,0,rs)); +} + +static void emit_cmovl_reg(int rs,int rt) +{ + assem_debug("movlt %s,%s\n",regname[rt],regname[rs]); + output_w32(0xb1a00000|rd_rn_rm(rt,0,rs)); +} + +static void emit_cmovb_reg(int rs,int rt) +{ + assem_debug("movcc %s,%s\n",regname[rt],regname[rs]); + output_w32(0x31a00000|rd_rn_rm(rt,0,rs)); +} + +static void emit_cmovs_reg(int rs,int rt) +{ + assem_debug("movmi %s,%s\n",regname[rt],regname[rs]); + output_w32(0x41a00000|rd_rn_rm(rt,0,rs)); +} + +static void emit_slti32(int rs,int imm,int rt) +{ + if(rs!=rt) emit_zeroreg(rt); + emit_cmpimm(rs,imm); + if(rs==rt) emit_movimm(0,rt); + emit_cmovl_imm(1,rt); +} + +static void emit_sltiu32(int rs,int imm,int rt) +{ + if(rs!=rt) emit_zeroreg(rt); + emit_cmpimm(rs,imm); + if(rs==rt) emit_movimm(0,rt); + emit_cmovb_imm(1,rt); +} + +static void emit_cmp(int rs,int rt) +{ + assem_debug("cmp %s,%s\n",regname[rs],regname[rt]); + output_w32(0xe1500000|rd_rn_rm(0,rs,rt)); +} + +static void emit_set_gz32(int rs, int rt) +{ + //assem_debug("set_gz32\n"); + emit_cmpimm(rs,1); + emit_movimm(1,rt); + emit_cmovl_imm(0,rt); +} + +static void emit_set_nz32(int rs, int rt) +{ + //assem_debug("set_nz32\n"); + if(rs!=rt) emit_movs(rs,rt); + else emit_test(rs,rs); + emit_cmovne_imm(1,rt); +} + +static void emit_set_if_less32(int rs1, int rs2, int rt) +{ + //assem_debug("set if less (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]); + if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt); + emit_cmp(rs1,rs2); + if(rs1==rt||rs2==rt) emit_movimm(0,rt); + emit_cmovl_imm(1,rt); +} + +static void emit_set_if_carry32(int rs1, int rs2, int rt) +{ + //assem_debug("set if carry (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]); + if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt); + emit_cmp(rs1,rs2); + if(rs1==rt||rs2==rt) emit_movimm(0,rt); + emit_cmovb_imm(1,rt); +} + +static int can_jump_or_call(const void *a) +{ + intptr_t offset = (u_char *)a - out - 8; + return (-33554432 <= offset && offset < 33554432); +} + +static void emit_call(const void *a_) +{ + int a = (int)a_; + assem_debug("bl %x (%x+%x)%s\n",a,(int)out,a-(int)out-8,func_name(a_)); + u_int offset=genjmp(a); + output_w32(0xeb000000|offset); +} + +static void emit_jmp(const void *a_) +{ + int a = (int)a_; + assem_debug("b %x (%x+%x)%s\n",a,(int)out,a-(int)out-8,func_name(a_)); + u_int offset=genjmp(a); + output_w32(0xea000000|offset); +} + +static void emit_jne(const void *a_) +{ + int a = (int)a_; + assem_debug("bne %x\n",a); + u_int offset=genjmp(a); + output_w32(0x1a000000|offset); +} + +static void emit_jeq(const void *a_) +{ + int a = (int)a_; + assem_debug("beq %x\n",a); + u_int offset=genjmp(a); + output_w32(0x0a000000|offset); +} + +static void emit_js(const void *a_) +{ + int a = (int)a_; + assem_debug("bmi %x\n",a); + u_int offset=genjmp(a); + output_w32(0x4a000000|offset); +} + +static void emit_jns(const void *a_) +{ + int a = (int)a_; + assem_debug("bpl %x\n",a); + u_int offset=genjmp(a); + output_w32(0x5a000000|offset); +} + +static void emit_jl(const void *a_) +{ + int a = (int)a_; + assem_debug("blt %x\n",a); + u_int offset=genjmp(a); + output_w32(0xba000000|offset); +} + +static void emit_jge(const void *a_) +{ + int a = (int)a_; + assem_debug("bge %x\n",a); + u_int offset=genjmp(a); + output_w32(0xaa000000|offset); +} + +static void emit_jno(const void *a_) +{ + int a = (int)a_; + assem_debug("bvc %x\n",a); + u_int offset=genjmp(a); + output_w32(0x7a000000|offset); +} + +static void emit_jc(const void *a_) +{ + int a = (int)a_; + assem_debug("bcs %x\n",a); + u_int offset=genjmp(a); + output_w32(0x2a000000|offset); +} + +static void emit_jcc(const void *a_) +{ + int a = (int)a_; + assem_debug("bcc %x\n",a); + u_int offset=genjmp(a); + output_w32(0x3a000000|offset); +} + +static unused void emit_callreg(u_int r) +{ + assert(r<15); + assem_debug("blx %s\n",regname[r]); + output_w32(0xe12fff30|r); +} + +static void emit_jmpreg(u_int r) +{ + assem_debug("mov pc,%s\n",regname[r]); + output_w32(0xe1a00000|rd_rn_rm(15,0,r)); +} + +static void emit_ret(void) +{ + emit_jmpreg(14); +} + +static void emit_readword_indexed(int offset, int rs, int rt) +{ + assert(offset>-4096&&offset<4096); + assem_debug("ldr %s,%s+%d\n",regname[rt],regname[rs],offset); + if(offset>=0) { + output_w32(0xe5900000|rd_rn_rm(rt,rs,0)|offset); + }else{ + output_w32(0xe5100000|rd_rn_rm(rt,rs,0)|(-offset)); + } +} + +static void emit_readword_dualindexedx4(int rs1, int rs2, int rt) +{ + assem_debug("ldr %s,%s,%s lsl #2\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe7900000|rd_rn_rm(rt,rs1,rs2)|0x100); +} +#define emit_readptr_dualindexedx_ptrlen emit_readword_dualindexedx4 + +static void emit_ldr_dualindexed(int rs1, int rs2, int rt) +{ + assem_debug("ldr %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe7900000|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_ldrcc_dualindexed(int rs1, int rs2, int rt) +{ + assem_debug("ldrcc %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0x37900000|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_ldrb_dualindexed(int rs1, int rs2, int rt) +{ + assem_debug("ldrb %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe7d00000|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_ldrccb_dualindexed(int rs1, int rs2, int rt) +{ + assem_debug("ldrccb %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0x37d00000|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_ldrsb_dualindexed(int rs1, int rs2, int rt) +{ + assem_debug("ldrsb %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe19000d0|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_ldrccsb_dualindexed(int rs1, int rs2, int rt) +{ + assem_debug("ldrccsb %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0x319000d0|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_ldrh_dualindexed(int rs1, int rs2, int rt) +{ + assem_debug("ldrh %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe19000b0|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_ldrcch_dualindexed(int rs1, int rs2, int rt) +{ + assem_debug("ldrcch %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0x319000b0|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_ldrsh_dualindexed(int rs1, int rs2, int rt) +{ + assem_debug("ldrsh %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe19000f0|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_ldrccsh_dualindexed(int rs1, int rs2, int rt) +{ + assem_debug("ldrccsh %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0x319000f0|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_str_dualindexed(int rs1, int rs2, int rt) +{ + assem_debug("str %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe7800000|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_strb_dualindexed(int rs1, int rs2, int rt) +{ + assem_debug("strb %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe7c00000|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_strh_dualindexed(int rs1, int rs2, int rt) +{ + assem_debug("strh %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe18000b0|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_movsbl_indexed(int offset, int rs, int rt) +{ + assert(offset>-256&&offset<256); + assem_debug("ldrsb %s,%s+%d\n",regname[rt],regname[rs],offset); + if(offset>=0) { + output_w32(0xe1d000d0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf)); + }else{ + output_w32(0xe15000d0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf)); + } +} + +static void emit_movswl_indexed(int offset, int rs, int rt) +{ + assert(offset>-256&&offset<256); + assem_debug("ldrsh %s,%s+%d\n",regname[rt],regname[rs],offset); + if(offset>=0) { + output_w32(0xe1d000f0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf)); + }else{ + output_w32(0xe15000f0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf)); + } +} + +static void emit_movzbl_indexed(int offset, int rs, int rt) +{ + assert(offset>-4096&&offset<4096); + assem_debug("ldrb %s,%s+%d\n",regname[rt],regname[rs],offset); + if(offset>=0) { + output_w32(0xe5d00000|rd_rn_rm(rt,rs,0)|offset); + }else{ + output_w32(0xe5500000|rd_rn_rm(rt,rs,0)|(-offset)); + } +} + +static void emit_movzwl_indexed(int offset, int rs, int rt) +{ + assert(offset>-256&&offset<256); + assem_debug("ldrh %s,%s+%d\n",regname[rt],regname[rs],offset); + if(offset>=0) { + output_w32(0xe1d000b0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf)); + }else{ + output_w32(0xe15000b0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf)); + } +} + +static void emit_ldrd(int offset, int rs, int rt) +{ + assert(offset>-256&&offset<256); + assem_debug("ldrd %s,%s+%d\n",regname[rt],regname[rs],offset); + if(offset>=0) { + output_w32(0xe1c000d0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf)); + }else{ + output_w32(0xe14000d0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf)); + } +} + +static void emit_readword(void *addr, int rt) +{ + uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local; + assert(offset<4096); + assem_debug("ldr %s,fp+%d\n",regname[rt],offset); + output_w32(0xe5900000|rd_rn_rm(rt,FP,0)|offset); +} +#define emit_readptr emit_readword + +static void emit_writeword_indexed(int rt, int offset, int rs) +{ + assert(offset>-4096&&offset<4096); + assem_debug("str %s,%s+%d\n",regname[rt],regname[rs],offset); + if(offset>=0) { + output_w32(0xe5800000|rd_rn_rm(rt,rs,0)|offset); + }else{ + output_w32(0xe5000000|rd_rn_rm(rt,rs,0)|(-offset)); + } +} + +static void emit_writehword_indexed(int rt, int offset, int rs) +{ + assert(offset>-256&&offset<256); + assem_debug("strh %s,%s+%d\n",regname[rt],regname[rs],offset); + if(offset>=0) { + output_w32(0xe1c000b0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf)); + }else{ + output_w32(0xe14000b0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf)); + } +} + +static void emit_writebyte_indexed(int rt, int offset, int rs) +{ + assert(offset>-4096&&offset<4096); + assem_debug("strb %s,%s+%d\n",regname[rt],regname[rs],offset); + if(offset>=0) { + output_w32(0xe5c00000|rd_rn_rm(rt,rs,0)|offset); + }else{ + output_w32(0xe5400000|rd_rn_rm(rt,rs,0)|(-offset)); + } +} + +static void emit_strcc_dualindexed(int rs1, int rs2, int rt) +{ + assem_debug("strcc %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0x37800000|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_strccb_dualindexed(int rs1, int rs2, int rt) +{ + assem_debug("strccb %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0x37c00000|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_strcch_dualindexed(int rs1, int rs2, int rt) +{ + assem_debug("strcch %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0x318000b0|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_writeword(int rt, void *addr) +{ + uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local; + assert(offset<4096); + assem_debug("str %s,fp+%d\n",regname[rt],offset); + output_w32(0xe5800000|rd_rn_rm(rt,FP,0)|offset); +} + +static void emit_umull(u_int rs1, u_int rs2, u_int hi, u_int lo) +{ + assem_debug("umull %s, %s, %s, %s\n",regname[lo],regname[hi],regname[rs1],regname[rs2]); + assert(rs1<16); + assert(rs2<16); + assert(hi<16); + assert(lo<16); + output_w32(0xe0800090|(hi<<16)|(lo<<12)|(rs2<<8)|rs1); +} + +static void emit_smull(u_int rs1, u_int rs2, u_int hi, u_int lo) +{ + assem_debug("smull %s, %s, %s, %s\n",regname[lo],regname[hi],regname[rs1],regname[rs2]); + assert(rs1<16); + assert(rs2<16); + assert(hi<16); + assert(lo<16); + output_w32(0xe0c00090|(hi<<16)|(lo<<12)|(rs2<<8)|rs1); +} + +static void emit_clz(int rs,int rt) +{ + assem_debug("clz %s,%s\n",regname[rt],regname[rs]); + output_w32(0xe16f0f10|rd_rn_rm(rt,0,rs)); +} + +static void emit_subcs(int rs1,int rs2,int rt) +{ + assem_debug("subcs %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0x20400000|rd_rn_rm(rt,rs1,rs2)); +} + +static void emit_shrcc_imm(int rs,u_int imm,int rt) +{ + assert(imm>0); + assert(imm<32); + assem_debug("lsrcc %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0x31a00000|rd_rn_rm(rt,0,rs)|0x20|(imm<<7)); +} + +static void emit_shrne_imm(int rs,u_int imm,int rt) +{ + assert(imm>0); + assert(imm<32); + assem_debug("lsrne %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0x11a00000|rd_rn_rm(rt,0,rs)|0x20|(imm<<7)); +} + +static void emit_negmi(int rs, int rt) +{ + assem_debug("rsbmi %s,%s,#0\n",regname[rt],regname[rs]); + output_w32(0x42600000|rd_rn_rm(rt,rs,0)); +} + +static void emit_negsmi(int rs, int rt) +{ + assem_debug("rsbsmi %s,%s,#0\n",regname[rt],regname[rs]); + output_w32(0x42700000|rd_rn_rm(rt,rs,0)); +} + +static void emit_bic_lsl(u_int rs1,u_int rs2,u_int shift,u_int rt) +{ + assem_debug("bic %s,%s,%s lsl %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]); + output_w32(0xe1C00000|rd_rn_rm(rt,rs1,rs2)|0x10|(shift<<8)); +} + +static void emit_bic_lsr(u_int rs1,u_int rs2,u_int shift,u_int rt) +{ + assem_debug("bic %s,%s,%s lsr %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]); + output_w32(0xe1C00000|rd_rn_rm(rt,rs1,rs2)|0x30|(shift<<8)); +} + +static void emit_teq(int rs, int rt) +{ + assem_debug("teq %s,%s\n",regname[rs],regname[rt]); + output_w32(0xe1300000|rd_rn_rm(0,rs,rt)); +} + +static unused void emit_rsbimm(int rs, int imm, int rt) +{ + u_int armval; + genimm_checked(imm,&armval); + assem_debug("rsb %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe2600000|rd_rn_rm(rt,rs,0)|armval); +} + +// Conditionally select one of two immediates, optimizing for small code size +// This will only be called if HAVE_CMOV_IMM is defined +static void emit_cmov2imm_e_ne_compact(int imm1,int imm2,u_int rt) +{ + u_int armval; + if(genimm(imm2-imm1,&armval)) { + emit_movimm(imm1,rt); + assem_debug("addne %s,%s,#%d\n",regname[rt],regname[rt],imm2-imm1); + output_w32(0x12800000|rd_rn_rm(rt,rt,0)|armval); + }else if(genimm(imm1-imm2,&armval)) { + emit_movimm(imm1,rt); + assem_debug("subne %s,%s,#%d\n",regname[rt],regname[rt],imm1-imm2); + output_w32(0x12400000|rd_rn_rm(rt,rt,0)|armval); + } + else { + #ifndef HAVE_ARMV7 + emit_movimm(imm1,rt); + add_literal((int)out,imm2); + assem_debug("ldrne %s,pc+? [=%x]\n",regname[rt],imm2); + output_w32(0x15900000|rd_rn_rm(rt,15,0)); + #else + emit_movw(imm1&0x0000FFFF,rt); + if((imm1&0xFFFF)!=(imm2&0xFFFF)) { + assem_debug("movwne %s,#%d (0x%x)\n",regname[rt],imm2&0xFFFF,imm2&0xFFFF); + output_w32(0x13000000|rd_rn_rm(rt,0,0)|(imm2&0xfff)|((imm2<<4)&0xf0000)); + } + emit_movt(imm1&0xFFFF0000,rt); + if((imm1&0xFFFF0000)!=(imm2&0xFFFF0000)) { + assem_debug("movtne %s,#%d (0x%x)\n",regname[rt],imm2&0xffff0000,imm2&0xffff0000); + output_w32(0x13400000|rd_rn_rm(rt,0,0)|((imm2>>16)&0xfff)|((imm2>>12)&0xf0000)); + } + #endif + } +} + +// special case for checking invalid_code +static void emit_cmpmem_indexedsr12_reg(int base,int r,int imm) +{ + assert(imm<128&&imm>=0); + assert(r>=0&&r<16); + assem_debug("ldrb lr,%s,%s lsr #12\n",regname[base],regname[r]); + output_w32(0xe7d00000|rd_rn_rm(HOST_TEMPREG,base,r)|0x620); + emit_cmpimm(HOST_TEMPREG,imm); +} + +static void emit_callne(int a) +{ + assem_debug("blne %x\n",a); + u_int offset=genjmp(a); + output_w32(0x1b000000|offset); +} + +// Used to preload hash table entries +static unused void emit_prefetchreg(int r) +{ + assem_debug("pld %s\n",regname[r]); + output_w32(0xf5d0f000|rd_rn_rm(0,r,0)); +} + +// Special case for mini_ht +static void emit_ldreq_indexed(int rs, u_int offset, int rt) +{ + assert(offset<4096); + assem_debug("ldreq %s,[%s, #%d]\n",regname[rt],regname[rs],offset); + output_w32(0x05900000|rd_rn_rm(rt,rs,0)|offset); +} + +static void emit_orrne_imm(int rs,int imm,int rt) +{ + u_int armval; + genimm_checked(imm,&armval); + assem_debug("orrne %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0x13800000|rd_rn_rm(rt,rs,0)|armval); +} + +static unused void emit_addpl_imm(int rs,int imm,int rt) +{ + u_int armval; + genimm_checked(imm,&armval); + assem_debug("addpl %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0x52800000|rd_rn_rm(rt,rs,0)|armval); +} + +static void emit_jno_unlikely(int a) +{ + //emit_jno(a); + assem_debug("addvc pc,pc,#? (%x)\n",/*a-(int)out-8,*/a); + output_w32(0x72800000|rd_rn_rm(15,15,0)); +} + +static void save_regs_all(u_int reglist) +{ + int i; + if(!reglist) return; + assem_debug("stmia fp,{"); + for(i=0;i<16;i++) + if(reglist&(1<=ndrc->translation_cache&&addr<(ndrc->translation_cache+(1<=0x80000000&&target<0x80800000)||(target>0xA4000000&&target<0xA4001000)); +//DEBUG > +#ifdef DEBUG_CYCLE_COUNT + emit_readword(&last_count,ECX); + emit_add(HOST_CCREG,ECX,HOST_CCREG); + emit_readword(&next_interupt,ECX); + emit_writeword(HOST_CCREG,&Count); + emit_sub(HOST_CCREG,ECX,HOST_CCREG); + emit_writeword(ECX,&last_count); +#endif +//DEBUG < + emit_far_jump(linker); +} + +static void check_extjump2(void *src) +{ + u_int *ptr = src; + assert((ptr[1] & 0x0fff0000) == 0x059f0000); // ldr rx, [pc, #ofs] + (void)ptr; +} + +// put rt_val into rt, potentially making use of rs with value rs_val +static void emit_movimm_from(u_int rs_val,int rs,u_int rt_val,int rt) +{ + u_int armval; + int diff; + if(genimm(rt_val,&armval)) { + assem_debug("mov %s,#%d\n",regname[rt],rt_val); + output_w32(0xe3a00000|rd_rn_rm(rt,0,0)|armval); + return; + } + if(genimm(~rt_val,&armval)) { + assem_debug("mvn %s,#%d\n",regname[rt],rt_val); + output_w32(0xe3e00000|rd_rn_rm(rt,0,0)|armval); + return; + } + diff=rt_val-rs_val; + if(genimm(diff,&armval)) { + assem_debug("add %s,%s,#%d\n",regname[rt],regname[rs],diff); + output_w32(0xe2800000|rd_rn_rm(rt,rs,0)|armval); + return; + }else if(genimm(-diff,&armval)) { + assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rs],-diff); + output_w32(0xe2400000|rd_rn_rm(rt,rs,0)|armval); + return; + } + emit_movimm(rt_val,rt); +} + +// return 1 if above function can do it's job cheaply +static int is_similar_value(u_int v1,u_int v2) +{ + u_int xs; + int diff; + if(v1==v2) return 1; + diff=v2-v1; + for(xs=diff;xs!=0&&(xs&3)==0;xs>>=2) + ; + if(xs<0x100) return 1; + for(xs=-diff;xs!=0&&(xs&3)==0;xs>>=2) + ; + if(xs<0x100) return 1; + return 0; +} + +static void mov_loadtype_adj(enum stub_type type,int rs,int rt) +{ + switch(type) { + case LOADB_STUB: emit_signextend8(rs,rt); break; + case LOADBU_STUB: emit_andimm(rs,0xff,rt); break; + case LOADH_STUB: emit_signextend16(rs,rt); break; + case LOADHU_STUB: emit_andimm(rs,0xffff,rt); break; + case LOADW_STUB: if(rs!=rt) emit_mov(rs,rt); break; + default: assert(0); + } +} + +#include "pcsxmem.h" +#include "pcsxmem_inline.c" + +static void do_readstub(int n) +{ + assem_debug("do_readstub %x\n",start+stubs[n].a*4); + literal_pool(256); + set_jump_target(stubs[n].addr, out); + enum stub_type type=stubs[n].type; + int i=stubs[n].a; + int rs=stubs[n].b; + const struct regstat *i_regs=(struct regstat *)stubs[n].c; + u_int reglist=stubs[n].e; + const signed char *i_regmap=i_regs->regmap; + int rt; + if(dops[i].itype==C1LS||dops[i].itype==C2LS||dops[i].itype==LOADLR) { + rt=get_reg(i_regmap,FTEMP); + }else{ + rt=get_reg(i_regmap,dops[i].rt1); + } + assert(rs>=0); + int r,temp=-1,temp2=HOST_TEMPREG,regs_saved=0; + void *restore_jump = NULL; + reglist|=(1<=0&&dops[i].rt1!=0) + reglist&=~(1<=0&&dops[i].rt1!=0)) { + switch(type) { + case LOADB_STUB: emit_ldrccsb_dualindexed(temp2,rs,rt); break; + case LOADBU_STUB: emit_ldrccb_dualindexed(temp2,rs,rt); break; + case LOADH_STUB: emit_ldrccsh_dualindexed(temp2,rs,rt); break; + case LOADHU_STUB: emit_ldrcch_dualindexed(temp2,rs,rt); break; + case LOADW_STUB: emit_ldrcc_dualindexed(temp2,rs,rt); break; + default: assert(0); + } + } + if(regs_saved) { + restore_jump=out; + emit_jcc(0); // jump to reg restore + } + else + emit_jcc(stubs[n].retaddr); // return address + + if(!regs_saved) + save_regs(reglist); + void *handler=NULL; + if(type==LOADB_STUB||type==LOADBU_STUB) + handler=jump_handler_read8; + if(type==LOADH_STUB||type==LOADHU_STUB) + handler=jump_handler_read16; + if(type==LOADW_STUB) + handler=jump_handler_read32; + assert(handler); + pass_args(rs,temp2); + int cc=get_reg(i_regmap,CCREG); + if(cc<0) + emit_loadreg(CCREG,2); + emit_addimm(cc<0?2:cc,(int)stubs[n].d,2); + emit_far_call(handler); + if(dops[i].itype==C1LS||dops[i].itype==C2LS||(rt>=0&&dops[i].rt1!=0)) { + mov_loadtype_adj(type,0,rt); + } + if(restore_jump) + set_jump_target(restore_jump, out); + restore_regs(reglist); + emit_jmp(stubs[n].retaddr); // return address +} + +static void inline_readstub(enum stub_type type, int i, u_int addr, + const signed char regmap[], int target, int adj, u_int reglist) +{ + int rs=get_reg(regmap,target); + int rt=get_reg(regmap,target); + if(rs<0) rs=get_reg(regmap,-1); + assert(rs>=0); + u_int is_dynamic; + uintptr_t host_addr = 0; + void *handler; + int cc=get_reg(regmap,CCREG); + if(pcsx_direct_read(type,addr,adj,cc,target?rs:-1,rt)) + return; + handler = get_direct_memhandler(mem_rtab, addr, type, &host_addr); + if (handler == NULL) { + if(rt<0||dops[i].rt1==0) + return; + if(addr!=host_addr) + emit_movimm_from(addr,rs,host_addr,rs); + switch(type) { + case LOADB_STUB: emit_movsbl_indexed(0,rs,rt); break; + case LOADBU_STUB: emit_movzbl_indexed(0,rs,rt); break; + case LOADH_STUB: emit_movswl_indexed(0,rs,rt); break; + case LOADHU_STUB: emit_movzwl_indexed(0,rs,rt); break; + case LOADW_STUB: emit_readword_indexed(0,rs,rt); break; + default: assert(0); + } + return; + } + is_dynamic=pcsxmem_is_handler_dynamic(addr); + if(is_dynamic) { + if(type==LOADB_STUB||type==LOADBU_STUB) + handler=jump_handler_read8; + if(type==LOADH_STUB||type==LOADHU_STUB) + handler=jump_handler_read16; + if(type==LOADW_STUB) + handler=jump_handler_read32; + } + + // call a memhandler + if(rt>=0&&dops[i].rt1!=0) + reglist&=~(1<>12]<<1,1); + emit_addimm(cc<0?2:cc,adj,2); + } + else { + emit_readword(&last_count,3); + emit_addimm(cc<0?2:cc,adj,2); + emit_add(2,3,2); + emit_writeword(2,&Count); + } + + emit_far_call(handler); + + if(rt>=0&&dops[i].rt1!=0) { + switch(type) { + case LOADB_STUB: emit_signextend8(0,rt); break; + case LOADBU_STUB: emit_andimm(0,0xff,rt); break; + case LOADH_STUB: emit_signextend16(0,rt); break; + case LOADHU_STUB: emit_andimm(0,0xffff,rt); break; + case LOADW_STUB: if(rt!=0) emit_mov(0,rt); break; + default: assert(0); + } + } + restore_regs(reglist); +} + +static void do_writestub(int n) +{ + assem_debug("do_writestub %x\n",start+stubs[n].a*4); + literal_pool(256); + set_jump_target(stubs[n].addr, out); + enum stub_type type=stubs[n].type; + int i=stubs[n].a; + int rs=stubs[n].b; + const struct regstat *i_regs=(struct regstat *)stubs[n].c; + u_int reglist=stubs[n].e; + const signed char *i_regmap=i_regs->regmap; + int rt,r; + if(dops[i].itype==C1LS||dops[i].itype==C2LS) { + rt=get_reg(i_regmap,r=FTEMP); + }else{ + rt=get_reg(i_regmap,r=dops[i].rs2); + } + assert(rs>=0); + assert(rt>=0); + int rtmp,temp=-1,temp2=HOST_TEMPREG,regs_saved=0; + void *restore_jump = NULL; + int reglist2=reglist|(1<=0); + assert(rt>=0); + uintptr_t host_addr = 0; + void *handler = get_direct_memhandler(mem_wtab, addr, type, &host_addr); + if (handler == NULL) { + if(addr!=host_addr) + emit_movimm_from(addr,rs,host_addr,rs); + switch(type) { + case STOREB_STUB: emit_writebyte_indexed(rt,0,rs); break; + case STOREH_STUB: emit_writehword_indexed(rt,0,rs); break; + case STOREW_STUB: emit_writeword_indexed(rt,0,rs); break; + default: assert(0); + } + return; + } + + // call a memhandler + save_regs(reglist); + pass_args(rs,rt); + int cc=get_reg(regmap,CCREG); + if(cc<0) + emit_loadreg(CCREG,2); + emit_addimm(cc<0?2:cc,adj,2); + emit_movimm((u_int)handler,3); + // returns new cycle_count + emit_far_call(jump_handler_write_h); + emit_addimm(0,-adj,cc<0?2:cc); + if(cc<0) + emit_storereg(CCREG,2); + restore_regs(reglist); +} + +// this output is parsed by verify_dirty, get_bounds, isclean, get_clean_addr +static void do_dirty_stub_emit_args(u_int arg0, u_int source_len) +{ + #ifndef HAVE_ARMV7 + emit_loadlp((int)source, 1); + emit_loadlp((int)copy, 2); + emit_loadlp(source_len, 3); + #else + emit_movw(((u_int)source)&0x0000FFFF, 1); + emit_movw(((u_int)copy)&0x0000FFFF, 2); + emit_movt(((u_int)source)&0xFFFF0000, 1); + emit_movt(((u_int)copy)&0xFFFF0000, 2); + emit_movw(source_len, 3); + #endif + emit_movimm(arg0, 0); +} + +static void *do_dirty_stub(int i, u_int source_len) +{ + assem_debug("do_dirty_stub %x\n",start+i*4); + do_dirty_stub_emit_args(start + i*4, source_len); + emit_far_call(verify_code); + void *entry = out; + load_regs_entry(i); + if (entry == out) + entry = instr_addr[i]; + emit_jmp(instr_addr[i]); + return entry; +} + +static void do_dirty_stub_ds(u_int source_len) +{ + do_dirty_stub_emit_args(start + 1, source_len); + emit_far_call(verify_code_ds); +} + +/* Special assem */ + +static void c2op_prologue(u_int op, int i, const struct regstat *i_regs, u_int reglist) +{ + save_regs_all(reglist); + cop2_do_stall_check(op, i, i_regs, 0); +#ifdef PCNT + emit_movimm(op, 0); + emit_far_call(pcnt_gte_start); +#endif + emit_addimm(FP, (u_char *)&psxRegs.CP2D.r[0] - (u_char *)&dynarec_local, 0); // cop2 regs +} + +static void c2op_epilogue(u_int op,u_int reglist) +{ +#ifdef PCNT + emit_movimm(op,0); + emit_far_call(pcnt_gte_end); +#endif + restore_regs_all(reglist); +} + +static void c2op_call_MACtoIR(int lm,int need_flags) +{ + if(need_flags) + emit_far_call(lm?gteMACtoIR_lm1:gteMACtoIR_lm0); + else + emit_far_call(lm?gteMACtoIR_lm1_nf:gteMACtoIR_lm0_nf); +} + +static void c2op_call_rgb_func(void *func,int lm,int need_ir,int need_flags) +{ + emit_far_call(func); + // func is C code and trashes r0 + emit_addimm(FP,(int)&psxRegs.CP2D.r[0]-(int)&dynarec_local,0); + if(need_flags||need_ir) + c2op_call_MACtoIR(lm,need_flags); + emit_far_call(need_flags?gteMACtoRGB:gteMACtoRGB_nf); +} + +static void c2op_assemble(int i, const struct regstat *i_regs) +{ + u_int c2op = source[i] & 0x3f; + u_int reglist_full = get_host_reglist(i_regs->regmap); + u_int reglist = reglist_full & CALLER_SAVE_REGS; + int need_flags, need_ir; + + if (gte_handlers[c2op]!=NULL) { + need_flags=!(gte_unneeded[i+1]>>63); // +1 because of how liveness detection works + need_ir=(gte_unneeded[i+1]&0xe00)!=0xe00; + assem_debug("gte op %08x, unneeded %016llx, need_flags %d, need_ir %d\n", + source[i],gte_unneeded[i+1],need_flags,need_ir); + if(HACK_ENABLED(NDHACK_GTE_NO_FLAGS)) + need_flags=0; + int shift = (source[i] >> 19) & 1; + int lm = (source[i] >> 10) & 1; + switch(c2op) { +#ifndef DRC_DBG + case GTE_MVMVA: { +#ifdef HAVE_ARMV5 + int v = (source[i] >> 15) & 3; + int cv = (source[i] >> 13) & 3; + int mx = (source[i] >> 17) & 3; + reglist=reglist_full&(CALLER_SAVE_REGS|0xf0); // +{r4-r7} + c2op_prologue(c2op,i,i_regs,reglist); + /* r4,r5 = VXYZ(v) packed; r6 = &MX11(mx); r7 = &CV1(cv) */ + if(v<3) + emit_ldrd(v*8,0,4); + else { + emit_movzwl_indexed(9*4,0,4); // gteIR + emit_movzwl_indexed(10*4,0,6); + emit_movzwl_indexed(11*4,0,5); + emit_orrshl_imm(6,16,4); + } + if(mx<3) + emit_addimm(0,32*4+mx*8*4,6); + else + emit_readword(&zeromem_ptr,6); + if(cv<3) + emit_addimm(0,32*4+(cv*8+5)*4,7); + else + emit_readword(&zeromem_ptr,7); +#ifdef __ARM_NEON__ + emit_movimm(source[i],1); // opcode + emit_far_call(gteMVMVA_part_neon); + if(need_flags) { + emit_movimm(lm,1); + emit_far_call(gteMACtoIR_flags_neon); + } +#else + if(cv==3&&shift) + emit_far_call((int)gteMVMVA_part_cv3sh12_arm); + else { + emit_movimm(shift,1); + emit_far_call((int)(need_flags?gteMVMVA_part_arm:gteMVMVA_part_nf_arm)); + } + if(need_flags||need_ir) + c2op_call_MACtoIR(lm,need_flags); +#endif +#else /* if not HAVE_ARMV5 */ + c2op_prologue(c2op,i,i_regs,reglist); + emit_movimm(source[i],1); // opcode + emit_writeword(1,&psxRegs.code); + emit_far_call(need_flags?gte_handlers[c2op]:gte_handlers_nf[c2op]); +#endif + break; + } + case GTE_OP: + c2op_prologue(c2op,i,i_regs,reglist); + emit_far_call(shift?gteOP_part_shift:gteOP_part_noshift); + if(need_flags||need_ir) { + emit_addimm(FP,(int)&psxRegs.CP2D.r[0]-(int)&dynarec_local,0); + c2op_call_MACtoIR(lm,need_flags); + } + break; + case GTE_DPCS: + c2op_prologue(c2op,i,i_regs,reglist); + c2op_call_rgb_func(shift?gteDPCS_part_shift:gteDPCS_part_noshift,lm,need_ir,need_flags); + break; + case GTE_INTPL: + c2op_prologue(c2op,i,i_regs,reglist); + c2op_call_rgb_func(shift?gteINTPL_part_shift:gteINTPL_part_noshift,lm,need_ir,need_flags); + break; + case GTE_SQR: + c2op_prologue(c2op,i,i_regs,reglist); + emit_far_call(shift?gteSQR_part_shift:gteSQR_part_noshift); + if(need_flags||need_ir) { + emit_addimm(FP,(int)&psxRegs.CP2D.r[0]-(int)&dynarec_local,0); + c2op_call_MACtoIR(lm,need_flags); + } + break; + case GTE_DCPL: + c2op_prologue(c2op,i,i_regs,reglist); + c2op_call_rgb_func(gteDCPL_part,lm,need_ir,need_flags); + break; + case GTE_GPF: + c2op_prologue(c2op,i,i_regs,reglist); + c2op_call_rgb_func(shift?gteGPF_part_shift:gteGPF_part_noshift,lm,need_ir,need_flags); + break; + case GTE_GPL: + c2op_prologue(c2op,i,i_regs,reglist); + c2op_call_rgb_func(shift?gteGPL_part_shift:gteGPL_part_noshift,lm,need_ir,need_flags); + break; +#endif + default: + c2op_prologue(c2op,i,i_regs,reglist); +#ifdef DRC_DBG + emit_movimm(source[i],1); // opcode + emit_writeword(1,&psxRegs.code); +#endif + emit_far_call(need_flags?gte_handlers[c2op]:gte_handlers_nf[c2op]); + break; + } + c2op_epilogue(c2op,reglist); + } +} + +static void c2op_ctc2_31_assemble(signed char sl, signed char temp) +{ + //value = value & 0x7ffff000; + //if (value & 0x7f87e000) value |= 0x80000000; + emit_shrimm(sl,12,temp); + emit_shlimm(temp,12,temp); + emit_testimm(temp,0x7f000000); + emit_testeqimm(temp,0x00870000); + emit_testeqimm(temp,0x0000e000); + emit_orrne_imm(temp,0x80000000,temp); +} + +static void do_mfc2_31_one(u_int copr,signed char temp) +{ + emit_readword(®_cop2d[copr],temp); + emit_lsls_imm(temp,16,temp); + emit_cmovs_imm(0,temp); + emit_cmpimm(temp,0xf80<<16); + emit_andimm(temp,0xf80<<16,temp); + emit_cmovae_imm(0xf80<<16,temp); +} + +static void c2op_mfc2_29_assemble(signed char tl, signed char temp) +{ + if (temp < 0) { + host_tempreg_acquire(); + temp = HOST_TEMPREG; + } + do_mfc2_31_one(9,temp); + emit_shrimm(temp,7+16,tl); + do_mfc2_31_one(10,temp); + emit_orrshr_imm(temp,2+16,tl); + do_mfc2_31_one(11,temp); + emit_orrshr_imm(temp,-3+16,tl); + emit_writeword(tl,®_cop2d[29]); + if (temp == HOST_TEMPREG) + host_tempreg_release(); +} + +static void multdiv_assemble_arm(int i, const struct regstat *i_regs) +{ + // case 0x18: MULT + // case 0x19: MULTU + // case 0x1A: DIV + // case 0x1B: DIVU + // case 0x1C: DMULT + // case 0x1D: DMULTU + // case 0x1E: DDIV + // case 0x1F: DDIVU + if(dops[i].rs1&&dops[i].rs2) + { + if((dops[i].opcode2&4)==0) // 32-bit + { + if(dops[i].opcode2==0x18) // MULT + { + signed char m1=get_reg(i_regs->regmap,dops[i].rs1); + signed char m2=get_reg(i_regs->regmap,dops[i].rs2); + signed char hi=get_reg(i_regs->regmap,HIREG); + signed char lo=get_reg(i_regs->regmap,LOREG); + assert(m1>=0); + assert(m2>=0); + assert(hi>=0); + assert(lo>=0); + emit_smull(m1,m2,hi,lo); + } + if(dops[i].opcode2==0x19) // MULTU + { + signed char m1=get_reg(i_regs->regmap,dops[i].rs1); + signed char m2=get_reg(i_regs->regmap,dops[i].rs2); + signed char hi=get_reg(i_regs->regmap,HIREG); + signed char lo=get_reg(i_regs->regmap,LOREG); + assert(m1>=0); + assert(m2>=0); + assert(hi>=0); + assert(lo>=0); + emit_umull(m1,m2,hi,lo); + } + if(dops[i].opcode2==0x1A) // DIV + { + signed char d1=get_reg(i_regs->regmap,dops[i].rs1); + signed char d2=get_reg(i_regs->regmap,dops[i].rs2); + assert(d1>=0); + assert(d2>=0); + signed char quotient=get_reg(i_regs->regmap,LOREG); + signed char remainder=get_reg(i_regs->regmap,HIREG); + assert(quotient>=0); + assert(remainder>=0); + emit_movs(d1,remainder); + emit_movimm(0xffffffff,quotient); + emit_negmi(quotient,quotient); // .. quotient and .. + emit_negmi(remainder,remainder); // .. remainder for div0 case (will be negated back after jump) + emit_movs(d2,HOST_TEMPREG); + emit_jeq(out+52); // Division by zero + emit_negsmi(HOST_TEMPREG,HOST_TEMPREG); +#ifdef HAVE_ARMV5 + emit_clz(HOST_TEMPREG,quotient); + emit_shl(HOST_TEMPREG,quotient,HOST_TEMPREG); +#else + emit_movimm(0,quotient); + emit_addpl_imm(quotient,1,quotient); + emit_lslpls_imm(HOST_TEMPREG,1,HOST_TEMPREG); + emit_jns(out-2*4); +#endif + emit_orimm(quotient,1<<31,quotient); + emit_shr(quotient,quotient,quotient); + emit_cmp(remainder,HOST_TEMPREG); + emit_subcs(remainder,HOST_TEMPREG,remainder); + emit_adcs(quotient,quotient,quotient); + emit_shrimm(HOST_TEMPREG,1,HOST_TEMPREG); + emit_jcc(out-16); // -4 + emit_teq(d1,d2); + emit_negmi(quotient,quotient); + emit_test(d1,d1); + emit_negmi(remainder,remainder); + } + if(dops[i].opcode2==0x1B) // DIVU + { + signed char d1=get_reg(i_regs->regmap,dops[i].rs1); // dividend + signed char d2=get_reg(i_regs->regmap,dops[i].rs2); // divisor + assert(d1>=0); + assert(d2>=0); + signed char quotient=get_reg(i_regs->regmap,LOREG); + signed char remainder=get_reg(i_regs->regmap,HIREG); + assert(quotient>=0); + assert(remainder>=0); + emit_mov(d1,remainder); + emit_movimm(0xffffffff,quotient); // div0 case + emit_test(d2,d2); + emit_jeq(out+40); // Division by zero +#ifdef HAVE_ARMV5 + emit_clz(d2,HOST_TEMPREG); + emit_movimm(1<<31,quotient); + emit_shl(d2,HOST_TEMPREG,d2); +#else + emit_movimm(0,HOST_TEMPREG); + emit_addpl_imm(HOST_TEMPREG,1,HOST_TEMPREG); + emit_lslpls_imm(d2,1,d2); + emit_jns(out-2*4); + emit_movimm(1<<31,quotient); +#endif + emit_shr(quotient,HOST_TEMPREG,quotient); + emit_cmp(remainder,d2); + emit_subcs(remainder,d2,remainder); + emit_adcs(quotient,quotient,quotient); + emit_shrcc_imm(d2,1,d2); + emit_jcc(out-16); // -4 + } + } + else // 64-bit + assert(0); + } + else + { + // Multiply by zero is zero. + // MIPS does not have a divide by zero exception. + // The result is undefined, we return zero. + signed char hr=get_reg(i_regs->regmap,HIREG); + signed char lr=get_reg(i_regs->regmap,LOREG); + if(hr>=0) emit_zeroreg(hr); + if(lr>=0) emit_zeroreg(lr); + } +} +#define multdiv_assemble multdiv_assemble_arm + +static void do_jump_vaddr(int rs) +{ + emit_far_jump(jump_vaddr_reg[rs]); +} + +static void do_preload_rhash(int r) { + // Don't need this for ARM. On x86, this puts the value 0xf8 into the + // register. On ARM the hash can be done with a single instruction (below) +} + +static void do_preload_rhtbl(int ht) { + emit_addimm(FP,(int)&mini_ht-(int)&dynarec_local,ht); +} + +static void do_rhash(int rs,int rh) { + emit_andimm(rs,0xf8,rh); +} + +static void do_miniht_load(int ht,int rh) { + assem_debug("ldr %s,[%s,%s]!\n",regname[rh],regname[ht],regname[rh]); + output_w32(0xe7b00000|rd_rn_rm(rh,ht,rh)); +} + +static void do_miniht_jump(int rs,int rh,int ht) { + emit_cmp(rh,rs); + emit_ldreq_indexed(ht,4,15); + #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK + if(rs!=7) + emit_mov(rs,7); + rs=7; + #endif + do_jump_vaddr(rs); +} + +static void do_miniht_insert(u_int return_address,int rt,int temp) { + #ifndef HAVE_ARMV7 + emit_movimm(return_address,rt); // PC into link register + add_to_linker(out,return_address,1); + emit_pcreladdr(temp); + emit_writeword(rt,&mini_ht[(return_address&0xFF)>>3][0]); + emit_writeword(temp,&mini_ht[(return_address&0xFF)>>3][1]); + #else + emit_movw(return_address&0x0000FFFF,rt); + add_to_linker(out,return_address,1); + emit_pcreladdr(temp); + emit_writeword(temp,&mini_ht[(return_address&0xFF)>>3][1]); + emit_movt(return_address&0xFFFF0000,rt); + emit_writeword(rt,&mini_ht[(return_address&0xFF)>>3][0]); + #endif +} + +// CPU-architecture-specific initialization +static void arch_init(void) +{ + uintptr_t diff = (u_char *)&ndrc->tramp.f - (u_char *)&ndrc->tramp.ops - 8; + struct tramp_insns *ops = ndrc->tramp.ops; + size_t i; + assert(!(diff & 3)); + assert(diff < 0x1000); + start_tcache_write(ops, (u_char *)ops + sizeof(ndrc->tramp.ops)); + for (i = 0; i < ARRAY_SIZE(ndrc->tramp.ops); i++) + ops[i].ldrpc = 0xe5900000 | rd_rn_rm(15,15,0) | diff; // ldr pc, [=val] + end_tcache_write(ops, (u_char *)ops + sizeof(ndrc->tramp.ops)); +} + +// vim:shiftwidth=2:expandtab diff --git a/libpcsxcore/new_dynarec/assem_arm.h b/libpcsxcore/new_dynarec/assem_arm.h new file mode 100644 index 00000000..75273aa8 --- /dev/null +++ b/libpcsxcore/new_dynarec/assem_arm.h @@ -0,0 +1,44 @@ +#define HOST_IMM8 1 +#define HAVE_CMOV_IMM 1 +#define HAVE_CONDITIONAL_CALL 1 + +/* ARM calling convention: + r0-r3, r12: caller-save + r4-r11: callee-save */ + +/* GCC register naming convention: + r10 = sl (base) + r11 = fp (frame pointer) + r12 = ip (scratch) + r13 = sp (stack pointer) + r14 = lr (link register) + r15 = pc (program counter) */ + +#define HOST_REGS 13 +#define HOST_CCREG 10 +#define HOST_BTREG 8 +#define EXCLUDE_REG 11 + +// Note: FP is set to &dynarec_local when executing generated code. +// Thus the local variables are actually global and not on the stack. +#define FP 11 +#define LR 14 +#define HOST_TEMPREG 14 + +#ifndef __MACH__ +#define CALLER_SAVE_REGS 0x100f +#else +#define CALLER_SAVE_REGS 0x120f +#endif +#define PREFERRED_REG_FIRST 4 +#define PREFERRED_REG_LAST 9 + +extern char *invc_ptr; + +#define TARGET_SIZE_2 24 // 2^24 = 16 megabytes + +struct tramp_insns +{ + u_int ldrpc; +}; + diff --git a/libpcsxcore/new_dynarec/assem_arm64.c b/libpcsxcore/new_dynarec/assem_arm64.c new file mode 100644 index 00000000..0b492211 --- /dev/null +++ b/libpcsxcore/new_dynarec/assem_arm64.c @@ -0,0 +1,2093 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Mupen64plus/PCSX - assem_arm64.c * + * Copyright (C) 2009-2011 Ari64 * + * Copyright (C) 2009-2018 Gillou68310 * + * Copyright (C) 2021 notaz * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include "pcnt.h" +#include "arm_features.h" + +#define unused __attribute__((unused)) + +void do_memhandler_pre(); +void do_memhandler_post(); + +/* Linker */ +static void set_jump_target(void *addr, void *target) +{ + u_int *ptr = addr; + intptr_t offset = (u_char *)target - (u_char *)addr; + + if ((*ptr&0xFC000000) == 0x14000000) { // b + assert(offset>=-134217728LL&&offset<134217728LL); + *ptr=(*ptr&0xFC000000)|((offset>>2)&0x3ffffff); + } + else if ((*ptr&0xff000000) == 0x54000000 // b.cond + || (*ptr&0x7e000000) == 0x34000000) { // cbz/cbnz + // Conditional branch are limited to +/- 1MB + // block max size is 256k so branching beyond the +/- 1MB limit + // should only happen when jumping to an already compiled block (see add_jump_out) + // a workaround would be to do a trampoline jump via a stub at the end of the block + assert(-1048576 <= offset && offset < 1048576); + *ptr=(*ptr&0xFF00000F)|(((offset>>2)&0x7ffff)<<5); + } + else if((*ptr&0x9f000000)==0x10000000) { // adr + // generated by do_miniht_insert + assert(offset>=-1048576LL&&offset<1048576LL); + *ptr=(*ptr&0x9F00001F)|(offset&0x3)<<29|((offset>>2)&0x7ffff)<<5; + } + else + abort(); // should not happen +} + +// from a pointer to external jump stub (which was produced by emit_extjump2) +// find where the jumping insn is +static void *find_extjump_insn(void *stub) +{ + int *ptr = (int *)stub + 2; + assert((*ptr&0x9f000000) == 0x10000000); // adr + int offset = (((signed int)(*ptr<<8)>>13)<<2)|((*ptr>>29)&0x3); + return ptr + offset / 4; +} + +// find where external branch is liked to using addr of it's stub: +// get address that the stub loads (dyna_linker arg1), +// treat it as a pointer to branch insn, +// return addr where that branch jumps to +static void *get_pointer(void *stub) +{ + int *i_ptr = find_extjump_insn(stub); + if ((*i_ptr&0xfc000000) == 0x14000000) // b + return i_ptr + ((signed int)(*i_ptr<<6)>>6); + if ((*i_ptr&0xff000000) == 0x54000000 // b.cond + || (*i_ptr&0x7e000000) == 0x34000000) // cbz/cbnz + return i_ptr + ((signed int)(*i_ptr<<8)>>13); + assert(0); + return NULL; +} + +// Allocate a specific ARM register. +static void alloc_arm_reg(struct regstat *cur,int i,signed char reg,int hr) +{ + int n; + int dirty=0; + + // see if it's already allocated (and dealloc it) + for(n=0;nregmap[n]==reg) { + dirty=(cur->dirty>>n)&1; + cur->regmap[n]=-1; + } + } + + cur->regmap[hr]=reg; + cur->dirty&=~(1<dirty|=dirty<isconst&=~(1< 134217727) { + SysPrintf("%s: out of range: %p %lx\n", __func__, addr, offset); + abort(); + return 0; + } + return ((u_int)offset >> 2) & 0x03ffffff; +} + +static u_int genjmpcc(const u_char *addr) +{ + intptr_t offset = addr - out; + if ((uintptr_t)addr < 3) return 0; + if (offset < -1048576 || offset > 1048572) { + SysPrintf("%s: out of range: %p %lx\n", __func__, addr, offset); + abort(); + return 0; + } + return ((u_int)offset >> 2) & 0x7ffff; +} + +static uint32_t is_mask(u_int value) +{ + return value && ((value + 1) & value) == 0; +} + +// This function returns true if the argument contains a +// non-empty sequence of ones (possibly rotated) with the remainder zero. +static uint32_t is_rotated_mask(u_int value) +{ + if (value == 0 || value == ~0) + return 0; + if (is_mask((value - 1) | value)) + return 1; + return is_mask((~value - 1) | ~value); +} + +static void gen_logical_imm(u_int value, u_int *immr, u_int *imms) +{ + int lzeros, tzeros, ones; + assert(value != 0); + if (is_mask((value - 1) | value)) { + lzeros = __builtin_clz(value); + tzeros = __builtin_ctz(value); + ones = 32 - lzeros - tzeros; + *immr = (32 - tzeros) & 31; + *imms = ones - 1; + return; + } + value = ~value; + if (is_mask((value - 1) | value)) { + lzeros = __builtin_clz(value); + tzeros = __builtin_ctz(value); + ones = 32 - lzeros - tzeros; + *immr = lzeros; + *imms = 31 - ones; + return; + } + abort(); +} + +static void emit_mov(u_int rs, u_int rt) +{ + assem_debug("mov %s,%s\n", regname[rt], regname[rs]); + output_w32(0x2a000000 | rm_rn_rd(rs, WZR, rt)); +} + +static void emit_mov64(u_int rs, u_int rt) +{ + assem_debug("mov %s,%s\n", regname64[rt], regname64[rs]); + output_w32(0xaa000000 | rm_rn_rd(rs, WZR, rt)); +} + +static void emit_add(u_int rs1, u_int rs2, u_int rt) +{ + assem_debug("add %s,%s,%s\n", regname[rt], regname[rs1], regname[rs2]); + output_w32(0x0b000000 | rm_rn_rd(rs2, rs1, rt)); +} + +static void emit_add64(u_int rs1, u_int rs2, u_int rt) +{ + assem_debug("add %s,%s,%s\n", regname64[rt], regname64[rs1], regname64[rs2]); + output_w32(0x8b000000 | rm_rn_rd(rs2, rs1, rt)); +} + +static void emit_adds64(u_int rs1, u_int rs2, u_int rt) +{ + assem_debug("adds %s,%s,%s\n",regname64[rt],regname64[rs1],regname64[rs2]); + output_w32(0xab000000 | rm_rn_rd(rs2, rs1, rt)); +} +#define emit_adds_ptr emit_adds64 + +static void emit_neg(u_int rs, u_int rt) +{ + assem_debug("neg %s,%s\n",regname[rt],regname[rs]); + output_w32(0x4b000000 | rm_rn_rd(rs, WZR, rt)); +} + +static void emit_sub(u_int rs1, u_int rs2, u_int rt) +{ + assem_debug("sub %s,%s,%s\n", regname[rt], regname[rs1], regname[rs2]); + output_w32(0x4b000000 | rm_imm6_rn_rd(rs2, 0, rs1, rt)); +} + +static void emit_sub_asrimm(u_int rs1, u_int rs2, u_int shift, u_int rt) +{ + assem_debug("sub %s,%s,%s,asr #%u\n",regname[rt],regname[rs1],regname[rs2],shift); + output_w32(0x4b800000 | rm_imm6_rn_rd(rs2, shift, rs1, rt)); +} + +static void emit_movz(u_int imm, u_int rt) +{ + assem_debug("movz %s,#%#x\n", regname[rt], imm); + output_w32(0x52800000 | imm16_rd(imm, rt)); +} + +static void emit_movz_lsl16(u_int imm, u_int rt) +{ + assem_debug("movz %s,#%#x,lsl #16\n", regname[rt], imm); + output_w32(0x52a00000 | imm16_rd(imm, rt)); +} + +static void emit_movn(u_int imm, u_int rt) +{ + assem_debug("movn %s,#%#x\n", regname[rt], imm); + output_w32(0x12800000 | imm16_rd(imm, rt)); +} + +static void emit_movn_lsl16(u_int imm,u_int rt) +{ + assem_debug("movn %s,#%#x,lsl #16\n", regname[rt], imm); + output_w32(0x12a00000 | imm16_rd(imm, rt)); +} + +static void emit_movk(u_int imm,u_int rt) +{ + assem_debug("movk %s,#%#x\n", regname[rt], imm); + output_w32(0x72800000 | imm16_rd(imm, rt)); +} + +static void emit_movk_lsl16(u_int imm,u_int rt) +{ + assert(imm<65536); + assem_debug("movk %s,#%#x,lsl #16\n", regname[rt], imm); + output_w32(0x72a00000 | imm16_rd(imm, rt)); +} + +static void emit_zeroreg(u_int rt) +{ + emit_movz(0, rt); +} + +static void emit_movimm(u_int imm, u_int rt) +{ + if (imm < 65536) + emit_movz(imm, rt); + else if ((~imm) < 65536) + emit_movn(~imm, rt); + else if ((imm&0xffff) == 0) + emit_movz_lsl16(imm >> 16, rt); + else if (((~imm)&0xffff) == 0) + emit_movn_lsl16(~imm >> 16, rt); + else if (is_rotated_mask(imm)) { + u_int immr, imms; + gen_logical_imm(imm, &immr, &imms); + assem_debug("orr %s,wzr,#%#x\n", regname[rt], imm); + output_w32(0x32000000 | n_immr_imms_rn_rd(0, immr, imms, WZR, rt)); + } + else { + emit_movz(imm & 0xffff, rt); + emit_movk_lsl16(imm >> 16, rt); + } +} + +static void emit_readword(void *addr, u_int rt) +{ + uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local; + if (!(offset & 3) && offset <= 16380) { + assem_debug("ldr %s,[x%d+%#lx]\n", regname[rt], FP, offset); + output_w32(0xb9400000 | imm12_rn_rd(offset >> 2, FP, rt)); + } + else + abort(); +} + +static void emit_readdword(void *addr, u_int rt) +{ + uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local; + if (!(offset & 7) && offset <= 32760) { + assem_debug("ldr %s,[x%d+%#lx]\n", regname64[rt], FP, offset); + output_w32(0xf9400000 | imm12_rn_rd(offset >> 3, FP, rt)); + } + else + abort(); +} +#define emit_readptr emit_readdword + +static void emit_readshword(void *addr, u_int rt) +{ + uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local; + if (!(offset & 1) && offset <= 8190) { + assem_debug("ldrsh %s,[x%d+%#lx]\n", regname[rt], FP, offset); + output_w32(0x79c00000 | imm12_rn_rd(offset >> 1, FP, rt)); + } + else + assert(0); +} + +static void emit_loadreg(u_int r, u_int hr) +{ + int is64 = 0; + assert(r < 64); + if (r == 0) + emit_zeroreg(hr); + else { + void *addr = &psxRegs.GPR.r[r]; + switch (r) { + //case HIREG: addr = &hi; break; + //case LOREG: addr = &lo; break; + case CCREG: addr = &cycle_count; break; + case CSREG: addr = &Status; break; + case INVCP: addr = &invc_ptr; is64 = 1; break; + case ROREG: addr = &ram_offset; is64 = 1; break; + default: assert(r < 34); break; + } + if (is64) + emit_readdword(addr, hr); + else + emit_readword(addr, hr); + } +} + +static void emit_writeword(u_int rt, void *addr) +{ + uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local; + if (!(offset & 3) && offset <= 16380) { + assem_debug("str %s,[x%d+%#lx]\n", regname[rt], FP, offset); + output_w32(0xb9000000 | imm12_rn_rd(offset >> 2, FP, rt)); + } + else + assert(0); +} + +static void emit_writedword(u_int rt, void *addr) +{ + uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local; + if (!(offset & 7) && offset <= 32760) { + assem_debug("str %s,[x%d+%#lx]\n", regname64[rt], FP, offset); + output_w32(0xf9000000 | imm12_rn_rd(offset >> 3, FP, rt)); + } + else + abort(); +} + +static void emit_storereg(u_int r, u_int hr) +{ + assert(r < 64); + void *addr = &psxRegs.GPR.r[r]; + switch (r) { + //case HIREG: addr = &hi; break; + //case LOREG: addr = &lo; break; + case CCREG: addr = &cycle_count; break; + default: assert(r < 34); break; + } + emit_writeword(hr, addr); +} + +static void emit_test(u_int rs, u_int rt) +{ + assem_debug("tst %s,%s\n", regname[rs], regname[rt]); + output_w32(0x6a000000 | rm_rn_rd(rt, rs, WZR)); +} + +static void emit_testimm(u_int rs, u_int imm) +{ + u_int immr, imms; + assem_debug("tst %s,#%#x\n", regname[rs], imm); + assert(is_rotated_mask(imm)); // good enough for PCSX + gen_logical_imm(imm, &immr, &imms); + output_w32(0x72000000 | n_immr_imms_rn_rd(0, immr, imms, rs, WZR)); +} + +static void emit_not(u_int rs,u_int rt) +{ + assem_debug("mvn %s,%s\n",regname[rt],regname[rs]); + output_w32(0x2a200000 | rm_rn_rd(rs, WZR, rt)); +} + +static void emit_and(u_int rs1,u_int rs2,u_int rt) +{ + assem_debug("and %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0x0a000000 | rm_rn_rd(rs2, rs1, rt)); +} + +static void emit_or(u_int rs1,u_int rs2,u_int rt) +{ + assem_debug("orr %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0x2a000000 | rm_rn_rd(rs2, rs1, rt)); +} + +static void emit_bic(u_int rs1,u_int rs2,u_int rt) +{ + assem_debug("bic %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0x0a200000 | rm_rn_rd(rs2, rs1, rt)); +} + +static void emit_orrshl_imm(u_int rs,u_int imm,u_int rt) +{ + assem_debug("orr %s,%s,%s,lsl #%d\n",regname[rt],regname[rt],regname[rs],imm); + output_w32(0x2a000000 | rm_imm6_rn_rd(rs, imm, rt, rt)); +} + +static void emit_orrshr_imm(u_int rs,u_int imm,u_int rt) +{ + assem_debug("orr %s,%s,%s,lsr #%d\n",regname[rt],regname[rt],regname[rs],imm); + output_w32(0x2a400000 | rm_imm6_rn_rd(rs, imm, rt, rt)); +} + +static void emit_bicsar_imm(u_int rs,u_int imm,u_int rt) +{ + assem_debug("bic %s,%s,%s,asr #%d\n",regname[rt],regname[rt],regname[rs],imm); + output_w32(0x0aa00000 | rm_imm6_rn_rd(rs, imm, rt, rt)); +} + +static void emit_xor(u_int rs1,u_int rs2,u_int rt) +{ + assem_debug("eor %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0x4a000000 | rm_rn_rd(rs2, rs1, rt)); +} + +static void emit_xorsar_imm(u_int rs1, u_int rs2, u_int imm, u_int rt) +{ + assem_debug("eor %s,%s,%s,asr #%d\n",regname[rt],regname[rs1],regname[rs2],imm); + output_w32(0x4a800000 | rm_imm6_rn_rd(rs2, imm, rs1, rt)); +} + +static void emit_addimm_s(u_int s, u_int is64, u_int rs, uintptr_t imm, u_int rt) +{ + unused const char *st = s ? "s" : ""; + s = s ? 0x20000000 : 0; + is64 = is64 ? 0x80000000 : 0; + if (imm < 4096) { + assem_debug("add%s %s,%s,%#lx\n", st, regname[rt], regname[rs], imm); + output_w32(0x11000000 | is64 | s | imm12_rn_rd(imm, rs, rt)); + } + else if (-imm < 4096) { + assem_debug("sub%s %s,%s,%#lx\n", st, regname[rt], regname[rs], -imm); + output_w32(0x51000000 | is64 | s | imm12_rn_rd(-imm, rs, rt)); + } + else if (imm < 16777216) { + assem_debug("add %s,%s,#%#lx\n",regname[rt],regname[rt],imm&0xfff000); + output_w32(0x11400000 | is64 | imm12_rn_rd(imm >> 12, rs, rt)); + if ((imm & 0xfff) || s) { + assem_debug("add%s %s,%s,#%#lx\n",st,regname[rt],regname[rs],imm&0xfff); + output_w32(0x11000000 | is64 | s | imm12_rn_rd(imm & 0xfff, rt, rt)); + } + } + else if (-imm < 16777216) { + assem_debug("sub %s,%s,#%#lx\n",regname[rt],regname[rt],-imm&0xfff000); + output_w32(0x51400000 | is64 | imm12_rn_rd(-imm >> 12, rs, rt)); + if ((imm & 0xfff) || s) { + assem_debug("sub%s %s,%s,#%#lx\n",st,regname[rt],regname[rs],-imm&0xfff); + output_w32(0x51000000 | is64 | s | imm12_rn_rd(-imm & 0xfff, rt, rt)); + } + } + else + abort(); +} + +static void emit_addimm(u_int rs, uintptr_t imm, u_int rt) +{ + emit_addimm_s(0, 0, rs, imm, rt); +} + +static void emit_addimm64(u_int rs, uintptr_t imm, u_int rt) +{ + emit_addimm_s(0, 1, rs, imm, rt); +} + +static void emit_addimm_and_set_flags(int imm, u_int rt) +{ + emit_addimm_s(1, 0, rt, imm, rt); +} + +static void emit_logicop_imm(u_int op, u_int rs, u_int imm, u_int rt) +{ + const char *names[] = { "and", "orr", "eor", "ands" }; + const char *name = names[op]; + u_int immr, imms; + op = op << 29; + if (is_rotated_mask(imm)) { + gen_logical_imm(imm, &immr, &imms); + assem_debug("%s %s,%s,#%#x\n", name, regname[rt], regname[rs], imm); + output_w32(op | 0x12000000 | n_immr_imms_rn_rd(0, immr, imms, rs, rt)); + } + else { + if (rs == HOST_TEMPREG || rt != HOST_TEMPREG) + host_tempreg_acquire(); + emit_movimm(imm, HOST_TEMPREG); + assem_debug("%s %s,%s,%s\n", name, regname[rt], regname[rs], regname[HOST_TEMPREG]); + output_w32(op | 0x0a000000 | rm_rn_rd(HOST_TEMPREG, rs, rt)); + if (rs == HOST_TEMPREG || rt != HOST_TEMPREG) + host_tempreg_release(); + } + (void)name; +} + +static void emit_andimm(u_int rs, u_int imm, u_int rt) +{ + if (imm == 0) + emit_zeroreg(rt); + else + emit_logicop_imm(0, rs, imm, rt); +} + +static void emit_orimm(u_int rs, u_int imm, u_int rt) +{ + if (imm == 0) { + if (rs != rt) + emit_mov(rs, rt); + } + else + emit_logicop_imm(1, rs, imm, rt); +} + +static void emit_xorimm(u_int rs, u_int imm, u_int rt) +{ + if (imm == 0) { + if (rs != rt) + emit_mov(rs, rt); + } + else + emit_logicop_imm(2, rs, imm, rt); +} + +static void emit_sbfm(u_int rs,u_int imm,u_int rt) +{ + assem_debug("sbfm %s,%s,#0,#%d\n",regname[rt],regname[rs],imm); + output_w32(0x13000000 | n_immr_imms_rn_rd(0, 0, imm, rs, rt)); +} + +static void emit_ubfm(u_int rs,u_int imm,u_int rt) +{ + assem_debug("ubfm %s,%s,#0,#%d\n",regname[rt],regname[rs],imm); + output_w32(0x53000000 | n_immr_imms_rn_rd(0, 0, imm, rs, rt)); +} + +static void emit_shlimm(u_int rs,u_int imm,u_int rt) +{ + assem_debug("lsl %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0x53000000 | n_immr_imms_rn_rd(0, (31-imm)+1, 31-imm, rs, rt)); +} + +static void emit_shrimm(u_int rs,u_int imm,u_int rt) +{ + assem_debug("lsr %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0x53000000 | n_immr_imms_rn_rd(0, imm, 31, rs, rt)); +} + +static void emit_shrimm64(u_int rs,u_int imm,u_int rt) +{ + assem_debug("lsr %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xd3400000 | n_immr_imms_rn_rd(0, imm, 63, rs, rt)); +} + +static void emit_sarimm(u_int rs,u_int imm,u_int rt) +{ + assem_debug("asr %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0x13000000 | n_immr_imms_rn_rd(0, imm, 31, rs, rt)); +} + +static void emit_rorimm(u_int rs,u_int imm,u_int rt) +{ + assem_debug("ror %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0x13800000 | rm_imm6_rn_rd(rs, imm, rs, rt)); +} + +static void emit_signextend16(u_int rs, u_int rt) +{ + assem_debug("sxth %s,%s\n", regname[rt], regname[rs]); + output_w32(0x13000000 | n_immr_imms_rn_rd(0, 0, 15, rs, rt)); +} + +static void emit_shl(u_int rs,u_int rshift,u_int rt) +{ + assem_debug("lsl %s,%s,%s\n",regname[rt],regname[rs],regname[rshift]); + output_w32(0x1ac02000 | rm_rn_rd(rshift, rs, rt)); +} + +static void emit_shr(u_int rs,u_int rshift,u_int rt) +{ + assem_debug("lsr %s,%s,%s\n",regname[rt],regname[rs],regname[rshift]); + output_w32(0x1ac02400 | rm_rn_rd(rshift, rs, rt)); +} + +static void emit_sar(u_int rs,u_int rshift,u_int rt) +{ + assem_debug("asr %s,%s,%s\n",regname[rt],regname[rs],regname[rshift]); + output_w32(0x1ac02800 | rm_rn_rd(rshift, rs, rt)); +} + +static void emit_cmpimm(u_int rs, u_int imm) +{ + if (imm < 4096) { + assem_debug("cmp %s,%#x\n", regname[rs], imm); + output_w32(0x71000000 | imm12_rn_rd(imm, rs, WZR)); + } + else if (-imm < 4096) { + assem_debug("cmn %s,%#x\n", regname[rs], imm); + output_w32(0x31000000 | imm12_rn_rd(-imm, rs, WZR)); + } + else if (imm < 16777216 && !(imm & 0xfff)) { + assem_debug("cmp %s,#%#x\n", regname[rs], imm); + output_w32(0x71400000 | imm12_rn_rd(imm >> 12, rs, WZR)); + } + else { + host_tempreg_acquire(); + emit_movimm(imm, HOST_TEMPREG); + assem_debug("cmp %s,%s\n", regname[rs], regname[HOST_TEMPREG]); + output_w32(0x6b000000 | rm_rn_rd(HOST_TEMPREG, rs, WZR)); + host_tempreg_release(); + } +} + +static void emit_cmov_imm(u_int cond0, u_int cond1, u_int imm, u_int rt) +{ + assert(imm == 0 || imm == 1); + assert(cond0 < 0x10); + assert(cond1 < 0x10); + if (imm) { + assem_debug("csinc %s,%s,%s,%s\n",regname[rt],regname[rt],regname[WZR],condname[cond1]); + output_w32(0x1a800400 | (cond1 << 12) | rm_rn_rd(WZR, rt, rt)); + } else { + assem_debug("csel %s,%s,%s,%s\n",regname[rt],regname[WZR],regname[rt],condname[cond0]); + output_w32(0x1a800000 | (cond0 << 12) | rm_rn_rd(rt, WZR, rt)); + } +} + +static void emit_cmovne_imm(u_int imm,u_int rt) +{ + emit_cmov_imm(COND_NE, COND_EQ, imm, rt); +} + +static void emit_cmovl_imm(u_int imm,u_int rt) +{ + emit_cmov_imm(COND_LT, COND_GE, imm, rt); +} + +static void emit_cmovb_imm(int imm,u_int rt) +{ + emit_cmov_imm(COND_CC, COND_CS, imm, rt); +} + +static void emit_cmoveq_reg(u_int rs,u_int rt) +{ + assem_debug("csel %s,%s,%s,eq\n",regname[rt],regname[rs],regname[rt]); + output_w32(0x1a800000 | (COND_EQ << 12) | rm_rn_rd(rt, rs, rt)); +} + +static void emit_cmovne_reg(u_int rs,u_int rt) +{ + assem_debug("csel %s,%s,%s,ne\n",regname[rt],regname[rs],regname[rt]); + output_w32(0x1a800000 | (COND_NE << 12) | rm_rn_rd(rt, rs, rt)); +} + +static void emit_cmovl_reg(u_int rs,u_int rt) +{ + assem_debug("csel %s,%s,%s,lt\n",regname[rt],regname[rs],regname[rt]); + output_w32(0x1a800000 | (COND_LT << 12) | rm_rn_rd(rt, rs, rt)); +} + +static void emit_cmovb_reg(u_int rs,u_int rt) +{ + assem_debug("csel %s,%s,%s,cc\n",regname[rt],regname[rs],regname[rt]); + output_w32(0x1a800000 | (COND_CC << 12) | rm_rn_rd(rt, rs, rt)); +} + +static void emit_cmovs_reg(u_int rs,u_int rt) +{ + assem_debug("csel %s,%s,%s,mi\n",regname[rt],regname[rs],regname[rt]); + output_w32(0x1a800000 | (COND_MI << 12) | rm_rn_rd(rt, rs, rt)); +} + +static void emit_csinvle_reg(u_int rs1,u_int rs2,u_int rt) +{ + assem_debug("csinv %s,%s,%s,le\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0x5a800000 | (COND_LE << 12) | rm_rn_rd(rs2, rs1, rt)); +} + +static void emit_slti32(u_int rs,int imm,u_int rt) +{ + if(rs!=rt) emit_zeroreg(rt); + emit_cmpimm(rs,imm); + if(rs==rt) emit_movimm(0,rt); + emit_cmovl_imm(1,rt); +} + +static void emit_sltiu32(u_int rs,int imm,u_int rt) +{ + if(rs!=rt) emit_zeroreg(rt); + emit_cmpimm(rs,imm); + if(rs==rt) emit_movimm(0,rt); + emit_cmovb_imm(1,rt); +} + +static void emit_cmp(u_int rs,u_int rt) +{ + assem_debug("cmp %s,%s\n",regname[rs],regname[rt]); + output_w32(0x6b000000 | rm_rn_rd(rt, rs, WZR)); +} + +static void emit_set_gz32(u_int rs, u_int rt) +{ + //assem_debug("set_gz32\n"); + emit_cmpimm(rs,1); + emit_movimm(1,rt); + emit_cmovl_imm(0,rt); +} + +static void emit_set_nz32(u_int rs, u_int rt) +{ + //assem_debug("set_nz32\n"); + if(rs!=rt) emit_mov(rs,rt); + emit_test(rs,rs); + emit_cmovne_imm(1,rt); +} + +static void emit_set_if_less32(u_int rs1, u_int rs2, u_int rt) +{ + //assem_debug("set if less (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]); + if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt); + emit_cmp(rs1,rs2); + if(rs1==rt||rs2==rt) emit_movimm(0,rt); + emit_cmovl_imm(1,rt); +} + +static void emit_set_if_carry32(u_int rs1, u_int rs2, u_int rt) +{ + //assem_debug("set if carry (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]); + if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt); + emit_cmp(rs1,rs2); + if(rs1==rt||rs2==rt) emit_movimm(0,rt); + emit_cmovb_imm(1,rt); +} + +static int can_jump_or_call(const void *a) +{ + intptr_t diff = (u_char *)a - out; + return (-134217728 <= diff && diff <= 134217727); +} + +static void emit_call(const void *a) +{ + intptr_t diff = (u_char *)a - out; + assem_debug("bl %p (%p+%lx)%s\n", a, out, diff, func_name(a)); + assert(!(diff & 3)); + if (-134217728 <= diff && diff <= 134217727) + output_w32(0x94000000 | ((diff >> 2) & 0x03ffffff)); + else + abort(); +} + +static void emit_jmp(const void *a) +{ + assem_debug("b %p (%p+%lx)%s\n", a, out, (u_char *)a - out, func_name(a)); + u_int offset = genjmp(a); + output_w32(0x14000000 | offset); +} + +static void emit_jne(const void *a) +{ + assem_debug("bne %p\n", a); + u_int offset = genjmpcc(a); + output_w32(0x54000000 | (offset << 5) | COND_NE); +} + +static void emit_jeq(const void *a) +{ + assem_debug("beq %p\n", a); + u_int offset = genjmpcc(a); + output_w32(0x54000000 | (offset << 5) | COND_EQ); +} + +static void emit_js(const void *a) +{ + assem_debug("bmi %p\n", a); + u_int offset = genjmpcc(a); + output_w32(0x54000000 | (offset << 5) | COND_MI); +} + +static void emit_jns(const void *a) +{ + assem_debug("bpl %p\n", a); + u_int offset = genjmpcc(a); + output_w32(0x54000000 | (offset << 5) | COND_PL); +} + +static void emit_jl(const void *a) +{ + assem_debug("blt %p\n", a); + u_int offset = genjmpcc(a); + output_w32(0x54000000 | (offset << 5) | COND_LT); +} + +static void emit_jge(const void *a) +{ + assem_debug("bge %p\n", a); + u_int offset = genjmpcc(a); + output_w32(0x54000000 | (offset << 5) | COND_GE); +} + +static void emit_jno(const void *a) +{ + assem_debug("bvc %p\n", a); + u_int offset = genjmpcc(a); + output_w32(0x54000000 | (offset << 5) | COND_VC); +} + +static void emit_jc(const void *a) +{ + assem_debug("bcs %p\n", a); + u_int offset = genjmpcc(a); + output_w32(0x54000000 | (offset << 5) | COND_CS); +} + +static void emit_cb(u_int isnz, u_int is64, const void *a, u_int r) +{ + assem_debug("cb%sz %s,%p\n", isnz?"n":"", is64?regname64[r]:regname[r], a); + u_int offset = genjmpcc(a); + is64 = is64 ? 0x80000000 : 0; + isnz = isnz ? 0x01000000 : 0; + output_w32(0x34000000 | is64 | isnz | imm19_rt(offset, r)); +} + +static void emit_cbz(const void *a, u_int r) +{ + emit_cb(0, 0, a, r); +} + +static void emit_jmpreg(u_int r) +{ + assem_debug("br %s\n", regname64[r]); + output_w32(0xd61f0000 | rm_rn_rd(0, r, 0)); +} + +static void emit_retreg(u_int r) +{ + assem_debug("ret %s\n", r == LR ? "" : regname64[r]); + output_w32(0xd65f0000 | rm_rn_rd(0, r, 0)); +} + +static void emit_ret(void) +{ + emit_retreg(LR); +} + +static void emit_adr(void *addr, u_int rt) +{ + intptr_t offset = (u_char *)addr - out; + assert(-1048576 <= offset && offset < 1048576); + assert(rt < 31); + assem_debug("adr x%d,#%#lx\n", rt, offset); + output_w32(0x10000000 | ((offset&0x3) << 29) | (((offset>>2)&0x7ffff) << 5) | rt); +} + +static void emit_adrp(void *addr, u_int rt) +{ + intptr_t offset = ((intptr_t)addr & ~0xfffl) - ((intptr_t)out & ~0xfffl); + assert(-4294967296l <= offset && offset < 4294967296l); + assert(rt < 31); + offset >>= 12; + assem_debug("adrp %s,#%#lx(000)\n",regname64[rt],offset); + output_w32(0x90000000 | ((offset&0x3)<<29) | (((offset>>2)&0x7ffff)<<5) | rt); +} + +static void emit_readword_indexed(int offset, u_int rs, u_int rt) +{ + assem_debug("ldur %s,[%s+%#x]\n",regname[rt],regname64[rs],offset); + assert(-256 <= offset && offset < 256); + output_w32(0xb8400000 | imm9_rn_rt(offset&0x1ff, rs, rt)); +} + +static void emit_strb_dualindexed(u_int rs1, u_int rs2, u_int rt) +{ + assem_debug("strb %s, [%s,%s]\n",regname[rt],regname64[rs1],regname[rs2]); + output_w32(0x38204800 | rm_rn_rd(rs2, rs1, rt)); +} + +static void emit_strh_dualindexed(u_int rs1, u_int rs2, u_int rt) +{ + assem_debug("strh %s, [%s,%s]\n",regname[rt],regname64[rs1],regname[rs2]); + output_w32(0x78204800 | rm_rn_rd(rs2, rs1, rt)); +} + +static void emit_str_dualindexed(u_int rs1, u_int rs2, u_int rt) +{ + assem_debug("str %s, [%s,%s]\n",regname[rt],regname64[rs1],regname[rs2]); + output_w32(0xb8204800 | rm_rn_rd(rs2, rs1, rt)); +} + +static void emit_readdword_dualindexedx8(u_int rs1, u_int rs2, u_int rt) +{ + assem_debug("ldr %s, [%s,%s, uxtw #3]\n",regname64[rt],regname64[rs1],regname[rs2]); + output_w32(0xf8605800 | rm_rn_rd(rs2, rs1, rt)); +} +#define emit_readptr_dualindexedx_ptrlen emit_readdword_dualindexedx8 + +static void emit_ldrb_dualindexed(u_int rs1, u_int rs2, u_int rt) +{ + assem_debug("ldrb %s, [%s,%s]\n",regname[rt],regname64[rs1],regname[rs2]); + output_w32(0x38604800 | rm_rn_rd(rs2, rs1, rt)); +} + +static void emit_ldrsb_dualindexed(u_int rs1, u_int rs2, u_int rt) +{ + assem_debug("ldrsb %s, [%s,%s]\n",regname[rt],regname64[rs1],regname[rs2]); + output_w32(0x38a04800 | rm_rn_rd(rs2, rs1, rt)); +} + +static void emit_ldrh_dualindexed(u_int rs1, u_int rs2, u_int rt) +{ + assem_debug("ldrh %s, [%s,%s, uxtw]\n",regname[rt],regname64[rs1],regname[rs2]); + output_w32(0x78604800 | rm_rn_rd(rs2, rs1, rt)); +} + +static void emit_ldrsh_dualindexed(u_int rs1, u_int rs2, u_int rt) +{ + assem_debug("ldrsh %s, [%s,%s, uxtw]\n",regname[rt],regname64[rs1],regname[rs2]); + output_w32(0x78a04800 | rm_rn_rd(rs2, rs1, rt)); +} + +static void emit_ldr_dualindexed(u_int rs1, u_int rs2, u_int rt) +{ + assem_debug("ldr %s, [%s,%s, uxtw]\n",regname[rt],regname64[rs1],regname[rs2]); + output_w32(0xb8604800 | rm_rn_rd(rs2, rs1, rt)); +} + +static void emit_movsbl_indexed(int offset, u_int rs, u_int rt) +{ + assem_debug("ldursb %s,[%s+%#x]\n",regname[rt],regname64[rs],offset); + assert(-256 <= offset && offset < 256); + output_w32(0x38c00000 | imm9_rn_rt(offset&0x1ff, rs, rt)); +} + +static void emit_movswl_indexed(int offset, u_int rs, u_int rt) +{ + assem_debug("ldursh %s,[%s+%#x]\n",regname[rt],regname64[rs],offset); + assert(-256 <= offset && offset < 256); + output_w32(0x78c00000 | imm9_rn_rt(offset&0x1ff, rs, rt)); +} + +static void emit_movzbl_indexed(int offset, u_int rs, u_int rt) +{ + assem_debug("ldurb %s,[%s+%#x]\n",regname[rt],regname64[rs],offset); + assert(-256 <= offset && offset < 256); + output_w32(0x38400000 | imm9_rn_rt(offset&0x1ff, rs, rt)); +} + +static void emit_movzwl_indexed(int offset, u_int rs, u_int rt) +{ + assem_debug("ldurh %s,[%s+%#x]\n",regname[rt],regname64[rs],offset); + assert(-256 <= offset && offset < 256); + output_w32(0x78400000 | imm9_rn_rt(offset&0x1ff, rs, rt)); +} + +static void emit_writeword_indexed(u_int rt, int offset, u_int rs) +{ + if (!(offset & 3) && (u_int)offset <= 16380) { + assem_debug("str %s,[%s+%#x]\n", regname[rt], regname[rs], offset); + output_w32(0xb9000000 | imm12_rn_rd(offset >> 2, rs, rt)); + } + else if (-256 <= offset && offset < 256) { + assem_debug("stur %s,[%s+%#x]\n", regname[rt], regname[rs], offset); + output_w32(0xb8000000 | imm9_rn_rt(offset & 0x1ff, rs, rt)); + } + else + assert(0); +} + +static void emit_writehword_indexed(u_int rt, int offset, u_int rs) +{ + if (!(offset & 1) && (u_int)offset <= 8190) { + assem_debug("strh %s,[%s+%#x]\n", regname[rt], regname64[rs], offset); + output_w32(0x79000000 | imm12_rn_rd(offset >> 1, rs, rt)); + } + else if (-256 <= offset && offset < 256) { + assem_debug("sturh %s,[%s+%#x]\n", regname[rt], regname64[rs], offset); + output_w32(0x78000000 | imm9_rn_rt(offset & 0x1ff, rs, rt)); + } + else + assert(0); +} + +static void emit_writebyte_indexed(u_int rt, int offset, u_int rs) +{ + if ((u_int)offset < 4096) { + assem_debug("strb %s,[%s+%#x]\n", regname[rt], regname64[rs], offset); + output_w32(0x39000000 | imm12_rn_rd(offset, rs, rt)); + } + else if (-256 <= offset && offset < 256) { + assem_debug("sturb %s,[%s+%#x]\n", regname[rt], regname64[rs], offset); + output_w32(0x38000000 | imm9_rn_rt(offset & 0x1ff, rs, rt)); + } + else + assert(0); +} + +static void emit_umull(u_int rs1, u_int rs2, u_int rt) +{ + assem_debug("umull %s,%s,%s\n",regname64[rt],regname[rs1],regname[rs2]); + output_w32(0x9ba00000 | rm_ra_rn_rd(rs2, WZR, rs1, rt)); +} + +static void emit_smull(u_int rs1, u_int rs2, u_int rt) +{ + assem_debug("smull %s,%s,%s\n",regname64[rt],regname[rs1],regname[rs2]); + output_w32(0x9b200000 | rm_ra_rn_rd(rs2, WZR, rs1, rt)); +} + +static void emit_msub(u_int rs1, u_int rs2, u_int rs3, u_int rt) +{ + assem_debug("msub %s,%s,%s,%s\n",regname[rt],regname[rs1],regname[rs2],regname[rs3]); + output_w32(0x1b008000 | rm_ra_rn_rd(rs2, rs3, rs1, rt)); +} + +static void emit_sdiv(u_int rs1, u_int rs2, u_int rt) +{ + assem_debug("sdiv %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0x1ac00c00 | rm_rn_rd(rs2, rs1, rt)); +} + +static void emit_udiv(u_int rs1, u_int rs2, u_int rt) +{ + assem_debug("udiv %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0x1ac00800 | rm_rn_rd(rs2, rs1, rt)); +} + +static void emit_clz(u_int rs, u_int rt) +{ + assem_debug("clz %s,%s\n",regname[rt],regname[rs]); + output_w32(0x5ac01000 | rn_rd(rs, rt)); +} + +// special case for checking invalid_code +static void emit_cmpmem_indexedsr12_reg(u_int rbase, u_int r, u_int imm) +{ + host_tempreg_acquire(); + emit_shrimm(r, 12, HOST_TEMPREG); + assem_debug("ldrb %s,[%s,%s,uxtw]\n",regname[HOST_TEMPREG],regname64[rbase],regname[HOST_TEMPREG]); + output_w32(0x38604800 | rm_rn_rd(HOST_TEMPREG, rbase, HOST_TEMPREG)); + emit_cmpimm(HOST_TEMPREG, imm); + host_tempreg_release(); +} + +// special for loadlr_assemble, rs2 is destroyed +static void emit_bic_lsl(u_int rs1,u_int rs2,u_int shift,u_int rt) +{ + emit_shl(rs2, shift, rs2); + emit_bic(rs1, rs2, rt); +} + +static void emit_bic_lsr(u_int rs1,u_int rs2,u_int shift,u_int rt) +{ + emit_shr(rs2, shift, rs2); + emit_bic(rs1, rs2, rt); +} + +static void emit_loadlp_ofs(u_int ofs, u_int rt) +{ + output_w32(0x58000000 | imm19_rt(ofs, rt)); +} + +static void emit_ldst(int is_st, int is64, u_int rt, u_int rn, u_int ofs) +{ + u_int op = 0xb9000000; + unused const char *ldst = is_st ? "st" : "ld"; + unused char rp = is64 ? 'x' : 'w'; + assem_debug("%sr %c%d,[x%d,#%#x]\n", ldst, rp, rt, rn, ofs); + is64 = is64 ? 1 : 0; + assert((ofs & ((1 << (2+is64)) - 1)) == 0); + ofs = (ofs >> (2+is64)); + if (!is_st) op |= 0x00400000; + if (is64) op |= 0x40000000; + output_w32(op | imm12_rn_rd(ofs, rn, rt)); +} + +static void emit_ldstp(int is_st, int is64, u_int rt1, u_int rt2, u_int rn, int ofs) +{ + u_int op = 0x29000000; + unused const char *ldst = is_st ? "st" : "ld"; + unused char rp = is64 ? 'x' : 'w'; + assem_debug("%sp %c%d,%c%d,[x%d,#%#x]\n", ldst, rp, rt1, rp, rt2, rn, ofs); + is64 = is64 ? 1 : 0; + assert((ofs & ((1 << (2+is64)) - 1)) == 0); + ofs = (ofs >> (2+is64)); + assert(-64 <= ofs && ofs <= 63); + ofs &= 0x7f; + if (!is_st) op |= 0x00400000; + if (is64) op |= 0x80000000; + output_w32(op | imm7_rt2_rn_rt(ofs, rt2, rn, rt1)); +} + +static void save_load_regs_all(int is_store, u_int reglist) +{ + int ofs = 0, c = 0; + u_int r, pair[2]; + for (r = 0; reglist; r++, reglist >>= 1) { + if (reglist & 1) + pair[c++] = r; + if (c == 2) { + emit_ldstp(is_store, 1, pair[0], pair[1], SP, SSP_CALLEE_REGS + ofs); + ofs += 8 * 2; + c = 0; + } + } + if (c) { + emit_ldst(is_store, 1, pair[0], SP, SSP_CALLEE_REGS + ofs); + ofs += 8; + } + assert(ofs <= SSP_CALLER_REGS); +} + +// Save registers before function call +static void save_regs(u_int reglist) +{ + reglist &= CALLER_SAVE_REGS; // only save the caller-save registers + save_load_regs_all(1, reglist); +} + +// Restore registers after function call +static void restore_regs(u_int reglist) +{ + reglist &= CALLER_SAVE_REGS; + save_load_regs_all(0, reglist); +} + +/* Stubs/epilogue */ + +static void literal_pool(int n) +{ + (void)literals; +} + +static void literal_pool_jumpover(int n) +{ +} + +// parsed by get_pointer, find_extjump_insn +static void emit_extjump2(u_char *addr, u_int target, void *linker) +{ + assert(((addr[3]&0xfc)==0x14) || ((addr[3]&0xff)==0x54)); // b or b.cond + + emit_movz(target & 0xffff, 0); + emit_movk_lsl16(target >> 16, 0); + + // addr is in the current recompiled block (max 256k) + // offset shouldn't exceed +/-1MB + emit_adr(addr, 1); + emit_far_jump(linker); +} + +static void check_extjump2(void *src) +{ + u_int *ptr = src; + assert((ptr[0] & 0xffe0001f) == 0x52800000); // movz r0, #val + (void)ptr; +} + +// put rt_val into rt, potentially making use of rs with value rs_val +static void emit_movimm_from(u_int rs_val, u_int rs, u_int rt_val, u_int rt) +{ + int diff = rt_val - rs_val; + if ((-4096 < diff && diff < 4096) + || (-16777216 < diff && diff < 16777216 && !(diff & 0xfff))) + emit_addimm(rs, diff, rt); + else if (rt_val == ~rs_val) + emit_not(rs, rt); + else if (is_rotated_mask(rs_val ^ rt_val)) + emit_xorimm(rs, rs_val ^ rt_val, rt); + else + emit_movimm(rt_val, rt); +} + +// return 1 if the above function can do it's job cheaply +static int is_similar_value(u_int v1, u_int v2) +{ + int diff = v1 - v2; + return (-4096 < diff && diff < 4096) + || (-16777216 < diff && diff < 16777216 && !(diff & 0xfff)) + || v1 == ~v2 + || is_rotated_mask(v1 ^ v2); +} + +static void emit_movimm_from64(u_int rs_val, u_int rs, uintptr_t rt_val, u_int rt) +{ + if (rt_val < 0x100000000ull) { + emit_movimm_from(rs_val, rs, rt_val, rt); + return; + } + // just move the whole thing. At least on Linux all addresses + // seem to be 48bit, so 3 insns - not great not terrible + assem_debug("movz %s,#%#lx\n", regname64[rt], rt_val & 0xffff); + output_w32(0xd2800000 | imm16_rd(rt_val & 0xffff, rt)); + assem_debug("movk %s,#%#lx,lsl #16\n", regname64[rt], (rt_val >> 16) & 0xffff); + output_w32(0xf2a00000 | imm16_rd((rt_val >> 16) & 0xffff, rt)); + assem_debug("movk %s,#%#lx,lsl #32\n", regname64[rt], (rt_val >> 32) & 0xffff); + output_w32(0xf2c00000 | imm16_rd((rt_val >> 32) & 0xffff, rt)); + if (rt_val >> 48) { + assem_debug("movk %s,#%#lx,lsl #48\n", regname64[rt], (rt_val >> 48) & 0xffff); + output_w32(0xf2e00000 | imm16_rd((rt_val >> 48) & 0xffff, rt)); + } +} + +// trashes x2 +static void pass_args64(u_int a0, u_int a1) +{ + if(a0==1&&a1==0) { + // must swap + emit_mov64(a0,2); emit_mov64(a1,1); emit_mov64(2,0); + } + else if(a0!=0&&a1==0) { + emit_mov64(a1,1); + if (a0>=0) emit_mov64(a0,0); + } + else { + if(a0>=0&&a0!=0) emit_mov64(a0,0); + if(a1>=0&&a1!=1) emit_mov64(a1,1); + } +} + +static void loadstore_extend(enum stub_type type, u_int rs, u_int rt) +{ + switch(type) { + case LOADB_STUB: emit_sbfm(rs, 7, rt); break; + case LOADBU_STUB: + case STOREB_STUB: emit_ubfm(rs, 7, rt); break; + case LOADH_STUB: emit_sbfm(rs, 15, rt); break; + case LOADHU_STUB: + case STOREH_STUB: emit_ubfm(rs, 15, rt); break; + case LOADW_STUB: + case STOREW_STUB: if (rs != rt) emit_mov(rs, rt); break; + default: assert(0); + } +} + +#include "pcsxmem.h" +//#include "pcsxmem_inline.c" + +static void do_readstub(int n) +{ + assem_debug("do_readstub %x\n",start+stubs[n].a*4); + set_jump_target(stubs[n].addr, out); + enum stub_type type = stubs[n].type; + int i = stubs[n].a; + int rs = stubs[n].b; + const struct regstat *i_regs = (void *)stubs[n].c; + u_int reglist = stubs[n].e; + const signed char *i_regmap = i_regs->regmap; + int rt; + if(dops[i].itype==C1LS||dops[i].itype==C2LS||dops[i].itype==LOADLR) { + rt=get_reg(i_regmap,FTEMP); + }else{ + rt=get_reg(i_regmap,dops[i].rt1); + } + assert(rs>=0); + int r,temp=-1,temp2=HOST_TEMPREG,regs_saved=0; + void *restore_jump = NULL, *handler_jump = NULL; + reglist|=(1<=0&&dops[i].rt1!=0) + reglist&=~(1<=0&&dops[i].rt1!=0)) { + switch(type) { + case LOADB_STUB: emit_ldrsb_dualindexed(temp2,rs,rt); break; + case LOADBU_STUB: emit_ldrb_dualindexed(temp2,rs,rt); break; + case LOADH_STUB: emit_ldrsh_dualindexed(temp2,rs,rt); break; + case LOADHU_STUB: emit_ldrh_dualindexed(temp2,rs,rt); break; + case LOADW_STUB: emit_ldr_dualindexed(temp2,rs,rt); break; + default: assert(0); + } + } + if(regs_saved) { + restore_jump=out; + emit_jmp(0); // jump to reg restore + } + else + emit_jmp(stubs[n].retaddr); // return address + set_jump_target(handler_jump, out); + + if(!regs_saved) + save_regs(reglist); + void *handler=NULL; + if(type==LOADB_STUB||type==LOADBU_STUB) + handler=jump_handler_read8; + if(type==LOADH_STUB||type==LOADHU_STUB) + handler=jump_handler_read16; + if(type==LOADW_STUB) + handler=jump_handler_read32; + assert(handler); + pass_args64(rs,temp2); + int cc=get_reg(i_regmap,CCREG); + if(cc<0) + emit_loadreg(CCREG,2); + emit_addimm(cc<0?2:cc,(int)stubs[n].d,2); + emit_far_call(handler); + // (no cycle reload after read) + if(dops[i].itype==C1LS||dops[i].itype==C2LS||(rt>=0&&dops[i].rt1!=0)) { + loadstore_extend(type,0,rt); + } + if(restore_jump) + set_jump_target(restore_jump, out); + restore_regs(reglist); + emit_jmp(stubs[n].retaddr); +} + +static void inline_readstub(enum stub_type type, int i, u_int addr, + const signed char regmap[], int target, int adj, u_int reglist) +{ + int rs=get_reg(regmap,target); + int rt=get_reg(regmap,target); + if(rs<0) rs=get_reg(regmap,-1); + assert(rs>=0); + u_int is_dynamic=0; + uintptr_t host_addr = 0; + void *handler; + int cc=get_reg(regmap,CCREG); + //if(pcsx_direct_read(type,addr,adj,cc,target?rs:-1,rt)) + // return; + handler = get_direct_memhandler(mem_rtab, addr, type, &host_addr); + if (handler == NULL) { + if(rt<0||dops[i].rt1==0) + return; + if (addr != host_addr) + emit_movimm_from64(addr, rs, host_addr, rs); + switch(type) { + case LOADB_STUB: emit_movsbl_indexed(0,rs,rt); break; + case LOADBU_STUB: emit_movzbl_indexed(0,rs,rt); break; + case LOADH_STUB: emit_movswl_indexed(0,rs,rt); break; + case LOADHU_STUB: emit_movzwl_indexed(0,rs,rt); break; + case LOADW_STUB: emit_readword_indexed(0,rs,rt); break; + default: assert(0); + } + return; + } + is_dynamic = pcsxmem_is_handler_dynamic(addr); + if (is_dynamic) { + if(type==LOADB_STUB||type==LOADBU_STUB) + handler=jump_handler_read8; + if(type==LOADH_STUB||type==LOADHU_STUB) + handler=jump_handler_read16; + if(type==LOADW_STUB) + handler=jump_handler_read32; + } + + // call a memhandler + if(rt>=0&&dops[i].rt1!=0) + reglist&=~(1<>12] << 1; + emit_adrp((void *)l1, 1); + emit_addimm64(1, l1 & 0xfff, 1); + } + else + emit_far_call(do_memhandler_pre); + + emit_far_call(handler); + + // (no cycle reload after read) + if(rt>=0&&dops[i].rt1!=0) + loadstore_extend(type, 0, rt); + restore_regs(reglist); +} + +static void do_writestub(int n) +{ + assem_debug("do_writestub %x\n",start+stubs[n].a*4); + set_jump_target(stubs[n].addr, out); + enum stub_type type=stubs[n].type; + int i=stubs[n].a; + int rs=stubs[n].b; + struct regstat *i_regs=(struct regstat *)stubs[n].c; + u_int reglist=stubs[n].e; + signed char *i_regmap=i_regs->regmap; + int rt,r; + if(dops[i].itype==C1LS||dops[i].itype==C2LS) { + rt=get_reg(i_regmap,r=FTEMP); + }else{ + rt=get_reg(i_regmap,r=dops[i].rs2); + } + assert(rs>=0); + assert(rt>=0); + int rtmp,temp=-1,temp2,regs_saved=0; + void *restore_jump = NULL, *handler_jump = NULL; + int reglist2=reglist|(1<= 0); + assert(rt >= 0); + uintptr_t host_addr = 0; + void *handler = get_direct_memhandler(mem_wtab, addr, type, &host_addr); + if (handler == NULL) { + if (addr != host_addr) + emit_movimm_from64(addr, rs, host_addr, rs); + switch (type) { + case STOREB_STUB: emit_writebyte_indexed(rt, 0, rs); break; + case STOREH_STUB: emit_writehword_indexed(rt, 0, rs); break; + case STOREW_STUB: emit_writeword_indexed(rt, 0, rs); break; + default: assert(0); + } + return; + } + + // call a memhandler + save_regs(reglist); + emit_writeword(rs, &address); // some handlers still need it + loadstore_extend(type, rt, 0); + int cc, cc_use; + cc = cc_use = get_reg(regmap, CCREG); + if (cc < 0) + emit_loadreg(CCREG, (cc_use = 2)); + emit_addimm(cc_use, adj, 2); + + emit_far_call(do_memhandler_pre); + emit_far_call(handler); + emit_far_call(do_memhandler_post); + emit_addimm(0, -adj, cc_use); + if (cc < 0) + emit_storereg(CCREG, cc_use); + restore_regs(reglist); +} + +static int verify_code_arm64(const void *source, const void *copy, u_int size) +{ + int ret = memcmp(source, copy, size); + //printf("%s %p,%#x = %d\n", __func__, source, size, ret); + return ret; +} + +// this output is parsed by verify_dirty, get_bounds, isclean, get_clean_addr +static void do_dirty_stub_base(u_int vaddr, u_int source_len) +{ + assert(source_len <= MAXBLOCK*4); + emit_loadlp_ofs(0, 0); // ldr x1, source + emit_loadlp_ofs(0, 1); // ldr x2, copy + emit_movz(source_len, 2); + emit_far_call(verify_code_arm64); + void *jmp = out; + emit_cbz(0, 0); + emit_movz(vaddr & 0xffff, 0); + emit_movk_lsl16(vaddr >> 16, 0); + emit_far_call(get_addr); + emit_jmpreg(0); + set_jump_target(jmp, out); +} + +static void assert_dirty_stub(const u_int *ptr) +{ + assert((ptr[0] & 0xff00001f) == 0x58000000); // ldr x0, source + assert((ptr[1] & 0xff00001f) == 0x58000001); // ldr x1, copy + assert((ptr[2] & 0xffe0001f) == 0x52800002); // movz w2, #source_len + assert( ptr[8] == 0xd61f0000); // br x0 +} + +static void set_loadlp(u_int *loadl, void *lit) +{ + uintptr_t ofs = (u_char *)lit - (u_char *)loadl; + assert((*loadl & ~0x1f) == 0x58000000); + assert((ofs & 3) == 0); + assert(ofs < 0x100000); + *loadl |= (ofs >> 2) << 5; +} + +static void do_dirty_stub_emit_literals(u_int *loadlps) +{ + set_loadlp(&loadlps[0], out); + output_w64((uintptr_t)source); + set_loadlp(&loadlps[1], out); + output_w64((uintptr_t)copy); +} + +static void *do_dirty_stub(int i, u_int source_len) +{ + assem_debug("do_dirty_stub %x\n",start+i*4); + u_int *loadlps = (void *)out; + do_dirty_stub_base(start + i*4, source_len); + void *entry = out; + load_regs_entry(i); + if (entry == out) + entry = instr_addr[i]; + emit_jmp(instr_addr[i]); + do_dirty_stub_emit_literals(loadlps); + return entry; +} + +static void do_dirty_stub_ds(u_int source_len) +{ + u_int *loadlps = (void *)out; + do_dirty_stub_base(start + 1, source_len); + void *lit_jumpover = out; + emit_jmp(out + 8*2); + do_dirty_stub_emit_literals(loadlps); + set_jump_target(lit_jumpover, out); +} + +static uint64_t get_from_ldr_literal(const u_int *i) +{ + signed int ofs; + assert((i[0] & 0xff000000) == 0x58000000); + ofs = i[0] << 8; + ofs >>= 5+8; + return *(uint64_t *)(i + ofs); +} + +static uint64_t get_from_movz(const u_int *i) +{ + assert((i[0] & 0x7fe00000) == 0x52800000); + return (i[0] >> 5) & 0xffff; +} + +// Find the "clean" entry point from a "dirty" entry point +// by skipping past the call to verify_code +static void *get_clean_addr(u_int *addr) +{ + assert_dirty_stub(addr); + return addr + 9; +} + +static int verify_dirty(const u_int *ptr) +{ + const void *source, *copy; + u_int len; + assert_dirty_stub(ptr); + source = (void *)get_from_ldr_literal(&ptr[0]); // ldr x1, source + copy = (void *)get_from_ldr_literal(&ptr[1]); // ldr x1, copy + len = get_from_movz(&ptr[2]); // movz w3, #source_len + return !memcmp(source, copy, len); +} + +static int isclean(void *addr) +{ + const u_int *ptr = addr; + if ((*ptr >> 24) == 0x58) { // the only place ldr (literal) is used + assert_dirty_stub(ptr); + return 0; + } + return 1; +} + +// get source that block at addr was compiled from (host pointers) +static void get_bounds(void *addr, u_char **start, u_char **end) +{ + const u_int *ptr = addr; + assert_dirty_stub(ptr); + *start = (u_char *)get_from_ldr_literal(&ptr[0]); // ldr x1, source + *end = *start + get_from_movz(&ptr[2]); // movz w3, #source_len +} + +/* Special assem */ + +static void c2op_prologue(u_int op, int i, const struct regstat *i_regs, u_int reglist) +{ + save_load_regs_all(1, reglist); + cop2_do_stall_check(op, i, i_regs, 0); +#ifdef PCNT + emit_movimm(op, 0); + emit_far_call(pcnt_gte_start); +#endif + // pointer to cop2 regs + emit_addimm64(FP, (u_char *)&psxRegs.CP2D.r[0] - (u_char *)&dynarec_local, 0); +} + +static void c2op_epilogue(u_int op,u_int reglist) +{ +#ifdef PCNT + emit_movimm(op, 0); + emit_far_call(pcnt_gte_end); +#endif + save_load_regs_all(0, reglist); +} + +static void c2op_assemble(int i, const struct regstat *i_regs) +{ + u_int c2op=source[i]&0x3f; + u_int hr,reglist_full=0,reglist; + int need_flags,need_ir; + for(hr=0;hrregmap[hr]>=0) reglist_full|=1<>63); // +1 because of how liveness detection works + need_ir=(gte_unneeded[i+1]&0xe00)!=0xe00; + assem_debug("gte op %08x, unneeded %016lx, need_flags %d, need_ir %d\n", + source[i],gte_unneeded[i+1],need_flags,need_ir); + if(HACK_ENABLED(NDHACK_GTE_NO_FLAGS)) + need_flags=0; + //int shift = (source[i] >> 19) & 1; + //int lm = (source[i] >> 10) & 1; + switch(c2op) { + default: + (void)need_ir; + c2op_prologue(c2op, i, i_regs, reglist); + emit_movimm(source[i],1); // opcode + emit_writeword(1,&psxRegs.code); + emit_far_call(need_flags?gte_handlers[c2op]:gte_handlers_nf[c2op]); + break; + } + c2op_epilogue(c2op,reglist); + } +} + +static void c2op_ctc2_31_assemble(signed char sl, signed char temp) +{ + //value = value & 0x7ffff000; + //if (value & 0x7f87e000) value |= 0x80000000; + emit_andimm(sl, 0x7fffe000, temp); + emit_testimm(temp, 0xff87ffff); + emit_andimm(sl, 0x7ffff000, temp); + host_tempreg_acquire(); + emit_orimm(temp, 0x80000000, HOST_TEMPREG); + emit_cmovne_reg(HOST_TEMPREG, temp); + host_tempreg_release(); + assert(0); // testing needed +} + +static void do_mfc2_31_one(u_int copr,signed char temp) +{ + emit_readshword(®_cop2d[copr],temp); + emit_bicsar_imm(temp,31,temp); + emit_cmpimm(temp,0xf80); + emit_csinvle_reg(temp,WZR,temp); // if (temp > 0xf80) temp = ~0; + emit_andimm(temp,0xf80,temp); +} + +static void c2op_mfc2_29_assemble(signed char tl, signed char temp) +{ + if (temp < 0) { + host_tempreg_acquire(); + temp = HOST_TEMPREG; + } + do_mfc2_31_one(9,temp); + emit_shrimm(temp,7,tl); + do_mfc2_31_one(10,temp); + emit_orrshr_imm(temp,2,tl); + do_mfc2_31_one(11,temp); + emit_orrshl_imm(temp,3,tl); + emit_writeword(tl,®_cop2d[29]); + + if (temp == HOST_TEMPREG) + host_tempreg_release(); +} + +static void multdiv_assemble_arm64(int i, const struct regstat *i_regs) +{ + // case 0x18: MULT + // case 0x19: MULTU + // case 0x1A: DIV + // case 0x1B: DIVU + if(dops[i].rs1&&dops[i].rs2) + { + switch(dops[i].opcode2) + { + case 0x18: // MULT + case 0x19: // MULTU + { + signed char m1=get_reg(i_regs->regmap,dops[i].rs1); + signed char m2=get_reg(i_regs->regmap,dops[i].rs2); + signed char hi=get_reg(i_regs->regmap,HIREG); + signed char lo=get_reg(i_regs->regmap,LOREG); + assert(m1>=0); + assert(m2>=0); + assert(hi>=0); + assert(lo>=0); + + if(dops[i].opcode2==0x18) // MULT + emit_smull(m1,m2,hi); + else // MULTU + emit_umull(m1,m2,hi); + + emit_mov(hi,lo); + emit_shrimm64(hi,32,hi); + break; + } + case 0x1A: // DIV + case 0x1B: // DIVU + { + signed char numerator=get_reg(i_regs->regmap,dops[i].rs1); + signed char denominator=get_reg(i_regs->regmap,dops[i].rs2); + signed char quotient=get_reg(i_regs->regmap,LOREG); + signed char remainder=get_reg(i_regs->regmap,HIREG); + assert(numerator>=0); + assert(denominator>=0); + assert(quotient>=0); + assert(remainder>=0); + + if (dops[i].opcode2 == 0x1A) // DIV + emit_sdiv(numerator,denominator,quotient); + else // DIVU + emit_udiv(numerator,denominator,quotient); + emit_msub(quotient,denominator,numerator,remainder); + + // div 0 quotient (remainder is already correct) + host_tempreg_acquire(); + if (dops[i].opcode2 == 0x1A) // DIV + emit_sub_asrimm(0,numerator,31,HOST_TEMPREG); + else + emit_movimm(~0,HOST_TEMPREG); + emit_test(denominator,denominator); + emit_cmoveq_reg(HOST_TEMPREG,quotient); + host_tempreg_release(); + break; + } + default: + assert(0); + } + } + else + { + signed char hr=get_reg(i_regs->regmap,HIREG); + signed char lr=get_reg(i_regs->regmap,LOREG); + if ((dops[i].opcode2==0x1A || dops[i].opcode2==0x1B) && dops[i].rs2==0) // div 0 + { + if (dops[i].rs1) { + signed char numerator = get_reg(i_regs->regmap, dops[i].rs1); + assert(numerator >= 0); + if (hr >= 0) + emit_mov(numerator,hr); + if (lr >= 0) { + if (dops[i].opcode2 == 0x1A) // DIV + emit_sub_asrimm(0,numerator,31,lr); + else + emit_movimm(~0,lr); + } + } + else { + if (hr >= 0) emit_zeroreg(hr); + if (lr >= 0) emit_movimm(~0,lr); + } + } + else + { + // Multiply by zero is zero. + if (hr >= 0) emit_zeroreg(hr); + if (lr >= 0) emit_zeroreg(lr); + } + } +} +#define multdiv_assemble multdiv_assemble_arm64 + +static void do_jump_vaddr(u_int rs) +{ + if (rs != 0) + emit_mov(rs, 0); + emit_far_call(get_addr_ht); + emit_jmpreg(0); +} + +static void do_preload_rhash(u_int r) { + // Don't need this for ARM. On x86, this puts the value 0xf8 into the + // register. On ARM the hash can be done with a single instruction (below) +} + +static void do_preload_rhtbl(u_int ht) { + emit_addimm64(FP, (u_char *)&mini_ht - (u_char *)&dynarec_local, ht); +} + +static void do_rhash(u_int rs,u_int rh) { + emit_andimm(rs, 0xf8, rh); +} + +static void do_miniht_load(int ht, u_int rh) { + emit_add64(ht, rh, ht); + emit_ldst(0, 0, rh, ht, 0); +} + +static void do_miniht_jump(u_int rs, u_int rh, u_int ht) { + emit_cmp(rh, rs); + void *jaddr = out; + emit_jeq(0); + do_jump_vaddr(rs); + + set_jump_target(jaddr, out); + assem_debug("ldr %s,[%s,#8]\n",regname64[ht], regname64[ht]); + output_w32(0xf9400000 | imm12_rn_rd(8 >> 3, ht, ht)); + emit_jmpreg(ht); +} + +// parsed by set_jump_target? +static void do_miniht_insert(u_int return_address,u_int rt,int temp) { + emit_movz_lsl16((return_address>>16)&0xffff,rt); + emit_movk(return_address&0xffff,rt); + add_to_linker(out,return_address,1); + emit_adr(out,temp); + emit_writedword(temp,&mini_ht[(return_address&0xFF)>>3][1]); + emit_writeword(rt,&mini_ht[(return_address&0xFF)>>3][0]); +} + +static void clear_cache_arm64(char *start, char *end) +{ + // Don't rely on GCC's __clear_cache implementation, as it caches + // icache/dcache cache line sizes, that can vary between cores on + // big.LITTLE architectures. + uint64_t addr, ctr_el0; + static size_t icache_line_size = 0xffff, dcache_line_size = 0xffff; + size_t isize, dsize; + + __asm__ volatile("mrs %0, ctr_el0" : "=r"(ctr_el0)); + isize = 4 << ((ctr_el0 >> 0) & 0xf); + dsize = 4 << ((ctr_el0 >> 16) & 0xf); + + // use the global minimum cache line size + icache_line_size = isize = icache_line_size < isize ? icache_line_size : isize; + dcache_line_size = dsize = dcache_line_size < dsize ? dcache_line_size : dsize; + + /* If CTR_EL0.IDC is enabled, Data cache clean to the Point of Unification is + not required for instruction to data coherence. */ + if ((ctr_el0 & (1 << 28)) == 0x0) { + addr = (uint64_t)start & ~(uint64_t)(dsize - 1); + for (; addr < (uint64_t)end; addr += dsize) + // use "civac" instead of "cvau", as this is the suggested workaround for + // Cortex-A53 errata 819472, 826319, 827319 and 824069. + __asm__ volatile("dc civac, %0" : : "r"(addr) : "memory"); + } + __asm__ volatile("dsb ish" : : : "memory"); + + /* If CTR_EL0.DIC is enabled, Instruction cache cleaning to the Point of + Unification is not required for instruction to data coherence. */ + if ((ctr_el0 & (1 << 29)) == 0x0) { + addr = (uint64_t)start & ~(uint64_t)(isize - 1); + for (; addr < (uint64_t)end; addr += isize) + __asm__ volatile("ic ivau, %0" : : "r"(addr) : "memory"); + + __asm__ volatile("dsb ish" : : : "memory"); + } + + __asm__ volatile("isb" : : : "memory"); +} + +// CPU-architecture-specific initialization +static void arch_init(void) +{ + uintptr_t diff = (u_char *)&ndrc->tramp.f - (u_char *)&ndrc->tramp.ops; + struct tramp_insns *ops = ndrc->tramp.ops; + size_t i; + assert(!(diff & 3)); + start_tcache_write(ops, (u_char *)ops + sizeof(ndrc->tramp.ops)); + for (i = 0; i < ARRAY_SIZE(ndrc->tramp.ops); i++) { + ops[i].ldr = 0x58000000 | imm19_rt(diff >> 2, 17); // ldr x17, [=val] + ops[i].br = 0xd61f0000 | rm_rn_rd(0, 17, 0); // br x17 + } + end_tcache_write(ops, (u_char *)ops + sizeof(ndrc->tramp.ops)); +} + +// vim:shiftwidth=2:expandtab diff --git a/libpcsxcore/new_dynarec/assem_arm64.h b/libpcsxcore/new_dynarec/assem_arm64.h new file mode 100644 index 00000000..c5fcadf3 --- /dev/null +++ b/libpcsxcore/new_dynarec/assem_arm64.h @@ -0,0 +1,49 @@ +#define HOST_IMM8 1 + +/* calling convention: + r0 -r17: caller-save + r19-r29: callee-save */ + +#define HOST_REGS 29 +#define HOST_BTREG 27 +#define EXCLUDE_REG -1 + +#define SP 31 +#define WZR SP +#define XZR SP + +#define LR 30 +#define HOST_TEMPREG LR + +// Note: FP is set to &dynarec_local when executing generated code. +// Thus the local variables are actually global and not on the stack. +#define FP 29 +#define rFP x29 + +#define HOST_CCREG 28 +#define rCC w28 + +#define CALLER_SAVE_REGS 0x0007ffff +#define PREFERRED_REG_FIRST 19 +#define PREFERRED_REG_LAST 27 + +// stack space +#define SSP_CALLEE_REGS (8*12) +#define SSP_CALLER_REGS (8*20) +#define SSP_ALL (SSP_CALLEE_REGS+SSP_CALLER_REGS) + +#define TARGET_SIZE_2 24 // 2^24 = 16 megabytes + +#ifndef __ASSEMBLER__ + +extern char *invc_ptr; + +struct tramp_insns +{ + u_int ldr; + u_int br; +}; + +static void clear_cache_arm64(char *start, char *end); + +#endif // !__ASSEMBLY__ diff --git a/libpcsxcore/new_dynarec/backends/psx/emu_if.c b/libpcsxcore/new_dynarec/emu_if.c similarity index 87% rename from libpcsxcore/new_dynarec/backends/psx/emu_if.c rename to libpcsxcore/new_dynarec/emu_if.c index e9fa6071..bbcd756e 100644 --- a/libpcsxcore/new_dynarec/backends/psx/emu_if.c +++ b/libpcsxcore/new_dynarec/emu_if.c @@ -9,19 +9,16 @@ #include "emu_if.h" #include "pcsxmem.h" -#include "../../../psxhle.h" -#include "../../../r3000a.h" -#include "../../../cdrom.h" -#include "../../../psxdma.h" -#include "../../../mdec.h" -#include "../../../gte_arm.h" -#include "../../../gte_neon.h" - -#include "../../../gte.h" - +#include "../psxhle.h" +#include "../psxinterpreter.h" +#include "../r3000a.h" +#include "../cdrom.h" +#include "../psxdma.h" +#include "../mdec.h" +#include "../gte_arm.h" +#include "../gte_neon.h" #define FLAGLESS -#include "../../../gte.h" -#undef FLAGLESS +#include "../gte.h" #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0])) @@ -29,7 +26,6 @@ #define evprintf(...) char invalid_code[0x100000]; -static u32 scratch_buf[8*8*2] __attribute__((aligned(64))); u32 event_cycles[PSXINT_COUNT]; static void schedule_timeslice(void) @@ -189,12 +185,15 @@ void new_dyna_freeze(void *f, int mode) if (bytes != size) return; - new_dynarec_load_blocks(addrs, size); + if (psxCpu != &psxInt) + new_dynarec_load_blocks(addrs, size); } //printf("drc: %d block info entries %s\n", size/8, mode ? "saved" : "loaded"); } +#if !defined(DRC_DISABLE) && !defined(LIGHTREC) + /* GTE stuff */ void *gte_handlers[64]; @@ -220,15 +219,6 @@ const char *gte_regnames[64] = { NULL , NULL , NULL , NULL , NULL , "GPF" , "GPL" , "NCCT", // 38 }; -/* from gte.txt.. not sure if this is any good. */ -const char gte_cycletab[64] = { - /* 1 2 3 4 5 6 7 8 9 a b c d e f */ - 0, 15, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 6, 0, 0, 0, - 8, 8, 8, 19, 13, 0, 44, 0, 0, 0, 0, 17, 11, 0, 14, 0, - 30, 0, 0, 0, 0, 0, 0, 0, 5, 8, 17, 0, 0, 5, 6, 0, - 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 39, -}; - #define GCBIT(x) \ (1ll << (32+x)) #define GDBIT(x) \ @@ -307,6 +297,7 @@ const uint64_t gte_reg_writes[64] = { static int ari64_init() { + static u32 scratch_buf[8*8*2] __attribute__((aligned(64))); extern void (*psxCP2[64])(); extern void psxNULL(); extern unsigned char *out; @@ -335,13 +326,12 @@ static int ari64_init() #ifdef DRC_DBG memcpy(gte_handlers_nf, gte_handlers, sizeof(gte_handlers_nf)); #endif - psxH_ptr = psxH; zeromem_ptr = zero_mem; scratch_buf_ptr = scratch_buf; SysPrintf("Mapped (RAM/scrp/ROM/LUTs/TC):\n"); - SysPrintf("%08x/%08x/%08x/%08x/%08x\n", + SysPrintf("%p/%p/%p/%p/%p\n", psxM, psxH, psxR, mem_rtab, out); return 0; @@ -365,7 +355,7 @@ static void ari64_execute_until() evprintf("ari64_execute %08x, %u->%u (%d)\n", psxRegs.pc, psxRegs.cycle, next_interupt, next_interupt - psxRegs.cycle); - new_dyna_start(); + new_dyna_start(dynarec_local); evprintf("ari64_execute end %08x, %u->%u (%d)\n", psxRegs.pc, psxRegs.cycle, next_interupt, next_interupt - psxRegs.cycle); @@ -398,23 +388,37 @@ static void ari64_clear(u32 addr, u32 size) invalidate_block(start); } -#ifdef ICACHE_EMULATION static void ari64_notify(int note, void *data) { /* - To change once we have proper icache emulation + Should be fixed when ARM dynarec has proper icache emulation. switch (note) { case R3000ACPU_NOTIFY_CACHE_UNISOLATED: - ari64_clear(0, 0x200000/4); break; case R3000ACPU_NOTIFY_CACHE_ISOLATED: - // Sent from psxDma3(). + Sent from psxDma3(). case R3000ACPU_NOTIFY_DMA3_EXE_LOAD: default: break; - }*/ + } + */ +} + +static void ari64_apply_config() +{ + intApplyConfig(); + + if (Config.DisableStalls) + new_dynarec_hacks |= NDHACK_NO_STALLS; + else + new_dynarec_hacks &= ~NDHACK_NO_STALLS; + + if (cycle_multiplier != cycle_multiplier_old + || new_dynarec_hacks != new_dynarec_hacks_old) + { + new_dynarec_clear_full(); + } } -#endif static void ari64_shutdown() { @@ -422,44 +426,28 @@ static void ari64_shutdown() new_dyna_pcsx_mem_shutdown(); } -extern void intExecute(); -extern void intExecuteT(); -extern void intExecuteBlock(); -extern void intExecuteBlockT(); -#ifndef DRC_DBG -#define intExecuteT intExecute -#define intExecuteBlockT intExecuteBlock -#endif - R3000Acpu psxRec = { ari64_init, ari64_reset, -#ifndef DRC_DISABLE ari64_execute, ari64_execute_until, -#else - intExecuteT, - intExecuteBlockT, -#endif ari64_clear, -#ifdef ICACHE_EMULATION ari64_notify, -#endif + ari64_apply_config, ari64_shutdown }; -// TODO: rm -#ifndef DRC_DBG -void do_insn_trace() {} -void do_insn_cmp() {} -#endif +#else // if DRC_DISABLE -#ifdef DRC_DISABLE unsigned int address; int pending_exception, stop; -u32 next_interupt; +unsigned int next_interupt; int new_dynarec_did_compile; int cycle_multiplier; +int cycle_multiplier_override; +int cycle_multiplier_old; +int new_dynarec_hacks_pergame; +int new_dynarec_hacks_old; int new_dynarec_hacks; void *psxH_ptr; void *zeromem_ptr; @@ -467,8 +455,8 @@ u8 zero_mem[0x1000]; unsigned char *out; void *mem_rtab; void *scratch_buf_ptr; -void new_dynarec_init() { (void)ari64_execute; } -void new_dyna_start() {} +void new_dynarec_init() {} +void new_dyna_start(void *context) {} void new_dynarec_cleanup() {} void new_dynarec_clear_full() {} void invalidate_all_pages() {} @@ -485,7 +473,9 @@ void new_dynarec_load_blocks(const void *save, int size) {} #include static FILE *f; -extern u32 last_io_addr; +u32 irq_test_cycle; +u32 handler_cycle; +u32 last_io_addr; static void dump_mem(const char *fname, void *mem, size_t size) { @@ -511,11 +501,10 @@ static u32 memcheck_read(u32 a) return *(u32 *)(psxM + (a & 0x1ffffc)); } +#if 0 void do_insn_trace(void) { static psxRegisters oldregs; - static u32 old_io_addr = (u32)-1; - static u32 old_io_data = 0xbad0c0de; static u32 event_cycles_o[PSXINT_COUNT]; u32 *allregs_p = (void *)&psxRegs; u32 *allregs_o = (void *)&oldregs; @@ -539,27 +528,27 @@ void do_insn_trace(void) // log event changes for (i = 0; i < PSXINT_COUNT; i++) { if (event_cycles[i] != event_cycles_o[i]) { - byte = 0xfc; + byte = 0xf8; fwrite(&byte, 1, 1, f); fwrite(&i, 1, 1, f); fwrite(&event_cycles[i], 1, 4, f); event_cycles_o[i] = event_cycles[i]; } } - // log last io - if (old_io_addr != last_io_addr) { - byte = 0xfd; - fwrite(&byte, 1, 1, f); - fwrite(&last_io_addr, 1, 4, f); - old_io_addr = last_io_addr; + #define SAVE_IF_CHANGED(code_, name_) { \ + static u32 old_##name_ = 0xbad0c0de; \ + if (old_##name_ != name_) { \ + byte = code_; \ + fwrite(&byte, 1, 1, f); \ + fwrite(&name_, 1, 4, f); \ + old_##name_ = name_; \ + } \ } + SAVE_IF_CHANGED(0xfb, irq_test_cycle); + SAVE_IF_CHANGED(0xfc, handler_cycle); + SAVE_IF_CHANGED(0xfd, last_io_addr); io_data = memcheck_read(last_io_addr); - if (old_io_data != io_data) { - byte = 0xfe; - fwrite(&byte, 1, 1, f); - fwrite(&io_data, 1, 4, f); - old_io_data = io_data; - } + SAVE_IF_CHANGED(0xfe, io_data); byte = 0xff; fwrite(&byte, 1, 1, f); @@ -572,6 +561,7 @@ void do_insn_trace(void) } #endif } +#endif static const char *regnames[offsetof(psxRegisters, intCycle) / 4] = { "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", @@ -620,12 +610,15 @@ void breakme() {} void do_insn_cmp(void) { + extern int last_count; static psxRegisters rregs; static u32 mem_addr, mem_val; + static u32 irq_test_cycle_intr; + static u32 handler_cycle_intr; u32 *allregs_p = (void *)&psxRegs; u32 *allregs_e = (void *)&rregs; static u32 ppc, failcount; - int i, ret, bad = 0, which_event = -1; + int i, ret, bad = 0, fatal = 0, which_event = -1; u32 ev_cycles = 0; u8 code; @@ -640,11 +633,17 @@ void do_insn_cmp(void) if (code == 0xff) break; switch (code) { - case 0xfc: + case 0xf8: which_event = 0; fread(&which_event, 1, 1, f); fread(&ev_cycles, 1, 4, f); continue; + case 0xfb: + fread(&irq_test_cycle_intr, 1, 4, f); + continue; + case 0xfc: + fread(&handler_cycle_intr, 1, 4, f); + continue; case 0xfd: fread(&mem_addr, 1, 4, f); continue; @@ -652,23 +651,43 @@ void do_insn_cmp(void) fread(&mem_val, 1, 4, f); continue; } + assert(code < offsetof(psxRegisters, intCycle) / 4); fread(&allregs_e[code], 1, 4, f); } if (ret <= 0) { printf("EOF?\n"); - goto end; + exit(1); } psxRegs.code = rregs.code; // don't care - psxRegs.cycle = rregs.cycle; + psxRegs.cycle += last_count; + //psxRegs.cycle = rregs.cycle; psxRegs.CP0.r[9] = rregs.CP0.r[9]; // Count //if (psxRegs.cycle == 166172) breakme(); - if (memcmp(&psxRegs, &rregs, offsetof(psxRegisters, intCycle)) == 0 && - mem_val == memcheck_read(mem_addr) - ) { + if (which_event >= 0 && event_cycles[which_event] != ev_cycles) { + printf("bad ev_cycles #%d: %08x %08x\n", which_event, event_cycles[which_event], ev_cycles); + fatal = 1; + } + + if (irq_test_cycle > irq_test_cycle_intr) { + printf("bad irq_test_cycle: %u %u\n", irq_test_cycle, irq_test_cycle_intr); + fatal = 1; + } + + if (handler_cycle != handler_cycle_intr) { + printf("bad handler_cycle: %u %u\n", handler_cycle, handler_cycle_intr); + fatal = 1; + } + + if (mem_val != memcheck_read(mem_addr)) { + printf("bad mem @%08x: %08x %08x\n", mem_addr, memcheck_read(mem_addr), mem_val); + fatal = 1; + } + + if (!fatal && !memcmp(&psxRegs, &rregs, offsetof(psxRegisters, intCycle))) { failcount = 0; goto ok; } @@ -677,20 +696,12 @@ void do_insn_cmp(void) if (allregs_p[i] != allregs_e[i]) { miss_log_add(i, allregs_p[i], allregs_e[i], psxRegs.pc, psxRegs.cycle); bad++; + if (i > 32+2) + fatal = 1; } } - if (mem_val != memcheck_read(mem_addr)) { - printf("bad mem @%08x: %08x %08x\n", mem_addr, memcheck_read(mem_addr), mem_val); - goto end; - } - - if (which_event >= 0 && event_cycles[which_event] != ev_cycles) { - printf("bad ev_cycles #%d: %08x %08x\n", which_event, event_cycles[which_event], ev_cycles); - goto end; - } - - if (psxRegs.pc == rregs.pc && bad < 6 && failcount < 32) { + if (!fatal && psxRegs.pc == rregs.pc && bad < 6 && failcount < 32) { static int last_mcycle; if (last_mcycle != psxRegs.cycle >> 20) { printf("%u\n", psxRegs.cycle); @@ -700,7 +711,6 @@ void do_insn_cmp(void) goto ok; } -end: for (i = 0; i < miss_log_len; i++, miss_log_i = (miss_log_i + 1) & miss_log_mask) printf("bad %5s: %08x %08x, pc=%08x, cycle %u\n", regnames[miss_log[miss_log_i].reg], miss_log[miss_log_i].val, @@ -714,7 +724,7 @@ end: dump_mem("/mnt/ntz/dev/pnd/tmp/psxregs.dump", psxH, 0x10000); exit(1); ok: - psxRegs.cycle = rregs.cycle + 2; // sync timing + //psxRegs.cycle = rregs.cycle + 2; // sync timing ppc = psxRegs.pc; } diff --git a/libpcsxcore/new_dynarec/backends/psx/emu_if.h b/libpcsxcore/new_dynarec/emu_if.h similarity index 86% rename from libpcsxcore/new_dynarec/backends/psx/emu_if.h rename to libpcsxcore/new_dynarec/emu_if.h index e5396ef3..30cb9ef6 100644 --- a/libpcsxcore/new_dynarec/backends/psx/emu_if.h +++ b/libpcsxcore/new_dynarec/emu_if.h @@ -1,8 +1,5 @@ -#ifndef __EMU_IF_H__ -#define __EMU_IF_H__ - -#include "../../new_dynarec.h" -#include "../../../r3000a.h" +#include "new_dynarec.h" +#include "../r3000a.h" extern char invalid_code[0x100000]; @@ -10,8 +7,7 @@ extern char invalid_code[0x100000]; #define EAX 0 #define ECX 1 -/* same as psxRegs */ -extern int reg[]; +extern int dynarec_local[]; /* same as psxRegs.GPR.n.* */ extern int hi, lo; @@ -56,13 +52,9 @@ extern int reg_cop2d[], reg_cop2c[]; extern void *gte_handlers[64]; extern void *gte_handlers_nf[64]; extern const char *gte_regnames[64]; -extern const char gte_cycletab[64]; extern const uint64_t gte_reg_reads[64]; extern const uint64_t gte_reg_writes[64]; -/* dummy */ -extern int FCR0, FCR31; - /* mem */ extern void *mem_rtab; extern void *mem_wtab; @@ -89,10 +81,11 @@ extern void *zeromem_ptr; extern void *scratch_buf_ptr; // same as invalid_code, just a region for ram write checks (inclusive) +// (psx/guest address range) extern u32 inv_code_start, inv_code_end; /* cycles/irqs */ -extern u32 next_interupt; +extern unsigned int next_interupt; extern int pending_exception; /* called by drc */ @@ -100,14 +93,6 @@ void pcsx_mtc0(u32 reg, u32 val); void pcsx_mtc0_ds(u32 reg, u32 val); /* misc */ -extern const void (*psxHLEt[8])(); - extern void SysPrintf(const char *fmt, ...); -#ifdef RAM_FIXED -#define rdram ((u_int)0x80000000) -#else -#define rdram ((u_int)psxM) -#endif - -#endif /* __EMU_IF_H__ */ +#define rdram ((u_char *)psxM) diff --git a/libpcsxcore/new_dynarec/arm/linkage_arm.S b/libpcsxcore/new_dynarec/linkage_arm.S similarity index 91% rename from libpcsxcore/new_dynarec/arm/linkage_arm.S rename to libpcsxcore/new_dynarec/linkage_arm.S index 269eb995..63717315 100644 --- a/libpcsxcore/new_dynarec/arm/linkage_arm.S +++ b/libpcsxcore/new_dynarec/linkage_arm.S @@ -20,21 +20,20 @@ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #include "arm_features.h" -#include "../new_dynarec_config.h" +#include "new_dynarec_config.h" #include "linkage_offsets.h" #ifdef __MACH__ #define dynarec_local ESYM(dynarec_local) -#define add_link ESYM(add_link) +#define add_jump_out ESYM(add_jump_out) #define new_recompile_block ESYM(new_recompile_block) #define get_addr ESYM(get_addr) #define get_addr_ht ESYM(get_addr_ht) #define clean_blocks ESYM(clean_blocks) #define gen_interupt ESYM(gen_interupt) -#define psxException ESYM(psxException) -#define execI ESYM(execI) #define invalidate_addr ESYM(invalidate_addr) +#define gteCheckStallRaw ESYM(gteCheckStallRaw) #endif .bss @@ -59,12 +58,13 @@ DRC_VAR(cycle_count, 4) DRC_VAR(last_count, 4) DRC_VAR(pending_exception, 4) DRC_VAR(stop, 4) -DRC_VAR(invc_ptr, 4) +DRC_VAR(branch_target, 4) DRC_VAR(address, 4) +@DRC_VAR(align0, 4) /* unused/alignment */ DRC_VAR(psxRegs, LO_psxRegs_end - LO_psxRegs) /* psxRegs */ -DRC_VAR(reg, 128) +@DRC_VAR(reg, 128) DRC_VAR(lo, 4) DRC_VAR(hi, 4) DRC_VAR(reg_cop0, 128) @@ -77,21 +77,18 @@ DRC_VAR(pcaddr, 4) @DRC_VAR(intCycle, 256) DRC_VAR(rcnts, 7*4*4) +DRC_VAR(inv_code_start, 4) +DRC_VAR(inv_code_end, 4) DRC_VAR(mem_rtab, 4) DRC_VAR(mem_wtab, 4) DRC_VAR(psxH_ptr, 4) DRC_VAR(zeromem_ptr, 4) -DRC_VAR(inv_code_start, 4) -DRC_VAR(inv_code_end, 4) -DRC_VAR(branch_target, 4) +DRC_VAR(invc_ptr, 4) DRC_VAR(scratch_buf_ptr, 4) -@DRC_VAR(align0, 12) /* unused/alignment */ +DRC_VAR(ram_offset, 4) DRC_VAR(mini_ht, 256) DRC_VAR(restore_candidate, 512) -/* unused */ -DRC_VAR(FCR0, 4) -DRC_VAR(FCR31, 4) #ifdef TEXRELS_FORBIDDEN .data @@ -180,7 +177,7 @@ ptr_hash_table: orrcs r2, r6, #2048 ldr r5, [r3, r2, lsl #2] lsl r12, r12, #8 - add r6, r1, r12, asr #6 + add r6, r1, r12, asr #6 /* old target */ mov r8, #0 /* jump_in lookup */ 1: @@ -200,7 +197,7 @@ ptr_hash_table: mov r5, r1 mov r1, r6 - bl add_link + bl add_jump_out sub r2, r8, r5 and r1, r7, #0xff000000 lsl r2, r2, #6 @@ -221,8 +218,8 @@ ptr_hash_table: ldr r5, [r3, r2, lsl #2] ldr r7, [r6, r4]! teq r7, r0 - ldreq pc, [r6, #4] - ldr r7, [r6, #8] + ldreq pc, [r6, #8] + ldr r7, [r6, #4] teq r7, r0 ldreq pc, [r6, #12] /* jump_dirty lookup */ @@ -237,10 +234,10 @@ ptr_hash_table: ldr r1, [r4, #8] /* hash_table insert */ ldr r2, [r6] - ldr r3, [r6, #4] + ldr r3, [r6, #8] str r0, [r6] - str r1, [r6, #4] - str r2, [r6, #8] + str r1, [r6, #8] + str r2, [r6, #4] str r3, [r6, #12] mov pc, r1 8: @@ -382,8 +379,8 @@ FUNCTION(jump_vaddr): and r2, r3, r2, lsr #12 ldr r2, [r1, r2]! teq r2, r0 - ldreq pc, [r1, #4] - ldr r2, [r1, #8] + ldreq pc, [r1, #8] + ldr r2, [r1, #4] teq r2, r0 ldreq pc, [r1, #12] str r10, [fp, #LO_cycle_count] @@ -395,8 +392,7 @@ FUNCTION(jump_vaddr): .align 2 FUNCTION(verify_code_ds): - str r8, [fp, #LO_branch_target] -FUNCTION(verify_code_vm): + str r8, [fp, #LO_branch_target] @ preserve HOST_BTREG? FUNCTION(verify_code): /* r1 = source */ /* r2 = target */ @@ -431,7 +427,7 @@ FUNCTION(verify_code): bl get_addr mov pc, r0 .size verify_code, .-verify_code - .size verify_code_vm, .-verify_code_vm + .size verify_code_ds, .-verify_code_ds .align 2 FUNCTION(cc_interrupt): @@ -479,14 +475,6 @@ FUNCTION(cc_interrupt): b .E1 .size cc_interrupt, .-cc_interrupt - .align 2 -FUNCTION(do_interrupt): - ldr r0, [fp, #LO_pcaddr] - bl get_addr_ht - add r10, r10, #2 - mov pc, r0 - .size do_interrupt, .-do_interrupt - .align 2 FUNCTION(fp_exception): mov r2, #0x10000000 @@ -523,19 +511,9 @@ FUNCTION(jump_syscall): .size jump_syscall, .-jump_syscall .align 2 - .align 2 -FUNCTION(jump_syscall_hle): - str r0, [fp, #LO_pcaddr] /* PC must be set to EPC for psxException */ - ldr r2, [fp, #LO_last_count] - mov r1, #0 /* in delay slot */ - add r2, r2, r10 - mov r0, #0x20 /* cause */ - str r2, [fp, #LO_cycle] /* PCSX cycle counter */ - bl psxException - /* note: psxException might do recursive recompiler call from it's HLE code, * so be ready for this */ -pcsx_return: +FUNCTION(jump_to_new_pc): ldr r1, [fp, #LO_next_interupt] ldr r10, [fp, #LO_cycle] ldr r0, [fp, #LO_pcaddr] @@ -543,27 +521,7 @@ pcsx_return: str r1, [fp, #LO_last_count] bl get_addr_ht mov pc, r0 - .size jump_syscall_hle, .-jump_syscall_hle - - .align 2 -FUNCTION(jump_hlecall): - ldr r2, [fp, #LO_last_count] - str r0, [fp, #LO_pcaddr] - add r2, r2, r10 - adr lr, pcsx_return - str r2, [fp, #LO_cycle] /* PCSX cycle counter */ - bx r1 - .size jump_hlecall, .-jump_hlecall - - .align 2 -FUNCTION(jump_intcall): - ldr r2, [fp, #LO_last_count] - str r0, [fp, #LO_pcaddr] - add r2, r2, r10 - adr lr, pcsx_return - str r2, [fp, #LO_cycle] /* PCSX cycle counter */ - b execI - .size jump_hlecall, .-jump_hlecall + .size jump_to_new_pc, .-jump_to_new_pc .align 2 FUNCTION(new_dyna_leave): @@ -658,7 +616,7 @@ invalidate_addr_call: FUNCTION(new_dyna_start): /* ip is stored to conform EABI alignment */ stmfd sp!, {r4, r5, r6, r7, r8, r9, sl, fp, ip, lr} - load_varadr fp, dynarec_local + mov fp, r0 /* dynarec_local */ ldr r0, [fp, #LO_pcaddr] bl get_addr_ht ldr r1, [fp, #LO_next_interupt] @@ -703,6 +661,13 @@ FUNCTION(jump_handler_read32): pcsx_read_mem ldrcc, 2 +.macro memhandler_post + ldr r0, [fp, #LO_next_interupt] + ldr r2, [fp, #LO_cycle] @ memhandlers can modify cc, like dma + str r0, [fp, #LO_last_count] + sub r0, r2, r0 +.endm + .macro pcsx_write_mem wrtop tab_shift /* r0 = address, r1 = data, r2 = cycles, r3 = handler_tab */ lsl r12,r0, #20 @@ -710,7 +675,7 @@ FUNCTION(jump_handler_read32): ldr r3, [r3, r12, lsl #2] str r0, [fp, #LO_address] @ some handlers still need it.. lsls r3, #1 - mov r0, r2 @ cycle return in case of direct store + mov r0, r2 @ cycle return in case of direct store .if \tab_shift == 1 lsl r12, #1 \wrtop r1, [r3, r12] @@ -721,15 +686,14 @@ FUNCTION(jump_handler_read32): ldr r12, [fp, #LO_last_count] mov r0, r1 add r2, r2, r12 - push {r2, lr} str r2, [fp, #LO_cycle] + + str lr, [fp, #LO_saved_lr] blx r3 + ldr lr, [fp, #LO_saved_lr] - ldr r0, [fp, #LO_next_interupt] - pop {r2, r3} - str r0, [fp, #LO_last_count] - sub r0, r2, r0 - bx r3 + memhandler_post + bx lr .endm FUNCTION(jump_handler_write8): @@ -749,15 +713,14 @@ FUNCTION(jump_handler_write_h): str r0, [fp, #LO_address] @ some handlers still need it.. add r2, r2, r12 mov r0, r1 - push {r2, lr} str r2, [fp, #LO_cycle] + + str lr, [fp, #LO_saved_lr] blx r3 + ldr lr, [fp, #LO_saved_lr] - ldr r0, [fp, #LO_next_interupt] - pop {r2, r3} - str r0, [fp, #LO_last_count] - sub r0, r2, r0 - bx r3 + memhandler_post + bx lr FUNCTION(jump_handle_swl): /* r0 = address, r1 = data, r2 = cycles */ @@ -863,4 +826,16 @@ FUNCTION(rcnt2_read_count_m1): lsr r0, #16 @ /= 8 bx lr +FUNCTION(call_gteStall): + /* r0 = op_cycles, r1 = cycles */ + ldr r2, [fp, #LO_last_count] + str lr, [fp, #LO_saved_lr] + add r1, r1, r2 + str r1, [fp, #LO_cycle] + add r1, fp, #LO_psxRegs + bl gteCheckStallRaw + ldr lr, [fp, #LO_saved_lr] + add r10, r10, r0 + bx lr + @ vim:filetype=armasm diff --git a/libpcsxcore/new_dynarec/linkage_arm64.S b/libpcsxcore/new_dynarec/linkage_arm64.S new file mode 100644 index 00000000..5e9626f5 --- /dev/null +++ b/libpcsxcore/new_dynarec/linkage_arm64.S @@ -0,0 +1,414 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * linkage_arm.s for PCSX * + * Copyright (C) 2009-2011 Ari64 * + * Copyright (C) 2021 notaz * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include "arm_features.h" +#include "new_dynarec_config.h" +#include "assem_arm64.h" +#include "linkage_offsets.h" + +#if (LO_mem_wtab & 7) +#error misligned pointers +#endif + +.bss + .align 4 + .global dynarec_local + .type dynarec_local, %object + .size dynarec_local, LO_dynarec_local_size +dynarec_local: + .space LO_dynarec_local_size + +#define DRC_VAR_(name, vname, size_) \ + vname = dynarec_local + LO_##name; \ + .global vname; \ + .type vname, %object; \ + .size vname, size_ + +#define DRC_VAR(name, size_) \ + DRC_VAR_(name, ESYM(name), size_) + +DRC_VAR(next_interupt, 4) +DRC_VAR(cycle_count, 4) +DRC_VAR(last_count, 4) +DRC_VAR(pending_exception, 4) +DRC_VAR(stop, 4) +DRC_VAR(branch_target, 4) +DRC_VAR(address, 4) +#DRC_VAR(align0, 16) /* unused/alignment */ +DRC_VAR(psxRegs, LO_psxRegs_end - LO_psxRegs) + +/* psxRegs */ +#DRC_VAR(reg, 128) +DRC_VAR(lo, 4) +DRC_VAR(hi, 4) +DRC_VAR(reg_cop0, 128) +DRC_VAR(reg_cop2d, 128) +DRC_VAR(reg_cop2c, 128) +DRC_VAR(pcaddr, 4) +#DRC_VAR(code, 4) +#DRC_VAR(cycle, 4) +#DRC_VAR(interrupt, 4) +#DRC_VAR(intCycle, 256) + +DRC_VAR(rcnts, 7*4*4) +DRC_VAR(inv_code_start, 4) +DRC_VAR(inv_code_end, 4) +DRC_VAR(mem_rtab, 8) +DRC_VAR(mem_wtab, 8) +DRC_VAR(psxH_ptr, 8) +DRC_VAR(invc_ptr, 8) +DRC_VAR(zeromem_ptr, 8) +DRC_VAR(scratch_buf_ptr, 8) +DRC_VAR(ram_offset, 8) +DRC_VAR(mini_ht, 256) +DRC_VAR(restore_candidate, 512) + + + .text + .align 2 + +/* r0 = virtual target address */ +/* r1 = instruction to patch */ +.macro dyna_linker_main + /* XXX TODO: should be able to do better than this... */ + bl get_addr_ht + br x0 +.endm + + +FUNCTION(dyna_linker): + /* r0 = virtual target address */ + /* r1 = instruction to patch */ + dyna_linker_main + .size dyna_linker, .-dyna_linker + +FUNCTION(exec_pagefault): + /* r0 = instruction pointer */ + /* r1 = fault address */ + /* r2 = cause */ + bl abort + .size exec_pagefault, .-exec_pagefault + +/* Special dynamic linker for the case where a page fault + may occur in a branch delay slot */ +FUNCTION(dyna_linker_ds): + /* r0 = virtual target address */ + /* r1 = instruction to patch */ + dyna_linker_main + .size dyna_linker_ds, .-dyna_linker_ds + + .align 2 +FUNCTION(cc_interrupt): + ldr w0, [rFP, #LO_last_count] + mov w2, #0x1fc + add rCC, w0, rCC + str wzr, [rFP, #LO_pending_exception] + and w2, w2, rCC, lsr #17 + add x3, rFP, #LO_restore_candidate + str rCC, [rFP, #LO_cycle] /* PCSX cycles */ +# str rCC, [rFP, #LO_reg_cop0+36] /* Count */ + ldr w19, [x3, w2, uxtw] + mov x21, lr + cbnz w19, 4f +1: + bl gen_interupt + mov lr, x21 + ldr rCC, [rFP, #LO_cycle] + ldr w0, [rFP, #LO_next_interupt] + ldr w1, [rFP, #LO_pending_exception] + ldr w2, [rFP, #LO_stop] + str w0, [rFP, #LO_last_count] + sub rCC, rCC, w0 + cbnz w2, new_dyna_leave + cbnz w1, 2f + ret +2: + ldr w0, [rFP, #LO_pcaddr] + bl get_addr_ht + br x0 +4: + /* Move 'dirty' blocks to the 'clean' list */ + lsl w20, w2, #3 + str wzr, [x3, w2, uxtw] +5: + mov w0, w20 + add w20, w20, #1 + tbz w19, #0, 6f + bl clean_blocks +6: + lsr w19, w19, #1 + tst w20, #31 + bne 5b + b 1b + .size cc_interrupt, .-cc_interrupt + + .align 2 +FUNCTION(fp_exception): + mov w2, #0x10000000 +0: + ldr w1, [rFP, #LO_reg_cop0+48] /* Status */ + mov w3, #0x80000000 + str w0, [rFP, #LO_reg_cop0+56] /* EPC */ + orr w1, w1, #2 + add w2, w2, #0x2c + str w1, [rFP, #LO_reg_cop0+48] /* Status */ + str w2, [rFP, #LO_reg_cop0+52] /* Cause */ + add w0, w3, #0x80 + bl get_addr_ht + br x0 + .size fp_exception, .-fp_exception + .align 2 +FUNCTION(fp_exception_ds): + mov w2, #0x90000000 /* Set high bit if delay slot */ + b 0b + .size fp_exception_ds, .-fp_exception_ds + + .align 2 +FUNCTION(jump_syscall): + ldr w1, [rFP, #LO_reg_cop0+48] /* Status */ + mov w3, #0x80000000 + str w0, [rFP, #LO_reg_cop0+56] /* EPC */ + orr w1, w1, #2 + mov w2, #0x20 + str w1, [rFP, #LO_reg_cop0+48] /* Status */ + str w2, [rFP, #LO_reg_cop0+52] /* Cause */ + add w0, w3, #0x80 + bl get_addr_ht + br x0 + .size jump_syscall, .-jump_syscall + .align 2 + + /* note: psxException might do recursive recompiler call from it's HLE code, + * so be ready for this */ +FUNCTION(jump_to_new_pc): + ldr w1, [rFP, #LO_next_interupt] + ldr rCC, [rFP, #LO_cycle] + ldr w0, [rFP, #LO_pcaddr] + sub rCC, rCC, w1 + str w1, [rFP, #LO_last_count] + bl get_addr_ht + br x0 + .size jump_to_new_pc, .-jump_to_new_pc + + /* stack must be aligned by 16, and include space for save_regs() use */ + .align 2 +FUNCTION(new_dyna_start): + stp x29, x30, [sp, #-SSP_ALL]! + ldr w1, [x0, #LO_next_interupt] + ldr w2, [x0, #LO_cycle] + stp x19, x20, [sp, #16*1] + stp x21, x22, [sp, #16*2] + stp x23, x24, [sp, #16*3] + stp x25, x26, [sp, #16*4] + stp x27, x28, [sp, #16*5] + mov rFP, x0 + ldr w0, [rFP, #LO_pcaddr] + str w1, [rFP, #LO_last_count] + sub rCC, w2, w1 + bl get_addr_ht + br x0 + .size new_dyna_start, .-new_dyna_start + + .align 2 +FUNCTION(new_dyna_leave): + ldr w0, [rFP, #LO_last_count] + add rCC, rCC, w0 + str rCC, [rFP, #LO_cycle] + ldp x19, x20, [sp, #16*1] + ldp x21, x22, [sp, #16*2] + ldp x23, x24, [sp, #16*3] + ldp x25, x26, [sp, #16*4] + ldp x27, x28, [sp, #16*5] + ldp x29, x30, [sp], #SSP_ALL + ret + .size new_dyna_leave, .-new_dyna_leave + +/* --------------------------------------- */ + +.align 2 + +.macro memhandler_pre + /* w0 = adddr/data, x1 = rhandler, w2 = cycles, x3 = whandler */ + ldr w4, [rFP, #LO_last_count] + add w4, w4, w2 + str w4, [rFP, #LO_cycle] +.endm + +.macro memhandler_post + ldr w0, [rFP, #LO_next_interupt] + ldr w2, [rFP, #LO_cycle] // memhandlers can modify cc, like dma + str w0, [rFP, #LO_last_count] + sub w0, w2, w0 +.endm + +FUNCTION(do_memhandler_pre): + memhandler_pre + ret + +FUNCTION(do_memhandler_post): + memhandler_post + ret + +.macro pcsx_read_mem readop tab_shift + /* w0 = address, x1 = handler_tab, w2 = cycles */ + ubfm w4, w0, #\tab_shift, #11 + ldr x3, [x1, w4, uxtw #3] + adds x3, x3, x3 + bcs 0f + \readop w0, [x3, w4, uxtw #\tab_shift] + ret +0: + stp xzr, x30, [sp, #-16]! + memhandler_pre + blr x3 +.endm + +FUNCTION(jump_handler_read8): + add x1, x1, #0x1000/4*8 + 0x1000/2*8 /* shift to r8 part */ + pcsx_read_mem ldrb, 0 + b handler_read_end + +FUNCTION(jump_handler_read16): + add x1, x1, #0x1000/4*8 /* shift to r16 part */ + pcsx_read_mem ldrh, 1 + b handler_read_end + +FUNCTION(jump_handler_read32): + pcsx_read_mem ldr, 2 + +handler_read_end: + ldp xzr, x30, [sp], #16 + ret + +.macro pcsx_write_mem wrtop movop tab_shift + /* w0 = address, w1 = data, w2 = cycles, x3 = handler_tab */ + ubfm w4, w0, #\tab_shift, #11 + ldr x3, [x3, w4, uxtw #3] + adds x3, x3, x3 + bcs 0f + mov w0, w2 /* cycle return */ + \wrtop w1, [x3, w4, uxtw #\tab_shift] + ret +0: + stp xzr, x30, [sp, #-16]! + str w0, [rFP, #LO_address] /* some handlers still need it... */ + \movop w0, w1 + memhandler_pre + blr x3 +.endm + +FUNCTION(jump_handler_write8): + add x3, x3, #0x1000/4*8 + 0x1000/2*8 /* shift to r8 part */ + pcsx_write_mem strb uxtb 0 + b handler_write_end + +FUNCTION(jump_handler_write16): + add x3, x3, #0x1000/4*8 /* shift to r16 part */ + pcsx_write_mem strh uxth 1 + b handler_write_end + +FUNCTION(jump_handler_write32): + pcsx_write_mem str mov 2 + +handler_write_end: + memhandler_post + ldp xzr, x30, [sp], #16 + ret + +FUNCTION(jump_handle_swl): + /* w0 = address, w1 = data, w2 = cycles */ + ldr x3, [rFP, #LO_mem_wtab] + orr w4, wzr, w0, lsr #12 + ldr x3, [x3, w4, uxtw #3] + adds x3, x3, x3 + bcs 4f + add x3, x0, x3 + mov w0, w2 + tbz x3, #1, 10f // & 2 + tbz x3, #0, 2f // & 1 +3: + stur w1, [x3, #-3] + ret +2: + lsr w2, w1, #8 + lsr w1, w1, #24 + sturh w2, [x3, #-2] + strb w1, [x3] + ret +10: + tbz x3, #0, 0f // & 1 +1: + lsr w1, w1, #16 + sturh w1, [x3, #-1] + ret +0: + lsr w2, w1, #24 + strb w2, [x3] + ret +4: + mov w0, w2 // todo + bl abort + ret + +FUNCTION(jump_handle_swr): + /* w0 = address, w1 = data, w2 = cycles */ + ldr x3, [rFP, #LO_mem_wtab] + orr w4, wzr, w0, lsr #12 + ldr x3, [x3, w4, uxtw #3] + adds x3, x3, x3 + bcs 4f + add x3, x0, x3 + mov w0, w2 + tbz x3, #1, 10f // & 2 + tbz x3, #0, 2f // & 1 +3: + strb w1, [x3] + ret +2: + strh w1, [x3] + ret +10: + tbz x3, #0, 0f // & 1 +1: + lsr w2, w1, #8 + strb w1, [x3] + sturh w2, [x3, #1] + ret +0: + str w1, [x3] + ret +4: + mov w0, w2 // todo + bl abort + ret + +FUNCTION(call_gteStall): + /* w0 = op_cycles, w1 = cycles */ + ldr w2, [rFP, #LO_last_count] + str lr, [rFP, #LO_saved_lr] + add w1, w1, w2 + str w1, [rFP, #LO_cycle] + add x1, rFP, #LO_psxRegs + bl gteCheckStallRaw + ldr lr, [rFP, #LO_saved_lr] + add rCC, rCC, w0 + ret + diff --git a/libpcsxcore/new_dynarec/linkage_offsets.h b/libpcsxcore/new_dynarec/linkage_offsets.h new file mode 100644 index 00000000..e9bb3abd --- /dev/null +++ b/libpcsxcore/new_dynarec/linkage_offsets.h @@ -0,0 +1,45 @@ + +#define PTRSZ __SIZEOF_POINTER__ + +#define LO_next_interupt 64 +#define LO_cycle_count (LO_next_interupt + 4) +#define LO_last_count (LO_cycle_count + 4) +#define LO_pending_exception (LO_last_count + 4) +#define LO_stop (LO_pending_exception + 4) +#define LO_branch_target (LO_stop + 4) +#define LO_address (LO_branch_target + 4) +#define LO_align0 (LO_address + 4) +#define LO_psxRegs (LO_align0 + 4) +#define LO_reg (LO_psxRegs) +#define LO_lo (LO_reg + 128) +#define LO_hi (LO_lo + 4) +#define LO_reg_cop0 (LO_hi + 4) +#define LO_reg_cop2d (LO_reg_cop0 + 128) +#define LO_reg_cop2c (LO_reg_cop2d + 128) +#define LO_PC (LO_reg_cop2c + 128) +#define LO_pcaddr (LO_PC) +#define LO_code (LO_PC + 4) +#define LO_cycle (LO_code + 4) +#define LO_interrupt (LO_cycle + 4) +#define LO_intCycle (LO_interrupt + 4) +#define LO_gteBusyCycle (LO_intCycle + 256) +#define LO_muldivBusyCycle (LO_gteBusyCycle + 4) +#define LO_psxRegs_reserved (LO_muldivBusyCycle + 4) +#define LO_psxRegs_end (LO_psxRegs_reserved + 4*2) +#define LO_rcnts (LO_psxRegs_end) +#define LO_rcnts_end (LO_rcnts + 7*4*4) +#define LO_inv_code_start (LO_rcnts_end) +#define LO_inv_code_end (LO_inv_code_start + 4) +#define LO_mem_rtab (LO_inv_code_end + 4) +#define LO_mem_wtab (LO_mem_rtab + PTRSZ) +#define LO_psxH_ptr (LO_mem_wtab + PTRSZ) +#define LO_zeromem_ptr (LO_psxH_ptr + PTRSZ) +#define LO_invc_ptr (LO_zeromem_ptr + PTRSZ) +#define LO_scratch_buf_ptr (LO_invc_ptr + PTRSZ) +#define LO_saved_lr (LO_scratch_buf_ptr + PTRSZ) +#define LO_ram_offset (LO_saved_lr + PTRSZ) +#define LO_mini_ht (LO_ram_offset + PTRSZ) +#define LO_restore_candidate (LO_mini_ht + PTRSZ*32*2) +#define LO_dynarec_local_size (LO_restore_candidate + 512) + +#define LO_cop2_to_scratch_buf (LO_scratch_buf_ptr - LO_reg_cop2d) diff --git a/libpcsxcore/new_dynarec/new_dynarec.c b/libpcsxcore/new_dynarec/new_dynarec.c index 52deb854..27d9d469 100644 --- a/libpcsxcore/new_dynarec/new_dynarec.c +++ b/libpcsxcore/new_dynarec/new_dynarec.c @@ -32,47 +32,97 @@ #ifdef VITA #include static int sceBlock; -int getVMBlock(); #endif #include "new_dynarec_config.h" -#include "backends/psx/emu_if.h" //emulator interface +#include "../psxhle.h" +#include "../psxinterpreter.h" +#include "../gte.h" +#include "emu_if.h" // emulator interface + +#define noinline __attribute__((noinline,noclone)) +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0])) +#endif +#ifndef min +#define min(a, b) ((b) < (a) ? (b) : (a)) +#endif +#ifndef max +#define max(a, b) ((b) > (a) ? (b) : (a)) +#endif //#define DISASM -//#define assem_debug printf -//#define inv_debug printf +//#define ASSEM_PRINT + +#ifdef ASSEM_PRINT +#define assem_debug printf +#else #define assem_debug(...) +#endif +//#define inv_debug printf #define inv_debug(...) #ifdef __i386__ -#include "x86/assem_x86.h" +#include "assem_x86.h" #endif #ifdef __x86_64__ -#include "x64/assem_x64.h" +#include "assem_x64.h" #endif #ifdef __arm__ -#include "arm/assem_arm.h" +#include "assem_arm.h" #endif - -#ifdef VITA -int _newlib_vm_size_user = 1 << TARGET_SIZE_2; +#ifdef __aarch64__ +#include "assem_arm64.h" #endif +#define RAM_SIZE 0x200000 #define MAXBLOCK 4096 #define MAX_OUTPUT_BLOCK_SIZE 262144 +struct ndrc_mem +{ + u_char translation_cache[1 << TARGET_SIZE_2]; + struct + { + struct tramp_insns ops[2048 / sizeof(struct tramp_insns)]; + const void *f[2048 / sizeof(void *)]; + } tramp; +}; + +#ifdef BASE_ADDR_DYNAMIC +static struct ndrc_mem *ndrc; +#else +static struct ndrc_mem ndrc_ __attribute__((aligned(4096))); +static struct ndrc_mem *ndrc = &ndrc_; +#endif + +// stubs +enum stub_type { + CC_STUB = 1, + FP_STUB = 2, + LOADB_STUB = 3, + LOADH_STUB = 4, + LOADW_STUB = 5, + LOADD_STUB = 6, + LOADBU_STUB = 7, + LOADHU_STUB = 8, + STOREB_STUB = 9, + STOREH_STUB = 10, + STOREW_STUB = 11, + STORED_STUB = 12, + STORELR_STUB = 13, + INVCODE_STUB = 14, +}; + struct regstat { - signed char regmap_entry[HOST_REGS]; + signed char regmap_entry[HOST_REGS]; // pre-insn + loop preloaded regs? signed char regmap[HOST_REGS]; - uint64_t was32; - uint64_t is32; uint64_t wasdirty; uint64_t dirty; uint64_t u; - uint64_t uu; - u_int wasconst; - u_int isconst; + u_int wasconst; // before; for example 'lw r2, (r2)' wasconst is true + u_int isconst; // ... but isconst is false when r2 is known u_int loadedconst; // host regs that have constants loaded u_int waswritten; // MIPS regs that were used as store base before }; @@ -86,9 +136,53 @@ struct ll_entry struct ll_entry *next; }; +struct ht_entry +{ + u_int vaddr[2]; + void *tcaddr[2]; +}; + +struct code_stub +{ + enum stub_type type; + void *addr; + void *retaddr; + u_int a; + uintptr_t b; + uintptr_t c; + u_int d; + u_int e; +}; + +struct link_entry +{ + void *addr; + u_int target; + u_int ext; +}; + +static struct decoded_insn +{ + u_char itype; + u_char opcode; + u_char opcode2; + u_char rs1; + u_char rs2; + u_char rt1; + u_char rt2; + u_char lt1; + u_char bt:1; + u_char ooo:1; + u_char is_ds:1; + u_char is_jump:1; + u_char is_ujump:1; + u_char is_load:1; + u_char is_store:1; +} dops[MAXBLOCK]; + // used by asm: u_char *out; - u_int hash_table[65536][4] __attribute__((aligned(16))); + struct ht_entry hash_table[65536] __attribute__((aligned(16))); struct ll_entry *jump_in[4096] __attribute__((aligned(16))); struct ll_entry *jump_dirty[4096]; @@ -96,19 +190,6 @@ struct ll_entry static u_int start; static u_int *source; static char insn[MAXBLOCK][10]; - static u_char itype[MAXBLOCK]; - static u_char opcode[MAXBLOCK]; - static u_char opcode2[MAXBLOCK]; - static u_char bt[MAXBLOCK]; - static u_char rs1[MAXBLOCK]; - static u_char rs2[MAXBLOCK]; - static u_char rt1[MAXBLOCK]; - static u_char rt2[MAXBLOCK]; - static u_char us1[MAXBLOCK]; - static u_char us2[MAXBLOCK]; - static u_char dep1[MAXBLOCK]; - static u_char dep2[MAXBLOCK]; - static u_char lt1[MAXBLOCK]; static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs static uint64_t gte_rt[MAXBLOCK]; static uint64_t gte_unneeded[MAXBLOCK]; @@ -119,16 +200,14 @@ struct ll_entry static u_int smrv_weak_next; static int imm[MAXBLOCK]; static u_int ba[MAXBLOCK]; - static char likely[MAXBLOCK]; - static char is_ds[MAXBLOCK]; - static char ooo[MAXBLOCK]; static uint64_t unneeded_reg[MAXBLOCK]; - static uint64_t unneeded_reg_upper[MAXBLOCK]; static uint64_t branch_unneeded_reg[MAXBLOCK]; - static uint64_t branch_unneeded_reg_upper[MAXBLOCK]; + // pre-instruction [i], excluding loop-preload regs? static signed char regmap_pre[MAXBLOCK][HOST_REGS]; - static uint64_t current_constmap[HOST_REGS]; - static uint64_t constmap[MAXBLOCK][HOST_REGS]; + // contains 'real' consts at [i] insn, but may differ from what's actually + // loaded in host reg as 'final' value is always loaded, see get_final_value() + static uint32_t current_constmap[HOST_REGS]; + static uint32_t constmap[MAXBLOCK][HOST_REGS]; static struct regstat regs[MAXBLOCK]; static struct regstat branch_regs[MAXBLOCK]; static signed char minimum_free_regs[MAXBLOCK]; @@ -137,35 +216,41 @@ struct ll_entry static u_int will_dirty[MAXBLOCK]; static int ccadj[MAXBLOCK]; static int slen; - static u_int instr_addr[MAXBLOCK]; - static u_int link_addr[MAXBLOCK][3]; + static void *instr_addr[MAXBLOCK]; + static struct link_entry link_addr[MAXBLOCK]; static int linkcount; - static u_int stubs[MAXBLOCK*3][8]; + static struct code_stub stubs[MAXBLOCK*3]; static int stubcount; static u_int literals[1024][2]; static int literalcount; static int is_delayslot; - static int cop1_usable; static char shadow[1048576] __attribute__((aligned(16))); static void *copy; static int expirep; static u_int stop_after_jal; -#ifndef RAM_FIXED - static u_int ram_offset; -#else - static const u_int ram_offset=0; -#endif + static u_int f1_hack; // 0 - off, ~0 - capture address, else addr int new_dynarec_hacks; + int new_dynarec_hacks_pergame; + int new_dynarec_hacks_old; int new_dynarec_did_compile; + + #define HACK_ENABLED(x) ((new_dynarec_hacks | new_dynarec_hacks_pergame) & (x)) + + extern int cycle_count; // ... until end of the timeslice, counts -N -> 0 + extern int last_count; // last absolute target, often = next_interupt + extern int pcaddr; + extern int pending_exception; + extern int branch_target; + extern uintptr_t ram_offset; + extern uintptr_t mini_ht[32][2]; extern u_char restore_candidate[512]; - extern int cycle_count; /* registers that may be allocated */ /* 1-31 gpr */ -#define HIREG 32 // hi -#define LOREG 33 // lo -#define FSREG 34 // FPU status (FCSR) +#define LOREG 32 // lo +#define HIREG 33 // hi +//#define FSREG 34 // FPU status (FCSR) #define CSREG 35 // Coprocessor status #define CCREG 36 // Cycle count #define INVCP 37 // Pointer to invalid_code @@ -204,10 +289,10 @@ struct ll_entry #define COP0 15 // Coprocessor 0 #define COP1 16 // Coprocessor 1 #define C1LS 17 // Coprocessor 1 load/store -#define FJUMP 18 // Conditional branch (floating point) -#define FLOAT 19 // Floating point unit -#define FCONV 20 // Convert integer to float -#define FCOMP 21 // Floating point compare (sets FSREG) +//#define FJUMP 18 // Conditional branch (floating point) +//#define FLOAT 19 // Floating point unit +//#define FCONV 20 // Convert integer to float +//#define FCOMP 21 // Floating point compare (sets FSREG) #define SYSCALL 22// SYSCALL #define OTHER 23 // Other #define SPAN 24 // Branch/delay slot spans 2 pages @@ -218,29 +303,16 @@ struct ll_entry #define C2OP 29 // Coprocessor 2 operation #define INTCALL 30// Call interpreter to handle rare corner cases - /* stubs */ -#define CC_STUB 1 -#define FP_STUB 2 -#define LOADB_STUB 3 -#define LOADH_STUB 4 -#define LOADW_STUB 5 -#define LOADD_STUB 6 -#define LOADBU_STUB 7 -#define LOADHU_STUB 8 -#define STOREB_STUB 9 -#define STOREH_STUB 10 -#define STOREW_STUB 11 -#define STORED_STUB 12 -#define STORELR_STUB 13 -#define INVCODE_STUB 14 - /* branch codes */ #define TAKEN 1 #define NOTTAKEN 2 #define NULLDS 3 +#define DJT_1 (void *)1l // no function, just a label in assem_debug log +#define DJT_2 (void *)2l + // asm linkage -int new_recompile_block(int addr); +int new_recompile_block(u_int addr); void *get_addr_ht(u_int vaddr); void invalidate_block(u_int block); void invalidate_addr(u_int addr); @@ -248,31 +320,39 @@ void remove_hash(int vaddr); void dyna_linker(); void dyna_linker_ds(); void verify_code(); -void verify_code_vm(); void verify_code_ds(); void cc_interrupt(); void fp_exception(); void fp_exception_ds(); -void jump_syscall_hle(); -void jump_hlecall(); -void jump_intcall(); +void jump_to_new_pc(); +void call_gteStall(); void new_dyna_leave(); // Needed by assembler -static void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32); -static void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty); -static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr); -static void load_all_regs(signed char i_regmap[]); -static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]); +static void wb_register(signed char r, const signed char regmap[], uint64_t dirty); +static void wb_dirtys(const signed char i_regmap[], uint64_t i_dirty); +static void wb_needed_dirtys(const signed char i_regmap[], uint64_t i_dirty, int addr); +static void load_all_regs(const signed char i_regmap[]); +static void load_needed_regs(const signed char i_regmap[], const signed char next_regmap[]); static void load_regs_entry(int t); -static void load_all_consts(signed char regmap[],int is32,u_int dirty,int i); +static void load_all_consts(const signed char regmap[], u_int dirty, int i); +static u_int get_host_reglist(const signed char *regmap); -static int verify_dirty(u_int *ptr); +static int verify_dirty(const u_int *ptr); static int get_final_value(int hr, int i, int *value); -static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e); -static void add_to_linker(int addr,int target,int ext); - -static int tracedebug=0; +static void add_stub(enum stub_type type, void *addr, void *retaddr, + u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e); +static void add_stub_r(enum stub_type type, void *addr, void *retaddr, + int i, int addr_reg, const struct regstat *i_regs, int ccadj, u_int reglist); +static void add_to_linker(void *addr, u_int target, int ext); +static void *emit_fastpath_cmp_jump(int i, const struct regstat *i_regs, + int addr, int *offset_reg, int *addr_reg_override); +static void *get_direct_memhandler(void *table, u_int addr, + enum stub_type type, uintptr_t *addr_host); +static void cop2_do_stall_check(u_int op, int i, const struct regstat *i_regs, u_int reglist); +static void pass_args(int a0, int a1); +static void emit_far_jump(const void *f); +static void emit_far_call(const void *f); static void mprotect_w_x(void *start, void *end, int is_x) { @@ -301,7 +381,7 @@ static void start_tcache_write(void *start, void *end) static void end_tcache_write(void *start, void *end) { -#ifdef __arm__ +#if defined(__arm__) || defined(__aarch64__) size_t len = (char *)end - (char *)start; #if defined(__BLACKBERRY_QNX__) msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE); @@ -311,6 +391,10 @@ static void end_tcache_write(void *start, void *end) sceKernelSyncVMDomain(sceBlock, start, len); #elif defined(_3DS) ctr_flush_invalidate_cache(); + #elif defined(__aarch64__) + // as of 2021, __clear_cache() is still broken on arm64 + // so here is a custom one :( + clear_cache_arm64(start, end); #else __clear_cache(start, end); #endif @@ -323,8 +407,8 @@ static void end_tcache_write(void *start, void *end) static void *start_block(void) { u_char *end = out + MAX_OUTPUT_BLOCK_SIZE; - if (end > (u_char *)BASE_ADDR + (1< ndrc->translation_cache + sizeof(ndrc->translation_cache)) + end = ndrc->translation_cache + sizeof(ndrc->translation_cache); start_tcache_write(out, end); return out; } @@ -334,16 +418,68 @@ static void end_block(void *start) end_tcache_write(start, out); } +// also takes care of w^x mappings when patching code +static u_int needs_clear_cache[1<<(TARGET_SIZE_2-17)]; + +static void mark_clear_cache(void *target) +{ + uintptr_t offset = (u_char *)target - ndrc->translation_cache; + u_int mask = 1u << ((offset >> 12) & 31); + if (!(needs_clear_cache[offset >> 17] & mask)) { + char *start = (char *)((uintptr_t)target & ~4095l); + start_tcache_write(start, start + 4095); + needs_clear_cache[offset >> 17] |= mask; + } +} + +// Clearing the cache is rather slow on ARM Linux, so mark the areas +// that need to be cleared, and then only clear these areas once. +static void do_clear_cache(void) +{ + int i, j; + for (i = 0; i < (1<<(TARGET_SIZE_2-17)); i++) + { + u_int bitmap = needs_clear_cache[i]; + if (!bitmap) + continue; + for (j = 0; j < 32; j++) + { + u_char *start, *end; + if (!(bitmap & (1<translation_cache + i*131072 + j*4096; + end = start + 4095; + for (j++; j < 32; j++) { + if (!(bitmap & (1<>31)|1; - return (x * cycle_multiplier + s * 50) / 100; + int m = cycle_multiplier_active; + int s = (x >> 31) | 1; + return (x * m + s * 50) / 100; +} + +static int ds_writes_rjump_rs(int i) +{ + return dops[i].rs1 != 0 && (dops[i].rs1 == dops[i+1].rt1 || dops[i].rs1 == dops[i+1].rt2); } static u_int get_page(u_int vaddr) @@ -362,71 +498,72 @@ static u_int get_vpage(u_int vaddr) return get_page(vaddr); } +static struct ht_entry *hash_table_get(u_int vaddr) +{ + return &hash_table[((vaddr>>16)^vaddr)&0xFFFF]; +} + +static void hash_table_add(struct ht_entry *ht_bin, u_int vaddr, void *tcaddr) +{ + ht_bin->vaddr[1] = ht_bin->vaddr[0]; + ht_bin->tcaddr[1] = ht_bin->tcaddr[0]; + ht_bin->vaddr[0] = vaddr; + ht_bin->tcaddr[0] = tcaddr; +} + +// some messy ari64's code, seems to rely on unsigned 32bit overflow +static int doesnt_expire_soon(void *tcaddr) +{ + u_int diff = (u_int)((u_char *)tcaddr - out) << (32-TARGET_SIZE_2); + return diff > (u_int)(0x60000000 + (MAX_OUTPUT_BLOCK_SIZE << (32-TARGET_SIZE_2))); +} + // Get address from virtual address // This is called from the recompiled JR/JALR instructions -void *get_addr(u_int vaddr) +void noinline *get_addr(u_int vaddr) { - struct ll_entry *head = NULL; - u_int page = get_page(vaddr); - u_int vpage = get_vpage(vaddr); + u_int page=get_page(vaddr); + u_int vpage=get_vpage(vaddr); + struct ll_entry *head; //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page); head=jump_in[page]; - while(head!=NULL) - { - if(head->vaddr==vaddr) - { - //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr); - u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF]; - ht_bin[3]=ht_bin[1]; - ht_bin[2]=ht_bin[0]; - ht_bin[1]=(u_int)head->addr; - ht_bin[0]=vaddr; + while(head!=NULL) { + if(head->vaddr==vaddr) { + //printf("TRACE: count=%d next=%d (get_addr match %x: %p)\n",Count,next_interupt,vaddr,head->addr); + hash_table_add(hash_table_get(vaddr), vaddr, head->addr); return head->addr; } head=head->next; } head=jump_dirty[vpage]; - while(head!=NULL) - { - if(head->vaddr==vaddr) - { - //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr); + while(head!=NULL) { + if(head->vaddr==vaddr) { + //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %p)\n",Count,next_interupt,vaddr,head->addr); // Don't restore blocks which are about to expire from the cache - if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) - if(verify_dirty(head->addr)) - { - //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]); - invalid_code[vaddr>>12]=0; - inv_code_start=inv_code_end=~0; - if(vpage<2048) - { - restore_candidate[vpage>>3]|=1<<(vpage&7); - } - else - { - restore_candidate[page>>3]|=1<<(page&7); - } - u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF]; + if (doesnt_expire_soon(head->addr)) + if (verify_dirty(head->addr)) { + //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]); + invalid_code[vaddr>>12]=0; + inv_code_start=inv_code_end=~0; + if(vpage<2048) { + restore_candidate[vpage>>3]|=1<<(vpage&7); + } + else restore_candidate[page>>3]|=1<<(page&7); + struct ht_entry *ht_bin = hash_table_get(vaddr); + if (ht_bin->vaddr[0] == vaddr) + ht_bin->tcaddr[0] = head->addr; // Replace existing entry + else + hash_table_add(ht_bin, vaddr, head->addr); - if(ht_bin[0]==vaddr) - ht_bin[1]=(u_int)head->addr; // Replace existing entry - else - { - ht_bin[3]=ht_bin[1]; - ht_bin[2]=ht_bin[0]; - ht_bin[1]=(int)head->addr; - ht_bin[0]=vaddr; - } - return head->addr; - } + return head->addr; + } } head=head->next; } //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr); int r=new_recompile_block(vaddr); - if(r==0) - return get_addr(vaddr); - // Execute in unmapped page, generate pagefault exception + if(r==0) return get_addr(vaddr); + // Execute in unmapped page, generate pagefault execption Status|=2; Cause=(vaddr<<31)|0x8; EPC=(vaddr&1)?vaddr-5:vaddr; @@ -435,14 +572,13 @@ void *get_addr(u_int vaddr) EntryHi=BadVAddr&0xFFFFE000; return get_addr_ht(0x80000000); } - // Look up address in hash table first void *get_addr_ht(u_int vaddr) { //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr); - u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF]; - if(ht_bin[0]==vaddr) return (void *)ht_bin[1]; - if(ht_bin[2]==vaddr) return (void *)ht_bin[3]; + const struct ht_entry *ht_bin = hash_table_get(vaddr); + if (ht_bin->vaddr[0] == vaddr) return ht_bin->tcaddr[0]; + if (ht_bin->vaddr[1] == vaddr) return ht_bin->tcaddr[1]; return get_addr(vaddr); } @@ -452,7 +588,7 @@ void clear_all_regs(signed char regmap[]) for (hr=0;hrdirty>>hr)&1) { - reg=cur->regmap[hr]; - if(reg>=64) - if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1; - } - } -} - -void set_const(struct regstat *cur,signed char reg,uint64_t value) +static void set_const(struct regstat *cur, signed char reg, uint32_t value) { int hr; if(!reg) return; @@ -517,14 +636,10 @@ void set_const(struct regstat *cur,signed char reg,uint64_t value) cur->isconst|=1<regmap[hr]^64)==reg) { - cur->isconst|=1<>32; - } } } -void clear_const(struct regstat *cur,signed char reg) +static void clear_const(struct regstat *cur, signed char reg) { int hr; if(!reg) return; @@ -535,7 +650,7 @@ void clear_const(struct regstat *cur,signed char reg) } } -int is_const(struct regstat *cur,signed char reg) +static int is_const(struct regstat *cur, signed char reg) { int hr; if(reg<0) return 0; @@ -547,7 +662,8 @@ int is_const(struct regstat *cur,signed char reg) } return 0; } -uint64_t get_const(struct regstat *cur,signed char reg) + +static uint32_t get_const(struct regstat *cur, signed char reg) { int hr; if(!reg) return 0; @@ -557,7 +673,7 @@ uint64_t get_const(struct regstat *cur,signed char reg) } } SysPrintf("Unknown constant in r%d\n",reg); - exit(1); + abort(); } // Least soon needed registers @@ -573,7 +689,7 @@ void lsn(u_char hsn[], int i, int *preferred_reg) j=slen-i-1; break; } - if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000) + if (dops[i+j].is_ujump) { // Don't go past an unconditonal jump j++; @@ -582,22 +698,23 @@ void lsn(u_char hsn[], int i, int *preferred_reg) } for(;j>=0;j--) { - if(rs1[i+j]) hsn[rs1[i+j]]=j; - if(rs2[i+j]) hsn[rs2[i+j]]=j; - if(rt1[i+j]) hsn[rt1[i+j]]=j; - if(rt2[i+j]) hsn[rt2[i+j]]=j; - if(itype[i+j]==STORE || itype[i+j]==STORELR) { + if(dops[i+j].rs1) hsn[dops[i+j].rs1]=j; + if(dops[i+j].rs2) hsn[dops[i+j].rs2]=j; + if(dops[i+j].rt1) hsn[dops[i+j].rt1]=j; + if(dops[i+j].rt2) hsn[dops[i+j].rt2]=j; + if(dops[i+j].itype==STORE || dops[i+j].itype==STORELR) { // Stores can allocate zero - hsn[rs1[i+j]]=j; - hsn[rs2[i+j]]=j; + hsn[dops[i+j].rs1]=j; + hsn[dops[i+j].rs2]=j; } + if (ram_offset && (dops[i+j].is_load || dops[i+j].is_store)) + hsn[ROREG] = j; // On some architectures stores need invc_ptr #if defined(HOST_IMM8) - if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) { - hsn[INVCP]=j; - } + if (dops[i+j].is_store) + hsn[INVCP] = j; #endif - if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP)) + if(i+j>=0&&(dops[i+j].itype==UJUMP||dops[i+j].itype==CJUMP||dops[i+j].itype==SJUMP)) { hsn[CCREG]=j; b=j; @@ -612,37 +729,37 @@ void lsn(u_char hsn[], int i, int *preferred_reg) j=7-b;if(t+j>=slen) j=slen-t-1; for(;j>=0;j--) { - if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2; - if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2; - //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2; - //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2; + if(dops[t+j].rs1) if(hsn[dops[t+j].rs1]>j+b+2) hsn[dops[t+j].rs1]=j+b+2; + if(dops[t+j].rs2) if(hsn[dops[t+j].rs2]>j+b+2) hsn[dops[t+j].rs2]=j+b+2; + //if(dops[t+j].rt1) if(hsn[dops[t+j].rt1]>j+b+2) hsn[dops[t+j].rt1]=j+b+2; + //if(dops[t+j].rt2) if(hsn[dops[t+j].rt2]>j+b+2) hsn[dops[t+j].rt2]=j+b+2; } } // TODO: preferred register based on backward branch } // Delay slot should preferably not overwrite branch conditions or cycle count - if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) { - if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1; - if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1; + if (i > 0 && dops[i-1].is_jump) { + if(dops[i-1].rs1) if(hsn[dops[i-1].rs1]>1) hsn[dops[i-1].rs1]=1; + if(dops[i-1].rs2) if(hsn[dops[i-1].rs2]>1) hsn[dops[i-1].rs2]=1; hsn[CCREG]=1; // ...or hash tables hsn[RHASH]=1; hsn[RHTBL]=1; } // Coprocessor load/store needs FTEMP, even if not declared - if(itype[i]==C1LS||itype[i]==C2LS) { + if(dops[i].itype==C2LS) { hsn[FTEMP]=0; } // Load L/R also uses FTEMP as a temporary register - if(itype[i]==LOADLR) { + if(dops[i].itype==LOADLR) { hsn[FTEMP]=0; } // Also SWL/SWR/SDL/SDR - if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { + if(dops[i].opcode==0x2a||dops[i].opcode==0x2e||dops[i].opcode==0x2c||dops[i].opcode==0x2d) { hsn[FTEMP]=0; } // Don't remove the miniht registers - if(itype[i]==UJUMP||itype[i]==RJUMP) + if(dops[i].itype==UJUMP||dops[i].itype==RJUMP) { hsn[RHASH]=0; hsn[RHTBL]=0; @@ -656,7 +773,7 @@ int needed_again(int r, int i) int b=-1; int rn=10; - if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) + if (i > 0 && dops[i-1].is_ujump) { if(ba[i-1]start+slen*4-4) return 0; // Don't need any registers if exiting the block @@ -667,46 +784,27 @@ int needed_again(int r, int i) j=slen-i-1; break; } - if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000) + if (dops[i+j].is_ujump) { // Don't go past an unconditonal jump j++; break; } - if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d)) + if(dops[i+j].itype==SYSCALL||dops[i+j].itype==HLECALL||dops[i+j].itype==INTCALL||((source[i+j]&0xfc00003f)==0x0d)) { break; } } for(;j>=1;j--) { - if(rs1[i+j]==r) rn=j; - if(rs2[i+j]==r) rn=j; + if(dops[i+j].rs1==r) rn=j; + if(dops[i+j].rs2==r) rn=j; if((unneeded_reg[i+j]>>r)&1) rn=10; - if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP)) + if(i+j>=0&&(dops[i+j].itype==UJUMP||dops[i+j].itype==CJUMP||dops[i+j].itype==SJUMP)) { b=j; } } - /* - if(b>=0) - { - if(ba[i+b]>=start && ba[i+b]<(start+slen*4)) - { - // Follow first branch - int o=rn; - int t=(ba[i+b]-start)>>2; - j=7-b;if(t+j>=slen) j=slen-t-1; - for(;j>=0;j--) - { - if(!((unneeded_reg[t+j]>>r)&1)) { - if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2; - if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2; - } - else rn=o; - } - } - }*/ if(rn<10) return 1; (void)b; return 0; @@ -723,7 +821,7 @@ int loop_reg(int i, int r, int hr) j=slen-i-1; break; } - if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000) + if (dops[i+j].is_ujump) { // Don't go past an unconditonal jump j++; @@ -732,14 +830,14 @@ int loop_reg(int i, int r, int hr) } k=0; if(i>0){ - if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) + if(dops[i-1].itype==UJUMP||dops[i-1].itype==CJUMP||dops[i-1].itype==SJUMP) k--; } for(;k>r)&1)) return hr; - if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr; - if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP)) + assert(r < 64); + if((unneeded_reg[i+k]>>r)&1) return hr; + if(i+k>=0&&(dops[i+k].itype==UJUMP||dops[i+k].itype==CJUMP||dops[i+k].itype==SJUMP)) { if(ba[i+k]>=start && ba[i+k]<(start+i*4)) { @@ -762,8 +860,8 @@ void alloc_all(struct regstat *cur,int i) for(hr=0;hrregmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&& - ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i])) + if(((cur->regmap[hr]&63)!=dops[i].rs1)&&((cur->regmap[hr]&63)!=dops[i].rs2)&& + ((cur->regmap[hr]&63)!=dops[i].rt1)&&((cur->regmap[hr]&63)!=dops[i].rt2)) { cur->regmap[hr]=-1; cur->dirty&=~(1<tramp.f); i++) { + if (ndrc->tramp.f[i] == f || ndrc->tramp.f[i] == NULL) + break; + } + if (i == ARRAY_SIZE(ndrc->tramp.f)) { + SysPrintf("trampoline table is full, last func %p\n", f); + abort(); + } + if (ndrc->tramp.f[i] == NULL) { + start_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]); + ndrc->tramp.f[i] = f; + end_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]); + } + return &ndrc->tramp.ops[i]; +} + +static void emit_far_jump(const void *f) +{ + if (can_jump_or_call(f)) { + emit_jmp(f); + return; + } + + f = get_trampoline(f); + emit_jmp(f); +} + +static void emit_far_call(const void *f) +{ + if (can_jump_or_call(f)) { + emit_call(f); + return; + } + + f = get_trampoline(f); + emit_call(f); +} + // Add virtual address mapping to linked list void ll_add(struct ll_entry **head,int vaddr,void *addr) { @@ -811,39 +1016,39 @@ void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr // but don't return addresses which are about to expire from the cache void *check_addr(u_int vaddr) { - u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF]; - if(ht_bin[0]==vaddr) { - if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) - if(isclean(ht_bin[1])) return (void *)ht_bin[1]; - } - if(ht_bin[2]==vaddr) { - if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) - if(isclean(ht_bin[3])) return (void *)ht_bin[3]; + struct ht_entry *ht_bin = hash_table_get(vaddr); + size_t i; + for (i = 0; i < ARRAY_SIZE(ht_bin->vaddr); i++) { + if (ht_bin->vaddr[i] == vaddr) + if (doesnt_expire_soon((u_char *)ht_bin->tcaddr[i] - MAX_OUTPUT_BLOCK_SIZE)) + if (isclean(ht_bin->tcaddr[i])) + return ht_bin->tcaddr[i]; } u_int page=get_page(vaddr); struct ll_entry *head; head=jump_in[page]; - while(head!=NULL) { - if(head->vaddr==vaddr) { - if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) { + while (head != NULL) { + if (head->vaddr == vaddr) { + if (doesnt_expire_soon(head->addr)) { // Update existing entry with current address - if(ht_bin[0]==vaddr) { - ht_bin[1]=(int)head->addr; + if (ht_bin->vaddr[0] == vaddr) { + ht_bin->tcaddr[0] = head->addr; return head->addr; } - if(ht_bin[2]==vaddr) { - ht_bin[3]=(int)head->addr; + if (ht_bin->vaddr[1] == vaddr) { + ht_bin->tcaddr[1] = head->addr; return head->addr; } // Insert into hash table with low priority. // Don't evict existing entries, as they are probably // addresses that are being accessed frequently. - if(ht_bin[0]==-1) { - ht_bin[1]=(int)head->addr; - ht_bin[0]=vaddr; - }else if(ht_bin[2]==-1) { - ht_bin[3]=(int)head->addr; - ht_bin[2]=vaddr; + if (ht_bin->vaddr[0] == -1) { + ht_bin->vaddr[0] = vaddr; + ht_bin->tcaddr[0] = head->addr; + } + else if (ht_bin->vaddr[1] == -1) { + ht_bin->vaddr[1] = vaddr; + ht_bin->tcaddr[1] = head->addr; } return head->addr; } @@ -856,25 +1061,29 @@ void *check_addr(u_int vaddr) void remove_hash(int vaddr) { //printf("remove hash: %x\n",vaddr); - u_int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF]; - if(ht_bin[2]==vaddr) { - ht_bin[2]=ht_bin[3]=-1; + struct ht_entry *ht_bin = hash_table_get(vaddr); + if (ht_bin->vaddr[1] == vaddr) { + ht_bin->vaddr[1] = -1; + ht_bin->tcaddr[1] = NULL; } - if(ht_bin[0]==vaddr) { - ht_bin[0]=ht_bin[2]; - ht_bin[1]=ht_bin[3]; - ht_bin[2]=ht_bin[3]=-1; + if (ht_bin->vaddr[0] == vaddr) { + ht_bin->vaddr[0] = ht_bin->vaddr[1]; + ht_bin->tcaddr[0] = ht_bin->tcaddr[1]; + ht_bin->vaddr[1] = -1; + ht_bin->tcaddr[1] = NULL; } } -void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift) +static void ll_remove_matching_addrs(struct ll_entry **head, + uintptr_t base_offs_s, int shift) { struct ll_entry *next; while(*head) { - if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || - ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)) + uintptr_t o1 = (u_char *)(*head)->addr - ndrc->translation_cache; + uintptr_t o2 = o1 - MAX_OUTPUT_BLOCK_SIZE; + if ((o1 >> shift) == base_offs_s || (o2 >> shift) == base_offs_s) { - inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr); + inv_debug("EXP: Remove pointer to %p (%x)\n",(*head)->addr,(*head)->vaddr); remove_hash((*head)->vaddr); next=(*head)->next; free(*head); @@ -903,27 +1112,27 @@ void ll_clear(struct ll_entry **head) } // Dereference the pointers and remove if it matches -static void ll_kill_pointers(struct ll_entry *head,int addr,int shift) +static void ll_kill_pointers(struct ll_entry *head, + uintptr_t base_offs_s, int shift) { while(head) { - int ptr=get_pointer(head->addr); - inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr); - if(((ptr>>shift)==(addr>>shift)) || - (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))) + u_char *ptr = get_pointer(head->addr); + uintptr_t o1 = ptr - ndrc->translation_cache; + uintptr_t o2 = o1 - MAX_OUTPUT_BLOCK_SIZE; + inv_debug("EXP: Lookup pointer to %p at %p (%x)\n",ptr,head->addr,head->vaddr); + if ((o1 >> shift) == base_offs_s || (o2 >> shift) == base_offs_s) { - inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr); + inv_debug("EXP: Kill pointer at %p (%x)\n",head->addr,head->vaddr); void *host_addr=find_extjump_insn(head->addr); - #ifdef __arm__ - mark_clear_cache(host_addr); - #endif - set_jump_target((int)host_addr,(int)head->addr); + mark_clear_cache(host_addr); + set_jump_target(host_addr, head->addr); } head=head->next; } } // This is called when we write to a compiled block (see do_invstub) -void invalidate_page(u_int page) +static void invalidate_page(u_int page) { struct ll_entry *head; struct ll_entry *next; @@ -939,12 +1148,10 @@ void invalidate_page(u_int page) head=jump_out[page]; jump_out[page]=0; while(head!=NULL) { - inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr); + inv_debug("INVALIDATE: kill pointer to %x (%p)\n",head->vaddr,head->addr); void *host_addr=find_extjump_insn(head->addr); - #ifdef __arm__ - mark_clear_cache(host_addr); - #endif - set_jump_target((int)host_addr,(int)head->addr); + mark_clear_cache(host_addr); + set_jump_target(host_addr, head->addr); // point back to dyna_linker next=head->next; free(head); head=next; @@ -959,26 +1166,21 @@ static void invalidate_block_range(u_int block, u_int first, u_int last) assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages) assert(last2047||(head->vaddr>>12)==block) - { // Ignore vaddr hash collision - get_bounds((int)head->addr,&start,&end); - //printf("start: %x end: %x\n",start,end); - if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE) - { - if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) - { - if((((start-(u_int)rdram)>>12)&2047)>12)&2047; - if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047; + while(head!=NULL) { + if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision + u_char *start, *end; + get_bounds(head->addr, &start, &end); + //printf("start: %p end: %p\n", start, end); + if (page < 2048 && start >= rdram && end < rdram+RAM_SIZE) { + if (((start-rdram)>>12) <= page && ((end-1-rdram)>>12) >= page) { + if ((((start-rdram)>>12)&2047) < first) first = ((start-rdram)>>12)&2047; + if ((((end-1-rdram)>>12)&2047) > last) last = ((end-1-rdram)>>12)&2047; } } } @@ -1034,12 +1233,11 @@ void invalidate_addr(u_int addr) } for(;pg1<=page;pg1++) { for(head=jump_dirty[pg1];head!=NULL;head=head->next) { - u_int start,end; - get_bounds((int)head->addr,&start,&end); - if(ram_offset) { - start-=ram_offset; - end-=ram_offset; - } + u_char *start_h, *end_h; + u_int start, end; + get_bounds(head->addr, &start_h, &end_h); + start = (uintptr_t)start_h - ram_offset; + end = (uintptr_t)end_h - ram_offset; if(start<=addr_main&&addr_mainaddr_max) addr_max=end; @@ -1078,30 +1276,37 @@ void invalidate_all_pages(void) for(page=0;page<4096;page++) invalidate_page(page); for(page=0;page<1048576;page++) - { - if(!invalid_code[page]) - { + if(!invalid_code[page]) { restore_candidate[(page&2047)>>3]|=1<<(page&7); restore_candidate[((page&2047)>>3)+256]|=1<<(page&7); } - } - -#ifdef USE_MINI_HT + #ifdef USE_MINI_HT memset(mini_ht,-1,sizeof(mini_ht)); -#endif + #endif + do_clear_cache(); +} + +static void do_invstub(int n) +{ + literal_pool(20); + u_int reglist=stubs[n].a; + set_jump_target(stubs[n].addr, out); + save_regs(reglist); + if(stubs[n].b!=0) emit_mov(stubs[n].b,0); + emit_far_call(invalidate_addr); + restore_regs(reglist); + emit_jmp(stubs[n].retaddr); // return address } // Add an entry to jump_out after making a link -void add_link(u_int vaddr,void *src) +// src should point to code by emit_extjump2() +void add_jump_out(u_int vaddr,void *src) { u_int page=get_page(vaddr); - inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page); - int *ptr=(int *)(src+4); - assert((*ptr&0x0fff0000)==0x059f0000); - (void)ptr; + inv_debug("add_jump_out: %p -> %x (%d)\n",src,vaddr,page); + check_extjump2(src); ll_add(jump_out+page,vaddr,src); - //int ptr=get_pointer(src); - //inv_debug("add_link: Pointer is to %x\n",(int)ptr); + //inv_debug("add_jump_out: to %p\n",get_pointer(src)); } // If a code block was found to be unmodified (bit was set in @@ -1113,50 +1318,37 @@ void clean_blocks(u_int page) struct ll_entry *head; inv_debug("INV: clean_blocks page=%d\n",page); head=jump_dirty[page]; - while(head!=NULL) - { - if(!invalid_code[head->vaddr>>12]) - { + while(head!=NULL) { + if(!invalid_code[head->vaddr>>12]) { // Don't restore blocks which are about to expire from the cache - if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) - { - u_int start,end; - if(verify_dirty(head->addr)) - { - //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr); + if (doesnt_expire_soon(head->addr)) { + if(verify_dirty(head->addr)) { + u_char *start, *end; + //printf("Possibly Restore %x (%p)\n",head->vaddr, head->addr); u_int i; u_int inv=0; - get_bounds((int)head->addr,&start,&end); - if(start-(u_int)rdram>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) - { + get_bounds(head->addr, &start, &end); + if (start - rdram < RAM_SIZE) { + for (i = (start-rdram+0x80000000)>>12; i <= (end-1-rdram+0x80000000)>>12; i++) { inv|=invalid_code[i]; } } - else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) - { + else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) { inv=1; } - if(!inv) - { - void * clean_addr=(void *)get_clean_addr((int)head->addr); - if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) - { + if(!inv) { + void *clean_addr = get_clean_addr(head->addr); + if (doesnt_expire_soon(clean_addr)) { u_int ppage=page; - inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr); + inv_debug("INV: Restored %x (%p/%p)\n",head->vaddr, head->addr, clean_addr); //printf("page=%x, addr=%x\n",page,head->vaddr); //assert(head->vaddr>>12==(page|0x80000)); ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr); - u_int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF]; - if(ht_bin[0]==head->vaddr) - { - ht_bin[1]=(u_int)clean_addr; // Replace existing entry - } - if(ht_bin[2]==head->vaddr) - { - ht_bin[3]=(u_int)clean_addr; // Replace existing entry - } + struct ht_entry *ht_bin = hash_table_get(head->vaddr); + if (ht_bin->vaddr[0] == head->vaddr) + ht_bin->tcaddr[0] = clean_addr; // Replace existing entry + if (ht_bin->vaddr[1] == head->vaddr) + ht_bin->tcaddr[1] = clean_addr; // Replace existing entry } } } @@ -1166,326 +1358,443 @@ void clean_blocks(u_int page) } } -static void mov_alloc(struct regstat *current,int i) +/* Register allocation */ + +// Note: registers are allocated clean (unmodified state) +// if you intend to modify the register, you must call dirty_reg(). +static void alloc_reg(struct regstat *cur,int i,signed char reg) { - // Note: Don't need to actually alloc the source registers - if((~current->is32>>rs1[i])&1) + int r,hr; + int preferred_reg = PREFERRED_REG_FIRST + + reg % (PREFERRED_REG_LAST - PREFERRED_REG_FIRST + 1); + if (reg == CCREG) preferred_reg = HOST_CCREG; + if (reg == PTEMP || reg == FTEMP) preferred_reg = 12; + assert(PREFERRED_REG_FIRST != EXCLUDE_REG && EXCLUDE_REG != HOST_REGS); + + // Don't allocate unused registers + if((cur->u>>reg)&1) return; + + // see if it's already allocated + for(hr=0;hris32&=~(1LL<regmap[hr]==reg) return; + } + + // Keep the same mapping if the register was already allocated in a loop + preferred_reg = loop_reg(i,reg,preferred_reg); + + // Try to allocate the preferred register + if(cur->regmap[preferred_reg]==-1) { + cur->regmap[preferred_reg]=reg; + cur->dirty&=~(1<isconst&=~(1<regmap[preferred_reg]; + assert(r < 64); + if((cur->u>>r)&1) { + cur->regmap[preferred_reg]=reg; + cur->dirty&=~(1<isconst&=~(1<is32|=(1LL<regmap[hr]; + if(r>=0) { + assert(r < 64); + if((cur->u>>r)&1) {cur->regmap[hr]=-1;break;} + } } - clear_const(current,rs1[i]); - clear_const(current,rt1[i]); - dirty_reg(current,rt1[i]); -} -void shiftimm_alloc(struct regstat *current,int i) -{ - if(opcode2[i]<=0x3) // SLL/SRL/SRA - { - if(rt1[i]) { - if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]); - else lt1[i]=rs1[i]; - alloc_reg(current,i,rt1[i]); - current->is32|=1LL<>imm[i]); - if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]); + // Try to allocate any available register, but prefer + // registers that have not been used recently. + if (i > 0) { + for (hr = PREFERRED_REG_FIRST; ; ) { + if (cur->regmap[hr] < 0) { + int oldreg = regs[i-1].regmap[hr]; + if (oldreg < 0 || (oldreg != dops[i-1].rs1 && oldreg != dops[i-1].rs2 + && oldreg != dops[i-1].rt1 && oldreg != dops[i-1].rt2)) + { + cur->regmap[hr]=reg; + cur->dirty&=~(1<isconst&=~(1<=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA - { - if(rt1[i]) { - if(rs1[i]) alloc_reg64(current,i,rs1[i]); - alloc_reg64(current,i,rt1[i]); - current->is32&=~(1LL<regmap[hr] < 0) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<isconst&=~(1<is32&=~(1LL<regmap[0],cur->regmap[1],cur->regmap[2],cur->regmap[3],cur->regmap[5],cur->regmap[6],cur->regmap[7]); + //printf("hsn(%x): %d %d %d %d %d %d %d\n",start+i*4,hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]); + if(i>0) { + // Don't evict the cycle count at entry points, otherwise the entry + // stub will have to write it. + if(dops[i].bt&&hsn[CCREG]>2) hsn[CCREG]=2; + if (i>1 && hsn[CCREG] > 2 && dops[i-2].is_jump) hsn[CCREG]=2; + for(j=10;j>=3;j--) + { + // Alloc preferred register if available + if(hsn[r=cur->regmap[preferred_reg]&63]==j) { + for(hr=0;hrregmap[hr]&63)==r) { + cur->regmap[hr]=-1; + cur->dirty&=~(1<isconst&=~(1<regmap[preferred_reg]=reg; + return; + } + for(r=1;r<=MAXREG;r++) + { + if(hsn[r]==j&&r!=dops[i-1].rs1&&r!=dops[i-1].rs2&&r!=dops[i-1].rt1&&r!=dops[i-1].rt2) { + for(hr=0;hrregmap[hr]==r) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<isconst&=~(1<=0;j--) { - if(rt1[i]) { - alloc_reg64(current,i,rs1[i]); - if(imm[i]==32) { - alloc_reg64(current,i,rt1[i]); - current->is32&=~(1LL<is32|=1LL<regmap[hr]==r) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<isconst&=~(1<is32|=1LL<regmap[hr]==reg) return; + } + + // Try to allocate any available register + for(hr=HOST_REGS-1;hr>=0;hr--) { + if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<isconst&=~(1<=0;hr--) + { + r=cur->regmap[hr]; + if(r>=0) { + assert(r < 64); + if((cur->u>>r)&1) { + if(i==0||((unneeded_reg[i-1]>>r)&1)) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<isconst&=~(1<regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]); + if(i>0) { + // Don't evict the cycle count at entry points, otherwise the entry + // stub will have to write it. + if(dops[i].bt&&hsn[CCREG]>2) hsn[CCREG]=2; + if (i>1 && hsn[CCREG] > 2 && dops[i-2].is_jump) hsn[CCREG]=2; + for(j=10;j>=3;j--) + { + for(r=1;r<=MAXREG;r++) + { + if(hsn[r]==j&&r!=dops[i-1].rs1&&r!=dops[i-1].rs2&&r!=dops[i-1].rt1&&r!=dops[i-1].rt2) { + for(hr=0;hr2) { + if(cur->regmap[hr]==r) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<isconst&=~(1<=0;j--) + { + for(r=1;r<=MAXREG;r++) + { + if(hsn[r]==j) { + for(hr=0;hrregmap[hr]==r) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<isconst&=~(1<>imm[i]); + if(dops[i].opcode2==0x03) set_const(current,dops[i].rt1,v>>imm[i]); + } + else clear_const(current,dops[i].rt1); + } + } + else + { + clear_const(current,dops[i].rs1); + clear_const(current,dops[i].rt1); + } + + if(dops[i].opcode2>=0x38&&dops[i].opcode2<=0x3b) // DSLL/DSRL/DSRA + { + assert(0); + } + if(dops[i].opcode2==0x3c) // DSLL32 + { + assert(0); + } + if(dops[i].opcode2==0x3e) // DSRL32 + { + assert(0); + } + if(dops[i].opcode2==0x3f) // DSRA32 + { + assert(0); } } -void shift_alloc(struct regstat *current,int i) +static void shift_alloc(struct regstat *current,int i) { - if(rt1[i]) { - if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV + if(dops[i].rt1) { + if(dops[i].opcode2<=0x07) // SLLV/SRLV/SRAV { - if(rs1[i]) alloc_reg(current,i,rs1[i]); - if(rs2[i]) alloc_reg(current,i,rs2[i]); - alloc_reg(current,i,rt1[i]); - if(rt1[i]==rs2[i]) { + if(dops[i].rs1) alloc_reg(current,i,dops[i].rs1); + if(dops[i].rs2) alloc_reg(current,i,dops[i].rs2); + alloc_reg(current,i,dops[i].rt1); + if(dops[i].rt1==dops[i].rs2) { alloc_reg_temp(current,i,-1); minimum_free_regs[i]=1; } - current->is32|=1LL<is32&=~(1LL<=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU - if(rt1[i]) { - if(rs1[i]&&rs2[i]) { - alloc_reg(current,i,rs1[i]); - alloc_reg(current,i,rs2[i]); + if(dops[i].opcode2>=0x20&&dops[i].opcode2<=0x23) { // ADD/ADDU/SUB/SUBU + if(dops[i].rt1) { + if(dops[i].rs1&&dops[i].rs2) { + alloc_reg(current,i,dops[i].rs1); + alloc_reg(current,i,dops[i].rs2); } else { - if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]); - if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]); + if(dops[i].rs1&&needed_again(dops[i].rs1,i)) alloc_reg(current,i,dops[i].rs1); + if(dops[i].rs2&&needed_again(dops[i].rs2,i)) alloc_reg(current,i,dops[i].rs2); } - alloc_reg(current,i,rt1[i]); + alloc_reg(current,i,dops[i].rt1); } - current->is32|=1LL<is32>>rs1[i])&(current->is32>>rs2[i])&1)) - { - alloc_reg64(current,i,rs1[i]); - alloc_reg64(current,i,rs2[i]); - alloc_reg(current,i,rt1[i]); - } else { - alloc_reg(current,i,rs1[i]); - alloc_reg(current,i,rs2[i]); - alloc_reg(current,i,rt1[i]); - } + if(dops[i].opcode2==0x2a||dops[i].opcode2==0x2b) { // SLT/SLTU + if(dops[i].rt1) { + alloc_reg(current,i,dops[i].rs1); + alloc_reg(current,i,dops[i].rs2); + alloc_reg(current,i,dops[i].rt1); } - current->is32|=1LL<=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR - if(rt1[i]) { - if(rs1[i]&&rs2[i]) { - alloc_reg(current,i,rs1[i]); - alloc_reg(current,i,rs2[i]); + if(dops[i].opcode2>=0x24&&dops[i].opcode2<=0x27) { // AND/OR/XOR/NOR + if(dops[i].rt1) { + if(dops[i].rs1&&dops[i].rs2) { + alloc_reg(current,i,dops[i].rs1); + alloc_reg(current,i,dops[i].rs2); } else { - if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]); - if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]); - } - alloc_reg(current,i,rt1[i]); - if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1)) - { - if(!((current->uu>>rt1[i])&1)) { - alloc_reg64(current,i,rt1[i]); - } - if(get_reg(current->regmap,rt1[i]|64)>=0) { - if(rs1[i]&&rs2[i]) { - alloc_reg64(current,i,rs1[i]); - alloc_reg64(current,i,rs2[i]); - } - else - { - // Is is really worth it to keep 64-bit values in registers? - #ifdef NATIVE_64BIT - if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]); - if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]); - #endif - } - } - current->is32&=~(1LL<is32|=1LL<=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU - if(rt1[i]) { - if(rs1[i]&&rs2[i]) { - if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) { - alloc_reg64(current,i,rs1[i]); - alloc_reg64(current,i,rs2[i]); - alloc_reg64(current,i,rt1[i]); - } else { - alloc_reg(current,i,rs1[i]); - alloc_reg(current,i,rs2[i]); - alloc_reg(current,i,rt1[i]); - } - } - else { - alloc_reg(current,i,rt1[i]); - if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) { - // DADD used as move, or zeroing - // If we have a 64-bit source, then make the target 64 bits too - if(rs1[i]&&!((current->is32>>rs1[i])&1)) { - if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]); - alloc_reg64(current,i,rt1[i]); - } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) { - if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]); - alloc_reg64(current,i,rt1[i]); - } - if(opcode2[i]>=0x2e&&rs2[i]) { - // DSUB used as negation - 64-bit result - // If we have a 32-bit register, extend it to 64 bits - if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]); - alloc_reg64(current,i,rt1[i]); - } - } - } - if(rs1[i]&&rs2[i]) { - current->is32&=~(1LL<is32&=~(1LL<is32>>rs1[i])&1) - current->is32|=1LL<is32&=~(1LL<is32>>rs2[i])&1) - current->is32|=1LL<is32|=1LL<=0x2c&&dops[i].opcode2<=0x2f) { // DADD/DADDU/DSUB/DSUBU + assert(0); } - clear_const(current,rs1[i]); - clear_const(current,rs2[i]); - clear_const(current,rt1[i]); - dirty_reg(current,rt1[i]); + clear_const(current,dops[i].rs1); + clear_const(current,dops[i].rs2); + clear_const(current,dops[i].rt1); + dirty_reg(current,dops[i].rt1); } -void imm16_alloc(struct regstat *current,int i) +static void imm16_alloc(struct regstat *current,int i) { - if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]); - else lt1[i]=rs1[i]; - if(rt1[i]) alloc_reg(current,i,rt1[i]); - if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU - current->is32&=~(1LL<uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) { - // TODO: Could preserve the 32-bit flag if the immediate is zero - alloc_reg64(current,i,rt1[i]); - alloc_reg64(current,i,rs1[i]); - } - clear_const(current,rs1[i]); - clear_const(current,rt1[i]); + if(dops[i].rs1&&needed_again(dops[i].rs1,i)) alloc_reg(current,i,dops[i].rs1); + else dops[i].lt1=dops[i].rs1; + if(dops[i].rt1) alloc_reg(current,i,dops[i].rt1); + if(dops[i].opcode==0x18||dops[i].opcode==0x19) { // DADDI/DADDIU + assert(0); } - else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU - if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]); - current->is32|=1LL<=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI - if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) { - if(rs1[i]!=rt1[i]) { - if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]); - alloc_reg64(current,i,rt1[i]); - current->is32&=~(1LL<=0x0c&&dops[i].opcode<=0x0e) { // ANDI/ORI/XORI + if(is_const(current,dops[i].rs1)) { + int v=get_const(current,dops[i].rs1); + if(dops[i].opcode==0x0c) set_const(current,dops[i].rt1,v&imm[i]); + if(dops[i].opcode==0x0d) set_const(current,dops[i].rt1,v|imm[i]); + if(dops[i].opcode==0x0e) set_const(current,dops[i].rt1,v^imm[i]); } - else current->is32|=1LL<is32|=1LL<is32|=1LL<u&=~1LL; // Allow allocating r0 if it's the source register - if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]); - if(rt1[i]&&!((current->u>>rt1[i])&1)) { - alloc_reg(current,i,rt1[i]); - assert(get_reg(current->regmap,rt1[i])>=0); - if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD + clear_const(current,dops[i].rt1); + //if(dops[i].rs1!=dops[i].rt1&&needed_again(dops[i].rs1,i)) clear_const(current,dops[i].rs1); // Does this help or hurt? + if(!dops[i].rs1) current->u&=~1LL; // Allow allocating r0 if it's the source register + if (needed_again(dops[i].rs1, i)) + alloc_reg(current, i, dops[i].rs1); + if (ram_offset) + alloc_reg(current, i, ROREG); + if(dops[i].rt1&&!((current->u>>dops[i].rt1)&1)) { + alloc_reg(current,i,dops[i].rt1); + assert(get_reg(current->regmap,dops[i].rt1)>=0); + if(dops[i].opcode==0x27||dops[i].opcode==0x37) // LWU/LD { - current->is32&=~(1LL<is32&=~(1LL<is32|=1LL<u&=~1LL; // Allow allocating r0 if necessary - if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]); - alloc_reg(current,i,rs2[i]); - if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD - alloc_reg64(current,i,rs2[i]); - if(rs2[i]) alloc_reg(current,i,FTEMP); - } + clear_const(current,dops[i].rs2); + if(!(dops[i].rs2)) current->u&=~1LL; // Allow allocating r0 if necessary + if(needed_again(dops[i].rs1,i)) alloc_reg(current,i,dops[i].rs1); + alloc_reg(current,i,dops[i].rs2); + if(dops[i].opcode==0x2c||dops[i].opcode==0x2d||dops[i].opcode==0x3f) { // 64-bit SDL/SDR/SD + assert(0); + } + if (ram_offset) + alloc_reg(current, i, ROREG); #if defined(HOST_IMM8) // On CPUs without 32-bit immediates we need a pointer to invalid_code - else alloc_reg(current,i,INVCP); + alloc_reg(current, i, INVCP); #endif - if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR + if(dops[i].opcode==0x2a||dops[i].opcode==0x2e||dops[i].opcode==0x2c||dops[i].opcode==0x2d) { // SWL/SWL/SDL/SDR alloc_reg(current,i,FTEMP); } // We need a temporary register for address generation @@ -1535,31 +1843,20 @@ void store_alloc(struct regstat *current,int i) void c1ls_alloc(struct regstat *current,int i) { - //clear_const(current,rs1[i]); // FIXME - clear_const(current,rt1[i]); - if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]); + clear_const(current,dops[i].rt1); alloc_reg(current,i,CSREG); // Status - alloc_reg(current,i,FTEMP); - if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1 - alloc_reg64(current,i,FTEMP); - } - #if defined(HOST_IMM8) - // On CPUs without 32-bit immediates we need a pointer to invalid_code - else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1 - alloc_reg(current,i,INVCP); - #endif - // We need a temporary register for address generation - alloc_reg_temp(current,i,-1); } void c2ls_alloc(struct regstat *current,int i) { - clear_const(current,rt1[i]); - if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]); + clear_const(current,dops[i].rt1); + if(needed_again(dops[i].rs1,i)) alloc_reg(current,i,dops[i].rs1); alloc_reg(current,i,FTEMP); + if (ram_offset) + alloc_reg(current, i, ROREG); #if defined(HOST_IMM8) // On CPUs without 32-bit immediates we need a pointer to invalid_code - if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2 + if (dops[i].opcode == 0x3a) // SWC2 alloc_reg(current,i,INVCP); #endif // We need a temporary register for address generation @@ -1578,39 +1875,25 @@ void multdiv_alloc(struct regstat *current,int i) // case 0x1D: DMULTU // case 0x1E: DDIV // case 0x1F: DDIVU - clear_const(current,rs1[i]); - clear_const(current,rs2[i]); - if(rs1[i]&&rs2[i]) + clear_const(current,dops[i].rs1); + clear_const(current,dops[i].rs2); + alloc_cc(current,i); // for stalls + if(dops[i].rs1&&dops[i].rs2) { - if((opcode2[i]&4)==0) // 32-bit + if((dops[i].opcode2&4)==0) // 32-bit { current->u&=~(1LL<u&=~(1LL<is32|=1LL<is32|=1LL<u&=~(1LL<u&=~(1LL<uu&=~(1LL<uu&=~(1LL<10) alloc_reg64(current,i,LOREG); - alloc_reg64(current,i,rs1[i]); - alloc_reg64(current,i,rs2[i]); - alloc_all(current,i); - current->is32&=~(1LL<is32&=~(1LL<is32|=1LL<is32|=1LL<is32|=1LL<is32&=~(1LL<is32|=1LL<3) // MTC1/DMTC1/CTC1 + else if (dops[i].opcode2 > 3) // MTC2/CTC2 { - if(rs1[i]){ - clear_const(current,rs1[i]); - if(opcode2[i]==5) - alloc_reg64(current,i,rs1[i]); // DMTC1 - else - alloc_reg(current,i,rs1[i]); // MTC1/CTC1 - alloc_reg_temp(current,i,-1); + if(dops[i].rs1){ + clear_const(current,dops[i].rs1); + alloc_reg(current,i,dops[i].rs1); } else { current->u&=~1LL; alloc_reg(current,i,0); - alloc_reg_temp(current,i,-1); } } - minimum_free_regs[i]=1; -} -void fconv_alloc(struct regstat *current,int i) -{ - alloc_reg(current,i,CSREG); // Load status - alloc_reg_temp(current,i,-1); - minimum_free_regs[i]=1; -} -void float_alloc(struct regstat *current,int i) -{ - alloc_reg(current,i,CSREG); // Load status alloc_reg_temp(current,i,-1); minimum_free_regs[i]=1; } + void c2op_alloc(struct regstat *current,int i) { + alloc_cc(current,i); // for stalls + dirty_reg(current,CCREG); alloc_reg_temp(current,i,-1); } -void fcomp_alloc(struct regstat *current,int i) -{ - alloc_reg(current,i,CSREG); // Load status - alloc_reg(current,i,FSREG); // Load flags - dirty_reg(current,FSREG); // Flag will be modified - alloc_reg_temp(current,i,-1); - minimum_free_regs[i]=1; -} void syscall_alloc(struct regstat *current,int i) { @@ -1734,17 +1987,15 @@ void syscall_alloc(struct regstat *current,int i) void delayslot_alloc(struct regstat *current,int i) { - switch(itype[i]) - { + switch(dops[i].itype) { case UJUMP: case CJUMP: case SJUMP: case RJUMP: - case FJUMP: case SYSCALL: case HLECALL: case SPAN: - assem_debug("jump in the delay slot. this shouldn't happen.\n");//exit(1); + assem_debug("jump in the delay slot. this shouldn't happen.\n");//abort(); SysPrintf("Disabled speculative precompilation\n"); stop_after_jal=1; break; @@ -1778,8 +2029,9 @@ void delayslot_alloc(struct regstat *current,int i) cop0_alloc(current,i); break; case COP1: + break; case COP2: - cop1_alloc(current,i); + cop2_alloc(current,i); break; case C1LS: c1ls_alloc(current,i); @@ -1787,15 +2039,6 @@ void delayslot_alloc(struct regstat *current,int i) case C2LS: c2ls_alloc(current,i); break; - case FCONV: - fconv_alloc(current,i); - break; - case FLOAT: - float_alloc(current,i); - break; - case FCOMP: - fcomp_alloc(current,i); - break; case C2OP: c2op_alloc(current,i); break; @@ -1812,309 +2055,174 @@ static void pagespan_alloc(struct regstat *current,int i) alloc_all(current,i); alloc_cc(current,i); dirty_reg(current,CCREG); - if(opcode[i]==3) // JAL + if(dops[i].opcode==3) // JAL { alloc_reg(current,i,31); dirty_reg(current,31); } - if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR - { - alloc_reg(current,i,rs1[i]); - if (rt1[i]!=0) { - alloc_reg(current,i,rt1[i]); - dirty_reg(current,rt1[i]); - } - } - if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL + if(dops[i].opcode==0&&(dops[i].opcode2&0x3E)==8) // JR/JALR { - if(rs1[i]) alloc_reg(current,i,rs1[i]); - if(rs2[i]) alloc_reg(current,i,rs2[i]); - if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1)) - { - if(rs1[i]) alloc_reg64(current,i,rs1[i]); - if(rs2[i]) alloc_reg64(current,i,rs2[i]); + alloc_reg(current,i,dops[i].rs1); + if (dops[i].rt1!=0) { + alloc_reg(current,i,dops[i].rt1); + dirty_reg(current,dops[i].rt1); } } - else - if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL + if((dops[i].opcode&0x2E)==4) // BEQ/BNE/BEQL/BNEL { - if(rs1[i]) alloc_reg(current,i,rs1[i]); - if(!((current->is32>>rs1[i])&1)) - { - if(rs1[i]) alloc_reg64(current,i,rs1[i]); - } + if(dops[i].rs1) alloc_reg(current,i,dops[i].rs1); + if(dops[i].rs2) alloc_reg(current,i,dops[i].rs2); } else - if(opcode[i]==0x11) // BC1 + if((dops[i].opcode&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL { - alloc_reg(current,i,FSREG); - alloc_reg(current,i,CSREG); + if(dops[i].rs1) alloc_reg(current,i,dops[i].rs1); } //else ... } -static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e) +static void add_stub(enum stub_type type, void *addr, void *retaddr, + u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e) { - stubs[stubcount][0]=type; - stubs[stubcount][1]=addr; - stubs[stubcount][2]=retaddr; - stubs[stubcount][3]=a; - stubs[stubcount][4]=b; - stubs[stubcount][5]=c; - stubs[stubcount][6]=d; - stubs[stubcount][7]=e; + assert(stubcount < ARRAY_SIZE(stubs)); + stubs[stubcount].type = type; + stubs[stubcount].addr = addr; + stubs[stubcount].retaddr = retaddr; + stubs[stubcount].a = a; + stubs[stubcount].b = b; + stubs[stubcount].c = c; + stubs[stubcount].d = d; + stubs[stubcount].e = e; stubcount++; } +static void add_stub_r(enum stub_type type, void *addr, void *retaddr, + int i, int addr_reg, const struct regstat *i_regs, int ccadj, u_int reglist) +{ + add_stub(type, addr, retaddr, i, addr_reg, (uintptr_t)i_regs, ccadj, reglist); +} + // Write out a single register -void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32) +static void wb_register(signed char r, const signed char regmap[], uint64_t dirty) { int hr; for(hr=0;hr>hr)&1) { - if(regmap[hr]<64) { - emit_storereg(r,hr); - }else{ - emit_storereg(r|64,hr); - } + assert(regmap[hr]<64); + emit_storereg(r,hr); } } } } } -#if 0 -static int mchecksum(void) +static void wb_valid(signed char pre[],signed char entry[],u_int dirty_pre,u_int dirty,uint64_t u) { - //if(!tracedebug) return 0; - int i; - int sum=0; - for(i=0;i<2097152;i++) { - unsigned int temp=sum; - sum<<=1; - sum|=(~temp)>>31; - sum^=((u_int *)rdram)[i]; + //if(dirty_pre==dirty) return; + int hr,reg; + for(hr=0;hr>(reg&63))&1) { + if(reg>0) { + if(((dirty_pre&~dirty)>>hr)&1) { + if(reg>0&®<34) { + emit_storereg(reg,hr); + } + else if(reg>=64) { + assert(0); + } + } + } + } + } } - return sum; -} - -static int rchecksum(void) -{ - int i; - int sum=0; - for(i=0;i<64;i++) - sum^=((u_int *)reg)[i]; - return sum; -} - -static void rlist(void) -{ - int i; - printf("TRACE: "); - for(i=0;i<32;i++) - printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]); - printf("\n"); -} - -static void enabletrace(void) -{ - tracedebug=1; } -static void memdebug(int i) +// trashes r2 +static void pass_args(int a0, int a1) { - //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]); - //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum()); - //rlist(); - //if(tracedebug) { - //if(Count>=-2084597794) { - if((signed int)Count>=-2084597794&&(signed int)Count<0) { - //if(0) { - printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum()); - //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status); - //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]); - rlist(); - #ifdef __i386__ - printf("TRACE: %x\n",(&i)[-1]); - #endif - #ifdef __arm__ - int j; - printf("TRACE: %x \n",(&j)[10]); - printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]); - #endif - //fflush(stdout); + if(a0==1&&a1==0) { + // must swap + emit_mov(a0,2); emit_mov(a1,1); emit_mov(2,0); + } + else if(a0!=0&&a1==0) { + emit_mov(a1,1); + if (a0>=0) emit_mov(a0,0); + } + else { + if(a0>=0&&a0!=0) emit_mov(a0,0); + if(a1>=0&&a1!=1) emit_mov(a1,1); } - //printf("TRACE: %x\n",(&i)[-1]); } -#endif -void alu_assemble(int i,struct regstat *i_regs) +static void alu_assemble(int i, const struct regstat *i_regs) { - if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU - if(rt1[i]) { + if(dops[i].opcode2>=0x20&&dops[i].opcode2<=0x23) { // ADD/ADDU/SUB/SUBU + if(dops[i].rt1) { signed char s1,s2,t; - t=get_reg(i_regs->regmap,rt1[i]); + t=get_reg(i_regs->regmap,dops[i].rt1); if(t>=0) { - s1=get_reg(i_regs->regmap,rs1[i]); - s2=get_reg(i_regs->regmap,rs2[i]); - if(rs1[i]&&rs2[i]) { + s1=get_reg(i_regs->regmap,dops[i].rs1); + s2=get_reg(i_regs->regmap,dops[i].rs2); + if(dops[i].rs1&&dops[i].rs2) { assert(s1>=0); assert(s2>=0); - if(opcode2[i]&2) emit_sub(s1,s2,t); + if(dops[i].opcode2&2) emit_sub(s1,s2,t); else emit_add(s1,s2,t); } - else if(rs1[i]) { + else if(dops[i].rs1) { if(s1>=0) emit_mov(s1,t); - else emit_loadreg(rs1[i],t); + else emit_loadreg(dops[i].rs1,t); } - else if(rs2[i]) { + else if(dops[i].rs2) { if(s2>=0) { - if(opcode2[i]&2) emit_neg(s2,t); + if(dops[i].opcode2&2) emit_neg(s2,t); else emit_mov(s2,t); } else { - emit_loadreg(rs2[i],t); - if(opcode2[i]&2) emit_neg(t,t); + emit_loadreg(dops[i].rs2,t); + if(dops[i].opcode2&2) emit_neg(t,t); } } else emit_zeroreg(t); } } } - if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU - if(rt1[i]) { - signed char s1l,s2l,s1h,s2h,tl,th; - tl=get_reg(i_regs->regmap,rt1[i]); - th=get_reg(i_regs->regmap,rt1[i]|64); - if(tl>=0) { - s1l=get_reg(i_regs->regmap,rs1[i]); - s2l=get_reg(i_regs->regmap,rs2[i]); - s1h=get_reg(i_regs->regmap,rs1[i]|64); - s2h=get_reg(i_regs->regmap,rs2[i]|64); - if(rs1[i]&&rs2[i]) { - assert(s1l>=0); - assert(s2l>=0); - if(opcode2[i]&2) emit_subs(s1l,s2l,tl); - else emit_adds(s1l,s2l,tl); - if(th>=0) { - #ifdef INVERTED_CARRY - if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);} - #else - if(opcode2[i]&2) emit_sbc(s1h,s2h,th); - #endif - else emit_add(s1h,s2h,th); - } - } - else if(rs1[i]) { - if(s1l>=0) emit_mov(s1l,tl); - else emit_loadreg(rs1[i],tl); - if(th>=0) { - if(s1h>=0) emit_mov(s1h,th); - else emit_loadreg(rs1[i]|64,th); - } - } - else if(rs2[i]) { - if(s2l>=0) { - if(opcode2[i]&2) emit_negs(s2l,tl); - else emit_mov(s2l,tl); - } - else { - emit_loadreg(rs2[i],tl); - if(opcode2[i]&2) emit_negs(tl,tl); - } - if(th>=0) { - #ifdef INVERTED_CARRY - if(s2h>=0) emit_mov(s2h,th); - else emit_loadreg(rs2[i]|64,th); - if(opcode2[i]&2) { - emit_adcimm(-1,th); // x86 has inverted carry flag - emit_not(th,th); - } - #else - if(opcode2[i]&2) { - if(s2h>=0) emit_rscimm(s2h,0,th); - else { - emit_loadreg(rs2[i]|64,th); - emit_rscimm(th,0,th); - } - }else{ - if(s2h>=0) emit_mov(s2h,th); - else emit_loadreg(rs2[i]|64,th); - } - #endif - } - } - else { - emit_zeroreg(tl); - if(th>=0) emit_zeroreg(th); - } - } - } + if(dops[i].opcode2>=0x2c&&dops[i].opcode2<=0x2f) { // DADD/DADDU/DSUB/DSUBU + assert(0); } - if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU - if(rt1[i]) { - signed char s1l,s1h,s2l,s2h,t; - if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)) + if(dops[i].opcode2==0x2a||dops[i].opcode2==0x2b) { // SLT/SLTU + if(dops[i].rt1) { + signed char s1l,s2l,t; { - t=get_reg(i_regs->regmap,rt1[i]); - //assert(t>=0); - if(t>=0) { - s1l=get_reg(i_regs->regmap,rs1[i]); - s1h=get_reg(i_regs->regmap,rs1[i]|64); - s2l=get_reg(i_regs->regmap,rs2[i]); - s2h=get_reg(i_regs->regmap,rs2[i]|64); - if(rs2[i]==0) // rx=0); - if(opcode2[i]==0x2a) // SLT - emit_shrimm(s1h,31,t); - else // SLTU (unsigned can not be less than zero) - emit_zeroreg(t); - } - else if(rs1[i]==0) // r0=0); - if(opcode2[i]==0x2a) // SLT - emit_set_gz64_32(s2h,s2l,t); - else // SLTU (set if not zero) - emit_set_nz64_32(s2h,s2l,t); - } - else { - assert(s1l>=0);assert(s1h>=0); - assert(s2l>=0);assert(s2h>=0); - if(opcode2[i]==0x2a) // SLT - emit_set_if_less64_32(s1h,s1l,s2h,s2l,t); - else // SLTU - emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t); - } - } - } else { - t=get_reg(i_regs->regmap,rt1[i]); + t=get_reg(i_regs->regmap,dops[i].rt1); //assert(t>=0); if(t>=0) { - s1l=get_reg(i_regs->regmap,rs1[i]); - s2l=get_reg(i_regs->regmap,rs2[i]); - if(rs2[i]==0) // rxregmap,dops[i].rs1); + s2l=get_reg(i_regs->regmap,dops[i].rs2); + if(dops[i].rs2==0) // rx=0); - if(opcode2[i]==0x2a) // SLT + if(dops[i].opcode2==0x2a&&dops[i].rs1!=0) { // SLT + assert(s1l>=0); emit_shrimm(s1l,31,t); - else // SLTU (unsigned can not be less than zero) + } + else // SLTU (unsigned can not be less than zero, 0<0) emit_zeroreg(t); } - else if(rs1[i]==0) // r0=0); - if(opcode2[i]==0x2a) // SLT + if(dops[i].opcode2==0x2a) // SLT emit_set_gz32(s2l,t); else // SLTU (set if not zero) emit_set_nz32(s2l,t); } else{ assert(s1l>=0);assert(s2l>=0); - if(opcode2[i]==0x2a) // SLT + if(dops[i].opcode2==0x2a) // SLT emit_set_if_less32(s1l,s2l,t); else // SLTU emit_set_if_carry32(s1l,s2l,t); @@ -2123,153 +2231,61 @@ void alu_assemble(int i,struct regstat *i_regs) } } } - if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR - if(rt1[i]) { - signed char s1l,s1h,s2l,s2h,th,tl; - tl=get_reg(i_regs->regmap,rt1[i]); - th=get_reg(i_regs->regmap,rt1[i]|64); - if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0) - { - assert(tl>=0); - if(tl>=0) { - s1l=get_reg(i_regs->regmap,rs1[i]); - s1h=get_reg(i_regs->regmap,rs1[i]|64); - s2l=get_reg(i_regs->regmap,rs2[i]); - s2h=get_reg(i_regs->regmap,rs2[i]|64); - if(rs1[i]&&rs2[i]) { - assert(s1l>=0);assert(s1h>=0); - assert(s2l>=0);assert(s2h>=0); - if(opcode2[i]==0x24) { // AND - emit_and(s1l,s2l,tl); - emit_and(s1h,s2h,th); - } else - if(opcode2[i]==0x25) { // OR - emit_or(s1l,s2l,tl); - emit_or(s1h,s2h,th); - } else - if(opcode2[i]==0x26) { // XOR - emit_xor(s1l,s2l,tl); - emit_xor(s1h,s2h,th); - } else - if(opcode2[i]==0x27) { // NOR - emit_or(s1l,s2l,tl); - emit_or(s1h,s2h,th); - emit_not(tl,tl); - emit_not(th,th); - } - } - else - { - if(opcode2[i]==0x24) { // AND - emit_zeroreg(tl); - emit_zeroreg(th); - } else - if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR - if(rs1[i]){ - if(s1l>=0) emit_mov(s1l,tl); - else emit_loadreg(rs1[i],tl); - if(s1h>=0) emit_mov(s1h,th); - else emit_loadreg(rs1[i]|64,th); - } - else - if(rs2[i]){ - if(s2l>=0) emit_mov(s2l,tl); - else emit_loadreg(rs2[i],tl); - if(s2h>=0) emit_mov(s2h,th); - else emit_loadreg(rs2[i]|64,th); - } - else{ - emit_zeroreg(tl); - emit_zeroreg(th); - } - } else - if(opcode2[i]==0x27) { // NOR - if(rs1[i]){ - if(s1l>=0) emit_not(s1l,tl); - else{ - emit_loadreg(rs1[i],tl); - emit_not(tl,tl); - } - if(s1h>=0) emit_not(s1h,th); - else{ - emit_loadreg(rs1[i]|64,th); - emit_not(th,th); - } - } - else - if(rs2[i]){ - if(s2l>=0) emit_not(s2l,tl); - else{ - emit_loadreg(rs2[i],tl); - emit_not(tl,tl); - } - if(s2h>=0) emit_not(s2h,th); - else{ - emit_loadreg(rs2[i]|64,th); - emit_not(th,th); - } - } - else { - emit_movimm(-1,tl); - emit_movimm(-1,th); - } - } - } - } - } - else + if(dops[i].opcode2>=0x24&&dops[i].opcode2<=0x27) { // AND/OR/XOR/NOR + if(dops[i].rt1) { + signed char s1l,s2l,tl; + tl=get_reg(i_regs->regmap,dops[i].rt1); { - // 32 bit if(tl>=0) { - s1l=get_reg(i_regs->regmap,rs1[i]); - s2l=get_reg(i_regs->regmap,rs2[i]); - if(rs1[i]&&rs2[i]) { + s1l=get_reg(i_regs->regmap,dops[i].rs1); + s2l=get_reg(i_regs->regmap,dops[i].rs2); + if(dops[i].rs1&&dops[i].rs2) { assert(s1l>=0); assert(s2l>=0); - if(opcode2[i]==0x24) { // AND + if(dops[i].opcode2==0x24) { // AND emit_and(s1l,s2l,tl); } else - if(opcode2[i]==0x25) { // OR + if(dops[i].opcode2==0x25) { // OR emit_or(s1l,s2l,tl); } else - if(opcode2[i]==0x26) { // XOR + if(dops[i].opcode2==0x26) { // XOR emit_xor(s1l,s2l,tl); } else - if(opcode2[i]==0x27) { // NOR + if(dops[i].opcode2==0x27) { // NOR emit_or(s1l,s2l,tl); emit_not(tl,tl); } } else { - if(opcode2[i]==0x24) { // AND + if(dops[i].opcode2==0x24) { // AND emit_zeroreg(tl); } else - if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR - if(rs1[i]){ + if(dops[i].opcode2==0x25||dops[i].opcode2==0x26) { // OR/XOR + if(dops[i].rs1){ if(s1l>=0) emit_mov(s1l,tl); - else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry? + else emit_loadreg(dops[i].rs1,tl); // CHECK: regmap_entry? } else - if(rs2[i]){ + if(dops[i].rs2){ if(s2l>=0) emit_mov(s2l,tl); - else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry? + else emit_loadreg(dops[i].rs2,tl); // CHECK: regmap_entry? } else emit_zeroreg(tl); } else - if(opcode2[i]==0x27) { // NOR - if(rs1[i]){ + if(dops[i].opcode2==0x27) { // NOR + if(dops[i].rs1){ if(s1l>=0) emit_not(s1l,tl); else { - emit_loadreg(rs1[i],tl); + emit_loadreg(dops[i].rs1,tl); emit_not(tl,tl); } } else - if(rs2[i]){ + if(dops[i].rs2){ if(s2l>=0) emit_not(s2l,tl); else { - emit_loadreg(rs2[i],tl); + emit_loadreg(dops[i].rs2,tl); emit_not(tl,tl); } } @@ -2282,12 +2298,12 @@ void alu_assemble(int i,struct regstat *i_regs) } } -void imm16_assemble(int i,struct regstat *i_regs) +static void imm16_assemble(int i, const struct regstat *i_regs) { - if (opcode[i]==0x0f) { // LUI - if(rt1[i]) { + if (dops[i].opcode==0x0f) { // LUI + if(dops[i].rt1) { signed char t; - t=get_reg(i_regs->regmap,rt1[i]); + t=get_reg(i_regs->regmap,dops[i].rt1); //assert(t>=0); if(t>=0) { if(!((i_regs->isconst>>t)&1)) @@ -2295,18 +2311,18 @@ void imm16_assemble(int i,struct regstat *i_regs) } } } - if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU - if(rt1[i]) { + if(dops[i].opcode==0x08||dops[i].opcode==0x09) { // ADDI/ADDIU + if(dops[i].rt1) { signed char s,t; - t=get_reg(i_regs->regmap,rt1[i]); - s=get_reg(i_regs->regmap,rs1[i]); - if(rs1[i]) { + t=get_reg(i_regs->regmap,dops[i].rt1); + s=get_reg(i_regs->regmap,dops[i].rs1); + if(dops[i].rs1) { //assert(t>=0); //assert(s>=0); if(t>=0) { if(!((i_regs->isconst>>t)&1)) { if(s<0) { - if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t); + if(i_regs->regmap_entry[t]!=dops[i].rs1) emit_loadreg(dops[i].rs1,t); emit_addimm(t,imm[i],t); }else{ if(!((i_regs->wasconst>>s)&1)) @@ -2324,45 +2340,33 @@ void imm16_assemble(int i,struct regstat *i_regs) } } } - if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU - if(rt1[i]) { - signed char sh,sl,th,tl; - th=get_reg(i_regs->regmap,rt1[i]|64); - tl=get_reg(i_regs->regmap,rt1[i]); - sh=get_reg(i_regs->regmap,rs1[i]|64); - sl=get_reg(i_regs->regmap,rs1[i]); + if(dops[i].opcode==0x18||dops[i].opcode==0x19) { // DADDI/DADDIU + if(dops[i].rt1) { + signed char sl,tl; + tl=get_reg(i_regs->regmap,dops[i].rt1); + sl=get_reg(i_regs->regmap,dops[i].rs1); if(tl>=0) { - if(rs1[i]) { - assert(sh>=0); + if(dops[i].rs1) { assert(sl>=0); - if(th>=0) { - emit_addimm64_32(sh,sl,imm[i],th,tl); - } - else { - emit_addimm(sl,imm[i],tl); - } + emit_addimm(sl,imm[i],tl); } else { emit_movimm(imm[i],tl); - if(th>=0) emit_movimm(((signed int)imm[i])>>31,th); } } } } - else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU - if(rt1[i]) { - //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug - signed char sh,sl,t; - t=get_reg(i_regs->regmap,rt1[i]); - sh=get_reg(i_regs->regmap,rs1[i]|64); - sl=get_reg(i_regs->regmap,rs1[i]); + else if(dops[i].opcode==0x0a||dops[i].opcode==0x0b) { // SLTI/SLTIU + if(dops[i].rt1) { + //assert(dops[i].rs1!=0); // r0 might be valid, but it's probably a bug + signed char sl,t; + t=get_reg(i_regs->regmap,dops[i].rt1); + sl=get_reg(i_regs->regmap,dops[i].rs1); //assert(t>=0); if(t>=0) { - if(rs1[i]>0) { - if(sh<0) assert((i_regs->was32>>rs1[i])&1); - if(sh<0||((i_regs->was32>>rs1[i])&1)) { - if(opcode[i]==0x0a) { // SLTI + if(dops[i].rs1>0) { + if(dops[i].opcode==0x0a) { // SLTI if(sl<0) { - if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t); + if(i_regs->regmap_entry[t]!=dops[i].rs1) emit_loadreg(dops[i].rs1,t); emit_slti32(t,imm[i],t); }else{ emit_slti32(sl,imm[i],t); @@ -2370,23 +2374,16 @@ void imm16_assemble(int i,struct regstat *i_regs) } else { // SLTIU if(sl<0) { - if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t); + if(i_regs->regmap_entry[t]!=dops[i].rs1) emit_loadreg(dops[i].rs1,t); emit_sltiu32(t,imm[i],t); }else{ emit_sltiu32(sl,imm[i],t); } } - }else{ // 64-bit - assert(sl>=0); - if(opcode[i]==0x0a) // SLTI - emit_slti64_32(sh,sl,imm[i],t); - else // SLTIU - emit_sltiu64_32(sh,sl,imm[i],t); - } }else{ // SLTI(U) with r0 is just stupid, // nonetheless examples can be found - if(opcode[i]==0x0a) // SLTI + if(dops[i].opcode==0x0a) // SLTI if(0=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI - if(rt1[i]) { - signed char sh,sl,th,tl; - th=get_reg(i_regs->regmap,rt1[i]|64); - tl=get_reg(i_regs->regmap,rt1[i]); - sh=get_reg(i_regs->regmap,rs1[i]|64); - sl=get_reg(i_regs->regmap,rs1[i]); + else if(dops[i].opcode>=0x0c&&dops[i].opcode<=0x0e) { // ANDI/ORI/XORI + if(dops[i].rt1) { + signed char sl,tl; + tl=get_reg(i_regs->regmap,dops[i].rt1); + sl=get_reg(i_regs->regmap,dops[i].rs1); if(tl>=0 && !((i_regs->isconst>>tl)&1)) { - if(opcode[i]==0x0c) //ANDI + if(dops[i].opcode==0x0c) //ANDI { - if(rs1[i]) { + if(dops[i].rs1) { if(sl<0) { - if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl); + if(i_regs->regmap_entry[tl]!=dops[i].rs1) emit_loadreg(dops[i].rs1,tl); emit_andimm(tl,imm[i],tl); }else{ if(!((i_regs->wasconst>>sl)&1)) @@ -2421,22 +2416,14 @@ void imm16_assemble(int i,struct regstat *i_regs) } else emit_zeroreg(tl); - if(th>=0) emit_zeroreg(th); } else { - if(rs1[i]) { + if(dops[i].rs1) { if(sl<0) { - if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl); + if(i_regs->regmap_entry[tl]!=dops[i].rs1) emit_loadreg(dops[i].rs1,tl); } - if(th>=0) { - if(sh<0) { - emit_loadreg(rs1[i]|64,th); - }else{ - emit_mov(sh,th); - } - } - if(opcode[i]==0x0d) { // ORI + if(dops[i].opcode==0x0d) { // ORI if(sl<0) { emit_orimm(tl,imm[i],tl); }else{ @@ -2446,7 +2433,7 @@ void imm16_assemble(int i,struct regstat *i_regs) emit_movimm(constmap[i][sl]|imm[i],tl); } } - if(opcode[i]==0x0e) { // XORI + if(dops[i].opcode==0x0e) { // XORI if(sl<0) { emit_xorimm(tl,imm[i],tl); }else{ @@ -2459,7 +2446,6 @@ void imm16_assemble(int i,struct regstat *i_regs) } else { emit_movimm(imm[i],tl); - if(th>=0) emit_zeroreg(th); } } } @@ -2467,33 +2453,33 @@ void imm16_assemble(int i,struct regstat *i_regs) } } -void shiftimm_assemble(int i,struct regstat *i_regs) +static void shiftimm_assemble(int i, const struct regstat *i_regs) { - if(opcode2[i]<=0x3) // SLL/SRL/SRA + if(dops[i].opcode2<=0x3) // SLL/SRL/SRA { - if(rt1[i]) { + if(dops[i].rt1) { signed char s,t; - t=get_reg(i_regs->regmap,rt1[i]); - s=get_reg(i_regs->regmap,rs1[i]); + t=get_reg(i_regs->regmap,dops[i].rt1); + s=get_reg(i_regs->regmap,dops[i].rs1); //assert(t>=0); if(t>=0&&!((i_regs->isconst>>t)&1)){ - if(rs1[i]==0) + if(dops[i].rs1==0) { emit_zeroreg(t); } else { - if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t); + if(s<0&&i_regs->regmap_entry[t]!=dops[i].rs1) emit_loadreg(dops[i].rs1,t); if(imm[i]) { - if(opcode2[i]==0) // SLL + if(dops[i].opcode2==0) // SLL { emit_shlimm(s<0?t:s,imm[i],t); } - if(opcode2[i]==2) // SRL + if(dops[i].opcode2==2) // SRL { emit_shrimm(s<0?t:s,imm[i],t); } - if(opcode2[i]==3) // SRA + if(dops[i].opcode2==3) // SRA { emit_sarimm(s<0?t:s,imm[i],t); } @@ -2503,131 +2489,285 @@ void shiftimm_assemble(int i,struct regstat *i_regs) } } } - //emit_storereg(rt1[i],t); //DEBUG + //emit_storereg(dops[i].rt1,t); //DEBUG } } - if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA + if(dops[i].opcode2>=0x38&&dops[i].opcode2<=0x3b) // DSLL/DSRL/DSRA { - if(rt1[i]) { - signed char sh,sl,th,tl; - th=get_reg(i_regs->regmap,rt1[i]|64); - tl=get_reg(i_regs->regmap,rt1[i]); - sh=get_reg(i_regs->regmap,rs1[i]|64); - sl=get_reg(i_regs->regmap,rs1[i]); - if(tl>=0) { - if(rs1[i]==0) - { - emit_zeroreg(tl); - if(th>=0) emit_zeroreg(th); - } - else - { - assert(sl>=0); - assert(sh>=0); - if(imm[i]) { - if(opcode2[i]==0x38) // DSLL - { - if(th>=0) emit_shldimm(sh,sl,imm[i],th); - emit_shlimm(sl,imm[i],tl); - } - if(opcode2[i]==0x3a) // DSRL - { - emit_shrdimm(sl,sh,imm[i],tl); - if(th>=0) emit_shrimm(sh,imm[i],th); - } - if(opcode2[i]==0x3b) // DSRA - { - emit_shrdimm(sl,sh,imm[i],tl); - if(th>=0) emit_sarimm(sh,imm[i],th); - } - }else{ - // Shift by zero - if(sl!=tl) emit_mov(sl,tl); - if(th>=0&&sh!=th) emit_mov(sh,th); - } - } - } - } + assert(0); } - if(opcode2[i]==0x3c) // DSLL32 + if(dops[i].opcode2==0x3c) // DSLL32 { - if(rt1[i]) { - signed char sl,tl,th; - tl=get_reg(i_regs->regmap,rt1[i]); - th=get_reg(i_regs->regmap,rt1[i]|64); - sl=get_reg(i_regs->regmap,rs1[i]); - if(th>=0||tl>=0){ - assert(tl>=0); - assert(th>=0); - assert(sl>=0); - emit_mov(sl,th); - emit_zeroreg(tl); - if(imm[i]>32) - { - emit_shlimm(th,imm[i]&31,th); - } - } - } + assert(0); } - if(opcode2[i]==0x3e) // DSRL32 + if(dops[i].opcode2==0x3e) // DSRL32 { - if(rt1[i]) { - signed char sh,tl,th; - tl=get_reg(i_regs->regmap,rt1[i]); - th=get_reg(i_regs->regmap,rt1[i]|64); - sh=get_reg(i_regs->regmap,rs1[i]|64); - if(tl>=0){ - assert(sh>=0); - emit_mov(sh,tl); - if(th>=0) emit_zeroreg(th); - if(imm[i]>32) - { - emit_shrimm(tl,imm[i]&31,tl); - } - } - } + assert(0); } - if(opcode2[i]==0x3f) // DSRA32 + if(dops[i].opcode2==0x3f) // DSRA32 { - if(rt1[i]) { - signed char sh,tl; - tl=get_reg(i_regs->regmap,rt1[i]); - sh=get_reg(i_regs->regmap,rs1[i]|64); - if(tl>=0){ - assert(sh>=0); - emit_mov(sh,tl); - if(imm[i]>32) - { - emit_sarimm(tl,imm[i]&31,tl); - } - } - } + assert(0); } } #ifndef shift_assemble -void shift_assemble(int i,struct regstat *i_regs) +static void shift_assemble(int i, const struct regstat *i_regs) { - printf("Need shift_assemble for this architecture.\n"); - exit(1); + signed char s,t,shift; + if (dops[i].rt1 == 0) + return; + assert(dops[i].opcode2<=0x07); // SLLV/SRLV/SRAV + t = get_reg(i_regs->regmap, dops[i].rt1); + s = get_reg(i_regs->regmap, dops[i].rs1); + shift = get_reg(i_regs->regmap, dops[i].rs2); + if (t < 0) + return; + + if(dops[i].rs1==0) + emit_zeroreg(t); + else if(dops[i].rs2==0) { + assert(s>=0); + if(s!=t) emit_mov(s,t); + } + else { + host_tempreg_acquire(); + emit_andimm(shift,31,HOST_TEMPREG); + switch(dops[i].opcode2) { + case 4: // SLLV + emit_shl(s,HOST_TEMPREG,t); + break; + case 6: // SRLV + emit_shr(s,HOST_TEMPREG,t); + break; + case 7: // SRAV + emit_sar(s,HOST_TEMPREG,t); + break; + default: + assert(0); + } + host_tempreg_release(); + } } + #endif -void load_assemble(int i,struct regstat *i_regs) +enum { + MTYPE_8000 = 0, + MTYPE_8020, + MTYPE_0000, + MTYPE_A000, + MTYPE_1F80, +}; + +static int get_ptr_mem_type(u_int a) +{ + if(a < 0x00200000) { + if(a<0x1000&&((start>>20)==0xbfc||(start>>24)==0xa0)) + // return wrong, must use memhandler for BIOS self-test to pass + // 007 does similar stuff from a00 mirror, weird stuff + return MTYPE_8000; + return MTYPE_0000; + } + if(0x1f800000 <= a && a < 0x1f801000) + return MTYPE_1F80; + if(0x80200000 <= a && a < 0x80800000) + return MTYPE_8020; + if(0xa0000000 <= a && a < 0xa0200000) + return MTYPE_A000; + return MTYPE_8000; +} + +static int get_ro_reg(const struct regstat *i_regs, int host_tempreg_free) +{ + int r = get_reg(i_regs->regmap, ROREG); + if (r < 0 && host_tempreg_free) { + host_tempreg_acquire(); + emit_loadreg(ROREG, r = HOST_TEMPREG); + } + if (r < 0) + abort(); + return r; +} + +static void *emit_fastpath_cmp_jump(int i, const struct regstat *i_regs, + int addr, int *offset_reg, int *addr_reg_override) +{ + void *jaddr = NULL; + int type = 0; + int mr = dops[i].rs1; + *offset_reg = -1; + if(((smrv_strong|smrv_weak)>>mr)&1) { + type=get_ptr_mem_type(smrv[mr]); + //printf("set %08x @%08x r%d %d\n", smrv[mr], start+i*4, mr, type); + } + else { + // use the mirror we are running on + type=get_ptr_mem_type(start); + //printf("set nospec @%08x r%d %d\n", start+i*4, mr, type); + } + + if(type==MTYPE_8020) { // RAM 80200000+ mirror + host_tempreg_acquire(); + emit_andimm(addr,~0x00e00000,HOST_TEMPREG); + addr=*addr_reg_override=HOST_TEMPREG; + type=0; + } + else if(type==MTYPE_0000) { // RAM 0 mirror + host_tempreg_acquire(); + emit_orimm(addr,0x80000000,HOST_TEMPREG); + addr=*addr_reg_override=HOST_TEMPREG; + type=0; + } + else if(type==MTYPE_A000) { // RAM A mirror + host_tempreg_acquire(); + emit_andimm(addr,~0x20000000,HOST_TEMPREG); + addr=*addr_reg_override=HOST_TEMPREG; + type=0; + } + else if(type==MTYPE_1F80) { // scratchpad + if (psxH == (void *)0x1f800000) { + host_tempreg_acquire(); + emit_xorimm(addr,0x1f800000,HOST_TEMPREG); + emit_cmpimm(HOST_TEMPREG,0x1000); + host_tempreg_release(); + jaddr=out; + emit_jc(0); + } + else { + // do the usual RAM check, jump will go to the right handler + type=0; + } + } + + if (type == 0) // need ram check + { + emit_cmpimm(addr,RAM_SIZE); + jaddr = out; + #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK + // Hint to branch predictor that the branch is unlikely to be taken + if (dops[i].rs1 >= 28) + emit_jno_unlikely(0); + else + #endif + emit_jno(0); + if (ram_offset != 0) + *offset_reg = get_ro_reg(i_regs, 0); + } + + return jaddr; +} + +// return memhandler, or get directly accessable address and return 0 +static void *get_direct_memhandler(void *table, u_int addr, + enum stub_type type, uintptr_t *addr_host) +{ + uintptr_t msb = 1ull << (sizeof(uintptr_t)*8 - 1); + uintptr_t l1, l2 = 0; + l1 = ((uintptr_t *)table)[addr>>12]; + if (!(l1 & msb)) { + uintptr_t v = l1 << 1; + *addr_host = v + addr; + return NULL; + } + else { + l1 <<= 1; + if (type == LOADB_STUB || type == LOADBU_STUB || type == STOREB_STUB) + l2 = ((uintptr_t *)l1)[0x1000/4 + 0x1000/2 + (addr&0xfff)]; + else if (type == LOADH_STUB || type == LOADHU_STUB || type == STOREH_STUB) + l2 = ((uintptr_t *)l1)[0x1000/4 + (addr&0xfff)/2]; + else + l2 = ((uintptr_t *)l1)[(addr&0xfff)/4]; + if (!(l2 & msb)) { + uintptr_t v = l2 << 1; + *addr_host = v + (addr&0xfff); + return NULL; + } + return (void *)(l2 << 1); + } +} + +static u_int get_host_reglist(const signed char *regmap) +{ + u_int reglist = 0, hr; + for (hr = 0; hr < HOST_REGS; hr++) { + if (hr != EXCLUDE_REG && regmap[hr] >= 0) + reglist |= 1 << hr; + } + return reglist; +} + +static u_int reglist_exclude(u_int reglist, int r1, int r2) +{ + if (r1 >= 0) + reglist &= ~(1u << r1); + if (r2 >= 0) + reglist &= ~(1u << r2); + return reglist; +} + +// find a temp caller-saved register not in reglist (so assumed to be free) +static int reglist_find_free(u_int reglist) +{ + u_int free_regs = ~reglist & CALLER_SAVE_REGS; + if (free_regs == 0) + return -1; + return __builtin_ctz(free_regs); +} + +static void do_load_word(int a, int rt, int offset_reg) +{ + if (offset_reg >= 0) + emit_ldr_dualindexed(offset_reg, a, rt); + else + emit_readword_indexed(0, a, rt); +} + +static void do_store_word(int a, int ofs, int rt, int offset_reg, int preseve_a) +{ + if (offset_reg < 0) { + emit_writeword_indexed(rt, ofs, a); + return; + } + if (ofs != 0) + emit_addimm(a, ofs, a); + emit_str_dualindexed(offset_reg, a, rt); + if (ofs != 0 && preseve_a) + emit_addimm(a, -ofs, a); +} + +static void do_store_hword(int a, int ofs, int rt, int offset_reg, int preseve_a) +{ + if (offset_reg < 0) { + emit_writehword_indexed(rt, ofs, a); + return; + } + if (ofs != 0) + emit_addimm(a, ofs, a); + emit_strh_dualindexed(offset_reg, a, rt); + if (ofs != 0 && preseve_a) + emit_addimm(a, -ofs, a); +} + +static void do_store_byte(int a, int rt, int offset_reg) +{ + if (offset_reg >= 0) + emit_strb_dualindexed(offset_reg, a, rt); + else + emit_writebyte_indexed(rt, 0, a); +} + +static void load_assemble(int i, const struct regstat *i_regs, int ccadj_) { - int s,th,tl,addr,map=-1; + int s,tl,addr; int offset; - int jaddr=0; + void *jaddr=0; int memtarget=0,c=0; - int fastload_reg_override=0; - u_int hr,reglist=0; - th=get_reg(i_regs->regmap,rt1[i]|64); - tl=get_reg(i_regs->regmap,rt1[i]); - s=get_reg(i_regs->regmap,rs1[i]); + int offset_reg = -1; + int fastio_reg_override = -1; + u_int reglist=get_host_reglist(i_regs->regmap); + tl=get_reg(i_regs->regmap,dops[i].rt1); + s=get_reg(i_regs->regmap,dops[i].rs1); offset=imm[i]; - for(hr=0;hrregmap[hr]>=0) reglist|=1<regmap[HOST_CCREG]==CCREG) reglist&=~(1<=0) { c=(i_regs->wasconst>>s)&1; @@ -2636,10 +2776,10 @@ void load_assemble(int i,struct regstat *i_regs) } } //printf("load_assemble: c=%d\n",c); - //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset); + //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset); // FIXME: Even if the load is a NOP, we should check for pagefaults... if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80)) - ||rt1[i]==0) { + ||dops[i].rt1==0) { // could be FIFO, must perform the read // ||dummy read assem_debug("(forced read)\n"); @@ -2651,274 +2791,219 @@ void load_assemble(int i,struct regstat *i_regs) //if(tl<0) tl=get_reg(i_regs->regmap,-1); if(tl>=0) { //printf("load_assemble: c=%d\n",c); - //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset); + //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset); assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O reglist&=~(1<=0) reglist&=~(1<regmap,ROREG); - if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG); - #endif #ifdef R29_HACK // Strmnnrmn's speed hack - if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE) + if(dops[i].rs1!=29||start<0x80001000||start>=0x80000000+RAM_SIZE) #endif { - jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override); + jaddr = emit_fastpath_cmp_jump(i, i_regs, addr, + &offset_reg, &fastio_reg_override); } } - else if(ram_offset&&memtarget) { - emit_addimm(addr,ram_offset,HOST_TEMPREG); - fastload_reg_override=HOST_TEMPREG; + else if (ram_offset && memtarget) { + offset_reg = get_ro_reg(i_regs, 0); } - int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg - if (opcode[i]==0x20) { // LB + int dummy=(dops[i].rt1==0)||(tl!=get_reg(i_regs->regmap,dops[i].rt1)); // ignore loads to r0 and unneeded reg + switch (dops[i].opcode) { + case 0x20: // LB if(!c||memtarget) { if(!dummy) { - #ifdef HOST_IMM_ADDR32 - if(c) - emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl); - else - #endif - { - //emit_xorimm(addr,3,tl); - //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl); - int x=0,a=tl; -#ifdef BIG_ENDIAN_MIPS - if(!c) emit_xorimm(addr,3,tl); - else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset); -#else - if(!c) a=addr; -#endif - if(fastload_reg_override) a=fastload_reg_override; + int a = tl; + if (!c) a = addr; + if (fastio_reg_override >= 0) + a = fastio_reg_override; - emit_movsbl_indexed_tlb(x,a,map,tl); - } + if (offset_reg >= 0) + emit_ldrsb_dualindexed(offset_reg, a, tl); + else + emit_movsbl_indexed(0, a, tl); } if(jaddr) - add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist); + add_stub_r(LOADB_STUB,jaddr,out,i,addr,i_regs,ccadj_,reglist); } else - inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist); - } - if (opcode[i]==0x21) { // LH + inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,dops[i].rt1,ccadj_,reglist); + break; + case 0x21: // LH if(!c||memtarget) { if(!dummy) { - #ifdef HOST_IMM_ADDR32 - if(c) - emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl); + int a = tl; + if (!c) a = addr; + if (fastio_reg_override >= 0) + a = fastio_reg_override; + if (offset_reg >= 0) + emit_ldrsh_dualindexed(offset_reg, a, tl); else - #endif - { - int x=0,a=tl; -#ifdef BIG_ENDIAN_MIPS - if(!c) emit_xorimm(addr,2,tl); - else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset); -#else - if(!c) a=addr; -#endif - if(fastload_reg_override) a=fastload_reg_override; - //#ifdef - //emit_movswl_indexed_tlb(x,tl,map,tl); - //else - if(map>=0) { - emit_movswl_indexed(x,a,tl); - }else{ - #if 1 //def RAM_OFFSET - emit_movswl_indexed(x,a,tl); - #else - emit_movswl_indexed((int)rdram-0x80000000+x,a,tl); - #endif - } - } + emit_movswl_indexed(0, a, tl); } if(jaddr) - add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist); + add_stub_r(LOADH_STUB,jaddr,out,i,addr,i_regs,ccadj_,reglist); } else - inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist); - } - if (opcode[i]==0x23) { // LW + inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,dops[i].rt1,ccadj_,reglist); + break; + case 0x23: // LW if(!c||memtarget) { if(!dummy) { - int a=addr; - if(fastload_reg_override) a=fastload_reg_override; - //emit_readword_indexed((int)rdram-0x80000000,addr,tl); - #ifdef HOST_IMM_ADDR32 - if(c) - emit_readword_tlb(constmap[i][s]+offset,map,tl); - else - #endif - emit_readword_indexed_tlb(0,a,map,tl); + int a = addr; + if (fastio_reg_override >= 0) + a = fastio_reg_override; + do_load_word(a, tl, offset_reg); } if(jaddr) - add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist); + add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj_,reglist); } else - inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist); - } - if (opcode[i]==0x24) { // LBU + inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,dops[i].rt1,ccadj_,reglist); + break; + case 0x24: // LBU if(!c||memtarget) { if(!dummy) { - #ifdef HOST_IMM_ADDR32 - if(c) - emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl); - else - #endif - { - //emit_xorimm(addr,3,tl); - //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl); - int x=0,a=tl; -#ifdef BIG_ENDIAN_MIPS - if(!c) emit_xorimm(addr,3,tl); - else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset); -#else - if(!c) a=addr; -#endif - if(fastload_reg_override) a=fastload_reg_override; + int a = tl; + if (!c) a = addr; + if (fastio_reg_override >= 0) + a = fastio_reg_override; - emit_movzbl_indexed_tlb(x,a,map,tl); - } + if (offset_reg >= 0) + emit_ldrb_dualindexed(offset_reg, a, tl); + else + emit_movzbl_indexed(0, a, tl); } if(jaddr) - add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist); + add_stub_r(LOADBU_STUB,jaddr,out,i,addr,i_regs,ccadj_,reglist); } else - inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist); - } - if (opcode[i]==0x25) { // LHU + inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,dops[i].rt1,ccadj_,reglist); + break; + case 0x25: // LHU if(!c||memtarget) { if(!dummy) { - #ifdef HOST_IMM_ADDR32 - if(c) - emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl); + int a = tl; + if(!c) a = addr; + if (fastio_reg_override >= 0) + a = fastio_reg_override; + if (offset_reg >= 0) + emit_ldrh_dualindexed(offset_reg, a, tl); else - #endif - { - int x=0,a=tl; -#ifdef BIG_ENDIAN_MIPS - if(!c) emit_xorimm(addr,2,tl); - else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset); -#else - if(!c) a=addr; -#endif - if(fastload_reg_override) a=fastload_reg_override; - //#ifdef - //emit_movzwl_indexed_tlb(x,tl,map,tl); - //#else - if(map>=0) { - emit_movzwl_indexed(x,a,tl); - }else{ - #if 1 //def RAM_OFFSET - emit_movzwl_indexed(x,a,tl); - #else - emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl); - #endif - } - } + emit_movzwl_indexed(0, a, tl); } if(jaddr) - add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist); + add_stub_r(LOADHU_STUB,jaddr,out,i,addr,i_regs,ccadj_,reglist); } else - inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist); + inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,dops[i].rt1,ccadj_,reglist); + break; + case 0x27: // LWU + case 0x37: // LD + default: + assert(0); } - if (opcode[i]==0x27) { // LWU - assert(th>=0); - if(!c||memtarget) { - if(!dummy) { - int a=addr; - if(fastload_reg_override) a=fastload_reg_override; - //emit_readword_indexed((int)rdram-0x80000000,addr,tl); - #ifdef HOST_IMM_ADDR32 - if(c) - emit_readword_tlb(constmap[i][s]+offset,map,tl); - else - #endif - emit_readword_indexed_tlb(0,a,map,tl); - } - if(jaddr) - add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist); + } + if (fastio_reg_override == HOST_TEMPREG || offset_reg == HOST_TEMPREG) + host_tempreg_release(); +} + +#ifndef loadlr_assemble +static void loadlr_assemble(int i, const struct regstat *i_regs, int ccadj_) +{ + int s,tl,temp,temp2,addr; + int offset; + void *jaddr=0; + int memtarget=0,c=0; + int offset_reg = -1; + int fastio_reg_override = -1; + u_int reglist=get_host_reglist(i_regs->regmap); + tl=get_reg(i_regs->regmap,dops[i].rt1); + s=get_reg(i_regs->regmap,dops[i].rs1); + temp=get_reg(i_regs->regmap,-1); + temp2=get_reg(i_regs->regmap,FTEMP); + addr=get_reg(i_regs->regmap,AGEN1+(i&1)); + assert(addr<0); + offset=imm[i]; + reglist|=1<=0) { + c=(i_regs->wasconst>>s)&1; + if(c) { + memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE; } - else { - inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist); + } + if(!c) { + emit_shlimm(addr,3,temp); + if (dops[i].opcode==0x22||dops[i].opcode==0x26) { + emit_andimm(addr,0xFFFFFFFC,temp2); // LWL/LWR + }else{ + emit_andimm(addr,0xFFFFFFF8,temp2); // LDL/LDR + } + jaddr = emit_fastpath_cmp_jump(i, i_regs, temp2, + &offset_reg, &fastio_reg_override); + } + else { + if (ram_offset && memtarget) { + offset_reg = get_ro_reg(i_regs, 0); + } + if (dops[i].opcode==0x22||dops[i].opcode==0x26) { + emit_movimm(((constmap[i][s]+offset)<<3)&24,temp); // LWL/LWR + }else{ + emit_movimm(((constmap[i][s]+offset)<<3)&56,temp); // LDL/LDR } - emit_zeroreg(th); } - if (opcode[i]==0x37) { // LD + if (dops[i].opcode==0x22||dops[i].opcode==0x26) { // LWL/LWR if(!c||memtarget) { - if(!dummy) { - int a=addr; - if(fastload_reg_override) a=fastload_reg_override; - //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th); - //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl); - #ifdef HOST_IMM_ADDR32 - if(c) - emit_readdword_tlb(constmap[i][s]+offset,map,th,tl); - else - #endif - emit_readdword_indexed_tlb(0,a,map,th,tl); - } - if(jaddr) - add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist); + int a = temp2; + if (fastio_reg_override >= 0) + a = fastio_reg_override; + do_load_word(a, temp2, offset_reg); + if (fastio_reg_override == HOST_TEMPREG || offset_reg == HOST_TEMPREG) + host_tempreg_release(); + if(jaddr) add_stub_r(LOADW_STUB,jaddr,out,i,temp2,i_regs,ccadj_,reglist); } else - inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist); + inline_readstub(LOADW_STUB,i,(constmap[i][s]+offset)&0xFFFFFFFC,i_regs->regmap,FTEMP,ccadj_,reglist); + if(dops[i].rt1) { + assert(tl>=0); + emit_andimm(temp,24,temp); + if (dops[i].opcode==0x22) // LWL + emit_xorimm(temp,24,temp); + host_tempreg_acquire(); + emit_movimm(-1,HOST_TEMPREG); + if (dops[i].opcode==0x26) { + emit_shr(temp2,temp,temp2); + emit_bic_lsr(tl,HOST_TEMPREG,temp,tl); + }else{ + emit_shl(temp2,temp,temp2); + emit_bic_lsl(tl,HOST_TEMPREG,temp,tl); + } + host_tempreg_release(); + emit_or(temp2,tl,tl); + } + //emit_storereg(dops[i].rt1,tl); // DEBUG + } + if (dops[i].opcode==0x1A||dops[i].opcode==0x1B) { // LDL/LDR + assert(0); } - } - //emit_storereg(rt1[i],tl); // DEBUG - //if(opcode[i]==0x23) - //if(opcode[i]==0x24) - //if(opcode[i]==0x23||opcode[i]==0x24) - /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24) - { - //emit_pusha(); - save_regs(0x100f); - emit_readword((int)&last_count,ECX); - #ifdef __i386__ - if(get_reg(i_regs->regmap,CCREG)<0) - emit_loadreg(CCREG,HOST_CCREG); - emit_add(HOST_CCREG,ECX,HOST_CCREG); - emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG); - emit_writeword(HOST_CCREG,(int)&Count); - #endif - #ifdef __arm__ - if(get_reg(i_regs->regmap,CCREG)<0) - emit_loadreg(CCREG,0); - else - emit_mov(HOST_CCREG,0); - emit_add(0,ECX,0); - emit_addimm(0,2*ccadj[i],0); - emit_writeword(0,(int)&Count); - #endif - emit_call((int)memdebug); - //emit_popa(); - restore_regs(0x100f); - }*/ -} - -#ifndef loadlr_assemble -void loadlr_assemble(int i,struct regstat *i_regs) -{ - printf("Need loadlr_assemble for this architecture.\n"); - exit(1); } #endif -void store_assemble(int i,struct regstat *i_regs) +static void store_assemble(int i, const struct regstat *i_regs, int ccadj_) { - int s,th,tl,map=-1; + int s,tl; int addr,temp; int offset; - int jaddr=0,type; + void *jaddr=0; + enum stub_type type=0; int memtarget=0,c=0; int agr=AGEN1+(i&1); - int faststore_reg_override=0; - u_int hr,reglist=0; - th=get_reg(i_regs->regmap,rs2[i]|64); - tl=get_reg(i_regs->regmap,rs2[i]); - s=get_reg(i_regs->regmap,rs1[i]); + int offset_reg = -1; + int fastio_reg_override = -1; + u_int reglist=get_host_reglist(i_regs->regmap); + tl=get_reg(i_regs->regmap,dops[i].rs2); + s=get_reg(i_regs->regmap,dops[i].rs1); temp=get_reg(i_regs->regmap,agr); if(temp<0) temp=get_reg(i_regs->regmap,-1); offset=imm[i]; @@ -2930,90 +3015,60 @@ void store_assemble(int i,struct regstat *i_regs) } assert(tl>=0); assert(temp>=0); - for(hr=0;hrregmap[hr]>=0) reglist|=1<regmap[HOST_CCREG]==CCREG) reglist&=~(1<=0) { - emit_writehword_indexed(tl,x,a); - }else - //emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a); - emit_writehword_indexed(tl,x,a); - } - type=STOREH_STUB; - } - if (opcode[i]==0x2B) { // SW + int a = temp; + if (!c) a = addr; + if (fastio_reg_override >= 0) + a = fastio_reg_override; + do_store_byte(a, tl, offset_reg); + } + type = STOREB_STUB; + break; + case 0x29: // SH if(!c||memtarget) { - int a=addr; - if(faststore_reg_override) a=faststore_reg_override; - //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr); - emit_writeword_indexed_tlb(tl,0,a,map,temp); - } - type=STOREW_STUB; - } - if (opcode[i]==0x3F) { // SD + int a = temp; + if (!c) a = addr; + if (fastio_reg_override >= 0) + a = fastio_reg_override; + do_store_hword(a, 0, tl, offset_reg, 1); + } + type = STOREH_STUB; + break; + case 0x2B: // SW if(!c||memtarget) { - int a=addr; - if(faststore_reg_override) a=faststore_reg_override; - if(rs2[i]) { - assert(th>=0); - //emit_writeword_indexed(th,(int)rdram-0x80000000,addr); - //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr); - emit_writedword_indexed_tlb(th,tl,0,a,map,temp); - }else{ - // Store zero - //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp); - //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp); - emit_writedword_indexed_tlb(tl,tl,0,a,map,temp); - } - } - type=STORED_STUB; - } + int a = addr; + if (fastio_reg_override >= 0) + a = fastio_reg_override; + do_store_word(a, 0, tl, offset_reg, 1); + } + type = STOREW_STUB; + break; + case 0x3F: // SD + default: + assert(0); + } + if (fastio_reg_override == HOST_TEMPREG || offset_reg == HOST_TEMPREG) + host_tempreg_release(); if(jaddr) { // PCSX store handlers don't check invcode again reglist|=1<waswritten&(1<waswritten&(1<=0); emit_cmpmem_indexedsr12_reg(ir,addr,1); #else - emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1); + emit_cmpmem_indexedsr12_imm(invalid_code,addr,1); #endif #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT) emit_callne(invalidate_addr_reg[addr]); #else - int jaddr2=(int)out; + void *jaddr2 = out; emit_jne(0); - add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<regmap,rs2[i],ccadj[i],reglist); + inline_writestub(type,i,addr_val,i_regs->regmap,dops[i].rs2,ccadj_,reglist); } // basic current block modification detection.. // not looking back as that should be in mips cache already + // (see Spyro2 title->attract mode) if(c&&start+i*4regmap==regs[i].regmap); // not delay slot if(i_regs->regmap==regs[i].regmap) { - load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i); - wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty); + load_all_consts(regs[i].regmap_entry,regs[i].wasdirty,i); + wb_dirtys(regs[i].regmap_entry,regs[i].wasdirty); emit_movimm(start+i*4+4,0); - emit_writeword(0,(int)&pcaddr); - emit_jmp((int)do_interrupt); + emit_writeword(0,&pcaddr); + emit_addimm(HOST_CCREG,2,HOST_CCREG); + emit_far_call(get_addr_ht); + emit_jmpreg(0); } } - //if(opcode[i]==0x2B || opcode[i]==0x3F) - //if(opcode[i]==0x2B || opcode[i]==0x28) - //if(opcode[i]==0x2B || opcode[i]==0x29) - //if(opcode[i]==0x2B) - /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F) - { - #ifdef __i386__ - emit_pusha(); - #endif - #ifdef __arm__ - save_regs(0x100f); - #endif - emit_readword((int)&last_count,ECX); - #ifdef __i386__ - if(get_reg(i_regs->regmap,CCREG)<0) - emit_loadreg(CCREG,HOST_CCREG); - emit_add(HOST_CCREG,ECX,HOST_CCREG); - emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG); - emit_writeword(HOST_CCREG,(int)&Count); - #endif - #ifdef __arm__ - if(get_reg(i_regs->regmap,CCREG)<0) - emit_loadreg(CCREG,0); - else - emit_mov(HOST_CCREG,0); - emit_add(0,ECX,0); - emit_addimm(0,2*ccadj[i],0); - emit_writeword(0,(int)&Count); - #endif - emit_call((int)memdebug); - #ifdef __i386__ - emit_popa(); - #endif - #ifdef __arm__ - restore_regs(0x100f); - #endif - }*/ } -void storelr_assemble(int i,struct regstat *i_regs) +static void storelr_assemble(int i, const struct regstat *i_regs, int ccadj_) { - int s,th,tl; + int s,tl; int temp; - int temp2=-1; int offset; - int jaddr=0; - int case1,case2,case3; - int done0,done1,done2; + void *jaddr=0; + void *case1, *case23, *case3; + void *done0, *done1, *done2; int memtarget=0,c=0; int agr=AGEN1+(i&1); - u_int hr,reglist=0; - th=get_reg(i_regs->regmap,rs2[i]|64); - tl=get_reg(i_regs->regmap,rs2[i]); - s=get_reg(i_regs->regmap,rs1[i]); + int offset_reg = -1; + u_int reglist=get_host_reglist(i_regs->regmap); + tl=get_reg(i_regs->regmap,dops[i].rs2); + s=get_reg(i_regs->regmap,dops[i].rs1); temp=get_reg(i_regs->regmap,agr); if(temp<0) temp=get_reg(i_regs->regmap,-1); offset=imm[i]; @@ -3119,240 +3139,574 @@ void storelr_assemble(int i,struct regstat *i_regs) } } assert(tl>=0); - for(hr=0;hrregmap[hr]>=0) reglist|=1<=0); if(!c) { emit_cmpimm(s<0||offset?temp:s,RAM_SIZE); if(!offset&&s!=temp) emit_mov(s,temp); - jaddr=(int)out; + jaddr=out; emit_jno(0); } else { - if(!memtarget||!rs1[i]) { - jaddr=(int)out; + if(!memtarget||!dops[i].rs1) { + jaddr=out; emit_jmp(0); } } - #ifdef RAM_OFFSET - int map=get_reg(i_regs->regmap,ROREG); - if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG); - #else - if((u_int)rdram!=0x80000000) - emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp); - #endif + if (ram_offset) + offset_reg = get_ro_reg(i_regs, 0); - if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR - temp2=get_reg(i_regs->regmap,FTEMP); - if(!rs2[i]) temp2=th=tl; + if (dops[i].opcode==0x2C||dops[i].opcode==0x2D) { // SDL/SDR + assert(0); } -#ifndef BIG_ENDIAN_MIPS - emit_xorimm(temp,3,temp); -#endif emit_testimm(temp,2); - case2=(int)out; + case23=out; emit_jne(0); emit_testimm(temp,1); - case1=(int)out; + case1=out; emit_jne(0); // 0 - if (opcode[i]==0x2A) { // SWL - emit_writeword_indexed(tl,0,temp); - } - if (opcode[i]==0x2E) { // SWR - emit_writebyte_indexed(tl,3,temp); - } - if (opcode[i]==0x2C) { // SDL - emit_writeword_indexed(th,0,temp); - if(rs2[i]) emit_mov(tl,temp2); + if (dops[i].opcode == 0x2A) { // SWL + // Write msb into least significant byte + if (dops[i].rs2) emit_rorimm(tl, 24, tl); + do_store_byte(temp, tl, offset_reg); + if (dops[i].rs2) emit_rorimm(tl, 8, tl); } - if (opcode[i]==0x2D) { // SDR - emit_writebyte_indexed(tl,3,temp); - if(rs2[i]) emit_shldimm(th,tl,24,temp2); + else if (dops[i].opcode == 0x2E) { // SWR + // Write entire word + do_store_word(temp, 0, tl, offset_reg, 1); } - done0=(int)out; + done0 = out; emit_jmp(0); // 1 - set_jump_target(case1,(int)out); - if (opcode[i]==0x2A) { // SWL - // Write 3 msb into three least significant bytes - if(rs2[i]) emit_rorimm(tl,8,tl); - emit_writehword_indexed(tl,-1,temp); - if(rs2[i]) emit_rorimm(tl,16,tl); - emit_writebyte_indexed(tl,1,temp); - if(rs2[i]) emit_rorimm(tl,8,tl); - } - if (opcode[i]==0x2E) { // SWR - // Write two lsb into two most significant bytes - emit_writehword_indexed(tl,1,temp); + set_jump_target(case1, out); + if (dops[i].opcode == 0x2A) { // SWL + // Write two msb into two least significant bytes + if (dops[i].rs2) emit_rorimm(tl, 16, tl); + do_store_hword(temp, -1, tl, offset_reg, 0); + if (dops[i].rs2) emit_rorimm(tl, 16, tl); } - if (opcode[i]==0x2C) { // SDL - if(rs2[i]) emit_shrdimm(tl,th,8,temp2); - // Write 3 msb into three least significant bytes - if(rs2[i]) emit_rorimm(th,8,th); - emit_writehword_indexed(th,-1,temp); - if(rs2[i]) emit_rorimm(th,16,th); - emit_writebyte_indexed(th,1,temp); - if(rs2[i]) emit_rorimm(th,8,th); - } - if (opcode[i]==0x2D) { // SDR - if(rs2[i]) emit_shldimm(th,tl,16,temp2); - // Write two lsb into two most significant bytes - emit_writehword_indexed(tl,1,temp); + else if (dops[i].opcode == 0x2E) { // SWR + // Write 3 lsb into three most significant bytes + do_store_byte(temp, tl, offset_reg); + if (dops[i].rs2) emit_rorimm(tl, 8, tl); + do_store_hword(temp, 1, tl, offset_reg, 0); + if (dops[i].rs2) emit_rorimm(tl, 24, tl); } - done1=(int)out; + done1=out; emit_jmp(0); - // 2 - set_jump_target(case2,(int)out); + // 2,3 + set_jump_target(case23, out); emit_testimm(temp,1); - case3=(int)out; + case3 = out; emit_jne(0); - if (opcode[i]==0x2A) { // SWL - // Write two msb into two least significant bytes - if(rs2[i]) emit_rorimm(tl,16,tl); - emit_writehword_indexed(tl,-2,temp); - if(rs2[i]) emit_rorimm(tl,16,tl); - } - if (opcode[i]==0x2E) { // SWR - // Write 3 lsb into three most significant bytes - emit_writebyte_indexed(tl,-1,temp); - if(rs2[i]) emit_rorimm(tl,8,tl); - emit_writehword_indexed(tl,0,temp); - if(rs2[i]) emit_rorimm(tl,24,tl); - } - if (opcode[i]==0x2C) { // SDL - if(rs2[i]) emit_shrdimm(tl,th,16,temp2); - // Write two msb into two least significant bytes - if(rs2[i]) emit_rorimm(th,16,th); - emit_writehword_indexed(th,-2,temp); - if(rs2[i]) emit_rorimm(th,16,th); + // 2 + if (dops[i].opcode==0x2A) { // SWL + // Write 3 msb into three least significant bytes + if (dops[i].rs2) emit_rorimm(tl, 8, tl); + do_store_hword(temp, -2, tl, offset_reg, 1); + if (dops[i].rs2) emit_rorimm(tl, 16, tl); + do_store_byte(temp, tl, offset_reg); + if (dops[i].rs2) emit_rorimm(tl, 8, tl); } - if (opcode[i]==0x2D) { // SDR - if(rs2[i]) emit_shldimm(th,tl,8,temp2); - // Write 3 lsb into three most significant bytes - emit_writebyte_indexed(tl,-1,temp); - if(rs2[i]) emit_rorimm(tl,8,tl); - emit_writehword_indexed(tl,0,temp); - if(rs2[i]) emit_rorimm(tl,24,tl); + else if (dops[i].opcode == 0x2E) { // SWR + // Write two lsb into two most significant bytes + do_store_hword(temp, 0, tl, offset_reg, 1); } - done2=(int)out; + done2 = out; emit_jmp(0); // 3 - set_jump_target(case3,(int)out); - if (opcode[i]==0x2A) { // SWL - // Write msb into least significant byte - if(rs2[i]) emit_rorimm(tl,24,tl); - emit_writebyte_indexed(tl,-3,temp); - if(rs2[i]) emit_rorimm(tl,8,tl); - } - if (opcode[i]==0x2E) { // SWR - // Write entire word - emit_writeword_indexed(tl,-3,temp); - } - if (opcode[i]==0x2C) { // SDL - if(rs2[i]) emit_shrdimm(tl,th,24,temp2); - // Write msb into least significant byte - if(rs2[i]) emit_rorimm(th,24,th); - emit_writebyte_indexed(th,-3,temp); - if(rs2[i]) emit_rorimm(th,8,th); - } - if (opcode[i]==0x2D) { // SDR - if(rs2[i]) emit_mov(th,temp2); - // Write entire word - emit_writeword_indexed(tl,-3,temp); - } - set_jump_target(done0,(int)out); - set_jump_target(done1,(int)out); - set_jump_target(done2,(int)out); - if (opcode[i]==0x2C) { // SDL - emit_testimm(temp,4); - done0=(int)out; - emit_jne(0); - emit_andimm(temp,~3,temp); - emit_writeword_indexed(temp2,4,temp); - set_jump_target(done0,(int)out); - } - if (opcode[i]==0x2D) { // SDR - emit_testimm(temp,4); - done0=(int)out; - emit_jeq(0); - emit_andimm(temp,~3,temp); - emit_writeword_indexed(temp2,-4,temp); - set_jump_target(done0,(int)out); - } + set_jump_target(case3, out); + if (dops[i].opcode == 0x2A) { // SWL + do_store_word(temp, -3, tl, offset_reg, 0); + } + else if (dops[i].opcode == 0x2E) { // SWR + do_store_byte(temp, tl, offset_reg); + } + set_jump_target(done0, out); + set_jump_target(done1, out); + set_jump_target(done2, out); + if (offset_reg == HOST_TEMPREG) + host_tempreg_release(); if(!c||!memtarget) - add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist); - if(!(i_regs->waswritten&(1<regmap,ROREG); - if(map<0) map=HOST_TEMPREG; - gen_orig_addr_w(temp,map); - #else - emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp); - #endif + add_stub_r(STORELR_STUB,jaddr,out,i,temp,i_regs,ccadj_,reglist); + if(!(i_regs->waswritten&(1<regmap,INVCP); assert(ir>=0); emit_cmpmem_indexedsr12_reg(ir,temp,1); #else - emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1); + emit_cmpmem_indexedsr12_imm(invalid_code,temp,1); #endif #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT) emit_callne(invalidate_addr_reg[temp]); #else - int jaddr2=(int)out; + void *jaddr2 = out; emit_jne(0); - add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<regmap,CCREG)<0) - emit_loadreg(CCREG,HOST_CCREG); - emit_add(HOST_CCREG,ECX,HOST_CCREG); - emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG); - emit_writeword(HOST_CCREG,(int)&Count); - emit_call((int)memdebug); - emit_popa(); - //restore_regs(0x100f); - */ } -void c1ls_assemble(int i,struct regstat *i_regs) +static void cop0_assemble(int i, const struct regstat *i_regs, int ccadj_) +{ + if(dops[i].opcode2==0) // MFC0 + { + signed char t=get_reg(i_regs->regmap,dops[i].rt1); + u_int copr=(source[i]>>11)&0x1f; + //assert(t>=0); // Why does this happen? OOT is weird + if(t>=0&&dops[i].rt1!=0) { + emit_readword(®_cop0[copr],t); + } + } + else if(dops[i].opcode2==4) // MTC0 + { + signed char s=get_reg(i_regs->regmap,dops[i].rs1); + char copr=(source[i]>>11)&0x1f; + assert(s>=0); + wb_register(dops[i].rs1,i_regs->regmap,i_regs->dirty); + if(copr==9||copr==11||copr==12||copr==13) { + emit_readword(&last_count,HOST_TEMPREG); + emit_loadreg(CCREG,HOST_CCREG); // TODO: do proper reg alloc + emit_add(HOST_CCREG,HOST_TEMPREG,HOST_CCREG); + emit_addimm(HOST_CCREG,ccadj_,HOST_CCREG); + emit_writeword(HOST_CCREG,&Count); + } + // What a mess. The status register (12) can enable interrupts, + // so needs a special case to handle a pending interrupt. + // The interrupt must be taken immediately, because a subsequent + // instruction might disable interrupts again. + if(copr==12||copr==13) { + if (is_delayslot) { + // burn cycles to cause cc_interrupt, which will + // reschedule next_interupt. Relies on CCREG from above. + assem_debug("MTC0 DS %d\n", copr); + emit_writeword(HOST_CCREG,&last_count); + emit_movimm(0,HOST_CCREG); + emit_storereg(CCREG,HOST_CCREG); + emit_loadreg(dops[i].rs1,1); + emit_movimm(copr,0); + emit_far_call(pcsx_mtc0_ds); + emit_loadreg(dops[i].rs1,s); + return; + } + emit_movimm(start+i*4+4,HOST_TEMPREG); + emit_writeword(HOST_TEMPREG,&pcaddr); + emit_movimm(0,HOST_TEMPREG); + emit_writeword(HOST_TEMPREG,&pending_exception); + } + if(s==HOST_CCREG) + emit_loadreg(dops[i].rs1,1); + else if(s!=1) + emit_mov(s,1); + emit_movimm(copr,0); + emit_far_call(pcsx_mtc0); + if(copr==9||copr==11||copr==12||copr==13) { + emit_readword(&Count,HOST_CCREG); + emit_readword(&next_interupt,HOST_TEMPREG); + emit_addimm(HOST_CCREG,-ccadj_,HOST_CCREG); + emit_sub(HOST_CCREG,HOST_TEMPREG,HOST_CCREG); + emit_writeword(HOST_TEMPREG,&last_count); + emit_storereg(CCREG,HOST_CCREG); + } + if(copr==12||copr==13) { + assert(!is_delayslot); + emit_readword(&pending_exception,14); + emit_test(14,14); + void *jaddr = out; + emit_jeq(0); + emit_readword(&pcaddr, 0); + emit_addimm(HOST_CCREG,2,HOST_CCREG); + emit_far_call(get_addr_ht); + emit_jmpreg(0); + set_jump_target(jaddr, out); + } + emit_loadreg(dops[i].rs1,s); + } + else + { + assert(dops[i].opcode2==0x10); + //if((source[i]&0x3f)==0x10) // RFE + { + emit_readword(&Status,0); + emit_andimm(0,0x3c,1); + emit_andimm(0,~0xf,0); + emit_orrshr_imm(1,2,0); + emit_writeword(0,&Status); + } + } +} + +static void cop1_unusable(int i, const struct regstat *i_regs) +{ + // XXX: should just just do the exception instead + //if(!cop1_usable) + { + void *jaddr=out; + emit_jmp(0); + add_stub_r(FP_STUB,jaddr,out,i,0,i_regs,is_delayslot,0); + } +} + +static void cop1_assemble(int i, const struct regstat *i_regs) { cop1_unusable(i, i_regs); } -void c2ls_assemble(int i,struct regstat *i_regs) +static void c1ls_assemble(int i, const struct regstat *i_regs) +{ + cop1_unusable(i, i_regs); +} + +// FP_STUB +static void do_cop1stub(int n) +{ + literal_pool(256); + assem_debug("do_cop1stub %x\n",start+stubs[n].a*4); + set_jump_target(stubs[n].addr, out); + int i=stubs[n].a; +// int rs=stubs[n].b; + struct regstat *i_regs=(struct regstat *)stubs[n].c; + int ds=stubs[n].d; + if(!ds) { + load_all_consts(regs[i].regmap_entry,regs[i].wasdirty,i); + //if(i_regs!=®s[i]) printf("oops: regs[i]=%x i_regs=%x",(int)®s[i],(int)i_regs); + } + //else {printf("fp exception in delay slot\n");} + wb_dirtys(i_regs->regmap_entry,i_regs->wasdirty); + if(regs[i].regmap_entry[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG); + emit_movimm(start+(i-ds)*4,EAX); // Get PC + emit_addimm(HOST_CCREG,ccadj[i],HOST_CCREG); // CHECK: is this right? There should probably be an extra cycle... + emit_far_jump(ds?fp_exception_ds:fp_exception); +} + +static int cop2_is_stalling_op(int i, int *cycles) +{ + if (dops[i].opcode == 0x3a) { // SWC2 + *cycles = 0; + return 1; + } + if (dops[i].itype == COP2 && (dops[i].opcode2 == 0 || dops[i].opcode2 == 2)) { // MFC2/CFC2 + *cycles = 0; + return 1; + } + if (dops[i].itype == C2OP) { + *cycles = gte_cycletab[source[i] & 0x3f]; + return 1; + } + // ... what about MTC2/CTC2/LWC2? + return 0; +} + +#if 0 +static void log_gte_stall(int stall, u_int cycle) +{ + if ((u_int)stall <= 44) + printf("x stall %2d %u\n", stall, cycle + last_count); +} + +static void emit_log_gte_stall(int i, int stall, u_int reglist) +{ + save_regs(reglist); + if (stall > 0) + emit_movimm(stall, 0); + else + emit_mov(HOST_TEMPREG, 0); + emit_addimm(HOST_CCREG, ccadj[i], 1); + emit_far_call(log_gte_stall); + restore_regs(reglist); +} +#endif + +static void cop2_do_stall_check(u_int op, int i, const struct regstat *i_regs, u_int reglist) +{ + int j = i, other_gte_op_cycles = -1, stall = -MAXBLOCK, cycles_passed; + int rtmp = reglist_find_free(reglist); + + if (HACK_ENABLED(NDHACK_NO_STALLS)) + return; + if (get_reg(i_regs->regmap, CCREG) != HOST_CCREG) { + // happens occasionally... cc evicted? Don't bother then + //printf("no cc %08x\n", start + i*4); + return; + } + if (!dops[i].bt) { + for (j = i - 1; j >= 0; j--) { + //if (dops[j].is_ds) break; + if (cop2_is_stalling_op(j, &other_gte_op_cycles) || dops[j].bt) + break; + if (j > 0 && ccadj[j - 1] > ccadj[j]) + break; + } + j = max(j, 0); + } + cycles_passed = ccadj[i] - ccadj[j]; + if (other_gte_op_cycles >= 0) + stall = other_gte_op_cycles - cycles_passed; + else if (cycles_passed >= 44) + stall = 0; // can't stall + if (stall == -MAXBLOCK && rtmp >= 0) { + // unknown stall, do the expensive runtime check + assem_debug("; cop2_do_stall_check\n"); +#if 0 // too slow + save_regs(reglist); + emit_movimm(gte_cycletab[op], 0); + emit_addimm(HOST_CCREG, ccadj[i], 1); + emit_far_call(call_gteStall); + restore_regs(reglist); +#else + host_tempreg_acquire(); + emit_readword(&psxRegs.gteBusyCycle, rtmp); + emit_addimm(rtmp, -ccadj[i], rtmp); + emit_sub(rtmp, HOST_CCREG, HOST_TEMPREG); + emit_cmpimm(HOST_TEMPREG, 44); + emit_cmovb_reg(rtmp, HOST_CCREG); + //emit_log_gte_stall(i, 0, reglist); + host_tempreg_release(); +#endif + } + else if (stall > 0) { + //emit_log_gte_stall(i, stall, reglist); + emit_addimm(HOST_CCREG, stall, HOST_CCREG); + } + + // save gteBusyCycle, if needed + if (gte_cycletab[op] == 0) + return; + other_gte_op_cycles = -1; + for (j = i + 1; j < slen; j++) { + if (cop2_is_stalling_op(j, &other_gte_op_cycles)) + break; + if (dops[j].is_jump) { + // check ds + if (j + 1 < slen && cop2_is_stalling_op(j + 1, &other_gte_op_cycles)) + j++; + break; + } + } + if (other_gte_op_cycles >= 0) + // will handle stall when assembling that op + return; + cycles_passed = ccadj[min(j, slen -1)] - ccadj[i]; + if (cycles_passed >= 44) + return; + assem_debug("; save gteBusyCycle\n"); + host_tempreg_acquire(); +#if 0 + emit_readword(&last_count, HOST_TEMPREG); + emit_add(HOST_TEMPREG, HOST_CCREG, HOST_TEMPREG); + emit_addimm(HOST_TEMPREG, ccadj[i], HOST_TEMPREG); + emit_addimm(HOST_TEMPREG, gte_cycletab[op]), HOST_TEMPREG); + emit_writeword(HOST_TEMPREG, &psxRegs.gteBusyCycle); +#else + emit_addimm(HOST_CCREG, ccadj[i] + gte_cycletab[op], HOST_TEMPREG); + emit_writeword(HOST_TEMPREG, &psxRegs.gteBusyCycle); +#endif + host_tempreg_release(); +} + +static int is_mflohi(int i) +{ + return (dops[i].itype == MOV && (dops[i].rs1 == HIREG || dops[i].rs1 == LOREG)); +} + +static int check_multdiv(int i, int *cycles) +{ + if (dops[i].itype != MULTDIV) + return 0; + if (dops[i].opcode2 == 0x18 || dops[i].opcode2 == 0x19) // MULT(U) + *cycles = 11; // approx from 7 11 14 + else + *cycles = 37; + return 1; +} + +static void multdiv_prepare_stall(int i, const struct regstat *i_regs, int ccadj_) +{ + int j, found = 0, c = 0; + if (HACK_ENABLED(NDHACK_NO_STALLS)) + return; + if (get_reg(i_regs->regmap, CCREG) != HOST_CCREG) { + // happens occasionally... cc evicted? Don't bother then + return; + } + for (j = i + 1; j < slen; j++) { + if (dops[j].bt) + break; + if ((found = is_mflohi(j))) + break; + if (dops[j].is_jump) { + // check ds + if (j + 1 < slen && (found = is_mflohi(j + 1))) + j++; + break; + } + } + if (found) + // handle all in multdiv_do_stall() + return; + check_multdiv(i, &c); + assert(c > 0); + assem_debug("; muldiv prepare stall %d\n", c); + host_tempreg_acquire(); + emit_addimm(HOST_CCREG, ccadj_ + c, HOST_TEMPREG); + emit_writeword(HOST_TEMPREG, &psxRegs.muldivBusyCycle); + host_tempreg_release(); +} + +static void multdiv_do_stall(int i, const struct regstat *i_regs) +{ + int j, known_cycles = 0; + u_int reglist = get_host_reglist(i_regs->regmap); + int rtmp = get_reg(i_regs->regmap, -1); + if (rtmp < 0) + rtmp = reglist_find_free(reglist); + if (HACK_ENABLED(NDHACK_NO_STALLS)) + return; + if (get_reg(i_regs->regmap, CCREG) != HOST_CCREG || rtmp < 0) { + // happens occasionally... cc evicted? Don't bother then + //printf("no cc/rtmp %08x\n", start + i*4); + return; + } + if (!dops[i].bt) { + for (j = i - 1; j >= 0; j--) { + if (dops[j].is_ds) break; + if (check_multdiv(j, &known_cycles)) + break; + if (is_mflohi(j)) + // already handled by this op + return; + if (dops[j].bt || (j > 0 && ccadj[j - 1] > ccadj[j])) + break; + } + j = max(j, 0); + } + if (known_cycles > 0) { + known_cycles -= ccadj[i] - ccadj[j]; + assem_debug("; muldiv stall resolved %d\n", known_cycles); + if (known_cycles > 0) + emit_addimm(HOST_CCREG, known_cycles, HOST_CCREG); + return; + } + assem_debug("; muldiv stall unresolved\n"); + host_tempreg_acquire(); + emit_readword(&psxRegs.muldivBusyCycle, rtmp); + emit_addimm(rtmp, -ccadj[i], rtmp); + emit_sub(rtmp, HOST_CCREG, HOST_TEMPREG); + emit_cmpimm(HOST_TEMPREG, 37); + emit_cmovb_reg(rtmp, HOST_CCREG); + //emit_log_gte_stall(i, 0, reglist); + host_tempreg_release(); +} + +static void cop2_get_dreg(u_int copr,signed char tl,signed char temp) +{ + switch (copr) { + case 1: + case 3: + case 5: + case 8: + case 9: + case 10: + case 11: + emit_readword(®_cop2d[copr],tl); + emit_signextend16(tl,tl); + emit_writeword(tl,®_cop2d[copr]); // hmh + break; + case 7: + case 16: + case 17: + case 18: + case 19: + emit_readword(®_cop2d[copr],tl); + emit_andimm(tl,0xffff,tl); + emit_writeword(tl,®_cop2d[copr]); + break; + case 15: + emit_readword(®_cop2d[14],tl); // SXY2 + emit_writeword(tl,®_cop2d[copr]); + break; + case 28: + case 29: + c2op_mfc2_29_assemble(tl,temp); + break; + default: + emit_readword(®_cop2d[copr],tl); + break; + } +} + +static void cop2_put_dreg(u_int copr,signed char sl,signed char temp) +{ + switch (copr) { + case 15: + emit_readword(®_cop2d[13],temp); // SXY1 + emit_writeword(sl,®_cop2d[copr]); + emit_writeword(temp,®_cop2d[12]); // SXY0 + emit_readword(®_cop2d[14],temp); // SXY2 + emit_writeword(sl,®_cop2d[14]); + emit_writeword(temp,®_cop2d[13]); // SXY1 + break; + case 28: + emit_andimm(sl,0x001f,temp); + emit_shlimm(temp,7,temp); + emit_writeword(temp,®_cop2d[9]); + emit_andimm(sl,0x03e0,temp); + emit_shlimm(temp,2,temp); + emit_writeword(temp,®_cop2d[10]); + emit_andimm(sl,0x7c00,temp); + emit_shrimm(temp,3,temp); + emit_writeword(temp,®_cop2d[11]); + emit_writeword(sl,®_cop2d[28]); + break; + case 30: + emit_xorsar_imm(sl,sl,31,temp); +#if defined(HAVE_ARMV5) || defined(__aarch64__) + emit_clz(temp,temp); +#else + emit_movs(temp,HOST_TEMPREG); + emit_movimm(0,temp); + emit_jeq((int)out+4*4); + emit_addpl_imm(temp,1,temp); + emit_lslpls_imm(HOST_TEMPREG,1,HOST_TEMPREG); + emit_jns((int)out-2*4); +#endif + emit_writeword(sl,®_cop2d[30]); + emit_writeword(temp,®_cop2d[31]); + break; + case 31: + break; + default: + emit_writeword(sl,®_cop2d[copr]); + break; + } +} + +static void c2ls_assemble(int i, const struct regstat *i_regs, int ccadj_) { int s,tl; int ar; int offset; int memtarget=0,c=0; - int jaddr2=0,type; + void *jaddr2=NULL; + enum stub_type type; int agr=AGEN1+(i&1); - int fastio_reg_override=0; - u_int hr,reglist=0; + int offset_reg = -1; + int fastio_reg_override = -1; + u_int reglist=get_host_reglist(i_regs->regmap); u_int copr=(source[i]>>16)&0x1f; - s=get_reg(i_regs->regmap,rs1[i]); + s=get_reg(i_regs->regmap,dops[i].rs1); tl=get_reg(i_regs->regmap,FTEMP); offset=imm[i]; - assert(rs1[i]>0); + assert(dops[i].rs1>0); assert(tl>=0); - for(hr=0;hrregmap[hr]>=0) reglist|=1<regmap[HOST_CCREG]==CCREG) reglist&=~(1<regmap,agr); if(ar<0) ar=get_reg(i_regs->regmap,-1); reglist|=1<=0) ar=s; assert(ar>=0); - if (opcode[i]==0x3a) { // SWC2 - cop2_get_dreg(copr,tl,HOST_TEMPREG); + cop2_do_stall_check(0, i, i_regs, reglist); + + if (dops[i].opcode==0x3a) { // SWC2 + cop2_get_dreg(copr,tl,-1); type=STOREW_STUB; } else type=LOADW_STUB; if(c&&!memtarget) { - jaddr2=(int)out; + jaddr2=out; emit_jmp(0); // inline_readstub/inline_writestub? } else { if(!c) { - jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override); - } - else if(ram_offset&&memtarget) { - emit_addimm(ar,ram_offset,HOST_TEMPREG); - fastio_reg_override=HOST_TEMPREG; - } - if (opcode[i]==0x32) { // LWC2 - #ifdef HOST_IMM_ADDR32 - if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl); - else - #endif - int a=ar; - if(fastio_reg_override) a=fastio_reg_override; - emit_readword_indexed(0,a,tl); + jaddr2 = emit_fastpath_cmp_jump(i, i_regs, ar, + &offset_reg, &fastio_reg_override); + } + else if (ram_offset && memtarget) { + offset_reg = get_ro_reg(i_regs, 0); + } + switch (dops[i].opcode) { + case 0x32: { // LWC2 + int a = ar; + if (fastio_reg_override >= 0) + a = fastio_reg_override; + do_load_word(a, tl, offset_reg); + break; } - if (opcode[i]==0x3a) { // SWC2 + case 0x3a: { // SWC2 #ifdef DESTRUCTIVE_SHIFT if(!offset&&!c&&s>=0) emit_mov(s,ar); #endif - int a=ar; - if(fastio_reg_override) a=fastio_reg_override; - emit_writeword_indexed(tl,0,a); + int a = ar; + if (fastio_reg_override >= 0) + a = fastio_reg_override; + do_store_word(a, 0, tl, offset_reg, 1); + break; + } + default: + assert(0); } } + if (fastio_reg_override == HOST_TEMPREG || offset_reg == HOST_TEMPREG) + host_tempreg_release(); if(jaddr2) - add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist); - if(opcode[i]==0x3a) // SWC2 - if(!(i_regs->waswritten&(1<waswritten&(1<regmap,INVCP); assert(ir>=0); emit_cmpmem_indexedsr12_reg(ir,ar,1); #else - emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1); + emit_cmpmem_indexedsr12_imm(invalid_code,ar,1); #endif #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT) emit_callne(invalidate_addr_reg[ar]); #else - int jaddr3=(int)out; + void *jaddr3 = out; emit_jne(0); - add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<>11) & 0x1f; + signed char temp = get_reg(i_regs->regmap, -1); + + if (!HACK_ENABLED(NDHACK_NO_STALLS)) { + u_int reglist = reglist_exclude(get_host_reglist(i_regs->regmap), temp, -1); + if (dops[i].opcode2 == 0 || dops[i].opcode2 == 2) { // MFC2/CFC2 + signed char tl = get_reg(i_regs->regmap, dops[i].rt1); + reglist = reglist_exclude(reglist, tl, -1); + } + cop2_do_stall_check(0, i, i_regs, reglist); + } + if (dops[i].opcode2==0) { // MFC2 + signed char tl=get_reg(i_regs->regmap,dops[i].rt1); + if(tl>=0&&dops[i].rt1!=0) + cop2_get_dreg(copr,tl,temp); + } + else if (dops[i].opcode2==4) { // MTC2 + signed char sl=get_reg(i_regs->regmap,dops[i].rs1); + cop2_put_dreg(copr,sl,temp); + } + else if (dops[i].opcode2==2) // CFC2 + { + signed char tl=get_reg(i_regs->regmap,dops[i].rt1); + if(tl>=0&&dops[i].rt1!=0) + emit_readword(®_cop2c[copr],tl); } + else if (dops[i].opcode2==6) // CTC2 + { + signed char sl=get_reg(i_regs->regmap,dops[i].rs1); + switch(copr) { + case 4: + case 12: + case 20: + case 26: + case 27: + case 29: + case 30: + emit_signextend16(sl,temp); + break; + case 31: + c2op_ctc2_31_assemble(sl,temp); + break; + default: + temp=sl; + break; + } + emit_writeword(temp,®_cop2c[copr]); + assert(sl>=0); + } +} + +static void do_unalignedwritestub(int n) +{ + assem_debug("do_unalignedwritestub %x\n",start+stubs[n].a*4); + literal_pool(256); + set_jump_target(stubs[n].addr, out); + + int i=stubs[n].a; + struct regstat *i_regs=(struct regstat *)stubs[n].c; + int addr=stubs[n].b; + u_int reglist=stubs[n].e; + signed char *i_regmap=i_regs->regmap; + int temp2=get_reg(i_regmap,FTEMP); + int rt; + rt=get_reg(i_regmap,dops[i].rs2); + assert(rt>=0); + assert(addr>=0); + assert(dops[i].opcode==0x2a||dops[i].opcode==0x2e); // SWL/SWR only implemented + reglist|=(1<regmap,rt1[i]|64); - tl=get_reg(i_regs->regmap,rt1[i]); + //if(dops[i].opcode2==0x10||dops[i].opcode2==0x12) { // MFHI/MFLO + //if(dops[i].opcode2==0x11||dops[i].opcode2==0x13) { // MTHI/MTLO + if(dops[i].rt1) { + signed char sl,tl; + tl=get_reg(i_regs->regmap,dops[i].rt1); //assert(tl>=0); if(tl>=0) { - sh=get_reg(i_regs->regmap,rs1[i]|64); - sl=get_reg(i_regs->regmap,rs1[i]); + sl=get_reg(i_regs->regmap,dops[i].rs1); if(sl>=0) emit_mov(sl,tl); - else emit_loadreg(rs1[i],tl); - if(th>=0) { - if(sh>=0) emit_mov(sh,th); - else emit_loadreg(rs1[i]|64,th); - } + else emit_loadreg(dops[i].rs1,tl); } } + if (dops[i].rs1 == HIREG || dops[i].rs1 == LOREG) // MFHI/MFLO + multdiv_do_stall(i, i_regs); } -#ifndef fconv_assemble -void fconv_assemble(int i,struct regstat *i_regs) +// call interpreter, exception handler, things that change pc/regs/cycles ... +static void call_c_cpu_handler(int i, const struct regstat *i_regs, int ccadj_, u_int pc, void *func) { - printf("Need fconv_assemble for this architecture.\n"); - exit(1); + signed char ccreg=get_reg(i_regs->regmap,CCREG); + assert(ccreg==HOST_CCREG); + assert(!is_delayslot); + (void)ccreg; + + emit_movimm(pc,3); // Get PC + emit_readword(&last_count,2); + emit_writeword(3,&psxRegs.pc); + emit_addimm(HOST_CCREG,ccadj_,HOST_CCREG); + emit_add(2,HOST_CCREG,2); + emit_writeword(2,&psxRegs.cycle); + emit_far_call(func); + emit_far_jump(jump_to_new_pc); } -#endif -#if 0 -void float_assemble(int i,struct regstat *i_regs) +static void syscall_assemble(int i, const struct regstat *i_regs, int ccadj_) { - printf("Need float_assemble for this architecture.\n"); - exit(1); + emit_movimm(0x20,0); // cause code + emit_movimm(0,1); // not in delay slot + call_c_cpu_handler(i, i_regs, ccadj_, start+i*4, psxException); } -#endif -void syscall_assemble(int i,struct regstat *i_regs) +static void hlecall_assemble(int i, const struct regstat *i_regs, int ccadj_) { - signed char ccreg=get_reg(i_regs->regmap,CCREG); - assert(ccreg==HOST_CCREG); - assert(!is_delayslot); - (void)ccreg; - emit_movimm(start+i*4,EAX); // Get PC - emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right? There should probably be an extra cycle... - emit_jmp((int)jump_syscall_hle); // XXX + void *hlefunc = psxNULL; + uint32_t hleCode = source[i] & 0x03ffffff; + if (hleCode < ARRAY_SIZE(psxHLEt)) + hlefunc = psxHLEt[hleCode]; + + call_c_cpu_handler(i, i_regs, ccadj_, start + i*4+4, hlefunc); } -void hlecall_assemble(int i,struct regstat *i_regs) +static void intcall_assemble(int i, const struct regstat *i_regs, int ccadj_) { - extern void psxNULL(); - signed char ccreg=get_reg(i_regs->regmap,CCREG); - assert(ccreg==HOST_CCREG); - assert(!is_delayslot); - (void)ccreg; - emit_movimm(start+i*4+4,0); // Get PC - uint32_t hleCode = source[i] & 0x03ffffff; - if (hleCode >= (sizeof(psxHLEt) / sizeof(psxHLEt[0]))) - emit_movimm((int)psxNULL,1); - else - emit_movimm((int)psxHLEt[hleCode],1); - emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX - emit_jmp((int)jump_hlecall); + call_c_cpu_handler(i, i_regs, ccadj_, start + i*4, execI); } -void intcall_assemble(int i,struct regstat *i_regs) +static void speculate_mov(int rs,int rt) { - signed char ccreg=get_reg(i_regs->regmap,CCREG); - assert(ccreg==HOST_CCREG); - assert(!is_delayslot); - (void)ccreg; - emit_movimm(start+i*4,0); // Get PC - emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); - emit_jmp((int)jump_intcall); + if(rt!=0) { + smrv_strong_next|=1<>dops[i].rs1)&1) speculate_mov(dops[i].rs1,dops[i].rt1); + else if((smrv_strong>>dops[i].rs2)&1) speculate_mov(dops[i].rs2,dops[i].rt1); + else if((smrv_weak>>dops[i].rs1)&1) speculate_mov_weak(dops[i].rs1,dops[i].rt1); + else if((smrv_weak>>dops[i].rs2)&1) speculate_mov_weak(dops[i].rs2,dops[i].rt1); + else { + smrv_strong_next&=~(1<=0) { + if(get_final_value(hr,i,&value)) + smrv[dops[i].rt1]=value; + else smrv[dops[i].rt1]=constmap[i][hr]; + smrv_strong_next|=1<>dops[i].rs1)&1) speculate_mov(dops[i].rs1,dops[i].rt1); + else if((smrv_weak>>dops[i].rs1)&1) speculate_mov_weak(dops[i].rs1,dops[i].rt1); + } + break; + case LOAD: + if(start<0x2000&&(dops[i].rt1==26||(smrv[dops[i].rt1]>>24)==0xa0)) { + // special case for BIOS + smrv[dops[i].rt1]=0xa0000000; + smrv_strong_next|=1<>r)&1),(smrv_weak>>r)&1,regs[i].isconst,regs[i].wasconst); +#endif +} + +static void ujump_assemble(int i, const struct regstat *i_regs); +static void rjump_assemble(int i, const struct regstat *i_regs); +static void cjump_assemble(int i, const struct regstat *i_regs); +static void sjump_assemble(int i, const struct regstat *i_regs); +static void pagespan_assemble(int i, const struct regstat *i_regs); + +static int assemble(int i, const struct regstat *i_regs, int ccadj_) +{ + int ds = 0; + switch (dops[i].itype) { case ALU: - alu_assemble(i,i_regs);break; + alu_assemble(i, i_regs); + break; case IMM16: - imm16_assemble(i,i_regs);break; + imm16_assemble(i, i_regs); + break; case SHIFT: - shift_assemble(i,i_regs);break; + shift_assemble(i, i_regs); + break; case SHIFTIMM: - shiftimm_assemble(i,i_regs);break; + shiftimm_assemble(i, i_regs); + break; case LOAD: - load_assemble(i,i_regs);break; + load_assemble(i, i_regs, ccadj_); + break; case LOADLR: - loadlr_assemble(i,i_regs);break; + loadlr_assemble(i, i_regs, ccadj_); + break; case STORE: - store_assemble(i,i_regs);break; + store_assemble(i, i_regs, ccadj_); + break; case STORELR: - storelr_assemble(i,i_regs);break; + storelr_assemble(i, i_regs, ccadj_); + break; case COP0: - cop0_assemble(i,i_regs);break; + cop0_assemble(i, i_regs, ccadj_); + break; case COP1: - cop1_assemble(i,i_regs);break; + cop1_assemble(i, i_regs); + break; case C1LS: - c1ls_assemble(i,i_regs);break; + c1ls_assemble(i, i_regs); + break; case COP2: - cop2_assemble(i,i_regs);break; + cop2_assemble(i, i_regs); + break; case C2LS: - c2ls_assemble(i,i_regs);break; + c2ls_assemble(i, i_regs, ccadj_); + break; case C2OP: - c2op_assemble(i,i_regs);break; - case FCONV: - fconv_assemble(i,i_regs);break; - case FLOAT: - float_assemble(i,i_regs);break; - case FCOMP: - fcomp_assemble(i,i_regs);break; + c2op_assemble(i, i_regs); + break; case MULTDIV: - multdiv_assemble(i,i_regs);break; + multdiv_assemble(i, i_regs); + multdiv_prepare_stall(i, i_regs, ccadj_); + break; case MOV: - mov_assemble(i,i_regs);break; + mov_assemble(i, i_regs); + break; + case SYSCALL: + syscall_assemble(i, i_regs, ccadj_); + break; + case HLECALL: + hlecall_assemble(i, i_regs, ccadj_); + break; + case INTCALL: + intcall_assemble(i, i_regs, ccadj_); + break; + case UJUMP: + ujump_assemble(i, i_regs); + ds = 1; + break; + case RJUMP: + rjump_assemble(i, i_regs); + ds = 1; + break; + case CJUMP: + cjump_assemble(i, i_regs); + ds = 1; + break; + case SJUMP: + sjump_assemble(i, i_regs); + ds = 1; + break; + case SPAN: + pagespan_assemble(i, i_regs); + break; + case NOP: + case OTHER: + case NI: + // not handled, just skip + break; + default: + assert(0); + } + return ds; +} + +static void ds_assemble(int i, const struct regstat *i_regs) +{ + speculate_register_values(i); + is_delayslot = 1; + switch (dops[i].itype) { case SYSCALL: case HLECALL: case INTCALL: @@ -3561,36 +4146,26 @@ void ds_assemble(int i,struct regstat *i_regs) case RJUMP: case CJUMP: case SJUMP: - case FJUMP: SysPrintf("Jump in the delay slot. This is probably a bug.\n"); + break; + default: + assemble(i, i_regs, ccadj[i]); } - is_delayslot=0; + is_delayslot = 0; } // Is the branch target a valid internal jump? -int internal_branch(uint64_t i_is32,int addr) +static int internal_branch(int addr) { if(addr&1) return 0; // Indirect (register) jump if(addr>=start && addr>2; - // Delay slots are not valid branch targets - //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0; - // 64 -> 32 bit transition requires a recompile - /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32) - { - if(requires_32bit[t]&~i_is32) printf("optimizable: no\n"); - else printf("optimizable: yes\n"); - }*/ - //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0; return 1; } return 0; } -#ifndef wb_invalidate -void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32, - uint64_t u,uint64_t uu) +static void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t u) { int hr; for(hr=0;hr=0) { if((dirty>>hr)&1) { if(get_reg(entry,pre[hr])<0) { - if(pre[hr]<64) { - if(!((u>>pre[hr])&1)) { - emit_storereg(pre[hr],hr); - if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) { - emit_sarimm(hr,31,hr); - emit_storereg(pre[hr]|64,hr); - } - } - }else{ - if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) { - emit_storereg(pre[hr],hr); - } - } + assert(pre[hr]<64); + if(!((u>>pre[hr])&1)) + emit_storereg(pre[hr],hr); } } } @@ -3632,12 +4197,11 @@ void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t } } } -#endif // Load the specified registers // This only loads the registers given as arguments because // we don't want to load things that will be overwritten -void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2) +static void load_regs(signed char entry[],signed char regmap[],int rs1,int rs2) { int hr; // Load 32-bit regs @@ -3657,28 +4221,6 @@ void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2 } } } - //Load 64-bit regs - for(hr=0;hr=0) { - if(entry[hr]!=regmap[hr]) { - if(regmap[hr]-64==rs1||regmap[hr]-64==rs2) - { - assert(regmap[hr]!=64); - if((is32>>(regmap[hr]&63))&1) { - int lr=get_reg(regmap,regmap[hr]-64); - if(lr>=0) - emit_sarimm(lr,31,hr); - else - emit_loadreg(regmap[hr],hr); - } - else - { - emit_loadreg(regmap[hr],hr); - } - } - } - } - } } // Load registers prior to the start of a loop @@ -3713,41 +4255,41 @@ static void loop_preload(signed char pre[],signed char entry[]) // Generate address for load/store instruction // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads -void address_generation(int i,struct regstat *i_regs,signed char entry[]) +void address_generation(int i, const struct regstat *i_regs, signed char entry[]) { - if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) { + if (dops[i].is_load || dops[i].is_store) { int ra=-1; int agr=AGEN1+(i&1); - if(itype[i]==LOAD) { - ra=get_reg(i_regs->regmap,rt1[i]); + if(dops[i].itype==LOAD) { + ra=get_reg(i_regs->regmap,dops[i].rt1); if(ra<0) ra=get_reg(i_regs->regmap,-1); assert(ra>=0); } - if(itype[i]==LOADLR) { + if(dops[i].itype==LOADLR) { ra=get_reg(i_regs->regmap,FTEMP); } - if(itype[i]==STORE||itype[i]==STORELR) { + if(dops[i].itype==STORE||dops[i].itype==STORELR) { ra=get_reg(i_regs->regmap,agr); if(ra<0) ra=get_reg(i_regs->regmap,-1); } - if(itype[i]==C1LS||itype[i]==C2LS) { - if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2 + if(dops[i].itype==C2LS) { + if ((dops[i].opcode&0x3b)==0x31||(dops[i].opcode&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2 ra=get_reg(i_regs->regmap,FTEMP); else { // SWC1/SDC1/SWC2/SDC2 ra=get_reg(i_regs->regmap,agr); if(ra<0) ra=get_reg(i_regs->regmap,-1); } } - int rs=get_reg(i_regs->regmap,rs1[i]); + int rs=get_reg(i_regs->regmap,dops[i].rs1); if(ra>=0) { int offset=imm[i]; int c=(i_regs->wasconst>>rs)&1; - if(rs1[i]==0) { + if(dops[i].rs1==0) { // Using r0 as a base address if(!entry||entry[ra]!=agr) { - if (opcode[i]==0x22||opcode[i]==0x26) { + if (dops[i].opcode==0x22||dops[i].opcode==0x26) { emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR - }else if (opcode[i]==0x1a||opcode[i]==0x1b) { + }else if (dops[i].opcode==0x1a||dops[i].opcode==0x1b) { emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR }else{ emit_movimm(offset,ra); @@ -3755,29 +4297,26 @@ void address_generation(int i,struct regstat *i_regs,signed char entry[]) } // else did it in the previous cycle } else if(rs<0) { - if(!entry||entry[ra]!=rs1[i]) - emit_loadreg(rs1[i],ra); - //if(!entry||entry[ra]!=rs1[i]) + if(!entry||entry[ra]!=dops[i].rs1) + emit_loadreg(dops[i].rs1,ra); + //if(!entry||entry[ra]!=dops[i].rs1) // printf("poor load scheduling!\n"); } else if(c) { - if(rs1[i]!=rt1[i]||itype[i]!=LOAD) { + if(dops[i].rs1!=dops[i].rt1||dops[i].itype!=LOAD) { if(!entry||entry[ra]!=agr) { - if (opcode[i]==0x22||opcode[i]==0x26) { + if (dops[i].opcode==0x22||dops[i].opcode==0x26) { emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR - }else if (opcode[i]==0x1a||opcode[i]==0x1b) { + }else if (dops[i].opcode==0x1a||dops[i].opcode==0x1b) { emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR }else{ - #ifdef HOST_IMM_ADDR32 - if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2 - #endif emit_movimm(constmap[i][rs]+offset,ra); regs[i].loadedconst|=1<=0) { emit_addimm(rs,offset,ra); }else{ @@ -3787,33 +4326,30 @@ void address_generation(int i,struct regstat *i_regs,signed char entry[]) } } // Preload constants for next instruction - if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) { + if (dops[i+1].is_load || dops[i+1].is_store) { int agr,ra; // Actual address agr=AGEN1+((i+1)&1); ra=get_reg(i_regs->regmap,agr); if(ra>=0) { - int rs=get_reg(regs[i+1].regmap,rs1[i+1]); + int rs=get_reg(regs[i+1].regmap,dops[i+1].rs1); int offset=imm[i+1]; int c=(regs[i+1].wasconst>>rs)&1; - if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) { - if (opcode[i+1]==0x22||opcode[i+1]==0x26) { + if(c&&(dops[i+1].rs1!=dops[i+1].rt1||dops[i+1].itype!=LOAD)) { + if (dops[i+1].opcode==0x22||dops[i+1].opcode==0x26) { emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR - }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) { + }else if (dops[i+1].opcode==0x1a||dops[i+1].opcode==0x1b) { emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR }else{ - #ifdef HOST_IMM_ADDR32 - if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2 - #endif emit_movimm(constmap[i+1][rs]+offset,ra); regs[i+1].loadedconst|=1<>hr)&1)) break; - if(bt[i+1]) break; + if(dops[i+1].bt) break; i++; } if(i>hr)&1)) + if(dops[i+2].itype==LOAD&&dops[i+2].rs1==reg&&dops[i+2].rt1==reg&&((regs[i+1].wasconst>>hr)&1)) { // Precompute load address *value=constmap[i][hr]+imm[i+2]; return 1; } } - if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg) + if(dops[i+1].itype==LOAD&&dops[i+1].rs1==reg&&dops[i+1].rt1==reg) { // Precompute load address *value=constmap[i][hr]+imm[i+1]; - //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]); + //printf("c=%x imm=%lx\n",(long)constmap[i][hr],imm[i+1]); return 1; } } } *value=constmap[i][hr]; - //printf("c=%x\n",(int)constmap[i][hr]); + //printf("c=%lx\n",(long)constmap[i][hr]); if(i==slen-1) return 1; - if(reg<64) { - return !((unneeded_reg[i+1]>>reg)&1); - }else{ - return !((unneeded_reg_upper[i+1]>>reg)&1); - } + assert(reg < 64); + return !((unneeded_reg[i+1]>>reg)&1); } // Load registers with known constants -void load_consts(signed char pre[],signed char regmap[],int is32,int i) +static void load_consts(signed char pre[],signed char regmap[],int i) { int hr,hr2; // propagate loaded constant flags - if(i==0||bt[i]) + if(i==0||dops[i].bt) regs[i].loadedconst=0; else { for(hr=0;hr=0) { //if(entry[hr]!=regmap[hr]) { if(!((regs[i].loadedconst>>hr)&1)) { - if(((regs[i].isconst>>hr)&1)&®map[hr]<64&®map[hr]>0) { + assert(regmap[hr]<64); + if(((regs[i].isconst>>hr)&1)&®map[hr]>0) { int value,similar=0; if(get_final_value(hr,i,&value)) { // see if some other register has similar value @@ -3918,41 +4452,16 @@ void load_consts(signed char pre[],signed char regmap[],int is32,int i) } } } - // Load 64-bit regs - for(hr=0;hr=0) { - //if(entry[hr]!=regmap[hr]) { - if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) { - if(((regs[i].isconst>>hr)&1)&®map[hr]>64) { - if((is32>>(regmap[hr]&63))&1) { - int lr=get_reg(regmap,regmap[hr]-64); - assert(lr>=0); - emit_sarimm(lr,31,hr); - } - else - { - int value; - if(get_final_value(hr,i,&value)) { - if(value==0) { - emit_zeroreg(hr); - } - else { - emit_movimm(value,hr); - } - } - } - } - } - } - } } -void load_all_consts(signed char regmap[],int is32,u_int dirty,int i) + +static void load_all_consts(const signed char regmap[], u_int dirty, int i) { int hr; // Load 32-bit regs for(hr=0;hr=0&&((dirty>>hr)&1)) { - if(((regs[i].isconst>>hr)&1)&®map[hr]<64&®map[hr]>0) { + assert(regmap[hr] < 64); + if(((regs[i].isconst>>hr)&1)&®map[hr]>0) { int value=constmap[i][hr]; if(value==0) { emit_zeroreg(hr); @@ -3963,32 +4472,10 @@ void load_all_consts(signed char regmap[],int is32,u_int dirty,int i) } } } - // Load 64-bit regs - for(hr=0;hr=0&&((dirty>>hr)&1)) { - if(((regs[i].isconst>>hr)&1)&®map[hr]>64) { - if((is32>>(regmap[hr]&63))&1) { - int lr=get_reg(regmap,regmap[hr]-64); - assert(lr>=0); - emit_sarimm(lr,31,hr); - } - else - { - int value=constmap[i][hr]; - if(value==0) { - emit_zeroreg(hr); - } - else { - emit_movimm(value,hr); - } - } - } - } - } } // Write out all dirty registers (except cycle count) -void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty) +static void wb_dirtys(const signed char i_regmap[], uint64_t i_dirty) { int hr; for(hr=0;hr0) { if(i_regmap[hr]!=CCREG) { if((i_dirty>>hr)&1) { - if(i_regmap[hr]<64) { - emit_storereg(i_regmap[hr],hr); - }else{ - if( !((i_is32>>(i_regmap[hr]&63))&1) ) { - emit_storereg(i_regmap[hr],hr); - } - } + assert(i_regmap[hr]<64); + emit_storereg(i_regmap[hr],hr); } } } } } } + // Write out dirty registers that we need to reload (pair with load_needed_regs) // This writes the registers not written by store_regs_bt -void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr) +static void wb_needed_dirtys(const signed char i_regmap[], uint64_t i_dirty, int addr) { int hr; int t=(addr-start)>>2; @@ -4019,15 +4502,10 @@ void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,in if(hr!=EXCLUDE_REG) { if(i_regmap[hr]>0) { if(i_regmap[hr]!=CCREG) { - if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) { + if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1)) { if((i_dirty>>hr)&1) { - if(i_regmap[hr]<64) { - emit_storereg(i_regmap[hr],hr); - }else{ - if( !((i_is32>>(i_regmap[hr]&63))&1) ) { - emit_storereg(i_regmap[hr],hr); - } - } + assert(i_regmap[hr]<64); + emit_storereg(i_regmap[hr],hr); } } } @@ -4037,7 +4515,7 @@ void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,in } // Load all registers (except cycle count) -void load_all_regs(signed char i_regmap[]) +static void load_all_regs(const signed char i_regmap[]) { int hr; for(hr=0;hr=64&®s[t].regmap_entry[hr]>(regs[t].regmap_entry[hr]&63))&1) { - int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64); - if(lr<0) { - emit_loadreg(regs[t].regmap_entry[hr],hr); - } - else - { - emit_sarimm(lr,31,hr); - } - } - else - { - emit_loadreg(regs[t].regmap_entry[hr],hr); - } - } - } } // Store dirty registers prior to branch -void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr) +void store_regs_bt(signed char i_regmap[],uint64_t i_dirty,int addr) { - if(internal_branch(i_is32,addr)) + if(internal_branch(addr)) { int t=(addr-start)>>2; int hr; for(hr=0;hr0 && i_regmap[hr]!=CCREG) { - if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) { + if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1)) { if((i_dirty>>hr)&1) { - if(i_regmap[hr]<64) { - if(!((unneeded_reg[t]>>i_regmap[hr])&1)) { - emit_storereg(i_regmap[hr],hr); - if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) { - #ifdef DESTRUCTIVE_WRITEBACK - emit_sarimm(hr,31,hr); - emit_storereg(i_regmap[hr]|64,hr); - #else - emit_sarimm(hr,31,HOST_TEMPREG); - emit_storereg(i_regmap[hr]|64,HOST_TEMPREG); - #endif - } - } - }else{ - if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) { - emit_storereg(i_regmap[hr],hr); - } - } + assert(i_regmap[hr]<64); + if(!((unneeded_reg[t]>>i_regmap[hr])&1)) + emit_storereg(i_regmap[hr],hr); } } } @@ -4156,15 +4599,15 @@ void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int a else { // Branch out of this block, write out all dirty regs - wb_dirtys(i_regmap,i_is32,i_dirty); + wb_dirtys(i_regmap,i_dirty); } } // Load all needed registers for branch target -void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr) +static void load_regs_bt(signed char i_regmap[],uint64_t i_dirty,int addr) { //if(addr>=start && addr<(start+slen*4)) - if(internal_branch(i_is32,addr)) + if(internal_branch(addr)) { int t=(addr-start)>>2; int hr; @@ -4178,11 +4621,7 @@ void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int ad // Load 32-bit regs for(hr=0;hr=0&®s[t].regmap_entry[hr]>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) { - #else - if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) { - #endif + if(i_regmap[hr]!=regs[t].regmap_entry[hr]) { if(regs[t].regmap_entry[hr]==0) { emit_zeroreg(hr); } @@ -4193,37 +4632,10 @@ void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int ad } } } - //Load 64-bit regs - for(hr=0;hr=64&®s[t].regmap_entry[hr]>(regs[t].regmap_entry[hr]&63))&1) { - int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64); - if(lr<0) { - emit_loadreg(regs[t].regmap_entry[hr],hr); - } - else - { - emit_sarimm(lr,31,hr); - } - } - else - { - emit_loadreg(regs[t].regmap_entry[hr],hr); - } - } - else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) { - int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64); - assert(lr>=0); - emit_sarimm(lr,31,hr); - } - } - } } } -int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr) +static int match_bt(signed char i_regmap[],uint64_t i_dirty,int addr) { if(addr>=start && addr=64&&i_regmap[hr]>(i_regmap[hr]&63))&1)) - return 0; + assert(0); } } } @@ -4269,19 +4680,13 @@ int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr) } } } - if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1) - { - //printf("%x: is32 no match\n",addr); - return 0; - } } } } - //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0; // Delay slots are not valid branch targets - //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0; + //if(t>0&&(dops[t-1].is_jump) return 0; // Delay slots require additional processing, so do not match - if(is_ds[t]) return 0; + if(dops[t].is_ds) return 0; } else { @@ -4306,60 +4711,68 @@ int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr) return 1; } -// Used when a branch jumps into the delay slot of another branch -void ds_assemble_entry(int i) +#ifdef DRC_DBG +static void drc_dbg_emit_do_cmp(int i, int ccadj_) { - int t=(ba[i]-start)>>2; - if(!instr_addr[t]) instr_addr[t]=(u_int)out; - assem_debug("Assemble delay slot at %x\n",ba[i]); - assem_debug("<->\n"); - if(regs[t].regmap_entry[HOST_CCREG]==CCREG&®s[t].regmap[HOST_CCREG]!=CCREG) - wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32); - load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]); - address_generation(t,®s[t],regs[t].regmap_entry); - if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a) - load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP); - cop1_usable=0; - is_delayslot=0; - switch(itype[t]) { - case ALU: - alu_assemble(t,®s[t]);break; - case IMM16: - imm16_assemble(t,®s[t]);break; - case SHIFT: - shift_assemble(t,®s[t]);break; - case SHIFTIMM: - shiftimm_assemble(t,®s[t]);break; - case LOAD: - load_assemble(t,®s[t]);break; - case LOADLR: - loadlr_assemble(t,®s[t]);break; - case STORE: - store_assemble(t,®s[t]);break; - case STORELR: - storelr_assemble(t,®s[t]);break; - case COP0: - cop0_assemble(t,®s[t]);break; - case COP1: - cop1_assemble(t,®s[t]);break; - case C1LS: - c1ls_assemble(t,®s[t]);break; - case COP2: - cop2_assemble(t,®s[t]);break; - case C2LS: - c2ls_assemble(t,®s[t]);break; - case C2OP: - c2op_assemble(t,®s[t]);break; - case FCONV: - fconv_assemble(t,®s[t]);break; - case FLOAT: - float_assemble(t,®s[t]);break; - case FCOMP: - fcomp_assemble(t,®s[t]);break; - case MULTDIV: - multdiv_assemble(t,®s[t]);break; - case MOV: - mov_assemble(t,®s[t]);break; + extern void do_insn_cmp(); + //extern int cycle; + u_int hr, reglist = get_host_reglist(regs[i].regmap); + + assem_debug("//do_insn_cmp %08x\n", start+i*4); + save_regs(reglist); + // write out changed consts to match the interpreter + if (i > 0 && !dops[i].bt) { + for (hr = 0; hr < HOST_REGS; hr++) { + int reg = regs[i].regmap_entry[hr]; // regs[i-1].regmap[hr]; + if (hr == EXCLUDE_REG || reg < 0) + continue; + if (!((regs[i-1].isconst >> hr) & 1)) + continue; + if (i > 1 && reg == regs[i-2].regmap[hr] && constmap[i-1][hr] == constmap[i-2][hr]) + continue; + emit_movimm(constmap[i-1][hr],0); + emit_storereg(reg, 0); + } + } + emit_movimm(start+i*4,0); + emit_writeword(0,&pcaddr); + int cc = get_reg(regs[i].regmap_entry, CCREG); + if (cc < 0) + emit_loadreg(CCREG, cc = 0); + emit_addimm(cc, ccadj_, 0); + emit_writeword(0, &psxRegs.cycle); + emit_far_call(do_insn_cmp); + //emit_readword(&cycle,0); + //emit_addimm(0,2,0); + //emit_writeword(0,&cycle); + (void)get_reg2; + restore_regs(reglist); + assem_debug("\\\\do_insn_cmp\n"); +} +#else +#define drc_dbg_emit_do_cmp(x,y) +#endif + +// Used when a branch jumps into the delay slot of another branch +static void ds_assemble_entry(int i) +{ + int t = (ba[i] - start) >> 2; + int ccadj_ = -CLOCK_ADJUST(1); + if (!instr_addr[t]) + instr_addr[t] = out; + assem_debug("Assemble delay slot at %x\n",ba[i]); + assem_debug("<->\n"); + drc_dbg_emit_do_cmp(t, ccadj_); + if(regs[t].regmap_entry[HOST_CCREG]==CCREG&®s[t].regmap[HOST_CCREG]!=CCREG) + wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty); + load_regs(regs[t].regmap_entry,regs[t].regmap,dops[t].rs1,dops[t].rs2); + address_generation(t,®s[t],regs[t].regmap_entry); + if (ram_offset && (dops[t].is_load || dops[t].is_store)) + load_regs(regs[t].regmap_entry,regs[t].regmap,ROREG,ROREG); + if (dops[t].is_store) + load_regs(regs[t].regmap_entry,regs[t].regmap,INVCP,INVCP); + is_delayslot=0; + switch (dops[t].itype) { case SYSCALL: case HLECALL: case INTCALL: @@ -4368,137 +4781,155 @@ void ds_assemble_entry(int i) case RJUMP: case CJUMP: case SJUMP: - case FJUMP: SysPrintf("Jump in the delay slot. This is probably a bug.\n"); + break; + default: + assemble(t, ®s[t], ccadj_); } - store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4); - load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4); - if(internal_branch(regs[t].is32,ba[i]+4)) + store_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+4); + load_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+4); + if(internal_branch(ba[i]+4)) assem_debug("branch: internal\n"); else assem_debug("branch: external\n"); - assert(internal_branch(regs[t].is32,ba[i]+4)); - add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4)); + assert(internal_branch(ba[i]+4)); + add_to_linker(out,ba[i]+4,internal_branch(ba[i]+4)); emit_jmp(0); } -void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert) +static void emit_extjump(void *addr, u_int target) +{ + emit_extjump2(addr, target, dyna_linker); +} + +static void emit_extjump_ds(void *addr, u_int target) +{ + emit_extjump2(addr, target, dyna_linker_ds); +} + +// Load 2 immediates optimizing for small code size +static void emit_mov2imm_compact(int imm1,u_int rt1,int imm2,u_int rt2) { - int count; - int jaddr; - int idle=0; + emit_movimm(imm1,rt1); + emit_movimm_from(imm1,rt1,imm2,rt2); +} + +static void do_cc(int i, const signed char i_regmap[], int *adj, + int addr, int taken, int invert) +{ + int count, count_plus2; + void *jaddr; + void *idle=NULL; int t=0; - if(itype[i]==RJUMP) + if(dops[i].itype==RJUMP) { *adj=0; } //if(ba[i]>=start && ba[i]<(start+slen*4)) - if(internal_branch(branch_regs[i].is32,ba[i])) + if(internal_branch(ba[i])) { t=(ba[i]-start)>>2; - if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle + if(dops[t].is_ds) *adj=-CLOCK_ADJUST(1); // Branch into delay slot adds an extra cycle else *adj=ccadj[t]; } else { *adj=0; } - count=ccadj[i]; + count = ccadj[i]; + count_plus2 = count + CLOCK_ADJUST(2); if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) { // Idle loop if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG); - idle=(int)out; + idle=out; //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles emit_andimm(HOST_CCREG,3,HOST_CCREG); - jaddr=(int)out; + jaddr=out; emit_jmp(0); } else if(*adj==0||invert) { - int cycles=CLOCK_ADJUST(count+2); + int cycles = count_plus2; // faster loop HACK +#if 0 if (t&&*adj) { int rel=t-i; if(-NO_CYCLE_PENALTY_THR>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) { - s1h=s2h=-1; + s2l=-1; } assert(s1l>=0); #ifdef DESTRUCTIVE_WRITEBACK - if(rs1[i]) { - if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1) - emit_loadreg(rs1[i],s1l); + if(dops[i].rs1) { + if((branch_regs[i].dirty>>s1l)&&1) + emit_loadreg(dops[i].rs1,s1l); } else { - if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1) - emit_loadreg(rs2[i],s1l); + if((branch_regs[i].dirty>>s1l)&1) + emit_loadreg(dops[i].rs2,s1l); } if(s2l>=0) - if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1) - emit_loadreg(rs2[i],s2l); + if((branch_regs[i].dirty>>s2l)&1) + emit_loadreg(dops[i].rs2,s2l); #endif int hr=0; int addr=-1,alt=-1,ntaddr=-1; while(hr=0) emit_cmp(s1l,s2l); - else emit_test(s1l,s1l); - emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr); - } - else + if(s2l>=0) emit_cmp(s1l,s2l); + else emit_test(s1l,s1l); + emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr); + #else + emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt); + if(s2l>=0) emit_cmp(s1l,s2l); + else emit_test(s1l,s1l); + emit_cmovne_reg(alt,addr); #endif - { - emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt); - if(s1h>=0) { - if(s2h>=0) emit_cmp(s1h,s2h); - else emit_test(s1h,s1h); - emit_cmovne_reg(alt,addr); - } - if(s2l>=0) emit_cmp(s1l,s2l); - else emit_test(s1l,s1l); - emit_cmovne_reg(alt,addr); - } } - if((opcode[i]&0x2f)==5) // BNE + if((dops[i].opcode&0x2f)==5) // BNE { #ifdef HAVE_CMOV_IMM - if(s1h<0) { - if(s2l>=0) emit_cmp(s1l,s2l); - else emit_test(s1l,s1l); - emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr); - } - else + if(s2l>=0) emit_cmp(s1l,s2l); + else emit_test(s1l,s1l); + emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr); + #else + emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt); + if(s2l>=0) emit_cmp(s1l,s2l); + else emit_test(s1l,s1l); + emit_cmovne_reg(alt,addr); #endif - { - emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt); - if(s1h>=0) { - if(s2h>=0) emit_cmp(s1h,s2h); - else emit_test(s1h,s1h); - emit_cmovne_reg(alt,addr); - } - if(s2l>=0) emit_cmp(s1l,s2l); - else emit_test(s1l,s1l); - emit_cmovne_reg(alt,addr); - } } - if((opcode[i]&0x2f)==6) // BLEZ + if((dops[i].opcode&0x2f)==6) // BLEZ { //emit_movimm(ba[i],alt); //emit_movimm(start+i*4+8,addr); emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr); emit_cmpimm(s1l,1); - if(s1h>=0) emit_mov(addr,ntaddr); emit_cmovl_reg(alt,addr); - if(s1h>=0) { - emit_test(s1h,s1h); - emit_cmovne_reg(ntaddr,addr); - emit_cmovs_reg(alt,addr); - } } - if((opcode[i]&0x2f)==7) // BGTZ + if((dops[i].opcode&0x2f)==7) // BGTZ { //emit_movimm(ba[i],addr); //emit_movimm(start+i*4+8,ntaddr); emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr); emit_cmpimm(s1l,1); - if(s1h>=0) emit_mov(addr,alt); emit_cmovl_reg(ntaddr,addr); - if(s1h>=0) { - emit_test(s1h,s1h); - emit_cmovne_reg(alt,addr); - emit_cmovs_reg(ntaddr,addr); - } } - if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ + if((dops[i].opcode==1)&&(dops[i].opcode2&0x2D)==0) // BLTZ { //emit_movimm(ba[i],alt); //emit_movimm(start+i*4+8,addr); emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr); - if(s1h>=0) emit_test(s1h,s1h); - else emit_test(s1l,s1l); + emit_test(s1l,s1l); emit_cmovs_reg(alt,addr); } - if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ + if((dops[i].opcode==1)&&(dops[i].opcode2&0x2D)==1) // BGEZ { //emit_movimm(ba[i],addr); //emit_movimm(start+i*4+8,alt); emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt); - if(s1h>=0) emit_test(s1h,s1h); - else emit_test(s1l,s1l); + emit_test(s1l,s1l); emit_cmovs_reg(alt,addr); } - if(opcode[i]==0x11 && opcode2[i]==0x08 ) { + if(dops[i].opcode==0x11 && dops[i].opcode2==0x08 ) { if(source[i]&0x10000) // BC1T { //emit_movimm(ba[i],alt); @@ -4636,79 +5035,55 @@ void do_ccstub(int n) emit_cmovne_reg(alt,addr); } } - emit_writeword(addr,(int)&pcaddr); + emit_writeword(addr,&pcaddr); } else - if(itype[i]==RJUMP) + if(dops[i].itype==RJUMP) { - int r=get_reg(branch_regs[i].regmap,rs1[i]); - if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) { + int r=get_reg(branch_regs[i].regmap,dops[i].rs1); + if (ds_writes_rjump_rs(i)) { r=get_reg(branch_regs[i].regmap,RTEMP); } - emit_writeword(r,(int)&pcaddr); + emit_writeword(r,&pcaddr); } - else {SysPrintf("Unknown branch type in do_ccstub\n");exit(1);} + else {SysPrintf("Unknown branch type in do_ccstub\n");abort();} } // Update cycle count assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1); - if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG); - emit_call((int)cc_interrupt); - if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG); - if(stubs[n][6]==TAKEN) { - if(internal_branch(branch_regs[i].is32,ba[i])) + if(stubs[n].a) emit_addimm(HOST_CCREG,(int)stubs[n].a,HOST_CCREG); + emit_far_call(cc_interrupt); + if(stubs[n].a) emit_addimm(HOST_CCREG,-(int)stubs[n].a,HOST_CCREG); + if(stubs[n].d==TAKEN) { + if(internal_branch(ba[i])) load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry); - else if(itype[i]==RJUMP) { + else if(dops[i].itype==RJUMP) { if(get_reg(branch_regs[i].regmap,RTEMP)>=0) - emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP)); + emit_readword(&pcaddr,get_reg(branch_regs[i].regmap,RTEMP)); else - emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i])); + emit_loadreg(dops[i].rs1,get_reg(branch_regs[i].regmap,dops[i].rs1)); } - }else if(stubs[n][6]==NOTTAKEN) { + }else if(stubs[n].d==NOTTAKEN) { if(i=0) { #ifdef USE_MINI_HT - if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) { + if(internal_branch(return_address)&&dops[i+1].rt1!=31) { int temp=-1; // note: must be ds-safe #ifdef HOST_TEMPREG temp=HOST_TEMPREG; @@ -4736,65 +5111,62 @@ static void ujump_assemble_write_ra(int i) #ifdef REG_PREFETCH if(temp>=0) { - if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp); + if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp); } #endif emit_movimm(return_address,rt); // PC into link register #ifdef IMM_PREFETCH - emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]); + emit_prefetch(hash_table_get(return_address)); #endif } } } -void ujump_assemble(int i,struct regstat *i_regs) +static void ujump_assemble(int i, const struct regstat *i_regs) { int ra_done=0; if(i==(ba[i]-start)>>2) assem_debug("idle loop\n"); address_generation(i+1,i_regs,regs[i].regmap_entry); #ifdef REG_PREFETCH int temp=get_reg(branch_regs[i].regmap,PTEMP); - if(rt1[i]==31&&temp>=0) + if(dops[i].rt1==31&&temp>=0) { signed char *i_regmap=i_regs->regmap; int return_address=start+i*4+8; if(get_reg(branch_regs[i].regmap,31)>0) - if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp); + if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp); } #endif - if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) { + if(dops[i].rt1==31&&(dops[i].rt1==dops[i+1].rs1||dops[i].rt1==dops[i+1].rs2)) { ujump_assemble_write_ra(i); // writeback ra for DS ra_done=1; } ds_assemble(i+1,i_regs); uint64_t bc_unneeded=branch_regs[i].u; - uint64_t bc_unneeded_upper=branch_regs[i].uu; - bc_unneeded|=1|(1LL<=0) emit_prefetchreg(temp); + if(dops[i].rt1==31&&temp>=0) emit_prefetchreg(temp); #endif do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0); - if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc); - load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); - if(internal_branch(branch_regs[i].is32,ba[i])) + if(adj) emit_addimm(cc, ccadj[i] + CLOCK_ADJUST(2) - adj, cc); + load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]); + if(internal_branch(ba[i])) assem_debug("branch: internal\n"); else assem_debug("branch: external\n"); - if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) { + if (internal_branch(ba[i]) && dops[(ba[i]-start)>>2].is_ds) { ds_assemble_entry(i); } else { - add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i])); + add_to_linker(out,ba[i],internal_branch(ba[i])); emit_jmp(0); } } @@ -4802,32 +5174,32 @@ void ujump_assemble(int i,struct regstat *i_regs) static void rjump_assemble_write_ra(int i) { int rt,return_address; - assert(rt1[i+1]!=rt1[i]); - assert(rt2[i+1]!=rt1[i]); - rt=get_reg(branch_regs[i].regmap,rt1[i]); + assert(dops[i+1].rt1!=dops[i].rt1); + assert(dops[i+1].rt2!=dops[i].rt1); + rt=get_reg(branch_regs[i].regmap,dops[i].rt1); assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]); assert(rt>=0); return_address=start+i*4+8; #ifdef REG_PREFETCH if(temp>=0) { - if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp); + if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp); } #endif emit_movimm(return_address,rt); // PC into link register #ifdef IMM_PREFETCH - emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]); + emit_prefetch(hash_table_get(return_address)); #endif } -void rjump_assemble(int i,struct regstat *i_regs) +static void rjump_assemble(int i, const struct regstat *i_regs) { int temp; int rs,cc; int ra_done=0; - rs=get_reg(branch_regs[i].regmap,rs1[i]); + rs=get_reg(branch_regs[i].regmap,dops[i].rs1); assert(rs>=0); - if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) { + if (ds_writes_rjump_rs(i)) { // Delay slot abuse, make a copy of the branch address register temp=get_reg(branch_regs[i].regmap,RTEMP); assert(temp>=0); @@ -4837,35 +5209,32 @@ void rjump_assemble(int i,struct regstat *i_regs) } address_generation(i+1,i_regs,regs[i].regmap_entry); #ifdef REG_PREFETCH - if(rt1[i]==31) + if(dops[i].rt1==31) { if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) { signed char *i_regmap=i_regs->regmap; int return_address=start+i*4+8; - if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp); + if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp); } } #endif #ifdef USE_MINI_HT - if(rs1[i]==31) { + if(dops[i].rs1==31) { int rh=get_reg(regs[i].regmap,RHASH); if(rh>=0) do_preload_rhash(rh); } #endif - if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) { + if(dops[i].rt1!=0&&(dops[i].rt1==dops[i+1].rs1||dops[i].rt1==dops[i+1].rs2)) { rjump_assemble_write_ra(i); ra_done=1; } ds_assemble(i+1,i_regs); uint64_t bc_unneeded=branch_regs[i].u; - uint64_t bc_unneeded_upper=branch_regs[i].uu; - bc_unneeded|=1|(1LL<>rs)&(branch_regs[i].is32>>rs1[i])&1) { - if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) { - emit_loadreg(rs1[i],rs); + if((branch_regs[i].dirty>>rs)&1) { + if(dops[i].rs1!=dops[i+1].rt1&&dops[i].rs1!=dops[i+1].rt2) { + emit_loadreg(dops[i].rs1,rs); } } #endif #ifdef REG_PREFETCH - if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp); + if(dops[i].rt1==31&&temp>=0) emit_prefetchreg(temp); #endif #ifdef USE_MINI_HT - if(rs1[i]==31) { + if(dops[i].rs1==31) { do_miniht_load(ht,rh); } #endif //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN); //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen //assert(adj==0); - emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG); - add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0); - if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10) + emit_addimm_and_set_flags(ccadj[i] + CLOCK_ADJUST(2), HOST_CCREG); + add_stub(CC_STUB,out,NULL,0,i,-1,TAKEN,rs); + if(dops[i+1].itype==COP0&&(source[i+1]&0x3f)==0x10) // special case for RFE emit_jmp(0); else emit_jns(0); - //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1); + //load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1); #ifdef USE_MINI_HT - if(rs1[i]==31) { + if(dops[i].rs1==31) { do_miniht_jump(rs,rh,ht); } else #endif { - //if(rs!=EAX) emit_mov(rs,EAX); - //emit_jmp((int)jump_vaddr_eax); - emit_jmp(jump_vaddr_reg[rs]); - } - /* Check hash table - temp=!rs; - emit_mov(rs,temp); - emit_shrimm(rs,16,rs); - emit_xor(temp,rs,rs); - emit_movzwl_reg(rs,rs); - emit_shlimm(rs,4,rs); - emit_cmpmem_indexed((int)hash_table,rs,temp); - emit_jne((int)out+14); - emit_readword_indexed((int)hash_table+4,rs,rs); - emit_jmpreg(rs); - emit_cmpmem_indexed((int)hash_table+8,rs,temp); - emit_addimm_no_flags(8,rs); - emit_jeq((int)out-17); - // No hit on hash table, call compiler - emit_pushreg(temp); -//DEBUG > -#ifdef DEBUG_CYCLE_COUNT - emit_readword((int)&last_count,ECX); - emit_add(HOST_CCREG,ECX,HOST_CCREG); - emit_readword((int)&next_interupt,ECX); - emit_writeword(HOST_CCREG,(int)&Count); - emit_sub(HOST_CCREG,ECX,HOST_CCREG); - emit_writeword(ECX,(int)&last_count); -#endif -//DEBUG < - emit_storereg(CCREG,HOST_CCREG); - emit_call((int)get_addr); - emit_loadreg(CCREG,HOST_CCREG); - emit_addimm(ESP,4,ESP); - emit_jmpreg(EAX);*/ + do_jump_vaddr(rs); + } #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK - if(rt1[i]!=31&&iregmap; + const signed char *i_regmap = i_regs->regmap; int cc; int match; - match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + match=match_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]); assem_debug("match=%d\n",match); - int s1h,s1l,s2h,s2l; - int prev_cop1_usable=cop1_usable; + int s1l,s2l; int unconditional=0,nop=0; - int only32=0; int invert=0; - int internal=internal_branch(branch_regs[i].is32,ba[i]); + int internal=internal_branch(ba[i]); if(i==(ba[i]-start)>>2) assem_debug("idle loop\n"); if(!match) invert=1; #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK if(i>(ba[i]-start)>>2) invert=1; #endif + #ifdef __aarch64__ + invert=1; // because of near cond. branches + #endif - if(ooo[i]) { - s1l=get_reg(branch_regs[i].regmap,rs1[i]); - s1h=get_reg(branch_regs[i].regmap,rs1[i]|64); - s2l=get_reg(branch_regs[i].regmap,rs2[i]); - s2h=get_reg(branch_regs[i].regmap,rs2[i]|64); + if(dops[i].ooo) { + s1l=get_reg(branch_regs[i].regmap,dops[i].rs1); + s2l=get_reg(branch_regs[i].regmap,dops[i].rs2); } else { - s1l=get_reg(i_regmap,rs1[i]); - s1h=get_reg(i_regmap,rs1[i]|64); - s2l=get_reg(i_regmap,rs2[i]); - s2h=get_reg(i_regmap,rs2[i]|64); + s1l=get_reg(i_regmap,dops[i].rs1); + s2l=get_reg(i_regmap,dops[i].rs2); } - if(rs1[i]==0&&rs2[i]==0) + if(dops[i].rs1==0&&dops[i].rs2==0) { - if(opcode[i]&1) nop=1; + if(dops[i].opcode&1) nop=1; else unconditional=1; - //assert(opcode[i]!=5); - //assert(opcode[i]!=7); - //assert(opcode[i]!=0x15); - //assert(opcode[i]!=0x17); + //assert(dops[i].opcode!=5); + //assert(dops[i].opcode!=7); + //assert(dops[i].opcode!=0x15); + //assert(dops[i].opcode!=0x17); } - else if(rs1[i]==0) + else if(dops[i].rs1==0) { - s1l=s2l;s1h=s2h; - s2l=s2h=-1; - only32=(regs[i].was32>>rs2[i])&1; + s1l=s2l; + s2l=-1; } - else if(rs2[i]==0) + else if(dops[i].rs2==0) { - s2l=s2h=-1; - only32=(regs[i].was32>>rs1[i])&1; - } - else { - only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1; + s2l=-1; } - if(ooo[i]) { + if(dops[i].ooo) { // Out of order execution (delay slot first) //printf("OOOE\n"); address_generation(i+1,i_regs,regs[i].regmap_entry); ds_assemble(i+1,i_regs); int adj; uint64_t bc_unneeded=branch_regs[i].u; - uint64_t bc_unneeded_upper=branch_regs[i].uu; - bc_unneeded&=~((1LL<>2 || source[i+1]!=0) { - if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc); - load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + if(adj) emit_addimm(cc, ccadj[i] + CLOCK_ADJUST(2) - adj, cc); + load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]); if(internal) assem_debug("branch: internal\n"); else assem_debug("branch: external\n"); - if(internal&&is_ds[(ba[i]-start)>>2]) { + if (internal && dops[(ba[i]-start)>>2].is_ds) { ds_assemble_entry(i); } else { - add_to_linker((int)out,ba[i],internal); + add_to_linker(out,ba[i],internal); emit_jmp(0); } #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK @@ -5052,230 +5376,152 @@ void cjump_assemble(int i,struct regstat *i_regs) } } else if(nop) { - emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc); - int jaddr=(int)out; + emit_addimm_and_set_flags(ccadj[i] + CLOCK_ADJUST(2), cc); + void *jaddr=out; emit_jns(0); - add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0); + add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0); } else { - int taken=0,nottaken=0,nottaken1=0; + void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL; do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert); - if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc); - if(!only32) - { - assert(s1h>=0); - if(opcode[i]==4) // BEQ - { - if(s2h>=0) emit_cmp(s1h,s2h); - else emit_test(s1h,s1h); - nottaken1=(int)out; - emit_jne(1); - } - if(opcode[i]==5) // BNE - { - if(s2h>=0) emit_cmp(s1h,s2h); - else emit_test(s1h,s1h); - if(invert) taken=(int)out; - else add_to_linker((int)out,ba[i],internal); - emit_jne(0); - } - if(opcode[i]==6) // BLEZ - { - emit_test(s1h,s1h); - if(invert) taken=(int)out; - else add_to_linker((int)out,ba[i],internal); - emit_js(0); - nottaken1=(int)out; - emit_jne(1); - } - if(opcode[i]==7) // BGTZ - { - emit_test(s1h,s1h); - nottaken1=(int)out; - emit_js(1); - if(invert) taken=(int)out; - else add_to_linker((int)out,ba[i],internal); - emit_jne(0); - } - } // if(!only32) + if(adj&&!invert) emit_addimm(cc, ccadj[i] + CLOCK_ADJUST(2) - adj, cc); //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]); assert(s1l>=0); - if(opcode[i]==4) // BEQ + if(dops[i].opcode==4) // BEQ { if(s2l>=0) emit_cmp(s1l,s2l); else emit_test(s1l,s1l); if(invert){ - nottaken=(int)out; - emit_jne(1); + nottaken=out; + emit_jne(DJT_1); }else{ - add_to_linker((int)out,ba[i],internal); + add_to_linker(out,ba[i],internal); emit_jeq(0); } } - if(opcode[i]==5) // BNE + if(dops[i].opcode==5) // BNE { if(s2l>=0) emit_cmp(s1l,s2l); else emit_test(s1l,s1l); if(invert){ - nottaken=(int)out; - emit_jeq(1); + nottaken=out; + emit_jeq(DJT_1); }else{ - add_to_linker((int)out,ba[i],internal); + add_to_linker(out,ba[i],internal); emit_jne(0); } } - if(opcode[i]==6) // BLEZ + if(dops[i].opcode==6) // BLEZ { emit_cmpimm(s1l,1); if(invert){ - nottaken=(int)out; - emit_jge(1); + nottaken=out; + emit_jge(DJT_1); }else{ - add_to_linker((int)out,ba[i],internal); + add_to_linker(out,ba[i],internal); emit_jl(0); } } - if(opcode[i]==7) // BGTZ + if(dops[i].opcode==7) // BGTZ { emit_cmpimm(s1l,1); if(invert){ - nottaken=(int)out; - emit_jl(1); + nottaken=out; + emit_jl(DJT_1); }else{ - add_to_linker((int)out,ba[i],internal); + add_to_linker(out,ba[i],internal); emit_jge(0); } } if(invert) { - if(taken) set_jump_target(taken,(int)out); + if(taken) set_jump_target(taken, out); #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK - if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) { + if (match && (!internal || !dops[(ba[i]-start)>>2].is_ds)) { if(adj) { - emit_addimm(cc,-CLOCK_ADJUST(adj),cc); - add_to_linker((int)out,ba[i],internal); + emit_addimm(cc,-adj,cc); + add_to_linker(out,ba[i],internal); }else{ emit_addnop(13); - add_to_linker((int)out,ba[i],internal*2); + add_to_linker(out,ba[i],internal*2); } emit_jmp(0); }else #endif { - if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc); - store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); - load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + if(adj) emit_addimm(cc,-adj,cc); + store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]); + load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]); if(internal) assem_debug("branch: internal\n"); else assem_debug("branch: external\n"); - if(internal&&is_ds[(ba[i]-start)>>2]) { + if (internal && dops[(ba[i] - start) >> 2].is_ds) { ds_assemble_entry(i); } else { - add_to_linker((int)out,ba[i],internal); + add_to_linker(out,ba[i],internal); emit_jmp(0); } } - set_jump_target(nottaken,(int)out); + set_jump_target(nottaken, out); } - if(nottaken1) set_jump_target(nottaken1,(int)out); + if(nottaken1) set_jump_target(nottaken1, out); if(adj) { - if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc); + if(!invert) emit_addimm(cc,adj,cc); } } // (!unconditional) } // if(ooo) else { // In-order execution (branch first) - //if(likely[i]) printf("IOL\n"); - //else - //printf("IOE\n"); - int taken=0,nottaken=0,nottaken1=0; + void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL; if(!unconditional&&!nop) { - if(!only32) - { - assert(s1h>=0); - if((opcode[i]&0x2f)==4) // BEQ - { - if(s2h>=0) emit_cmp(s1h,s2h); - else emit_test(s1h,s1h); - nottaken1=(int)out; - emit_jne(2); - } - if((opcode[i]&0x2f)==5) // BNE - { - if(s2h>=0) emit_cmp(s1h,s2h); - else emit_test(s1h,s1h); - taken=(int)out; - emit_jne(1); - } - if((opcode[i]&0x2f)==6) // BLEZ - { - emit_test(s1h,s1h); - taken=(int)out; - emit_js(1); - nottaken1=(int)out; - emit_jne(2); - } - if((opcode[i]&0x2f)==7) // BGTZ - { - emit_test(s1h,s1h); - nottaken1=(int)out; - emit_js(2); - taken=(int)out; - emit_jne(1); - } - } // if(!only32) - //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]); assert(s1l>=0); - if((opcode[i]&0x2f)==4) // BEQ + if((dops[i].opcode&0x2f)==4) // BEQ { if(s2l>=0) emit_cmp(s1l,s2l); else emit_test(s1l,s1l); - nottaken=(int)out; - emit_jne(2); + nottaken=out; + emit_jne(DJT_2); } - if((opcode[i]&0x2f)==5) // BNE + if((dops[i].opcode&0x2f)==5) // BNE { if(s2l>=0) emit_cmp(s1l,s2l); else emit_test(s1l,s1l); - nottaken=(int)out; - emit_jeq(2); + nottaken=out; + emit_jeq(DJT_2); } - if((opcode[i]&0x2f)==6) // BLEZ + if((dops[i].opcode&0x2f)==6) // BLEZ { emit_cmpimm(s1l,1); - nottaken=(int)out; - emit_jge(2); + nottaken=out; + emit_jge(DJT_2); } - if((opcode[i]&0x2f)==7) // BGTZ + if((dops[i].opcode&0x2f)==7) // BGTZ { emit_cmpimm(s1l,1); - nottaken=(int)out; - emit_jl(2); + nottaken=out; + emit_jl(DJT_2); } } // if(!unconditional) int adj; uint64_t ds_unneeded=branch_regs[i].u; - uint64_t ds_unneeded_upper=branch_regs[i].uu; - ds_unneeded&=~((1LL<>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<>2]) { + if (internal && dops[(ba[i] - start) >> 2].is_ds) { ds_assemble_entry(i); } else { - add_to_linker((int)out,ba[i],internal); + add_to_linker(out,ba[i],internal); emit_jmp(0); } } // branch not taken - cop1_usable=prev_cop1_usable; if(!unconditional) { - if(nottaken1) set_jump_target(nottaken1,(int)out); - set_jump_target(nottaken,(int)out); + if(nottaken1) set_jump_target(nottaken1, out); + set_jump_target(nottaken, out); assem_debug("2:\n"); - if(!likely[i]) { - wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32, - ds_unneeded,ds_unneeded_upper); - load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]); - address_generation(i+1,&branch_regs[i],0); - load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG); - ds_assemble(i+1,&branch_regs[i]); - } + wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded); + // load regs + load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i+1].rs1,dops[i+1].rs2); + address_generation(i+1,&branch_regs[i],0); + if (ram_offset) + load_regs(regs[i].regmap,branch_regs[i].regmap,ROREG,ROREG); + load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP); + ds_assemble(i+1,&branch_regs[i]); cc=get_reg(branch_regs[i].regmap,CCREG); - if(cc==-1&&!likely[i]) { + if (cc == -1) { // Cycle count isn't in a register, temporarily load it then write it out emit_loadreg(CCREG,HOST_CCREG); - emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG); - int jaddr=(int)out; + emit_addimm_and_set_flags(ccadj[i] + CLOCK_ADJUST(2), HOST_CCREG); + void *jaddr=out; emit_jns(0); - add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0); + add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0); emit_storereg(CCREG,HOST_CCREG); } else{ cc=get_reg(i_regmap,CCREG); assert(cc==HOST_CCREG); - emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc); - int jaddr=(int)out; + emit_addimm_and_set_flags(ccadj[i] + CLOCK_ADJUST(2), cc); + void *jaddr=out; emit_jns(0); - add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0); + add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0); } } } } -void sjump_assemble(int i,struct regstat *i_regs) +static void sjump_assemble(int i, const struct regstat *i_regs) { - signed char *i_regmap=i_regs->regmap; + const signed char *i_regmap = i_regs->regmap; int cc; int match; - match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + match=match_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]); assem_debug("smatch=%d\n",match); - int s1h,s1l; - int prev_cop1_usable=cop1_usable; + int s1l; int unconditional=0,nevertaken=0; - int only32=0; int invert=0; - int internal=internal_branch(branch_regs[i].is32,ba[i]); + int internal=internal_branch(ba[i]); if(i==(ba[i]-start)>>2) assem_debug("idle loop\n"); if(!match) invert=1; #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK if(i>(ba[i]-start)>>2) invert=1; #endif + #ifdef __aarch64__ + invert=1; // because of near cond. branches + #endif - //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL) - //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL) + //if(dops[i].opcode2>=0x10) return; // FIXME (BxxZAL) + //assert(dops[i].opcode2<0x10||dops[i].rs1==0); // FIXME (BxxZAL) - if(ooo[i]) { - s1l=get_reg(branch_regs[i].regmap,rs1[i]); - s1h=get_reg(branch_regs[i].regmap,rs1[i]|64); + if(dops[i].ooo) { + s1l=get_reg(branch_regs[i].regmap,dops[i].rs1); } else { - s1l=get_reg(i_regmap,rs1[i]); - s1h=get_reg(i_regmap,rs1[i]|64); + s1l=get_reg(i_regmap,dops[i].rs1); } - if(rs1[i]==0) + if(dops[i].rs1==0) { - if(opcode2[i]&1) unconditional=1; + if(dops[i].opcode2&1) unconditional=1; else nevertaken=1; // These are never taken (r0 is never less than zero) - //assert(opcode2[i]!=0); - //assert(opcode2[i]!=2); - //assert(opcode2[i]!=0x10); - //assert(opcode2[i]!=0x12); - } - else { - only32=(regs[i].was32>>rs1[i])&1; + //assert(dops[i].opcode2!=0); + //assert(dops[i].opcode2!=2); + //assert(dops[i].opcode2!=0x10); + //assert(dops[i].opcode2!=0x12); } - if(ooo[i]) { + if(dops[i].ooo) { // Out of order execution (delay slot first) //printf("OOOE\n"); address_generation(i+1,i_regs,regs[i].regmap_entry); ds_assemble(i+1,i_regs); int adj; uint64_t bc_unneeded=branch_regs[i].u; - uint64_t bc_unneeded_upper=branch_regs[i].uu; - bc_unneeded&=~((1LL<>16)^return_address)&0xFFFF]); + if(!nevertaken) emit_prefetch(hash_table_get(return_address)); #endif } } cc=get_reg(branch_regs[i].regmap,CCREG); assert(cc==HOST_CCREG); if(unconditional) - store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]); //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional); assem_debug("cycle count (adj)\n"); if(unconditional) { do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0); if(i!=(ba[i]-start)>>2 || source[i+1]!=0) { - if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc); - load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + if(adj) emit_addimm(cc, ccadj[i] + CLOCK_ADJUST(2) - adj, cc); + load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]); if(internal) assem_debug("branch: internal\n"); else assem_debug("branch: external\n"); - if(internal&&is_ds[(ba[i]-start)>>2]) { + if (internal && dops[(ba[i] - start) >> 2].is_ds) { ds_assemble_entry(i); } else { - add_to_linker((int)out,ba[i],internal); + add_to_linker(out,ba[i],internal); emit_jmp(0); } #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK @@ -5437,339 +5674,75 @@ void sjump_assemble(int i,struct regstat *i_regs) } } else if(nevertaken) { - emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc); - int jaddr=(int)out; + emit_addimm_and_set_flags(ccadj[i] + CLOCK_ADJUST(2), cc); + void *jaddr=out; emit_jns(0); - add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0); + add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0); } else { - int nottaken=0; + void *nottaken = NULL; do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert); - if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc); - if(!only32) - { - assert(s1h>=0); - if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL - { - emit_test(s1h,s1h); - if(invert){ - nottaken=(int)out; - emit_jns(1); - }else{ - add_to_linker((int)out,ba[i],internal); - emit_js(0); - } - } - if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL - { - emit_test(s1h,s1h); - if(invert){ - nottaken=(int)out; - emit_js(1); - }else{ - add_to_linker((int)out,ba[i],internal); - emit_jns(0); - } - } - } // if(!only32) - else + if(adj&&!invert) emit_addimm(cc, ccadj[i] + CLOCK_ADJUST(2) - adj, cc); { assert(s1l>=0); - if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL + if((dops[i].opcode2&0xf)==0) // BLTZ/BLTZAL { emit_test(s1l,s1l); if(invert){ - nottaken=(int)out; - emit_jns(1); + nottaken=out; + emit_jns(DJT_1); }else{ - add_to_linker((int)out,ba[i],internal); + add_to_linker(out,ba[i],internal); emit_js(0); } } - if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL + if((dops[i].opcode2&0xf)==1) // BGEZ/BLTZAL { emit_test(s1l,s1l); if(invert){ - nottaken=(int)out; - emit_js(1); + nottaken=out; + emit_js(DJT_1); }else{ - add_to_linker((int)out,ba[i],internal); + add_to_linker(out,ba[i],internal); emit_jns(0); } } - } // if(!only32) - - if(invert) { - #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK - if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) { - if(adj) { - emit_addimm(cc,-CLOCK_ADJUST(adj),cc); - add_to_linker((int)out,ba[i],internal); - }else{ - emit_addnop(13); - add_to_linker((int)out,ba[i],internal*2); - } - emit_jmp(0); - }else - #endif - { - if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc); - store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); - load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); - if(internal) - assem_debug("branch: internal\n"); - else - assem_debug("branch: external\n"); - if(internal&&is_ds[(ba[i]-start)>>2]) { - ds_assemble_entry(i); - } - else { - add_to_linker((int)out,ba[i],internal); - emit_jmp(0); - } - } - set_jump_target(nottaken,(int)out); - } - - if(adj) { - if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc); - } - } // (!unconditional) - } // if(ooo) - else - { - // In-order execution (branch first) - //printf("IOE\n"); - int nottaken=0; - if(rt1[i]==31) { - int rt,return_address; - rt=get_reg(branch_regs[i].regmap,31); - if(rt>=0) { - // Save the PC even if the branch is not taken - return_address=start+i*4+8; - emit_movimm(return_address,rt); // PC into link register - #ifdef IMM_PREFETCH - emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]); - #endif - } - } - if(!unconditional) { - //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]); - if(!only32) - { - assert(s1h>=0); - if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL - { - emit_test(s1h,s1h); - nottaken=(int)out; - emit_jns(1); - } - if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL - { - emit_test(s1h,s1h); - nottaken=(int)out; - emit_js(1); - } - } // if(!only32) - else - { - assert(s1l>=0); - if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL - { - emit_test(s1l,s1l); - nottaken=(int)out; - emit_jns(1); - } - if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL - { - emit_test(s1l,s1l); - nottaken=(int)out; - emit_js(1); - } - } - } // if(!unconditional) - int adj; - uint64_t ds_unneeded=branch_regs[i].u; - uint64_t ds_unneeded_upper=branch_regs[i].uu; - ds_unneeded&=~((1LL<>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<>2]) { - ds_assemble_entry(i); - } - else { - add_to_linker((int)out,ba[i],internal); - emit_jmp(0); - } - } - // branch not taken - cop1_usable=prev_cop1_usable; - if(!unconditional) { - set_jump_target(nottaken,(int)out); - assem_debug("1:\n"); - if(!likely[i]) { - wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32, - ds_unneeded,ds_unneeded_upper); - load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]); - address_generation(i+1,&branch_regs[i],0); - load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG); - ds_assemble(i+1,&branch_regs[i]); - } - cc=get_reg(branch_regs[i].regmap,CCREG); - if(cc==-1&&!likely[i]) { - // Cycle count isn't in a register, temporarily load it then write it out - emit_loadreg(CCREG,HOST_CCREG); - emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG); - int jaddr=(int)out; - emit_jns(0); - add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0); - emit_storereg(CCREG,HOST_CCREG); - } - else{ - cc=get_reg(i_regmap,CCREG); - assert(cc==HOST_CCREG); - emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc); - int jaddr=(int)out; - emit_jns(0); - add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0); } - } - } -} - -void fjump_assemble(int i,struct regstat *i_regs) -{ - signed char *i_regmap=i_regs->regmap; - int cc; - int match; - match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); - assem_debug("fmatch=%d\n",match); - int fs,cs; - int eaddr; - int invert=0; - int internal=internal_branch(branch_regs[i].is32,ba[i]); - if(i==(ba[i]-start)>>2) assem_debug("idle loop\n"); - if(!match) invert=1; - #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK - if(i>(ba[i]-start)>>2) invert=1; - #endif - - if(ooo[i]) { - fs=get_reg(branch_regs[i].regmap,FSREG); - address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay? - } - else { - fs=get_reg(i_regmap,FSREG); - } - - // Check cop1 unusable - if(!cop1_usable) { - cs=get_reg(i_regmap,CSREG); - assert(cs>=0); - emit_testimm(cs,0x20000000); - eaddr=(int)out; - emit_jeq(0); - add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0); - cop1_usable=1; - } - - if(ooo[i]) { - // Out of order execution (delay slot first) - //printf("OOOE\n"); - ds_assemble(i+1,i_regs); - int adj; - uint64_t bc_unneeded=branch_regs[i].u; - uint64_t bc_unneeded_upper=branch_regs[i].uu; - bc_unneeded&=~((1LL<=0); - emit_testimm(fs,0x800000); - if(source[i]&0x10000) // BC1T - { - if(invert){ - nottaken=(int)out; - emit_jeq(1); - }else{ - add_to_linker((int)out,ba[i],internal); - emit_jne(0); - } - } - else // BC1F - if(invert){ - nottaken=(int)out; - emit_jne(1); - }else{ - add_to_linker((int)out,ba[i],internal); - emit_jeq(0); - } - { - } - } // if(!only32) if(invert) { - if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc); #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK - else if(match) emit_addnop(13); - #endif - store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); - load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); - if(internal) - assem_debug("branch: internal\n"); - else - assem_debug("branch: external\n"); - if(internal&&is_ds[(ba[i]-start)>>2]) { - ds_assemble_entry(i); - } - else { - add_to_linker((int)out,ba[i],internal); + if (match && (!internal || !dops[(ba[i] - start) >> 2].is_ds)) { + if(adj) { + emit_addimm(cc,-adj,cc); + add_to_linker(out,ba[i],internal); + }else{ + emit_addnop(13); + add_to_linker(out,ba[i],internal*2); + } emit_jmp(0); + }else + #endif + { + if(adj) emit_addimm(cc,-adj,cc); + store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]); + load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]); + if(internal) + assem_debug("branch: internal\n"); + else + assem_debug("branch: external\n"); + if (internal && dops[(ba[i] - start) >> 2].is_ds) { + ds_assemble_entry(i); + } + else { + add_to_linker(out,ba[i],internal); + emit_jmp(0); + } } - set_jump_target(nottaken,(int)out); + set_jump_target(nottaken, out); } if(adj) { - if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc); + if(!invert) emit_addimm(cc,adj,cc); } } // (!unconditional) } // if(ooo) @@ -5777,118 +5750,119 @@ void fjump_assemble(int i,struct regstat *i_regs) { // In-order execution (branch first) //printf("IOE\n"); - int nottaken=0; - if(1) { + void *nottaken = NULL; + if(dops[i].rt1==31) { + int rt,return_address; + rt=get_reg(branch_regs[i].regmap,31); + if(rt>=0) { + // Save the PC even if the branch is not taken + return_address=start+i*4+8; + emit_movimm(return_address,rt); // PC into link register + #ifdef IMM_PREFETCH + emit_prefetch(hash_table_get(return_address)); + #endif + } + } + if(!unconditional) { //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]); - if(1) { - assert(fs>=0); - emit_testimm(fs,0x800000); - if(source[i]&0x10000) // BC1T + assert(s1l>=0); + if((dops[i].opcode2&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL { - nottaken=(int)out; - emit_jeq(1); + emit_test(s1l,s1l); + nottaken=out; + emit_jns(DJT_1); } - else // BC1F + if((dops[i].opcode2&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL { - nottaken=(int)out; - emit_jne(1); + emit_test(s1l,s1l); + nottaken=out; + emit_js(DJT_1); } - } } // if(!unconditional) int adj; uint64_t ds_unneeded=branch_regs[i].u; - uint64_t ds_unneeded_upper=branch_regs[i].uu; - ds_unneeded&=~((1LL<>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<>2]) { - ds_assemble_entry(i); - } - else { - add_to_linker((int)out,ba[i],internal); - emit_jmp(0); + if(!nevertaken) { + //assem_debug("1:\n"); + wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded); + // load regs + load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i+1].rs1,dops[i+1].rs2); + address_generation(i+1,&branch_regs[i],0); + if (ram_offset) + load_regs(regs[i].regmap,branch_regs[i].regmap,ROREG,ROREG); + load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP); + ds_assemble(i+1,&branch_regs[i]); + cc=get_reg(branch_regs[i].regmap,CCREG); + if(cc==-1) { + emit_loadreg(CCREG,cc=HOST_CCREG); + // CHECK: Is the following instruction (fall thru) allocated ok? + } + assert(cc==HOST_CCREG); + store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]); + do_cc(i,i_regmap,&adj,ba[i],TAKEN,0); + assem_debug("cycle count (adj)\n"); + if(adj) emit_addimm(cc, ccadj[i] + CLOCK_ADJUST(2) - adj, cc); + load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]); + if(internal) + assem_debug("branch: internal\n"); + else + assem_debug("branch: external\n"); + if (internal && dops[(ba[i] - start) >> 2].is_ds) { + ds_assemble_entry(i); + } + else { + add_to_linker(out,ba[i],internal); + emit_jmp(0); + } } - // branch not taken - if(1) { // <- FIXME (don't need this) - set_jump_target(nottaken,(int)out); + if(!unconditional) { + set_jump_target(nottaken, out); assem_debug("1:\n"); - if(!likely[i]) { - wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32, - ds_unneeded,ds_unneeded_upper); - load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]); - address_generation(i+1,&branch_regs[i],0); - load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG); - ds_assemble(i+1,&branch_regs[i]); - } + wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded); + load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i+1].rs1,dops[i+1].rs2); + address_generation(i+1,&branch_regs[i],0); + load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG); + ds_assemble(i+1,&branch_regs[i]); cc=get_reg(branch_regs[i].regmap,CCREG); - if(cc==-1&&!likely[i]) { + if (cc == -1) { // Cycle count isn't in a register, temporarily load it then write it out emit_loadreg(CCREG,HOST_CCREG); - emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG); - int jaddr=(int)out; + emit_addimm_and_set_flags(ccadj[i] + CLOCK_ADJUST(2), HOST_CCREG); + void *jaddr=out; emit_jns(0); - add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0); + add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0); emit_storereg(CCREG,HOST_CCREG); } else{ cc=get_reg(i_regmap,CCREG); assert(cc==HOST_CCREG); - emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc); - int jaddr=(int)out; + emit_addimm_and_set_flags(ccadj[i] + CLOCK_ADJUST(2), cc); + void *jaddr=out; emit_jns(0); - add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0); + add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0); } } } } -static void pagespan_assemble(int i,struct regstat *i_regs) +static void pagespan_assemble(int i, const struct regstat *i_regs) { - int s1l=get_reg(i_regs->regmap,rs1[i]); - int s1h=get_reg(i_regs->regmap,rs1[i]|64); - int s2l=get_reg(i_regs->regmap,rs2[i]); - int s2h=get_reg(i_regs->regmap,rs2[i]|64); - int taken=0; - int nottaken=0; + int s1l=get_reg(i_regs->regmap,dops[i].rs1); + int s2l=get_reg(i_regs->regmap,dops[i].rs2); + void *taken = NULL; + void *nottaken = NULL; int unconditional=0; - if(rs1[i]==0) + if(dops[i].rs1==0) { - s1l=s2l;s1h=s2h; - s2l=s2h=-1; + s1l=s2l; + s2l=-1; } - else if(rs2[i]==0) + else if(dops[i].rs2==0) { - s2l=s2h=-1; - } - if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) { - s1h=s2h=-1; + s2l=-1; } int hr=0; int addr=-1,alt=-1,ntaddr=-1; @@ -5897,8 +5871,8 @@ static void pagespan_assemble(int i,struct regstat *i_regs) while(hrregmap[hr]&63)!=rs1[i] && - (i_regs->regmap[hr]&63)!=rs2[i] ) + (i_regs->regmap[hr]&63)!=dops[i].rs1 && + (i_regs->regmap[hr]&63)!=dops[i].rs2 ) { addr=hr++;break; } @@ -5908,20 +5882,20 @@ static void pagespan_assemble(int i,struct regstat *i_regs) while(hrregmap[hr]&63)!=rs1[i] && - (i_regs->regmap[hr]&63)!=rs2[i] ) + (i_regs->regmap[hr]&63)!=dops[i].rs1 && + (i_regs->regmap[hr]&63)!=dops[i].rs2 ) { alt=hr++;break; } hr++; } - if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register + if((dops[i].opcode&0x2E)==6) // BLEZ/BGTZ needs another register { while(hrregmap[hr]&63)!=rs1[i] && - (i_regs->regmap[hr]&63)!=rs2[i] ) + (i_regs->regmap[hr]&63)!=dops[i].rs1 && + (i_regs->regmap[hr]&63)!=dops[i].rs2 ) { ntaddr=hr;break; } @@ -5929,39 +5903,39 @@ static void pagespan_assemble(int i,struct regstat *i_regs) } } assert(hrregmap,31); emit_movimm(start+i*4+8,rt); unconditional=1; } - if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR + if(dops[i].opcode==0&&(dops[i].opcode2&0x3E)==8) // JR/JALR { emit_mov(s1l,addr); - if(opcode2[i]==9) // JALR + if(dops[i].opcode2==9) // JALR { - int rt=get_reg(i_regs->regmap,rt1[i]); + int rt=get_reg(i_regs->regmap,dops[i].rt1); emit_movimm(start+i*4+8,rt); } } - if((opcode[i]&0x3f)==4) // BEQ + if((dops[i].opcode&0x3f)==4) // BEQ { - if(rs1[i]==rs2[i]) + if(dops[i].rs1==dops[i].rs2) { unconditional=1; } else #ifdef HAVE_CMOV_IMM - if(s1h<0) { + if(1) { if(s2l>=0) emit_cmp(s1l,s2l); else emit_test(s1l,s1l); emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr); @@ -5971,103 +5945,65 @@ static void pagespan_assemble(int i,struct regstat *i_regs) { assert(s1l>=0); emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt); - if(s1h>=0) { - if(s2h>=0) emit_cmp(s1h,s2h); - else emit_test(s1h,s1h); - emit_cmovne_reg(alt,addr); - } if(s2l>=0) emit_cmp(s1l,s2l); else emit_test(s1l,s1l); emit_cmovne_reg(alt,addr); } } - if((opcode[i]&0x3f)==5) // BNE + if((dops[i].opcode&0x3f)==5) // BNE { #ifdef HAVE_CMOV_IMM - if(s1h<0) { - if(s2l>=0) emit_cmp(s1l,s2l); - else emit_test(s1l,s1l); - emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr); - } - else + if(s2l>=0) emit_cmp(s1l,s2l); + else emit_test(s1l,s1l); + emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr); + #else + assert(s1l>=0); + emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt); + if(s2l>=0) emit_cmp(s1l,s2l); + else emit_test(s1l,s1l); + emit_cmovne_reg(alt,addr); #endif - { - assert(s1l>=0); - emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt); - if(s1h>=0) { - if(s2h>=0) emit_cmp(s1h,s2h); - else emit_test(s1h,s1h); - emit_cmovne_reg(alt,addr); - } - if(s2l>=0) emit_cmp(s1l,s2l); - else emit_test(s1l,s1l); - emit_cmovne_reg(alt,addr); - } } - if((opcode[i]&0x3f)==0x14) // BEQL + if((dops[i].opcode&0x3f)==0x14) // BEQL { - if(s1h>=0) { - if(s2h>=0) emit_cmp(s1h,s2h); - else emit_test(s1h,s1h); - nottaken=(int)out; - emit_jne(0); - } if(s2l>=0) emit_cmp(s1l,s2l); else emit_test(s1l,s1l); - if(nottaken) set_jump_target(nottaken,(int)out); - nottaken=(int)out; + if(nottaken) set_jump_target(nottaken, out); + nottaken=out; emit_jne(0); } - if((opcode[i]&0x3f)==0x15) // BNEL + if((dops[i].opcode&0x3f)==0x15) // BNEL { - if(s1h>=0) { - if(s2h>=0) emit_cmp(s1h,s2h); - else emit_test(s1h,s1h); - taken=(int)out; - emit_jne(0); - } if(s2l>=0) emit_cmp(s1l,s2l); else emit_test(s1l,s1l); - nottaken=(int)out; + nottaken=out; emit_jeq(0); - if(taken) set_jump_target(taken,(int)out); + if(taken) set_jump_target(taken, out); } - if((opcode[i]&0x3f)==6) // BLEZ + if((dops[i].opcode&0x3f)==6) // BLEZ { emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr); emit_cmpimm(s1l,1); - if(s1h>=0) emit_mov(addr,ntaddr); emit_cmovl_reg(alt,addr); - if(s1h>=0) { - emit_test(s1h,s1h); - emit_cmovne_reg(ntaddr,addr); - emit_cmovs_reg(alt,addr); - } } - if((opcode[i]&0x3f)==7) // BGTZ + if((dops[i].opcode&0x3f)==7) // BGTZ { emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr); emit_cmpimm(s1l,1); - if(s1h>=0) emit_mov(addr,alt); emit_cmovl_reg(ntaddr,addr); - if(s1h>=0) { - emit_test(s1h,s1h); - emit_cmovne_reg(alt,addr); - emit_cmovs_reg(ntaddr,addr); - } } - if((opcode[i]&0x3f)==0x16) // BLEZL + if((dops[i].opcode&0x3f)==0x16) // BLEZL { - assert((opcode[i]&0x3f)!=0x16); + assert((dops[i].opcode&0x3f)!=0x16); } - if((opcode[i]&0x3f)==0x17) // BGTZL + if((dops[i].opcode&0x3f)==0x17) // BGTZL { - assert((opcode[i]&0x3f)!=0x17); + assert((dops[i].opcode&0x3f)!=0x17); } - assert(opcode[i]!=1); // BLTZ/BGEZ + assert(dops[i].opcode!=1); // BLTZ/BGEZ //FIXME: Check CSREG - if(opcode[i]==0x11 && opcode2[i]==0x08 ) { + if(dops[i].opcode==0x11 && dops[i].opcode2==0x08 ) { if((source[i]&0x30000)==0) // BC1F { emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt); @@ -6083,20 +6019,20 @@ static void pagespan_assemble(int i,struct regstat *i_regs) if((source[i]&0x30000)==0x20000) // BC1FL { emit_testimm(s1l,0x800000); - nottaken=(int)out; + nottaken=out; emit_jne(0); } if((source[i]&0x30000)==0x30000) // BC1TL { emit_testimm(s1l,0x800000); - nottaken=(int)out; + nottaken=out; emit_jeq(0); } } assert(i_regs->regmap[HOST_CCREG]==CCREG); - wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty); - if(likely[i]||unconditional) + wb_dirtys(regs[i].regmap,regs[i].dirty); + if(unconditional) { emit_movimm(ba[i],HOST_BTREG); } @@ -6109,28 +6045,12 @@ static void pagespan_assemble(int i,struct regstat *i_regs) int target_addr=start+i*4+5; void *stub=out; void *compiled_target_addr=check_addr(target_addr); - emit_extjump_ds((int)branch_addr,target_addr); + emit_extjump_ds(branch_addr, target_addr); if(compiled_target_addr) { - set_jump_target((int)branch_addr,(int)compiled_target_addr); - add_link(target_addr,stub); - } - else set_jump_target((int)branch_addr,(int)stub); - if(likely[i]) { - // Not-taken path - set_jump_target((int)nottaken,(int)out); - wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty); - void *branch_addr=out; - emit_jmp(0); - int target_addr=start+i*4+8; - void *stub=out; - void *compiled_target_addr=check_addr(target_addr); - emit_extjump_ds((int)branch_addr,target_addr); - if(compiled_target_addr) { - set_jump_target((int)branch_addr,(int)compiled_target_addr); - add_link(target_addr,stub); - } - else set_jump_target((int)branch_addr,(int)stub); + set_jump_target(branch_addr, compiled_target_addr); + add_jump_out(target_addr,stub); } + else set_jump_target(branch_addr, stub); } // Assemble the delay slot for the above @@ -6141,58 +6061,21 @@ static void pagespan_ds() u_int page=get_page(vaddr); u_int vpage=get_vpage(vaddr); ll_add(jump_dirty+vpage,vaddr,(void *)out); - do_dirty_stub_ds(); + do_dirty_stub_ds(slen*4); ll_add(jump_in+page,vaddr,(void *)out); assert(regs[0].regmap_entry[HOST_CCREG]==CCREG); if(regs[0].regmap[HOST_CCREG]!=CCREG) - wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32); + wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty); if(regs[0].regmap[HOST_BTREG]!=BTREG) - emit_writeword(HOST_BTREG,(int)&branch_target); - load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]); + emit_writeword(HOST_BTREG,&branch_target); + load_regs(regs[0].regmap_entry,regs[0].regmap,dops[0].rs1,dops[0].rs2); address_generation(0,®s[0],regs[0].regmap_entry); - if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a) - load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP); - cop1_usable=0; + if (ram_offset && (dops[0].is_load || dops[0].is_store)) + load_regs(regs[0].regmap_entry,regs[0].regmap,ROREG,ROREG); + if (dops[0].is_store) + load_regs(regs[0].regmap_entry,regs[0].regmap,INVCP,INVCP); is_delayslot=0; - switch(itype[0]) { - case ALU: - alu_assemble(0,®s[0]);break; - case IMM16: - imm16_assemble(0,®s[0]);break; - case SHIFT: - shift_assemble(0,®s[0]);break; - case SHIFTIMM: - shiftimm_assemble(0,®s[0]);break; - case LOAD: - load_assemble(0,®s[0]);break; - case LOADLR: - loadlr_assemble(0,®s[0]);break; - case STORE: - store_assemble(0,®s[0]);break; - case STORELR: - storelr_assemble(0,®s[0]);break; - case COP0: - cop0_assemble(0,®s[0]);break; - case COP1: - cop1_assemble(0,®s[0]);break; - case C1LS: - c1ls_assemble(0,®s[0]);break; - case COP2: - cop2_assemble(0,®s[0]);break; - case C2LS: - c2ls_assemble(0,®s[0]);break; - case C2OP: - c2op_assemble(0,®s[0]);break; - case FCONV: - fconv_assemble(0,®s[0]);break; - case FLOAT: - float_assemble(0,®s[0]);break; - case FCOMP: - fcomp_assemble(0,®s[0]);break; - case MULTDIV: - multdiv_assemble(0,®s[0]);break; - case MOV: - mov_assemble(0,®s[0]);break; + switch (dops[0].itype) { case SYSCALL: case HLECALL: case INTCALL: @@ -6201,167 +6084,102 @@ static void pagespan_ds() case RJUMP: case CJUMP: case SJUMP: - case FJUMP: SysPrintf("Jump in the delay slot. This is probably a bug.\n"); + break; + default: + assemble(0, ®s[0], 0); } int btaddr=get_reg(regs[0].regmap,BTREG); if(btaddr<0) { btaddr=get_reg(regs[0].regmap,-1); - emit_readword((int)&branch_target,btaddr); + emit_readword(&branch_target,btaddr); } assert(btaddr!=HOST_CCREG); if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG); #ifdef HOST_IMM8 + host_tempreg_acquire(); emit_movimm(start+4,HOST_TEMPREG); emit_cmp(btaddr,HOST_TEMPREG); + host_tempreg_release(); #else emit_cmpimm(btaddr,start+4); #endif - int branch=(int)out; + void *branch = out; emit_jeq(0); - store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1); - emit_jmp(jump_vaddr_reg[btaddr]); - set_jump_target(branch,(int)out); - store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4); - load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4); + store_regs_bt(regs[0].regmap,regs[0].dirty,-1); + do_jump_vaddr(btaddr); + set_jump_target(branch, out); + store_regs_bt(regs[0].regmap,regs[0].dirty,start+4); + load_regs_bt(regs[0].regmap,regs[0].dirty,start+4); } // Basic liveness analysis for MIPS registers void unneeded_registers(int istart,int iend,int r) { int i; - uint64_t u,uu,gte_u,b,bu,gte_bu; - uint64_t temp_u,temp_uu,temp_gte_u=0; - uint64_t tdep; + uint64_t u,gte_u,b,gte_b; + uint64_t temp_u,temp_gte_u=0; uint64_t gte_u_unknown=0; - if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED) + if (HACK_ENABLED(NDHACK_GTE_UNNEEDED)) gte_u_unknown=~0ll; if(iend==slen-1) { - u=1;uu=1; + u=1; gte_u=gte_u_unknown; }else{ - u=unneeded_reg[iend+1]; - uu=unneeded_reg_upper[iend+1]; - u=1;uu=1; + //u=unneeded_reg[iend+1]; + u=1; gte_u=gte_unneeded[iend+1]; } for (i=iend;i>=istart;i--) { //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r); - if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) + if(dops[i].is_jump) { // If subroutine call, flag return address as a possible branch target - if(rt1[i]==31 && i=(start+slen*4)) { // Branch out of this block, flush all regs u=1; - uu=1; gte_u=gte_u_unknown; - /* Hexagon hack - if(itype[i]==UJUMP&&rt1[i]==31) - { - uu=u=0x300C00F; // Discard at, v0-v1, t6-t9 - } - if(itype[i]==RJUMP&&rs1[i]==31) - { - uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9 - } - if(start>0x80000400&&start<0x80000000+RAM_SIZE) { - if(itype[i]==UJUMP&&rt1[i]==31) - { - //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi - uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9 - } - if(itype[i]==RJUMP&&rs1[i]==31) - { - //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi - uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9 - } - }*/ branch_unneeded_reg[i]=u; - branch_unneeded_reg_upper[i]=uu; // Merge in delay slot - tdep=(~uu>>rt1[i+1])&1; - u|=(1LL<>2]=1; + dops[(ba[i]-start)>>2].bt=1; if(ba[i]<=start+i*4) { // Backward branch - if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000) + if(dops[i].is_ujump) { // Unconditional branch - temp_u=1;temp_uu=1; + temp_u=1; temp_gte_u=0; } else { // Conditional branch (not taken case) temp_u=unneeded_reg[i+2]; - temp_uu=unneeded_reg_upper[i+2]; temp_gte_u&=gte_unneeded[i+2]; } // Merge in delay slot - tdep=(~temp_uu>>rt1[i+1])&1; - temp_u|=(1LL<>rt1[i])&1; - temp_u|=(1LL<>2,i-1,r+1); }else{ unneeded_reg[(ba[i]-start)>>2]=1; - unneeded_reg_upper[(ba[i]-start)>>2]=1; gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown; } } /*else*/ if(1) { - if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000) + if (dops[i].is_ujump) { // Unconditional branch u=unneeded_reg[(ba[i]-start)>>2]; - uu=unneeded_reg_upper[(ba[i]-start)>>2]; gte_u=gte_unneeded[(ba[i]-start)>>2]; branch_unneeded_reg[i]=u; - branch_unneeded_reg_upper[i]=uu; - //u=1; - //uu=1; - //branch_unneeded_reg[i]=u; - //branch_unneeded_reg_upper[i]=uu; // Merge in delay slot - tdep=(~uu>>rt1[i+1])&1; - u|=(1LL<>2]; - bu=unneeded_reg_upper[(ba[i]-start)>>2]; - gte_bu=gte_unneeded[(ba[i]-start)>>2]; + gte_b=gte_unneeded[(ba[i]-start)>>2]; branch_unneeded_reg[i]=b; - branch_unneeded_reg_upper[i]=bu; - //b=1; - //bu=1; - //branch_unneeded_reg[i]=b; - //branch_unneeded_reg_upper[i]=bu; // Branch delay slot - tdep=(~uu>>rt1[i+1])&1; - b|=(1LL<>rt1[i])&1; + //u=1; // DEBUG // Written registers are unneeded - u|=1LL<>r)&1) { - if(r==HIREG) printf(" HI"); - else if(r==LOREG) printf(" LO"); - else printf(" r%d",r); - } - } - printf("\n");*/ - } - for (i=iend;i>=istart;i--) - { - unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL; + printf("\n"); + */ } } @@ -6530,12 +6284,12 @@ void clean_registers(int istart,int iend,int wr) } for (i=iend;i>=istart;i--) { - if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) + if(dops[i].is_jump) { if(ba[i]=(start+slen*4)) { // Branch out of this block, flush all regs - if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000) + if (dops[i].is_ujump) { // Unconditional branch will_dirty_i=0; @@ -6543,17 +6297,17 @@ void clean_registers(int istart,int iend,int wr) // Merge in delay slot (will dirty) for(r=0;r33) will_dirty_i&=~(1<33) will_dirty_i&=~(1<33) will_dirty_i&=~(1<33) will_dirty_i&=~(1<>16)==0x1000) + if (dops[i].is_ujump) { // Unconditional branch temp_will_dirty=0; @@ -6623,17 +6377,17 @@ void clean_registers(int istart,int iend,int wr) // Merge in delay slot (will dirty) for(r=0;r33) temp_will_dirty&=~(1<33) temp_will_dirty&=~(1<33) temp_will_dirty&=~(1<33) temp_will_dirty&=~(1<>16)==0x1000) + if (dops[i].is_ujump) { // Unconditional branch will_dirty_i=0; @@ -6734,17 +6488,17 @@ void clean_registers(int istart,int iend,int wr) // Merge in delay slot for(r=0;r33) will_dirty_i&=~(1<33) will_dirty_i&=~(1<>2]>>(target_reg&63))&1)<>2]>>(target_reg&63))&1)<>2].regmap_entry[r]) { - will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<>2]&(1<33) will_dirty_i&=~(1<33) will_dirty_i&=~(1<33) will_dirty_i&=~(1<istart) { - if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) + if (!dops[i].is_jump) { // Don't store a register immediately after writing it, // may prevent dual-issue. - if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<>r)&1) { - printf(" r%d",r); - } - } - printf("\n");*/ - - //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) { regs[i].dirty|=will_dirty_i; #ifndef DESTRUCTIVE_WRITEBACK regs[i].dirty&=wont_dirty_i; - if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) + if(dops[i].is_jump) { - if(i>16)!=0x1000) { + if (i < iend-1 && !dops[i].is_ujump) { for(r=0;r>14):*ba);break; + printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],dops[i].rs1,dops[i].rs2,i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break; case SJUMP: - printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break; - case FJUMP: - printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break; + printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],dops[i].rs1,start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break; case RJUMP: - if (opcode[i]==0x9&&rt1[i]!=31) - printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]); + if (dops[i].opcode==0x9&&dops[i].rt1!=31) + printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],dops[i].rt1,dops[i].rs1); else - printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]); + printf (" %x: %s r%d\n",start+i*4,insn[i],dops[i].rs1); break; case SPAN: - printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break; + printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],dops[i].rs1,dops[i].rs2,ba[i]);break; case IMM16: - if(opcode[i]==0xf) //LUI - printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff); + if(dops[i].opcode==0xf) //LUI + printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],dops[i].rt1,imm[i]&0xffff); else - printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]); + printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],dops[i].rt1,dops[i].rs1,imm[i]); break; case LOAD: case LOADLR: - printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]); + printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],dops[i].rt1,dops[i].rs1,imm[i]); break; case STORE: case STORELR: - printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]); + printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],dops[i].rs2,dops[i].rs1,imm[i]); break; case ALU: case SHIFT: - printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]); + printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],dops[i].rt1,dops[i].rs1,dops[i].rs2); break; case MULTDIV: - printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]); + printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],dops[i].rs1,dops[i].rs2); break; case SHIFTIMM: - printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]); + printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],dops[i].rt1,dops[i].rs1,imm[i]); break; case MOV: - if((opcode2[i]&0x1d)==0x10) - printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]); - else if((opcode2[i]&0x1d)==0x11) - printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]); + if((dops[i].opcode2&0x1d)==0x10) + printf (" %x: %s r%d\n",start+i*4,insn[i],dops[i].rt1); + else if((dops[i].opcode2&0x1d)==0x11) + printf (" %x: %s r%d\n",start+i*4,insn[i],dops[i].rs1); else printf (" %x: %s\n",start+i*4,insn[i]); break; case COP0: - if(opcode2[i]==0) - printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0 - else if(opcode2[i]==4) - printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0 + if(dops[i].opcode2==0) + printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],dops[i].rt1,(source[i]>>11)&0x1f); // MFC0 + else if(dops[i].opcode2==4) + printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],dops[i].rs1,(source[i]>>11)&0x1f); // MTC0 else printf (" %x: %s\n",start+i*4,insn[i]); break; case COP1: - if(opcode2[i]<3) - printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1 - else if(opcode2[i]>3) - printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1 + if(dops[i].opcode2<3) + printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],dops[i].rt1,(source[i]>>11)&0x1f); // MFC1 + else if(dops[i].opcode2>3) + printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],dops[i].rs1,(source[i]>>11)&0x1f); // MTC1 else printf (" %x: %s\n",start+i*4,insn[i]); break; case COP2: - if(opcode2[i]<3) - printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2 - else if(opcode2[i]>3) - printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2 + if(dops[i].opcode2<3) + printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],dops[i].rt1,(source[i]>>11)&0x1f); // MFC2 + else if(dops[i].opcode2>3) + printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],dops[i].rs1,(source[i]>>11)&0x1f); // MTC2 else printf (" %x: %s\n",start+i*4,insn[i]); break; case C1LS: - printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]); + printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,dops[i].rs1,imm[i]); break; case C2LS: - printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]); + printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,dops[i].rs1,imm[i]); break; case INTCALL: printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]); @@ -7042,25 +6776,38 @@ static void disassemble_inst(int i) {} #define DRC_TEST_VAL 0x74657374 -static int new_dynarec_test(void) +static void new_dynarec_test(void) { - int (*testfunc)(void) = (void *)out; + int (*testfunc)(void); void *beginning; - int ret; + int ret[2]; + size_t i; - beginning = start_block(); - emit_movimm(DRC_TEST_VAL,0); // test - emit_jmpreg(14); - literal_pool(0); - end_block(beginning); - SysPrintf("testing if we can run recompiled code..\n"); - ret = testfunc(); - if (ret == DRC_TEST_VAL) + // check structure linkage + if ((u_char *)rcnts - (u_char *)&psxRegs != sizeof(psxRegs)) + { + SysPrintf("linkage_arm* miscompilation/breakage detected.\n"); + } + + SysPrintf("testing if we can run recompiled code...\n"); + ((volatile u_int *)out)[0]++; // make cache dirty + + for (i = 0; i < ARRAY_SIZE(ret); i++) { + out = ndrc->translation_cache; + beginning = start_block(); + emit_movimm(DRC_TEST_VAL + i, 0); // test + emit_ret(); + literal_pool(0); + end_block(beginning); + testfunc = beginning; + ret[i] = testfunc(); + } + + if (ret[0] == DRC_TEST_VAL && ret[1] == DRC_TEST_VAL + 1) SysPrintf("test passed.\n"); else - SysPrintf("test failed: %08x\n", ret); - out=(u_char *)BASE_ADDR; - return ret == DRC_TEST_VAL; + SysPrintf("test failed, will likely crash soon (r=%08x %08x)\n", ret[0], ret[1]); + out = ndrc->translation_cache; } // clear the state completely, instead of just marking @@ -7068,7 +6815,7 @@ static int new_dynarec_test(void) void new_dynarec_clear_full(void) { int n; - out=(u_char *)BASE_ADDR; + out = ndrc->translation_cache; memset(invalid_code,1,sizeof(invalid_code)); memset(hash_table,0xff,sizeof(hash_table)); memset(mini_ht,-1,sizeof(mini_ht)); @@ -7080,64 +6827,51 @@ void new_dynarec_clear_full(void) literalcount=0; stop_after_jal=0; inv_code_start=inv_code_end=~0; + f1_hack=0; // TLB for(n=0;n<4096;n++) ll_clear(jump_in+n); for(n=0;n<4096;n++) ll_clear(jump_out+n); for(n=0;n<4096;n++) ll_clear(jump_dirty+n); + + cycle_multiplier_old = cycle_multiplier; + new_dynarec_hacks_old = new_dynarec_hacks; } void new_dynarec_init(void) { SysPrintf("Init new dynarec\n"); -#ifdef _3DS - check_rosalina(); -#endif - - // allocate/prepare a buffer for translation cache - // see assem_arm.h for some explanation -#if defined(BASE_ADDR_FIXED) - if (mmap (translation_cache, 1 << TARGET_SIZE_2, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_PRIVATE | MAP_ANONYMOUS, - -1, 0) != translation_cache) - { - SysPrintf("mmap() failed: %s\n", strerror(errno)); - SysPrintf("disable BASE_ADDR_FIXED and recompile\n"); - abort(); - } -#elif defined(BASE_ADDR_DYNAMIC) -#ifdef VITA - sceBlock = getVMBlock();//sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2); +#ifdef BASE_ADDR_DYNAMIC + #ifdef VITA + sceBlock = sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2); if (sceBlock < 0) SysPrintf("sceKernelAllocMemBlockForVM failed\n"); - int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&translation_cache); + int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&ndrc); if (ret < 0) SysPrintf("sceKernelGetMemBlockBase failed\n"); - - sceKernelOpenVMDomain(); - sceClibPrintf("translation_cache = 0x%08X \n ", translation_cache); -#elif defined(_MSC_VER) - base_addr = VirtualAlloc(NULL, 1<translation_cache) + sizeof(ndrc->tramp.ops), + PROT_READ | PROT_WRITE | PROT_EXEC) != 0) SysPrintf("mprotect() failed: %s\n", strerror(errno)); + #endif #endif -#endif - - out=(u_char *)BASE_ADDR; + out = ndrc->translation_cache; cycle_multiplier=200; new_dynarec_clear_full(); #ifdef HOST_IMM8 @@ -7146,9 +6880,7 @@ void new_dynarec_init(void) #endif arch_init(); new_dynarec_test(); -#ifndef RAM_FIXED - ram_offset=(u_int)rdram-0x80000000; -#endif + ram_offset=(uintptr_t)rdram-0x80000000; if (ram_offset!=0) SysPrintf("warning: RAM is not directly mapped, performance will suffer\n"); } @@ -7156,45 +6888,47 @@ void new_dynarec_init(void) void new_dynarec_cleanup(void) { int n; -#if defined(BASE_ADDR_FIXED) || defined(BASE_ADDR_DYNAMIC) -#ifndef VITA -#if defined(_MSC_VER) - VirtualFree(base_addr, 0, MEM_RELEASE); -#else - if (munmap ((void *)BASE_ADDR, 1<= 0x80000000 && addr < 0x80000000+RAM_SIZE) { *limit = (addr & 0x80600000) + 0x00200000; - return (u_int *)((u_int)rdram + (addr&0x1fffff)); + return (u_int *)(rdram + (addr&0x1fffff)); } return NULL; } @@ -7238,7 +6972,7 @@ int new_dynarec_save_blocks(void *save, int size) u_int addr; o = 0; - for (p = 0; p < sizeof(jump_in) / sizeof(jump_in[0]); p++) { + for (p = 0; p < ARRAY_SIZE(jump_in); p++) { bcnt = 0; for (head = jump_in[p]; head != NULL; head = head->next) { tmp_blocks[bcnt].addr = head->vaddr; @@ -7299,22 +7033,52 @@ void new_dynarec_load_blocks(const void *save, int size) memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save)); } -int new_recompile_block(int addr) +static void apply_hacks(void) +{ + int i; + if (HACK_ENABLED(NDHACK_NO_COMPAT_HACKS)) + return; + /* special hack(s) */ + for (i = 0; i < slen - 4; i++) + { + // lui a4, 0xf200; jal ; addu a0, 2; slti v0, 28224 + if (source[i] == 0x3c04f200 && dops[i+1].itype == UJUMP + && source[i+2] == 0x34840002 && dops[i+3].opcode == 0x0a + && imm[i+3] == 0x6e40 && dops[i+3].rs1 == 2) + { + SysPrintf("PE2 hack @%08x\n", start + (i+3)*4); + dops[i + 3].itype = NOP; + } + } + i = slen; + if (i > 10 && source[i-1] == 0 && source[i-2] == 0x03e00008 + && source[i-4] == 0x8fbf0018 && source[i-6] == 0x00c0f809 + && dops[i-7].itype == STORE) + { + i = i-8; + if (dops[i].itype == IMM16) + i--; + // swl r2, 15(r6); swr r2, 12(r6); sw r6, *; jalr r6 + if (dops[i].itype == STORELR && dops[i].rs1 == 6 + && dops[i-1].itype == STORELR && dops[i-1].rs1 == 6) + { + SysPrintf("F1 hack from %08x\n", start); + if (f1_hack == 0) + f1_hack = ~0u; + } + } +} + +int new_recompile_block(u_int addr) { u_int pagelimit = 0; u_int state_rflags = 0; int i; - assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out); - //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out); + assem_debug("NOTCOMPILED: addr = %x -> %p\n", addr, out); //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr); //if(debug) - //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum()); //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29); - /*if(Count>=312978186) { - rlist(); - }*/ - //rlist(); // this is just for speculation for (i = 1; i < 32; i++) { @@ -7323,7 +7087,7 @@ int new_recompile_block(int addr) } start = (u_int)addr&~3; - //assert(((u_int)addr&1)==0); + //assert(((u_int)addr&1)==0); // start-in-delay-slot flag new_dynarec_did_compile=1; if (Config.HLE && start == 0x80001000) // hlecall { @@ -7333,18 +7097,42 @@ int new_recompile_block(int addr) invalid_code[start>>12]=0; emit_movimm(start,0); - emit_writeword(0,(int)&pcaddr); - emit_jmp((int)new_dyna_leave); + emit_writeword(0,&pcaddr); + emit_far_jump(new_dyna_leave); literal_pool(0); end_block(beginning); ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning); return 0; } + else if (f1_hack == ~0u || (f1_hack != 0 && start == f1_hack)) { + void *beginning = start_block(); + u_int page = get_page(start); + emit_readword(&psxRegs.GPR.n.sp, 0); + emit_readptr(&mem_rtab, 1); + emit_shrimm(0, 12, 2); + emit_readptr_dualindexedx_ptrlen(1, 2, 1); + emit_addimm(0, 0x18, 0); + emit_adds_ptr(1, 1, 1); + emit_ldr_dualindexed(1, 0, 0); + emit_writeword(0, &psxRegs.GPR.r[26]); // lw k0, 0x18(sp) + emit_far_call(get_addr_ht); + emit_jmpreg(0); // jr k0 + literal_pool(0); + end_block(beginning); + + ll_add_flags(jump_in + page, start, state_rflags, beginning); + SysPrintf("F1 hack to %08x\n", start); + f1_hack = start; + return 0; + } + + cycle_multiplier_active = cycle_multiplier_override && cycle_multiplier == CYCLE_MULT_DEFAULT + ? cycle_multiplier_override : cycle_multiplier; source = get_source_start(start, &pagelimit); if (source == NULL) { SysPrintf("Compile at bogus memory address: %08x\n", addr); - exit(1); + abort(); } /* Pass 1: disassemble */ @@ -7367,9 +7155,11 @@ int new_recompile_block(int addr) /* Pass 1 disassembly */ for(i=0;!done;i++) { - bt[i]=0;likely[i]=0;ooo[i]=0;op2=0; + dops[i].bt=0; + dops[i].ooo=0; + op2=0; minimum_free_regs[i]=0; - opcode[i]=op=source[i]>>26; + dops[i].opcode=op=source[i]>>26; switch(op) { case 0x00: strcpy(insn[i],"special"); type=NI; @@ -7438,18 +7228,18 @@ int new_recompile_block(int addr) { case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break; case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break; - case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break; - case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break; - case 0x08: strcpy(insn[i],"TGEI"); type=NI; break; - case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break; - case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break; - case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break; - case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break; - case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break; + //case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break; + //case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break; + //case 0x08: strcpy(insn[i],"TGEI"); type=NI; break; + //case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break; + //case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break; + //case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break; + //case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break; + //case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break; case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break; case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break; - case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break; - case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break; + //case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break; + //case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break; } break; case 0x02: strcpy(insn[i],"J"); type=UJUMP; break; @@ -7471,133 +7261,14 @@ int new_recompile_block(int addr) switch(op2) { case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break; + case 0x02: strcpy(insn[i],"CFC0"); type=COP0; break; case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break; - case 0x10: strcpy(insn[i],"tlb"); type=NI; - switch(source[i]&0x3f) - { - case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break; - case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break; - case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break; - case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break; - case 0x10: strcpy(insn[i],"RFE"); type=COP0; break; - //case 0x18: strcpy(insn[i],"ERET"); type=COP0; break; - } + case 0x06: strcpy(insn[i],"CTC0"); type=COP0; break; + case 0x10: strcpy(insn[i],"RFE"); type=COP0; break; } break; - case 0x11: strcpy(insn[i],"cop1"); type=NI; + case 0x11: strcpy(insn[i],"cop1"); type=COP1; op2=(source[i]>>21)&0x1f; - switch(op2) - { - case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break; - case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break; - case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break; - case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break; - case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break; - case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break; - case 0x08: strcpy(insn[i],"BC1"); type=FJUMP; - switch((source[i]>>16)&0x3) - { - case 0x00: strcpy(insn[i],"BC1F"); break; - case 0x01: strcpy(insn[i],"BC1T"); break; - case 0x02: strcpy(insn[i],"BC1FL"); break; - case 0x03: strcpy(insn[i],"BC1TL"); break; - } - break; - case 0x10: strcpy(insn[i],"C1.S"); type=NI; - switch(source[i]&0x3f) - { - case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break; - case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break; - case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break; - case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break; - case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break; - case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break; - case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break; - case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break; - case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break; - case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break; - case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break; - case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break; - case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break; - case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break; - case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break; - case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break; - case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break; - case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break; - case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break; - case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break; - case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break; - case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break; - case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break; - case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break; - case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break; - case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break; - case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break; - case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break; - case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break; - case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break; - case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break; - case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break; - case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break; - case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break; - case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break; - } - break; - case 0x11: strcpy(insn[i],"C1.D"); type=NI; - switch(source[i]&0x3f) - { - case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break; - case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break; - case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break; - case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break; - case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break; - case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break; - case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break; - case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break; - case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break; - case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break; - case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break; - case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break; - case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break; - case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break; - case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break; - case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break; - case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break; - case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break; - case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break; - case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break; - case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break; - case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break; - case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break; - case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break; - case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break; - case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break; - case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break; - case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break; - case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break; - case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break; - case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break; - case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break; - case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break; - case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break; - case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break; - } - break; - case 0x14: strcpy(insn[i],"C1.W"); type=NI; - switch(source[i]&0x3f) - { - case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break; - case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break; - } - break; - case 0x15: strcpy(insn[i],"C1.L"); type=NI; - switch(source[i]&0x3f) - { - case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break; - case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break; - } - break; - } break; #if 0 case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break; @@ -7645,7 +7316,7 @@ int new_recompile_block(int addr) #endif case 0x12: strcpy(insn[i],"COP2"); type=NI; op2=(source[i]>>21)&0x1f; - //if (op2 & 0x10) { + //if (op2 & 0x10) if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns if (gte_handlers[source[i]&0x3f]!=NULL) { if (gte_regnames[source[i]&0x3f]!=NULL) @@ -7670,195 +7341,155 @@ int new_recompile_block(int addr) SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr); break; } - itype[i]=type; - opcode2[i]=op2; + dops[i].itype=type; + dops[i].opcode2=op2; /* Get registers/immediates */ - lt1[i]=0; - us1[i]=0; - us2[i]=0; - dep1[i]=0; - dep2[i]=0; + dops[i].lt1=0; gte_rs[i]=gte_rt[i]=0; switch(type) { case LOAD: - rs1[i]=(source[i]>>21)&0x1f; - rs2[i]=0; - rt1[i]=(source[i]>>16)&0x1f; - rt2[i]=0; + dops[i].rs1=(source[i]>>21)&0x1f; + dops[i].rs2=0; + dops[i].rt1=(source[i]>>16)&0x1f; + dops[i].rt2=0; imm[i]=(short)source[i]; break; case STORE: case STORELR: - rs1[i]=(source[i]>>21)&0x1f; - rs2[i]=(source[i]>>16)&0x1f; - rt1[i]=0; - rt2[i]=0; + dops[i].rs1=(source[i]>>21)&0x1f; + dops[i].rs2=(source[i]>>16)&0x1f; + dops[i].rt1=0; + dops[i].rt2=0; imm[i]=(short)source[i]; - if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD break; case LOADLR: // LWL/LWR only load part of the register, // therefore the target register must be treated as a source too - rs1[i]=(source[i]>>21)&0x1f; - rs2[i]=(source[i]>>16)&0x1f; - rt1[i]=(source[i]>>16)&0x1f; - rt2[i]=0; + dops[i].rs1=(source[i]>>21)&0x1f; + dops[i].rs2=(source[i]>>16)&0x1f; + dops[i].rt1=(source[i]>>16)&0x1f; + dops[i].rt2=0; imm[i]=(short)source[i]; - if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL - if(op==0x26) dep1[i]=rt1[i]; // LWR break; case IMM16: - if (op==0x0f) rs1[i]=0; // LUI instruction has no source register - else rs1[i]=(source[i]>>21)&0x1f; - rs2[i]=0; - rt1[i]=(source[i]>>16)&0x1f; - rt2[i]=0; + if (op==0x0f) dops[i].rs1=0; // LUI instruction has no source register + else dops[i].rs1=(source[i]>>21)&0x1f; + dops[i].rs2=0; + dops[i].rt1=(source[i]>>16)&0x1f; + dops[i].rt2=0; if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI imm[i]=(unsigned short)source[i]; }else{ imm[i]=(short)source[i]; } - if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU - if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU - if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI break; case UJUMP: - rs1[i]=0; - rs2[i]=0; - rt1[i]=0; - rt2[i]=0; + dops[i].rs1=0; + dops[i].rs2=0; + dops[i].rt1=0; + dops[i].rt2=0; // The JAL instruction writes to r31. if (op&1) { - rt1[i]=31; + dops[i].rt1=31; } - rs2[i]=CCREG; + dops[i].rs2=CCREG; break; case RJUMP: - rs1[i]=(source[i]>>21)&0x1f; - rs2[i]=0; - rt1[i]=0; - rt2[i]=0; + dops[i].rs1=(source[i]>>21)&0x1f; + dops[i].rs2=0; + dops[i].rt1=0; + dops[i].rt2=0; // The JALR instruction writes to rd. if (op2&1) { - rt1[i]=(source[i]>>11)&0x1f; + dops[i].rt1=(source[i]>>11)&0x1f; } - rs2[i]=CCREG; + dops[i].rs2=CCREG; break; case CJUMP: - rs1[i]=(source[i]>>21)&0x1f; - rs2[i]=(source[i]>>16)&0x1f; - rt1[i]=0; - rt2[i]=0; + dops[i].rs1=(source[i]>>21)&0x1f; + dops[i].rs2=(source[i]>>16)&0x1f; + dops[i].rt1=0; + dops[i].rt2=0; if(op&2) { // BGTZ/BLEZ - rs2[i]=0; + dops[i].rs2=0; } - us1[i]=rs1[i]; - us2[i]=rs2[i]; - likely[i]=op>>4; break; case SJUMP: - rs1[i]=(source[i]>>21)&0x1f; - rs2[i]=CCREG; - rt1[i]=0; - rt2[i]=0; - us1[i]=rs1[i]; + dops[i].rs1=(source[i]>>21)&0x1f; + dops[i].rs2=CCREG; + dops[i].rt1=0; + dops[i].rt2=0; if(op2&0x10) { // BxxAL - rt1[i]=31; + dops[i].rt1=31; // NOTE: If the branch is not taken, r31 is still overwritten } - likely[i]=(op2&2)>>1; - break; - case FJUMP: - rs1[i]=FSREG; - rs2[i]=CSREG; - rt1[i]=0; - rt2[i]=0; - likely[i]=((source[i])>>17)&1; break; case ALU: - rs1[i]=(source[i]>>21)&0x1f; // source - rs2[i]=(source[i]>>16)&0x1f; // subtract amount - rt1[i]=(source[i]>>11)&0x1f; // destination - rt2[i]=0; - if(op2==0x2a||op2==0x2b) { // SLT/SLTU - us1[i]=rs1[i];us2[i]=rs2[i]; - } - else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR - dep1[i]=rs1[i];dep2[i]=rs2[i]; - } - else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB - dep1[i]=rs1[i];dep2[i]=rs2[i]; - } + dops[i].rs1=(source[i]>>21)&0x1f; // source + dops[i].rs2=(source[i]>>16)&0x1f; // subtract amount + dops[i].rt1=(source[i]>>11)&0x1f; // destination + dops[i].rt2=0; break; case MULTDIV: - rs1[i]=(source[i]>>21)&0x1f; // source - rs2[i]=(source[i]>>16)&0x1f; // divisor - rt1[i]=HIREG; - rt2[i]=LOREG; - if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU - us1[i]=rs1[i];us2[i]=rs2[i]; - } + dops[i].rs1=(source[i]>>21)&0x1f; // source + dops[i].rs2=(source[i]>>16)&0x1f; // divisor + dops[i].rt1=HIREG; + dops[i].rt2=LOREG; break; case MOV: - rs1[i]=0; - rs2[i]=0; - rt1[i]=0; - rt2[i]=0; - if(op2==0x10) rs1[i]=HIREG; // MFHI - if(op2==0x11) rt1[i]=HIREG; // MTHI - if(op2==0x12) rs1[i]=LOREG; // MFLO - if(op2==0x13) rt1[i]=LOREG; // MTLO - if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx - if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx - dep1[i]=rs1[i]; + dops[i].rs1=0; + dops[i].rs2=0; + dops[i].rt1=0; + dops[i].rt2=0; + if(op2==0x10) dops[i].rs1=HIREG; // MFHI + if(op2==0x11) dops[i].rt1=HIREG; // MTHI + if(op2==0x12) dops[i].rs1=LOREG; // MFLO + if(op2==0x13) dops[i].rt1=LOREG; // MTLO + if((op2&0x1d)==0x10) dops[i].rt1=(source[i]>>11)&0x1f; // MFxx + if((op2&0x1d)==0x11) dops[i].rs1=(source[i]>>21)&0x1f; // MTxx break; case SHIFT: - rs1[i]=(source[i]>>16)&0x1f; // target of shift - rs2[i]=(source[i]>>21)&0x1f; // shift amount - rt1[i]=(source[i]>>11)&0x1f; // destination - rt2[i]=0; - // DSLLV/DSRLV/DSRAV are 64-bit - if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i]; + dops[i].rs1=(source[i]>>16)&0x1f; // target of shift + dops[i].rs2=(source[i]>>21)&0x1f; // shift amount + dops[i].rt1=(source[i]>>11)&0x1f; // destination + dops[i].rt2=0; break; case SHIFTIMM: - rs1[i]=(source[i]>>16)&0x1f; - rs2[i]=0; - rt1[i]=(source[i]>>11)&0x1f; - rt2[i]=0; + dops[i].rs1=(source[i]>>16)&0x1f; + dops[i].rs2=0; + dops[i].rt1=(source[i]>>11)&0x1f; + dops[i].rt2=0; imm[i]=(source[i]>>6)&0x1f; // DSxx32 instructions if(op2>=0x3c) imm[i]|=0x20; - // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source - if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i]; break; case COP0: - rs1[i]=0; - rs2[i]=0; - rt1[i]=0; - rt2[i]=0; - if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0 - if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0 - if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status - if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET + dops[i].rs1=0; + dops[i].rs2=0; + dops[i].rt1=0; + dops[i].rt2=0; + if(op2==0||op2==2) dops[i].rt1=(source[i]>>16)&0x1F; // MFC0/CFC0 + if(op2==4||op2==6) dops[i].rs1=(source[i]>>16)&0x1F; // MTC0/CTC0 + if(op2==4&&((source[i]>>11)&0x1f)==12) dops[i].rt2=CSREG; // Status + if(op2==16) if((source[i]&0x3f)==0x18) dops[i].rs2=CCREG; // ERET break; case COP1: - rs1[i]=0; - rs2[i]=0; - rt1[i]=0; - rt2[i]=0; - if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1 - if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1 - if(op2==5) us1[i]=rs1[i]; // DMTC1 - rs2[i]=CSREG; + dops[i].rs1=0; + dops[i].rs2=0; + dops[i].rt1=0; + dops[i].rt2=0; + if(op2<3) dops[i].rt1=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1 + if(op2>3) dops[i].rs1=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1 + dops[i].rs2=CSREG; break; case COP2: - rs1[i]=0; - rs2[i]=0; - rt1[i]=0; - rt2[i]=0; - if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2 - if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2 - rs2[i]=CSREG; + dops[i].rs1=0; + dops[i].rs2=0; + dops[i].rt1=0; + dops[i].rt2=0; + if(op2<3) dops[i].rt1=(source[i]>>16)&0x1F; // MFC2/CFC2 + if(op2>3) dops[i].rs1=(source[i]>>16)&0x1F; // MTC2/CTC2 + dops[i].rs2=CSREG; int gr=(source[i]>>11)&0x1F; switch(op2) { @@ -7869,26 +7500,26 @@ int new_recompile_block(int addr) } break; case C1LS: - rs1[i]=(source[i]>>21)&0x1F; - rs2[i]=CSREG; - rt1[i]=0; - rt2[i]=0; + dops[i].rs1=(source[i]>>21)&0x1F; + dops[i].rs2=CSREG; + dops[i].rt1=0; + dops[i].rt2=0; imm[i]=(short)source[i]; break; case C2LS: - rs1[i]=(source[i]>>21)&0x1F; - rs2[i]=0; - rt1[i]=0; - rt2[i]=0; + dops[i].rs1=(source[i]>>21)&0x1F; + dops[i].rs2=0; + dops[i].rt1=0; + dops[i].rt2=0; imm[i]=(short)source[i]; if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2 else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2 break; case C2OP: - rs1[i]=0; - rs2[i]=0; - rt1[i]=0; - rt2[i]=0; + dops[i].rs1=0; + dops[i].rs2=0; + dops[i].rt1=0; + dops[i].rt2=0; gte_rs[i]=gte_reg_reads[source[i]&0x3f]; gte_rt[i]=gte_reg_writes[source[i]&0x3f]; gte_rt[i]|=1ll<<63; // every op changes flags @@ -7899,79 +7530,85 @@ int new_recompile_block(int addr) else gte_rs[i]|=3ll<<(v*2); } break; - case FLOAT: - case FCONV: - rs1[i]=0; - rs2[i]=CSREG; - rt1[i]=0; - rt2[i]=0; - break; - case FCOMP: - rs1[i]=FSREG; - rs2[i]=CSREG; - rt1[i]=FSREG; - rt2[i]=0; - break; case SYSCALL: case HLECALL: case INTCALL: - rs1[i]=CCREG; - rs2[i]=0; - rt1[i]=0; - rt2[i]=0; + dops[i].rs1=CCREG; + dops[i].rs2=0; + dops[i].rt1=0; + dops[i].rt2=0; break; default: - rs1[i]=0; - rs2[i]=0; - rt1[i]=0; - rt2[i]=0; + dops[i].rs1=0; + dops[i].rs2=0; + dops[i].rt1=0; + dops[i].rt2=0; } /* Calculate branch target addresses */ if(type==UJUMP) ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4); - else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1)) + else if(type==CJUMP&&dops[i].rs1==dops[i].rs2&&(op&1)) ba[i]=start+i*4+8; // Ignore never taken branch - else if(type==SJUMP&&rs1[i]==0&&!(op2&1)) + else if(type==SJUMP&&dops[i].rs1==0&&!(op2&1)) ba[i]=start+i*4+8; // Ignore never taken branch - else if(type==CJUMP||type==SJUMP||type==FJUMP) + else if(type==CJUMP||type==SJUMP) ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14); else ba[i]=-1; - if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) { + + /* simplify always (not)taken branches */ + if (type == CJUMP && dops[i].rs1 == dops[i].rs2) { + dops[i].rs1 = dops[i].rs2 = 0; + if (!(op & 1)) { + dops[i].itype = type = UJUMP; + dops[i].rs2 = CCREG; + } + } + else if (type == SJUMP && dops[i].rs1 == 0 && (op2 & 1)) + dops[i].itype = type = UJUMP; + + dops[i].is_jump = (dops[i].itype == RJUMP || dops[i].itype == UJUMP || dops[i].itype == CJUMP || dops[i].itype == SJUMP); + dops[i].is_ujump = (dops[i].itype == RJUMP || dops[i].itype == UJUMP); // || (source[i] >> 16) == 0x1000 // beq r0,r0 + dops[i].is_load = (dops[i].itype == LOAD || dops[i].itype == LOADLR || op == 0x32); // LWC2 + dops[i].is_store = (dops[i].itype == STORE || dops[i].itype == STORELR || op == 0x3a); // SWC2 + + /* messy cases to just pass over to the interpreter */ + if (i > 0 && dops[i-1].is_jump) { int do_in_intrp=0; // branch in delay slot? - if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) { + if (dops[i].is_jump) { // don't handle first branch and call interpreter if it's hit SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr); do_in_intrp=1; } // basic load delay detection - else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) { + else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&dops[i].rt1!=0) { int t=(ba[i-1]-start)/4; - if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) { + if(0 <= t && t < i &&(dops[i].rt1==dops[t].rs1||dops[i].rt1==dops[t].rs2)&&dops[t].itype!=CJUMP&&dops[t].itype!=SJUMP) { // jump target wants DS result - potential load delay effect SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr); do_in_intrp=1; - bt[t+1]=1; // expected return from interpreter + dops[t+1].bt=1; // expected return from interpreter } - else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&& - !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) { + else if(i>=2&&dops[i-2].rt1==2&&dops[i].rt1==2&&dops[i].rs1!=2&&dops[i].rs2!=2&&dops[i-1].rs1!=2&&dops[i-1].rs2!=2&& + !(i>=3&&dops[i-3].is_jump)) { // v0 overwrite like this is a sign of trouble, bail out SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr); do_in_intrp=1; } } if(do_in_intrp) { - rs1[i-1]=CCREG; - rs2[i-1]=rt1[i-1]=rt2[i-1]=0; + dops[i-1].rs1=CCREG; + dops[i-1].rs2=dops[i-1].rt1=dops[i-1].rt2=0; ba[i-1]=-1; - itype[i-1]=INTCALL; + dops[i-1].itype=INTCALL; done=2; i--; // don't compile the DS } } + /* Is this the end of the block? */ - if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) { - if(rt1[i-1]==0) { // Continue past subroutine call (JAL) + if (i > 0 && dops[i-1].is_ujump) { + if(dops[i-1].rt1==0) { // Continue past subroutine call (JAL) done=2; } else { @@ -7984,8 +7621,8 @@ int new_recompile_block(int addr) // Don't get too close to the limit if(i>MAXBLOCK/2) done=1; } - if(itype[i]==SYSCALL&&stop_after_jal) done=1; - if(itype[i]==HLECALL||itype[i]==INTCALL) done=2; + if(dops[i].itype==SYSCALL&&stop_after_jal) done=1; + if(dops[i].itype==HLECALL||dops[i].itype==INTCALL) done=2; if(done==2) { // Does the block continue due to a branch? for(j=i-1;j>=0;j--) @@ -8000,19 +7637,21 @@ int new_recompile_block(int addr) assert(start+i*40); + apply_hacks(); + /* Pass 2 - Register dependencies and branch targets */ unneeded_registers(0,slen-1,0); @@ -8020,10 +7659,8 @@ int new_recompile_block(int addr) /* Pass 3 - Register allocation */ struct regstat current; // Current register allocations/status - current.is32=1; current.dirty=0; current.u=unneeded_reg[0]; - current.uu=unneeded_reg_upper[0]; clear_all_regs(current.regmap); alloc_reg(¤t,0,CCREG); dirty_reg(¤t,CCREG); @@ -8037,16 +7674,15 @@ int new_recompile_block(int addr) if((u_int)addr&1) { // First instruction is delay slot cc=-1; - bt[1]=1; + dops[1].bt=1; ds=1; unneeded_reg[0]=1; - unneeded_reg_upper[0]=1; current.regmap[HOST_BTREG]=BTREG; } for(i=0;i1) - { - if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL - { - if(rs1[i-2]==0||rs2[i-2]==0) - { - if(rs1[i-2]) { - current.is32|=1LL<=0) current.regmap[hr]=-1; - } - if(rs2[i-2]) { - current.is32|=1LL<=0) current.regmap[hr]=-1; - } - } - } - } - current.is32=-1LL; memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap)); regs[i].wasconst=current.isconst; - regs[i].was32=current.is32; regs[i].wasdirty=current.dirty; regs[i].loadedconst=0; - if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) { + if (!dops[i].is_jump) { if(i+1>rt1[i])&1) current.uu&=~((1LL<>rt1[i+1])&1) current.uu&=~((1LL<>rt1[i])&1) current.uu&=~((1LL<>r)&1) { regs[i].regmap_entry[hr]=-1; regs[i].regmap[hr]=-1; @@ -8153,16 +7752,6 @@ int new_recompile_block(int addr) //current.regmap[hr]=-1; }else regs[i].regmap_entry[hr]=r; - } - else { - if((current.uu>>(r&63))&1) { - regs[i].regmap_entry[hr]=-1; - regs[i].regmap[hr]=-1; - //Don't clear regs in the delay slot as the branch might need them - //current.regmap[hr]=-1; - }else - regs[i].regmap_entry[hr]=r; - } } } else { // First instruction expects CCREG to be allocated @@ -8174,25 +7763,24 @@ int new_recompile_block(int addr) } } else { // Not delay slot - switch(itype[i]) { + switch(dops[i].itype) { case UJUMP: //current.isconst=0; // DEBUG //current.wasconst=0; // DEBUG //regs[i].wasconst=0; // DEBUG - clear_const(¤t,rt1[i]); + clear_const(¤t,dops[i].rt1); alloc_cc(¤t,i); dirty_reg(¤t,CCREG); - if (rt1[i]==31) { + if (dops[i].rt1==31) { alloc_reg(¤t,i,31); dirty_reg(¤t,31); - //assert(rs1[i+1]!=31&&rs2[i+1]!=31); - //assert(rt1[i+1]!=rt1[i]); + //assert(dops[i+1].rs1!=31&&dops[i+1].rs2!=31); + //assert(dops[i+1].rt1!=dops[i].rt1); #ifdef REG_PREFETCH alloc_reg(¤t,i,PTEMP); #endif - //current.is32|=1LL<>rs1[i])&(current.is32>>rs2[i])&1)) - { - if(rs1[i]) alloc_reg64(¤t,i,rs1[i]); - if(rs2[i]) alloc_reg64(¤t,i,rs2[i]); - } - if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))|| - (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) { + if(dops[i].rs1) alloc_reg(¤t,i,dops[i].rs1); + if(dops[i].rs2) alloc_reg(¤t,i,dops[i].rs2); + if((dops[i].rs1&&(dops[i].rs1==dops[i+1].rt1||dops[i].rs1==dops[i+1].rt2))|| + (dops[i].rs2&&(dops[i].rs2==dops[i+1].rt1||dops[i].rs2==dops[i+1].rt2))) { // The delay slot overwrites one of our conditions. // Allocate the branch condition registers instead. current.isconst=0; current.wasconst=0; regs[i].wasconst=0; - if(rs1[i]) alloc_reg(¤t,i,rs1[i]); - if(rs2[i]) alloc_reg(¤t,i,rs2[i]); - if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1)) - { - if(rs1[i]) alloc_reg64(¤t,i,rs1[i]); - if(rs2[i]) alloc_reg64(¤t,i,rs2[i]); - } + if(dops[i].rs1) alloc_reg(¤t,i,dops[i].rs1); + if(dops[i].rs2) alloc_reg(¤t,i,dops[i].rs2); } else { - ooo[i]=1; + dops[i].ooo=1; delayslot_alloc(¤t,i+1); } } else - if((opcode[i]&0x3E)==6) // BLEZ/BGTZ + if((dops[i].opcode&0x3E)==6) // BLEZ/BGTZ { alloc_cc(¤t,i); dirty_reg(¤t,CCREG); - alloc_reg(¤t,i,rs1[i]); - if(!(current.is32>>rs1[i]&1)) - { - alloc_reg64(¤t,i,rs1[i]); - } - if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) { + alloc_reg(¤t,i,dops[i].rs1); + if(dops[i].rs1&&(dops[i].rs1==dops[i+1].rt1||dops[i].rs1==dops[i+1].rt2)) { // The delay slot overwrites one of our conditions. // Allocate the branch condition registers instead. current.isconst=0; current.wasconst=0; regs[i].wasconst=0; - if(rs1[i]) alloc_reg(¤t,i,rs1[i]); - if(!((current.is32>>rs1[i])&1)) - { - if(rs1[i]) alloc_reg64(¤t,i,rs1[i]); - } + if(dops[i].rs1) alloc_reg(¤t,i,dops[i].rs1); } else { - ooo[i]=1; + dops[i].ooo=1; delayslot_alloc(¤t,i+1); } } else // Don't alloc the delay slot yet because we might not execute it - if((opcode[i]&0x3E)==0x14) // BEQL/BNEL + if((dops[i].opcode&0x3E)==0x14) // BEQL/BNEL { current.isconst=0; current.wasconst=0; regs[i].wasconst=0; alloc_cc(¤t,i); dirty_reg(¤t,CCREG); - alloc_reg(¤t,i,rs1[i]); - alloc_reg(¤t,i,rs2[i]); - if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1)) - { - alloc_reg64(¤t,i,rs1[i]); - alloc_reg64(¤t,i,rs2[i]); - } + alloc_reg(¤t,i,dops[i].rs1); + alloc_reg(¤t,i,dops[i].rs2); } else - if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL + if((dops[i].opcode&0x3E)==0x16) // BLEZL/BGTZL { current.isconst=0; current.wasconst=0; regs[i].wasconst=0; alloc_cc(¤t,i); dirty_reg(¤t,CCREG); - alloc_reg(¤t,i,rs1[i]); - if(!(current.is32>>rs1[i]&1)) - { - alloc_reg64(¤t,i,rs1[i]); - } + alloc_reg(¤t,i,dops[i].rs1); } ds=1; //current.isconst=0; @@ -8344,103 +7903,49 @@ int new_recompile_block(int addr) //current.isconst=0; //current.wasconst=0; //regs[i].wasconst=0; - clear_const(¤t,rs1[i]); - clear_const(¤t,rt1[i]); - //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ - if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ + clear_const(¤t,dops[i].rs1); + clear_const(¤t,dops[i].rt1); + //if((dops[i].opcode2&0x1E)==0x0) // BLTZ/BGEZ + if((dops[i].opcode2&0x0E)==0x0) // BLTZ/BGEZ { alloc_cc(¤t,i); dirty_reg(¤t,CCREG); - alloc_reg(¤t,i,rs1[i]); - if(!(current.is32>>rs1[i]&1)) - { - alloc_reg64(¤t,i,rs1[i]); - } - if (rt1[i]==31) { // BLTZAL/BGEZAL + alloc_reg(¤t,i,dops[i].rs1); + if (dops[i].rt1==31) { // BLTZAL/BGEZAL alloc_reg(¤t,i,31); dirty_reg(¤t,31); //#ifdef REG_PREFETCH //alloc_reg(¤t,i,PTEMP); //#endif - //current.is32|=1LL<>rs1[i])&1)) - { - if(rs1[i]) alloc_reg64(¤t,i,rs1[i]); - } + if(dops[i].rs1) alloc_reg(¤t,i,dops[i].rs1); } else { - ooo[i]=1; + dops[i].ooo=1; delayslot_alloc(¤t,i+1); } } else // Don't alloc the delay slot yet because we might not execute it - if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL + if((dops[i].opcode2&0x1E)==0x2) // BLTZL/BGEZL { current.isconst=0; current.wasconst=0; regs[i].wasconst=0; alloc_cc(¤t,i); dirty_reg(¤t,CCREG); - alloc_reg(¤t,i,rs1[i]); - if(!(current.is32>>rs1[i]&1)) - { - alloc_reg64(¤t,i,rs1[i]); - } + alloc_reg(¤t,i,dops[i].rs1); } ds=1; //current.isconst=0; break; - case FJUMP: - current.isconst=0; - current.wasconst=0; - regs[i].wasconst=0; - if(likely[i]==0) // BC1F/BC1T - { - // TODO: Theoretically we can run out of registers here on x86. - // The delay slot can allocate up to six, and we need to check - // CSREG before executing the delay slot. Possibly we can drop - // the cycle count and then reload it after checking that the - // FPU is in a usable state, or don't do out-of-order execution. - alloc_cc(¤t,i); - dirty_reg(¤t,CCREG); - alloc_reg(¤t,i,FSREG); - alloc_reg(¤t,i,CSREG); - if(itype[i+1]==FCOMP) { - // The delay slot overwrites the branch condition. - // Allocate the branch condition registers instead. - alloc_cc(¤t,i); - dirty_reg(¤t,CCREG); - alloc_reg(¤t,i,CSREG); - alloc_reg(¤t,i,FSREG); - } - else { - ooo[i]=1; - delayslot_alloc(¤t,i+1); - alloc_reg(¤t,i+1,CSREG); - } - } - else - // Don't alloc the delay slot yet because we might not execute it - if(likely[i]) // BC1FL/BC1TL - { - alloc_cc(¤t,i); - dirty_reg(¤t,CCREG); - alloc_reg(¤t,i,CSREG); - alloc_reg(¤t,i,FSREG); - } - ds=1; - current.isconst=0; - break; case IMM16: imm16_alloc(¤t,i); break; @@ -8471,8 +7976,9 @@ int new_recompile_block(int addr) cop0_alloc(¤t,i); break; case COP1: + break; case COP2: - cop1_alloc(¤t,i); + cop2_alloc(¤t,i); break; case C1LS: c1ls_alloc(¤t,i); @@ -8483,15 +7989,6 @@ int new_recompile_block(int addr) case C2OP: c2op_alloc(¤t,i); break; - case FCONV: - fconv_alloc(¤t,i); - break; - case FLOAT: - float_alloc(¤t,i); - break; - case FCOMP: - fcomp_alloc(¤t,i); - break; case SYSCALL: case HLECALL: case INTCALL: @@ -8502,20 +7999,6 @@ int new_recompile_block(int addr) break; } - // Drop the upper half of registers that have become 32-bit - current.uu|=current.is32&((1LL<>rt1[i])&1) current.uu&=~((1LL<>rt1[i+1])&1) current.uu&=~((1LL<>r)&1) { regs[i].regmap_entry[hr]=-1; //regs[i].regmap[hr]=-1; @@ -8551,14 +8035,6 @@ int new_recompile_block(int addr) }else regs[i].regmap_entry[hr]=r; } - else { - if((current.uu>>(r&63))&1) { - regs[i].regmap_entry[hr]=-1; - //regs[i].regmap[hr]=-1; - current.regmap[hr]=-1; - }else - regs[i].regmap_entry[hr]=r; - } } } else { // Branches expect CCREG to be allocated at the target @@ -8571,137 +8047,111 @@ int new_recompile_block(int addr) memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap)); } - if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800) - current.waswritten|=1<=0x800) - current.waswritten&=~(1<0&&(dops[i-1].itype==STORE||dops[i-1].itype==STORELR||(dops[i-1].itype==C2LS&&dops[i-1].opcode==0x3a))&&(u_int)imm[i-1]<0x800) + current.waswritten|=1<=0x800) + current.waswritten&=~(1<0) { - current.was32=current.is32; current.wasdirty=current.dirty; - switch(itype[i-1]) { + switch(dops[i-1].itype) { case UJUMP: memcpy(&branch_regs[i-1],¤t,sizeof(current)); branch_regs[i-1].isconst=0; branch_regs[i-1].wasconst=0; - branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<>rt1[i])&1) current.uu&=~((1LL<>rs1[i-1])&(current.is32>>rs2[i-1])&1)) - { - if(rs1[i-1]) alloc_reg64(¤t,i-1,rs1[i-1]); - if(rs2[i-1]) alloc_reg64(¤t,i-1,rs2[i-1]); - } + if(dops[i-1].rs1) alloc_reg(¤t,i-1,dops[i-1].rs1); + if(dops[i-1].rs2) alloc_reg(¤t,i-1,dops[i-1].rs2); } memcpy(&branch_regs[i-1],¤t,sizeof(current)); branch_regs[i-1].isconst=0; branch_regs[i-1].wasconst=0; memcpy(&branch_regs[i-1].regmap_entry,¤t.regmap,sizeof(current.regmap)); - memcpy(constmap[i],constmap[i-1],sizeof(current_constmap)); + memcpy(constmap[i],constmap[i-1],sizeof(constmap[i])); } else - if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ + if((dops[i-1].opcode&0x3E)==6) // BLEZ/BGTZ { alloc_cc(¤t,i-1); dirty_reg(¤t,CCREG); - if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) { + if(dops[i-1].rs1==dops[i].rt1||dops[i-1].rs1==dops[i].rt2) { // The delay slot overwrote the branch condition // Delay slot goes after the test (in order) - current.u=branch_unneeded_reg[i-1]&~((1LL<>rt1[i])&1) current.uu&=~((1LL<>rs1[i-1]&1)) - { - alloc_reg64(¤t,i-1,rs1[i-1]); - } + alloc_reg(¤t,i-1,dops[i-1].rs1); } memcpy(&branch_regs[i-1],¤t,sizeof(current)); branch_regs[i-1].isconst=0; branch_regs[i-1].wasconst=0; memcpy(&branch_regs[i-1].regmap_entry,¤t.regmap,sizeof(current.regmap)); - memcpy(constmap[i],constmap[i-1],sizeof(current_constmap)); + memcpy(constmap[i],constmap[i-1],sizeof(constmap[i])); } else // Alloc the delay slot in case the branch is taken - if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL + if((dops[i-1].opcode&0x3E)==0x14) // BEQL/BNEL { memcpy(&branch_regs[i-1],¤t,sizeof(current)); - branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<>rt1[i])&1) current.uu&=~((1LL<>rs1[i-1]&1)) - { - alloc_reg64(¤t,i-1,rs1[i-1]); - } + alloc_reg(¤t,i-1,dops[i-1].rs1); } memcpy(&branch_regs[i-1],¤t,sizeof(current)); branch_regs[i-1].isconst=0; branch_regs[i-1].wasconst=0; memcpy(&branch_regs[i-1].regmap_entry,¤t.regmap,sizeof(current.regmap)); - memcpy(constmap[i],constmap[i-1],sizeof(current_constmap)); + memcpy(constmap[i],constmap[i-1],sizeof(constmap[i])); } else // Alloc the delay slot in case the branch is taken - if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL + if((dops[i-1].opcode2&0x1E)==2) // BLTZL/BGEZL { memcpy(&branch_regs[i-1],¤t,sizeof(current)); - branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<>16)==0x1000) + if (dops[i-1].is_ujump) { - if(rt1[i-1]==31) // JAL/JALR + if(dops[i-1].rt1==31) // JAL/JALR { // Subroutine call will return here, don't alloc any registers - current.is32=1; current.dirty=0; clear_all_regs(current.regmap); alloc_reg(¤t,i,CCREG); @@ -8836,7 +8235,6 @@ int new_recompile_block(int addr) else if(i+10&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL)) + ccadj[i] = CLOCK_ADJUST(cc); + if (i > 0 && (dops[i-1].is_jump || dops[i].itype == SYSCALL || dops[i].itype == HLECALL)) { cc=0; } #if !defined(DRC_DBG) - else if(itype[i]==C2OP&>e_cycletab[source[i]&0x3f]>2) - { - // GTE runs in parallel until accessed, divide by 2 for a rough guess - cc+=gte_cycletab[source[i]&0x3f]/2; - } - else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues + else if(dops[i].itype==C2OP&>e_cycletab[source[i]&0x3f]>2) { - cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER) + // this should really be removed since the real stalls have been implemented, + // but doing so causes sizeable perf regression against the older version + u_int gtec = gte_cycletab[source[i] & 0x3f]; + cc += HACK_ENABLED(NDHACK_NO_STALLS) ? gtec/2 : 2; } - else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i]) + else if(i>1&&dops[i].itype==STORE&&dops[i-1].itype==STORE&&dops[i-2].itype==STORE&&!dops[i].bt) { cc+=4; } - else if(itype[i]==C2LS) + else if(dops[i].itype==C2LS) { - cc+=4; + // same as with C2OP + cc += HACK_ENABLED(NDHACK_NO_STALLS) ? 4 : 2; } #endif else @@ -8896,12 +8291,10 @@ int new_recompile_block(int addr) cc++; } - flush_dirty_uppers(¤t); - if(!is_ds[i]) { - regs[i].is32=current.is32; + if(!dops[i].is_ds) { regs[i].dirty=current.dirty; regs[i].isconst=current.isconst; - memcpy(constmap[i],current_constmap,sizeof(current_constmap)); + memcpy(constmap[i],current_constmap,sizeof(constmap[i])); } for(hr=0;hr=0) { @@ -8921,7 +8314,7 @@ int new_recompile_block(int addr) for (i=slen-1;i>=0;i--) { int hr; - if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) + if(dops[i].is_jump) { if(ba[i]=(start+slen*4)) { @@ -8942,7 +8335,7 @@ int new_recompile_block(int addr) } } // Conditional branch may need registers for following instructions - if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) + if (!dops[i].is_ujump) { if(i>dep1[i+1])&1)) { - if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<>dep2[i+1])&1)) { - if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<>dep1[i])&1)) { - if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<>dep2[i])&1)) { - if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) { - if((regmap_pre[i][hr]>0&®map_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) || - (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) { - if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<0&&!dops[i].bt&&((regs[i].wasdirty>>hr)&1)) { + if((regmap_pre[i][hr]>0&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1))) { + if(dops[i-1].rt1==(regmap_pre[i][hr]&63)) nr|=1<0&®s[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) || - (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) { - if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<0&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1))) { + if(dops[i-1].rt1==(regs[i].regmap_entry[hr]&63)) nr|=1<>hr)&1)) { if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1; - if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] && - (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] && - (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG) - { - if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) - { - if(likely[i]) { - regs[i].regmap[hr]=-1; - regs[i].isconst&=~(1<=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0) - { - d1=dep1[i+1]; - d2=dep2[i+1]; - } - if(itype[i+1]==STORE || itype[i+1]==STORELR || - (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2 - map=INVCP; - } - if(itype[i+1]==LOADLR || itype[i+1]==STORELR || - itype[i+1]==C1LS || itype[i+1]==C2LS) - temp=FTEMP; - if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] && - (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] && - (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] && - (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] && - (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 && - regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] && + int map1 = 0, map2 = 0, temp = 0; // or -1 ?? + if (dops[i+1].is_load || dops[i+1].is_store) + map1 = ROREG; + if (dops[i+1].is_store) + map2 = INVCP; + if(dops[i+1].itype==LOADLR || dops[i+1].itype==STORELR || dops[i+1].itype==C2LS) + temp = FTEMP; + if((regs[i].regmap[hr]&63)!=dops[i].rs1 && (regs[i].regmap[hr]&63)!=dops[i].rs2 && + (regs[i].regmap[hr]&63)!=dops[i].rt1 && (regs[i].regmap[hr]&63)!=dops[i].rt2 && + (regs[i].regmap[hr]&63)!=dops[i+1].rt1 && (regs[i].regmap[hr]&63)!=dops[i+1].rt2 && + regs[i].regmap[hr]!=dops[i+1].rs1 && regs[i].regmap[hr]!=dops[i+1].rs2 && (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP && regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL && regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG && - regs[i].regmap[hr]!=map ) + regs[i].regmap[hr]!=map1 && regs[i].regmap[hr]!=map2) { regs[i].regmap[hr]=-1; regs[i].isconst&=~(1<>16)!=0x1000) + if (!dops[i].is_ujump) { - if(!likely[i]&&i0) { - int d1=0,d2=0,map=-1,temp=-1; - if(get_reg(regs[i].regmap,rt1[i]|64)>=0) + int map1 = -1, map2 = -1, temp=-1; + if (dops[i].is_load || dops[i].is_store) + map1 = ROREG; + if (dops[i].is_store) + map2 = INVCP; + if (dops[i].itype==LOADLR || dops[i].itype==STORELR || dops[i].itype==C2LS) + temp = FTEMP; + if((regs[i].regmap[hr]&63)!=dops[i].rt1 && (regs[i].regmap[hr]&63)!=dops[i].rt2 && + regs[i].regmap[hr]!=dops[i].rs1 && regs[i].regmap[hr]!=dops[i].rs2 && + (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map1 && regs[i].regmap[hr]!=map2 && + //(dops[i].itype!=SPAN||regs[i].regmap[hr]!=CCREG) + regs[i].regmap[hr] != CCREG) { - d1=dep1[i]; - d2=dep2[i]; - } - if(itype[i]==STORE || itype[i]==STORELR || - (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2 - map=INVCP; - } - if(itype[i]==LOADLR || itype[i]==STORELR || - itype[i]==C1LS || itype[i]==C2LS) - temp=FTEMP; - if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] && - (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] && - (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 && - regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] && - (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map && - (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG)) - { - if(i0) if(regmap_pre[i+1][hr]!=regs[i].regmap[hr]) - if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1)) { SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]); assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]); @@ -9174,8 +8512,8 @@ int new_recompile_block(int addr) } } } - } - } + } // if needed + } // for hr } /* Pass 5 - Pre-allocate registers */ @@ -9188,27 +8526,21 @@ int new_recompile_block(int addr) clear_all_regs(f_regmap); for(i=0;i=start && ba[i]<(start+i*4)) - if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU - ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD - ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS - ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT - ||itype[i+1]==FCOMP||itype[i+1]==FCONV - ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP) + if(dops[i+1].itype==NOP||dops[i+1].itype==MOV||dops[i+1].itype==ALU + ||dops[i+1].itype==SHIFTIMM||dops[i+1].itype==IMM16||dops[i+1].itype==LOAD + ||dops[i+1].itype==STORE||dops[i+1].itype==STORELR||dops[i+1].itype==C1LS + ||dops[i+1].itype==SHIFT||dops[i+1].itype==COP1 + ||dops[i+1].itype==COP2||dops[i+1].itype==C2LS||dops[i+1].itype==C2OP) { int t=(ba[i]-start)>>2; - if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots - if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated + if(t > 0 && !dops[t-1].is_jump) // loop_preload can't handle jumps into delay slots + if(t<2||(dops[t-2].itype!=UJUMP&&dops[t-2].itype!=RJUMP)||dops[t-2].rt1!=31) // call/ret assumes no registers allocated for(hr=0;hr64) { - if(!((regs[i].dirty>>hr)&1)) - f_regmap[hr]=regs[i].regmap[hr]; - else f_regmap[hr]=-1; - } - else if(regs[i].regmap[hr]>=0) { + if(regs[i].regmap[hr]>=0) { if(f_regmap[hr]!=regs[i].regmap[hr]) { // dealloc old register int n; @@ -9220,12 +8552,7 @@ int new_recompile_block(int addr) f_regmap[hr]=regs[i].regmap[hr]; } } - if(branch_regs[i].regmap[hr]>64) { - if(!((branch_regs[i].dirty>>hr)&1)) - f_regmap[hr]=branch_regs[i].regmap[hr]; - else f_regmap[hr]=-1; - } - else if(branch_regs[i].regmap[hr]>=0) { + if(branch_regs[i].regmap[hr]>=0) { if(f_regmap[hr]!=branch_regs[i].regmap[hr]) { // dealloc old register int n; @@ -9237,7 +8564,7 @@ int new_recompile_block(int addr) f_regmap[hr]=branch_regs[i].regmap[hr]; } } - if(ooo[i]) { + if(dops[i].ooo) { if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) f_regmap[hr]=branch_regs[i].regmap[hr]; }else{ @@ -9262,15 +8589,7 @@ int new_recompile_block(int addr) { //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r); if(r<34&&((unneeded_reg[j]>>r)&1)) break; - if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break; - if(r>63) { - // NB This can exclude the case where the upper-half - // register is lower numbered than the lower-half - // register. Not sure if it's worth fixing... - if(get_reg(regs[j].regmap,r&63)<0) break; - if(get_reg(regs[j].regmap_entry,r&63)<0) break; - if(regs[j].is32&(1LL<<(r&63))) break; - } + assert(r < 64); if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63) %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r); int k; @@ -9290,30 +8609,17 @@ int new_recompile_block(int addr) //printf("no-match due to different register\n"); break; } - if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) { + if (dops[k-2].is_jump) { //printf("no-match due to branch\n"); break; } // call/ret fast path assumes no registers allocated - if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) { + if(k>2&&(dops[k-3].itype==UJUMP||dops[k-3].itype==RJUMP)&&dops[k-3].rt1==31) { break; } - if(r>63) { - // NB This can exclude the case where the upper-half - // register is lower numbered than the lower-half - // register. Not sure if it's worth fixing... - if(get_reg(regs[k-1].regmap,r&63)<0) break; - if(regs[k-1].is32&(1LL<<(r&63))) break; - } + assert(r < 64); k--; } - if(i\n",hr,start+k*4); while(k>16)!=0x1000) { + if (!dops[i].is_ujump) { regmap_pre[i+2][hr]=f_regmap[hr]; regs[i+2].wasdirty&=~(1<>16)!=0x1000) { + if (!dops[k].is_ujump) { regmap_pre[k+2][hr]=f_regmap[hr]; regs[k+2].wasdirty&=~(1<>16)==0x1000) + if (dops[j].is_ujump) { // Stop on unconditional branch break; } - if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) + if(dops[j].itype==CJUMP||dops[j].itype==SJUMP) { - if(ooo[j]) { + if(dops[j].ooo) { if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break; }else{ @@ -9426,17 +8724,7 @@ int new_recompile_block(int addr) //printf("No free regs for store %x\n",start+j*4); break; } - if(f_regmap[hr]>=64) { - if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) { - break; - } - else - { - if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) { - break; - } - } - } + assert(f_regmap[hr]<64); } } } @@ -9447,11 +8735,7 @@ int new_recompile_block(int addr) for(hr=0;hr64) { - if(!((regs[i].dirty>>hr)&1)) - f_regmap[hr]=regs[i].regmap[hr]; - } - else if(regs[i].regmap[hr]>=0) { + if(regs[i].regmap[hr]>=0) { if(f_regmap[hr]!=regs[i].regmap[hr]) { // dealloc old register int n; @@ -9466,7 +8750,7 @@ int new_recompile_block(int addr) } } // Try to restore cycle count at branch targets - if(bt[i]) { + if(dops[i].bt) { for(j=i;j=0) { - score[hr]=0;earliest_available[hr]=i+1; - loop_start[hr]=MAXBLOCK; - } - if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) { - if(branch_regs[i].regmap[hr]>=0) { - score[hr]=0;earliest_available[hr]=i+2; - loop_start[hr]=MAXBLOCK; - } - } - } - // No register allocations after unconditional jumps - if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000) - { - for(hr=0;hr=0) break; - if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) { - if(branch_regs[j].regmap[hr]>=0) break; - if(ooo[j]) { - if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break; - }else{ - if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break; - } - } - else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break; - if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) { - int t=(ba[j]-start)>>2; - if(t=earliest_available[hr]) { - if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated - // Score a point for hoisting loop invariant - if(t>16)==0x1000) - { - // Stop on unconditional branch - break; - } - else - if(itype[j]==LOAD||itype[j]==LOADLR|| - itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) { - score[hr]++; - end[hr]=j; - } - } - } - } - // Find highest score and allocate that register - int maxscore=0; - for(hr=0;hrscore[maxscore]) { - maxscore=hr; - //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4); - } - } - } - if(score[maxscore]>1) - { - if(i=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);} - assert(regs[j].regmap[maxscore]<0); - if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg; - regs[j].regmap[maxscore]=reg; - regs[j].dirty&=~(1<>16)!=0x1000) { - regmap_pre[j+2][maxscore]=reg; - regs[j+2].wasdirty&=~(1<>2; - if(t==loop_start[maxscore]) { - if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated - regs[t].regmap_entry[maxscore]=reg; - } - } - else - { - if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) { - regmap_pre[j+1][maxscore]=reg; - regs[j+1].wasdirty&=~(1<=0) + if(dops[i+1].rs1) { + if((hr=get_reg(regs[i+1].regmap,dops[i+1].rs1))>=0) { if(regs[i].regmap[hr]<0&®s[i+1].regmap_entry[hr]<0) { @@ -9737,8 +8840,8 @@ int new_recompile_block(int addr) } } } - if(rs2[i+1]) { - if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0) + if(dops[i+1].rs2) { + if((hr=get_reg(regs[i+1].regmap,dops[i+1].rs2))>=0) { if(regs[i].regmap[hr]<0&®s[i+1].regmap_entry[hr]<0) { @@ -9754,14 +8857,14 @@ int new_recompile_block(int addr) } } // Preload target address for load instruction (non-constant) - if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) { - if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0) + if(dops[i+1].itype==LOAD&&dops[i+1].rs1&&get_reg(regs[i+1].regmap,dops[i+1].rs1)<0) { + if((hr=get_reg(regs[i+1].regmap,dops[i+1].rt1))>=0) { if(regs[i].regmap[hr]<0&®s[i+1].regmap_entry[hr]<0) { - regs[i].regmap[hr]=rs1[i+1]; - regmap_pre[i+1][hr]=rs1[i+1]; - regs[i+1].regmap_entry[hr]=rs1[i+1]; + regs[i].regmap[hr]=dops[i+1].rs1; + regmap_pre[i+1][hr]=dops[i+1].rs1; + regs[i+1].regmap_entry[hr]=dops[i+1].rs1; regs[i].isconst&=~(1<=0) + if(dops[i+1].lt1&&get_reg(regs[i+1].regmap,dops[i+1].rs1)<0) { + if((hr=get_reg(regs[i+1].regmap,dops[i+1].rt1))>=0) { if(regs[i].regmap[hr]<0&®s[i+1].regmap_entry[hr]<0) { - regs[i].regmap[hr]=rs1[i+1]; - regmap_pre[i+1][hr]=rs1[i+1]; - regs[i+1].regmap_entry[hr]=rs1[i+1]; + regs[i].regmap[hr]=dops[i+1].rs1; + regmap_pre[i+1][hr]=dops[i+1].rs1; + regs[i+1].regmap_entry[hr]=dops[i+1].rs1; regs[i].isconst&=~(1<=0); if(regs[i].regmap[hr]<0&®s[i+1].regmap_entry[hr]<0) { - regs[i].regmap[hr]=rs1[i+1]; - regmap_pre[i+1][hr]=rs1[i+1]; - regs[i+1].regmap_entry[hr]=rs1[i+1]; + regs[i].regmap[hr]=dops[i+1].rs1; + regmap_pre[i+1][hr]=dops[i+1].rs1; + regs[i+1].regmap_entry[hr]=dops[i+1].rs1; regs[i].isconst&=~(1<=0); if(regs[i].regmap[hr]<0&®s[i+1].regmap_entry[hr]<0) { - regs[i].regmap[hr]=rs1[i+1]; - regmap_pre[i+1][hr]=rs1[i+1]; - regs[i+1].regmap_entry[hr]=rs1[i+1]; + regs[i].regmap[hr]=dops[i+1].rs1; + regmap_pre[i+1][hr]=dops[i+1].rs1; + regs[i+1].regmap_entry[hr]=dops[i+1].rs1; regs[i].isconst&=~(1<=0&®s[i].regmap[hr]<0) { - int rs=get_reg(regs[i+1].regmap,rs1[i+1]); + int rs=get_reg(regs[i+1].regmap,dops[i+1].rs1); if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) { regs[i].regmap[hr]=AGEN1+((i+1)&1); regmap_pre[i+1][hr]=AGEN1+((i+1)&1); @@ -9875,19 +8978,19 @@ int new_recompile_block(int addr) /* Pass 7 - Identify 32-bit registers */ for (i=slen-1;i>=0;i--) { - if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) + if(dops[i].itype==CJUMP||dops[i].itype==SJUMP) { // Conditional branch if((source[i]>>16)!=0x1000&&i>1)&1) printf("ecx "); @@ -9919,7 +9023,6 @@ int new_recompile_block(int addr) if((needed_reg[i]>>6)&1) printf("esi "); if((needed_reg[i]>>7)&1) printf("edi "); printf("\n"); - #if defined(__i386__) || defined(__x86_64__) printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]); printf("dirty: "); if(regs[i].wasdirty&1) printf("eax "); @@ -9978,31 +9081,23 @@ int new_recompile_block(int addr) if(regs[i].isconst) { printf("constants: "); #if defined(__i386__) || defined(__x86_64__) - if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]); - if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]); - if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]); - if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]); - if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]); - if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]); - if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]); + if(regs[i].isconst&1) printf("eax=%x ",(u_int)constmap[i][0]); + if((regs[i].isconst>>1)&1) printf("ecx=%x ",(u_int)constmap[i][1]); + if((regs[i].isconst>>2)&1) printf("edx=%x ",(u_int)constmap[i][2]); + if((regs[i].isconst>>3)&1) printf("ebx=%x ",(u_int)constmap[i][3]); + if((regs[i].isconst>>5)&1) printf("ebp=%x ",(u_int)constmap[i][5]); + if((regs[i].isconst>>6)&1) printf("esi=%x ",(u_int)constmap[i][6]); + if((regs[i].isconst>>7)&1) printf("edi=%x ",(u_int)constmap[i][7]); #endif - #ifdef __arm__ - if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]); - if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]); - if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]); - if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]); - if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]); - if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]); - if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]); - if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]); - if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]); - if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]); - if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]); - if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]); + #if defined(__arm__) || defined(__aarch64__) + int r; + for (r = 0; r < ARRAY_SIZE(constmap[i]); r++) + if ((regs[i].isconst >> r) & 1) + printf(" r%d=%x", r, (u_int)constmap[i][r]); #endif printf("\n"); } - if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) { + if(dops[i].is_jump) { #if defined(__i386__) || defined(__x86_64__) printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]); if(branch_regs[i].dirty&1) printf("eax "); @@ -10035,28 +9130,31 @@ int new_recompile_block(int addr) /* Pass 8 - Assembly */ linkcount=0;stubcount=0; ds=0;is_delayslot=0; - cop1_usable=0; - uint64_t is32_pre=0; u_int dirty_pre=0; void *beginning=start_block(); if((u_int)addr&1) { ds=1; pagespan_ds(); } - u_int instr_addr0_override=0; + void *instr_addr0_override = NULL; if (start == 0x80030000) { - // nasty hack for fastbios thing + // nasty hack for the fastbios thing // override block entry to this code - instr_addr0_override=(u_int)out; + instr_addr0_override = out; emit_movimm(start,0); // abuse io address var as a flag that we // have already returned here once - emit_readword((int)&address,1); - emit_writeword(0,(int)&pcaddr); - emit_writeword(0,(int)&address); + emit_readword(&address,1); + emit_writeword(0,&pcaddr); + emit_writeword(0,&address); emit_cmp(0,1); - emit_jne((int)new_dyna_leave); + #ifdef __aarch64__ + emit_jeq(out + 4*2); + emit_far_jump(new_dyna_leave); + #else + emit_jne(new_dyna_leave); + #endif } for(i=0;i>16)!=0x1000)) + if (i < 2 || !dops[i-2].is_ujump) { - wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre, - unneeded_reg[i],unneeded_reg_upper[i]); + wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,unneeded_reg[i]); } - if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) { - is32_pre=branch_regs[i].is32; + if((dops[i].itype==CJUMP||dops[i].itype==SJUMP)) { dirty_pre=branch_regs[i].dirty; }else{ - is32_pre=regs[i].is32; dirty_pre=regs[i].dirty; } #endif // write back - if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000)) + if (i < 2 || !dops[i-2].is_ujump) { - wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32, - unneeded_reg[i],unneeded_reg_upper[i]); + wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,unneeded_reg[i]); loop_preload(regmap_pre[i],regs[i].regmap_entry); } // branch target entry point - instr_addr[i]=(u_int)out; + instr_addr[i] = out; assem_debug("<->\n"); + drc_dbg_emit_do_cmp(i, ccadj[i]); + // load regs if(regs[i].regmap_entry[HOST_CCREG]==CCREG&®s[i].regmap[HOST_CCREG]!=CCREG) - wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32); - load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]); + wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty); + load_regs(regs[i].regmap_entry,regs[i].regmap,dops[i].rs1,dops[i].rs2); address_generation(i,®s[i],regs[i].regmap_entry); - load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i); - if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) + load_consts(regmap_pre[i],regs[i].regmap,i); + if(dops[i].is_jump) { // Load the delay slot registers if necessary - if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0)) - load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]); - if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0)) - load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]); - if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) - load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP); + if(dops[i+1].rs1!=dops[i].rs1&&dops[i+1].rs1!=dops[i].rs2&&(dops[i+1].rs1!=dops[i].rt1||dops[i].rt1==0)) + load_regs(regs[i].regmap_entry,regs[i].regmap,dops[i+1].rs1,dops[i+1].rs1); + if(dops[i+1].rs2!=dops[i+1].rs1&&dops[i+1].rs2!=dops[i].rs1&&dops[i+1].rs2!=dops[i].rs2&&(dops[i+1].rs2!=dops[i].rt1||dops[i].rt1==0)) + load_regs(regs[i].regmap_entry,regs[i].regmap,dops[i+1].rs2,dops[i+1].rs2); + if (ram_offset && (dops[i+1].is_load || dops[i+1].is_store)) + load_regs(regs[i].regmap_entry,regs[i].regmap,ROREG,ROREG); + if (dops[i+1].is_store) + load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP); } else if(i+1>16)==0x1000) + if (!dops[i].is_jump || dops[i].itype == CJUMP) + load_regs(regs[i].regmap_entry,regs[i].regmap,CCREG,CCREG); + if (ram_offset && (dops[i].is_load || dops[i].is_store)) + load_regs(regs[i].regmap_entry,regs[i].regmap,ROREG,ROREG); + if (dops[i].is_store) + load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP); + + ds = assemble(i, ®s[i], ccadj[i]); + + if (dops[i].is_ujump) literal_pool(1024); else literal_pool_jumpover(256); } } - //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000); + + assert(slen > 0); + if (slen > 0 && dops[slen-1].itype == INTCALL) { + // no ending needed for this block since INTCALL never returns + } // If the block did not end with an unconditional branch, // add a jump to the next instruction. - if(i>1) { - if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) { - assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP); + else if (i > 1) { + if (!dops[i-2].is_ujump && dops[i-1].itype != SPAN) { + assert(!dops[i-1].is_jump); assert(i==slen); - if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) { - store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4); + if(dops[i-2].itype!=CJUMP&&dops[i-2].itype!=SJUMP) { + store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4); if(regs[i-1].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG); - emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG); - } - else if(!likely[i-2]) - { - store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4); - assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG); + emit_addimm(HOST_CCREG, ccadj[i-1] + CLOCK_ADJUST(1), HOST_CCREG); } else { - store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4); - assert(regs[i-2].regmap[HOST_CCREG]==CCREG); + store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].dirty,start+i*4); + assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG); } - add_to_linker((int)out,start+i*4,0); + add_to_linker(out,start+i*4,0); emit_jmp(0); } } else { assert(i>0); - assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP); - store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4); + assert(!dops[i-1].is_jump); + store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4); if(regs[i-1].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG); - emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG); - add_to_linker((int)out,start+i*4,0); + emit_addimm(HOST_CCREG, ccadj[i-1] + CLOCK_ADJUST(1), HOST_CCREG); + add_to_linker(out,start+i*4,0); emit_jmp(0); } @@ -10232,7 +9274,7 @@ int new_recompile_block(int addr) // Stubs for(i=0;i %8x\n",link_addr[i][0],link_addr[i][1]); + assem_debug("%p -> %8x\n",link_addr[i].addr,link_addr[i].target); literal_pool(64); - if(!link_addr[i][2]) + if (!link_addr[i].ext) { - void *stub=out; - void *addr=check_addr(link_addr[i][1]); - emit_extjump(link_addr[i][0],link_addr[i][1]); - if(addr) { - set_jump_target(link_addr[i][0],(int)addr); - add_link(link_addr[i][1],stub); + void *stub = out; + void *addr = check_addr(link_addr[i].target); + emit_extjump(link_addr[i].addr, link_addr[i].target); + if (addr) { + set_jump_target(link_addr[i].addr, addr); + add_jump_out(link_addr[i].target,stub); } - else set_jump_target(link_addr[i][0],(int)stub); + else + set_jump_target(link_addr[i].addr, stub); } else { // Internal branch - int target=(link_addr[i][1]-start)>>2; + int target=(link_addr[i].target-start)>>2; assert(target>=0&&target>1); + //set_jump_target_fillslot(link_addr[i].addr,instr_addr[target],link_addr[i].ext>>1); //#else - set_jump_target(link_addr[i][0],instr_addr[target]); + set_jump_target(link_addr[i].addr, instr_addr[target]); //#endif } } + + u_int source_len = slen*4; + if (dops[slen-1].itype == INTCALL && source_len > 4) + // no need to treat the last instruction as compiled + // as interpreter fully handles it + source_len -= 4; + + if ((u_char *)copy + source_len > (u_char *)shadow + sizeof(shadow)) + copy = shadow; + // External Branch Targets (jump_in) - if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow; for(i=0;i>16)^vaddr)&0xFFFF]; - if(ht_bin[0]==vaddr) { - ht_bin[1]=entry_point; - } - if(ht_bin[2]==vaddr) { - ht_bin[3]=entry_point; - } + struct ht_entry *ht_bin = hash_table_get(vaddr); + if (ht_bin->vaddr[0] == vaddr) + ht_bin->tcaddr[0] = entry_point; + if (ht_bin->vaddr[1] == vaddr) + ht_bin->tcaddr[1] = entry_point; } } } @@ -10328,16 +9378,17 @@ int new_recompile_block(int addr) // Align code if(((u_int)out)&7) emit_addnop(13); #endif - assert((u_int)out-(u_int)beginning(u_int)BASE_ADDR+(1< ndrc->translation_cache + sizeof(ndrc->translation_cache) - MAX_OUTPUT_BLOCK_SIZE) + out = ndrc->translation_cache; // Trap writes to any of the pages we compiled for(i=start>>12;i<=(start+slen*4)>>12;i++) { @@ -10354,56 +9405,62 @@ int new_recompile_block(int addr) /* Pass 10 - Free memory by expiring oldest blocks */ - int end=((((int)out-(int)BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535; + int end=(((out-ndrc->translation_cache)>>(TARGET_SIZE_2-16))+16384)&65535; while(expirep!=end) { int shift=TARGET_SIZE_2-3; // Divide into 8 blocks - int base=(int)BASE_ADDR+((expirep>>13)<> 13) << shift); // Base offset of this block + uintptr_t base_offs_s = base_offs >> shift; inv_debug("EXP: Phase %d\n",expirep); switch((expirep>>11)&3) { case 0: // Clear jump_in and jump_dirty - ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift); - ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift); - ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift); - ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift); + ll_remove_matching_addrs(jump_in+(expirep&2047),base_offs_s,shift); + ll_remove_matching_addrs(jump_dirty+(expirep&2047),base_offs_s,shift); + ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base_offs_s,shift); + ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base_offs_s,shift); break; case 1: // Clear pointers - ll_kill_pointers(jump_out[expirep&2047],base,shift); - ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift); + ll_kill_pointers(jump_out[expirep&2047],base_offs_s,shift); + ll_kill_pointers(jump_out[(expirep&2047)+2048],base_offs_s,shift); break; case 2: // Clear hash table for(i=0;i<32;i++) { - u_int *ht_bin=hash_table[((expirep&2047)<<5)+i]; - if((ht_bin[3]>>shift)==(base>>shift) || - ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) { - inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]); - ht_bin[2]=ht_bin[3]=-1; - } - if((ht_bin[1]>>shift)==(base>>shift) || - ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) { - inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]); - ht_bin[0]=ht_bin[2]; - ht_bin[1]=ht_bin[3]; - ht_bin[2]=ht_bin[3]=-1; + struct ht_entry *ht_bin = &hash_table[((expirep&2047)<<5)+i]; + uintptr_t o1 = (u_char *)ht_bin->tcaddr[1] - ndrc->translation_cache; + uintptr_t o2 = o1 - MAX_OUTPUT_BLOCK_SIZE; + if ((o1 >> shift) == base_offs_s || (o2 >> shift) == base_offs_s) { + inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[1],ht_bin->tcaddr[1]); + ht_bin->vaddr[1] = -1; + ht_bin->tcaddr[1] = NULL; + } + o1 = (u_char *)ht_bin->tcaddr[0] - ndrc->translation_cache; + o2 = o1 - MAX_OUTPUT_BLOCK_SIZE; + if ((o1 >> shift) == base_offs_s || (o2 >> shift) == base_offs_s) { + inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[0],ht_bin->tcaddr[0]); + ht_bin->vaddr[0] = ht_bin->vaddr[1]; + ht_bin->tcaddr[0] = ht_bin->tcaddr[1]; + ht_bin->vaddr[1] = -1; + ht_bin->tcaddr[1] = NULL; } } break; case 3: // Clear jump_out - #ifdef __arm__ if((expirep&2047)==0) do_clear_cache(); - #endif - ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift); - ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift); + ll_remove_matching_addrs(jump_out+(expirep&2047),base_offs_s,shift); + ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base_offs_s,shift); break; } expirep=(expirep+1)&65535; } +#ifdef ASSEM_PRINT + fflush(stdout); +#endif return 0; } diff --git a/libpcsxcore/new_dynarec/new_dynarec.h b/libpcsxcore/new_dynarec/new_dynarec.h index 8c890518..8991faca 100644 --- a/libpcsxcore/new_dynarec/new_dynarec.h +++ b/libpcsxcore/new_dynarec/new_dynarec.h @@ -1,27 +1,31 @@ -#ifndef __NEW_DYNAREC_H__ -#define __NEW_DYNAREC_H__ - -/* #define NEW_DYNAREC 1 */ +#define NEW_DYNAREC 1 extern int pcaddr; extern int pending_exception; extern int stop; extern int new_dynarec_did_compile; + +#define CYCLE_MULT_DEFAULT 175 extern int cycle_multiplier; // 100 for 1.0 +extern int cycle_multiplier_override; +extern int cycle_multiplier_old; #define NDHACK_NO_SMC_CHECK (1<<0) #define NDHACK_GTE_UNNEEDED (1<<1) #define NDHACK_GTE_NO_FLAGS (1<<2) +#define NDHACK_OVERRIDE_CYCLE_M (1<<3) +#define NDHACK_NO_STALLS (1<<4) +#define NDHACK_NO_COMPAT_HACKS (1<<5) extern int new_dynarec_hacks; +extern int new_dynarec_hacks_pergame; +extern int new_dynarec_hacks_old; void new_dynarec_init(void); void new_dynarec_cleanup(void); void new_dynarec_clear_full(void); -void new_dyna_start(void); +void new_dyna_start(void *context); int new_dynarec_save_blocks(void *save, int size); void new_dynarec_load_blocks(const void *save, int size); void invalidate_all_pages(void); void invalidate_block(unsigned int block); - -#endif /* __NEW_DYNAREC_H__ */ diff --git a/libpcsxcore/new_dynarec/new_dynarec_config.h b/libpcsxcore/new_dynarec/new_dynarec_config.h index 3b00780e..321bfbf3 100644 --- a/libpcsxcore/new_dynarec/new_dynarec_config.h +++ b/libpcsxcore/new_dynarec/new_dynarec_config.h @@ -1,15 +1,14 @@ -#ifndef __NEW_DYNAREC_CONFIG_H__ -#define __NEW_DYNAREC_CONFIG_H__ +#ifdef __arm__ #define CORTEX_A8_BRANCH_PREDICTION_HACK 1 +#endif + #define USE_MINI_HT 1 //#define REG_PREFETCH 1 -#if defined(__MACH__) +#if defined(__MACH__) || defined(VITA) #define NO_WRITE_EXEC 1 #endif #ifdef VITA #define BASE_ADDR_DYNAMIC 1 #endif - -#endif /* __NEW_DYNAREC_CONFIG_H__ */ diff --git a/libpcsxcore/new_dynarec/patches/trace_drc_chk b/libpcsxcore/new_dynarec/patches/trace_drc_chk new file mode 100644 index 00000000..e98a48e7 --- /dev/null +++ b/libpcsxcore/new_dynarec/patches/trace_drc_chk @@ -0,0 +1,133 @@ +diff --git a/libpcsxcore/new_dynarec/new_dynarec.c b/libpcsxcore/new_dynarec/new_dynarec.c +index f1005db..ebd1d4f 100644 +--- a/libpcsxcore/new_dynarec/new_dynarec.c ++++ b/libpcsxcore/new_dynarec/new_dynarec.c +@@ -235,7 +235,7 @@ static struct decoded_insn + int new_dynarec_hacks_old; + int new_dynarec_did_compile; + +- #define HACK_ENABLED(x) ((new_dynarec_hacks | new_dynarec_hacks_pergame) & (x)) ++ #define HACK_ENABLED(x) ((NDHACK_NO_STALLS) & (x)) + + extern int cycle_count; // ... until end of the timeslice, counts -N -> 0 + extern int last_count; // last absolute target, often = next_interupt +@@ -471,6 +471,7 @@ int cycle_multiplier_old; + + static int CLOCK_ADJUST(int x) + { ++ return x * 2; + int m = cycle_multiplier_override && cycle_multiplier == CYCLE_MULT_DEFAULT + ? cycle_multiplier_override : cycle_multiplier; + int s=(x>>31)|1; +@@ -522,6 +523,9 @@ static int doesnt_expire_soon(void *tcaddr) + // This is called from the recompiled JR/JALR instructions + void noinline *get_addr(u_int vaddr) + { ++#ifdef DRC_DBG ++printf("get_addr %08x, pc=%08x\n", vaddr, psxRegs.pc); ++#endif + u_int page=get_page(vaddr); + u_int vpage=get_vpage(vaddr); + struct ll_entry *head; +@@ -6248,7 +6252,7 @@ void unneeded_registers(int istart,int iend,int r) + // R0 is always unneeded + u|=1; + // Save it +- unneeded_reg[i]=u; ++ unneeded_reg[i]=1;//u; + gte_unneeded[i]=gte_u; + /* + printf("ur (%d,%d) %x: ",istart,iend,start+i*4); +@@ -8794,6 +8798,7 @@ int new_recompile_block(u_int addr) + + // This allocates registers (if possible) one instruction prior + // to use, which can avoid a load-use penalty on certain CPUs. ++#if 0 + for(i=0;i> 26; + switch (tmp) { +@@ -499,13 +501,15 @@ static void doBranch(u32 tar) { + } + break; + } +- ++#endif + psxBSC[psxRegs.code >> 26](); + + branch = 0; + psxRegs.pc = branchPC; + ++ psxRegs.cycle += BIAS; + psxBranchTest(); ++ psxRegs.cycle -= BIAS; + } + + /********************************************************* +@@ -615,12 +619,13 @@ void psxMULTU_stall() { + psxMULTU(); + } + ++#define doBranchNotTaken() do { psxRegs.cycle += BIAS; execI(); psxBranchTest(); psxRegs.cycle -= BIAS; } while(0) + /********************************************************* + * Register branch logic * + * Format: OP rs, offset * + *********************************************************/ +-#define RepZBranchi32(op) if(_i32(_rRs_) op 0) doBranch(_BranchTarget_); +-#define RepZBranchLinki32(op) { _SetLink(31); if(_i32(_rRs_) op 0) { doBranch(_BranchTarget_); } } ++#define RepZBranchi32(op) if(_i32(_rRs_) op 0) doBranch(_BranchTarget_); else doBranchNotTaken(); ++#define RepZBranchLinki32(op) { _SetLink(31); if(_i32(_rRs_) op 0) { doBranch(_BranchTarget_); } else doBranchNotTaken(); } + + void psxBGEZ() { RepZBranchi32(>=) } // Branch if Rs >= 0 + void psxBGEZAL() { RepZBranchLinki32(>=) } // Branch if Rs >= 0 and link +@@ -702,7 +707,7 @@ void psxRFE() { + * Register branch logic * + * Format: OP rs, rt, offset * + *********************************************************/ +-#define RepBranchi32(op) if(_i32(_rRs_) op _i32(_rRt_)) doBranch(_BranchTarget_); ++#define RepBranchi32(op) if(_i32(_rRs_) op _i32(_rRt_)) doBranch(_BranchTarget_); else doBranchNotTaken(); + + void psxBEQ() { RepBranchi32(==) } // Branch if Rs == Rt + void psxBNE() { RepBranchi32(!=) } // Branch if Rs != Rt +@@ -886,6 +891,7 @@ void MTC0(int reg, u32 val) { + case 12: // Status + psxRegs.CP0.r[12] = val; + psxTestSWInts(); ++ //psxBranchTest(); + break; + + case 13: // Cause +@@ -1027,6 +1033,23 @@ void intExecuteBlock() { + while (!branch2) execI(); + } + ++extern void do_insn_trace(void); ++ ++void intExecuteT() { ++ for (;;) { ++ do_insn_trace(); ++ execI(); ++ } ++} ++ ++void intExecuteBlockT() { ++ branch2 = 0; ++ while (!branch2) { ++ do_insn_trace(); ++ execI(); ++ } ++} ++ + static void intClear(u32 Addr, u32 Size) { + } + +@@ -1049,7 +1072,7 @@ void intApplyConfig() { + assert(psxSPC[26] == psxDIV || psxSPC[26] == psxDIV_stall); + assert(psxSPC[27] == psxDIVU || psxSPC[27] == psxDIVU_stall); + +- if (Config.DisableStalls) { ++ if (1) { + psxBSC[18] = psxCOP2; + psxBSC[50] = gteLWC2; + psxBSC[58] = gteSWC2; +@@ -1091,9 +1114,10 @@ void execI() { + if (Config.Debug) ProcessDebug(); + + psxRegs.pc += 4; +- psxRegs.cycle += BIAS; + + psxBSC[psxRegs.code >> 26](); ++ ++ psxRegs.cycle += BIAS; + } + + R3000Acpu psxInt = { +diff --git a/libpcsxcore/psxmem.c b/libpcsxcore/psxmem.c +index 04aeec2..710a379 100644 +--- a/libpcsxcore/psxmem.c ++++ b/libpcsxcore/psxmem.c +@@ -217,11 +217,13 @@ void psxMemShutdown() { + } + + static int writeok = 1; ++extern u32 last_io_addr; + + u8 psxMemRead8(u32 mem) { + char *p; + u32 t; + ++ last_io_addr = mem; + t = mem >> 16; + if (t == 0x1f80 || t == 0x9f80 || t == 0xbf80) { + if ((mem & 0xffff) < 0x400) +@@ -247,6 +249,7 @@ u16 psxMemRead16(u32 mem) { + char *p; + u32 t; + ++ last_io_addr = mem; + t = mem >> 16; + if (t == 0x1f80 || t == 0x9f80 || t == 0xbf80) { + if ((mem & 0xffff) < 0x400) +@@ -272,6 +275,7 @@ u32 psxMemRead32(u32 mem) { + char *p; + u32 t; + ++ last_io_addr = mem; + t = mem >> 16; + if (t == 0x1f80 || t == 0x9f80 || t == 0xbf80) { + if ((mem & 0xffff) < 0x400) +@@ -297,6 +301,7 @@ void psxMemWrite8(u32 mem, u8 value) { + char *p; + u32 t; + ++ last_io_addr = mem; + t = mem >> 16; + if (t == 0x1f80 || t == 0x9f80 || t == 0xbf80) { + if ((mem & 0xffff) < 0x400) +@@ -324,6 +329,7 @@ void psxMemWrite16(u32 mem, u16 value) { + char *p; + u32 t; + ++ last_io_addr = mem; + t = mem >> 16; + if (t == 0x1f80 || t == 0x9f80 || t == 0xbf80) { + if ((mem & 0xffff) < 0x400) +@@ -351,6 +357,7 @@ void psxMemWrite32(u32 mem, u32 value) { + char *p; + u32 t; + ++ last_io_addr = mem; + // if ((mem&0x1fffff) == 0x71E18 || value == 0x48088800) SysPrintf("t2fix!!\n"); + t = mem >> 16; + if (t == 0x1f80 || t == 0x9f80 || t == 0xbf80) { +@@ -380,6 +387,8 @@ void psxMemWrite32(u32 mem, u32 value) { + } else { + int i; + ++extern u32 handler_cycle; ++handler_cycle = psxRegs.cycle; + switch (value) { + case 0x800: case 0x804: + if (writeok == 0) break; +diff --git a/libpcsxcore/r3000a.c b/libpcsxcore/r3000a.c +index 7e6f16b..0114947 100644 +--- a/libpcsxcore/r3000a.c ++++ b/libpcsxcore/r3000a.c +@@ -120,6 +120,8 @@ void psxException(u32 code, u32 bd) { + } + + void psxBranchTest() { ++ extern u32 irq_test_cycle; ++ irq_test_cycle = psxRegs.cycle; + if ((psxRegs.cycle - psxNextsCounter) >= psxNextCounter) + psxRcntUpdate(); + diff --git a/libpcsxcore/new_dynarec/backends/psx/pcsxmem.c b/libpcsxcore/new_dynarec/pcsxmem.c similarity index 95% rename from libpcsxcore/new_dynarec/backends/psx/pcsxmem.c rename to libpcsxcore/new_dynarec/pcsxmem.c index 647981ea..bb471b6a 100644 --- a/libpcsxcore/new_dynarec/backends/psx/pcsxmem.c +++ b/libpcsxcore/new_dynarec/pcsxmem.c @@ -6,11 +6,11 @@ */ #include -#include "../../../psxhw.h" -#include "../../../cdrom.h" -#include "../../../mdec.h" -#include "../../../gpu.h" -#include "../../../psxmem_map.h" +#include "../psxhw.h" +#include "../cdrom.h" +#include "../mdec.h" +#include "../gpu.h" +#include "../psxmem_map.h" #include "emu_if.h" #include "pcsxmem.h" @@ -22,27 +22,27 @@ //#define memprintf printf #define memprintf(...) -static u32 *mem_readtab; -static u32 *mem_writetab; -static u32 mem_iortab[(1+2+4) * 0x1000 / 4]; -static u32 mem_iowtab[(1+2+4) * 0x1000 / 4]; -static u32 mem_ffwtab[(1+2+4) * 0x1000 / 4]; -//static u32 mem_unmrtab[(1+2+4) * 0x1000 / 4]; -static u32 mem_unmwtab[(1+2+4) * 0x1000 / 4]; +static uintptr_t *mem_readtab; +static uintptr_t *mem_writetab; +static uintptr_t mem_iortab[(1+2+4) * 0x1000 / 4]; +static uintptr_t mem_iowtab[(1+2+4) * 0x1000 / 4]; +static uintptr_t mem_ffwtab[(1+2+4) * 0x1000 / 4]; +//static uintptr_t mem_unmrtab[(1+2+4) * 0x1000 / 4]; +static uintptr_t mem_unmwtab[(1+2+4) * 0x1000 / 4]; -// When this is called in a loop, and 'h' is a function pointer, clang will crash. +static #ifdef __clang__ -static __attribute__ ((noinline)) void map_item(u32 *out, const void *h, u32 flag) -#else -static void map_item(u32 *out, const void *h, u32 flag) +// When this is called in a loop, and 'h' is a function pointer, clang will crash. +__attribute__ ((noinline)) #endif +void map_item(uintptr_t *out, const void *h, uintptr_t flag) { - u32 hv = (u32)h; + uintptr_t hv = (uintptr_t)h; if (hv & 1) { SysPrintf("FATAL: %p has LSB set\n", h); abort(); } - *out = (hv >> 1) | (flag << 31); + *out = (hv >> 1) | (flag << (sizeof(hv) * 8 - 1)); } // size must be power of 2, at least 4k @@ -90,7 +90,7 @@ static void io_write_sio32(u32 value) sioWrite8((unsigned char)(value >> 24)); } -#ifndef DRC_DBG +#if !defined(DRC_DBG) && defined(__arm__) static void map_rcnt_rcount0(u32 mode) { @@ -306,7 +306,7 @@ void new_dyna_pcsx_mem_init(void) int i; // have to map these further to keep tcache close to .text - mem_readtab = psxMap(0x08000000, 0x200000 * 4, 0, MAP_TAG_LUTS); + mem_readtab = psxMap(0x08000000, 0x200000 * sizeof(mem_readtab[0]), 0, MAP_TAG_LUTS); if (mem_readtab == NULL) { SysPrintf("failed to map mem tables\n"); exit(1); diff --git a/libpcsxcore/new_dynarec/backends/psx/pcsxmem.h b/libpcsxcore/new_dynarec/pcsxmem.h similarity index 76% rename from libpcsxcore/new_dynarec/backends/psx/pcsxmem.h rename to libpcsxcore/new_dynarec/pcsxmem.h index 9d292a66..72892a8e 100644 --- a/libpcsxcore/new_dynarec/backends/psx/pcsxmem.h +++ b/libpcsxcore/new_dynarec/pcsxmem.h @@ -1,5 +1,3 @@ -#ifndef __PCSXMEM_H__ -#define __PCSXMEM_H__ extern u8 zero_mem[0x1000]; @@ -9,5 +7,3 @@ void new_dyna_pcsx_mem_load_state(void); void new_dyna_pcsx_mem_shutdown(void); int pcsxmem_is_handler_dynamic(unsigned int addr); - -#endif /* __PCSXMEM_H__ */ diff --git a/libpcsxcore/new_dynarec/backends/psx/pcsxmem_inline.c b/libpcsxcore/new_dynarec/pcsxmem_inline.c similarity index 78% rename from libpcsxcore/new_dynarec/backends/psx/pcsxmem_inline.c rename to libpcsxcore/new_dynarec/pcsxmem_inline.c index 305931ae..69227f2a 100644 --- a/libpcsxcore/new_dynarec/backends/psx/pcsxmem_inline.c +++ b/libpcsxcore/new_dynarec/pcsxmem_inline.c @@ -15,14 +15,16 @@ static int pcsx_direct_read(int type, u_int addr, int cc_adj, int cc, int rs, in case 0x1120: // rcnt2 count if (rt < 0) goto dont_care; if (cc < 0) return 0; - emit_readword((int)&rcnts[2].mode, HOST_TEMPREG); - emit_readword((int)&rcnts[2].cycleStart, rt); + host_tempreg_acquire(); + emit_readword(&rcnts[2].mode, HOST_TEMPREG); + emit_readword(&rcnts[2].cycleStart, rt); emit_testimm(HOST_TEMPREG, 0x200); - emit_readword((int)&last_count, HOST_TEMPREG); + emit_readword(&last_count, HOST_TEMPREG); emit_sub(HOST_TEMPREG, rt, HOST_TEMPREG); emit_add(HOST_TEMPREG, cc, HOST_TEMPREG); if (cc_adj) emit_addimm(HOST_TEMPREG, cc_adj, rt); + host_tempreg_release(); emit_shrne_imm(rt, 3, rt); mov_loadtype_adj(type!=LOADW_STUB?type:LOADH_STUB, rt, rt); goto hit; @@ -31,9 +33,11 @@ static int pcsx_direct_read(int type, u_int addr, int cc_adj, int cc, int rs, in case 0x1124: // rcnt mode if (rt < 0) return 0; t = (addr >> 4) & 3; - emit_readword((int)&rcnts[t].mode, rt); + emit_readword(&rcnts[t].mode, rt); + host_tempreg_acquire(); emit_andimm(rt, ~0x1800, HOST_TEMPREG); - emit_writeword(HOST_TEMPREG, (int)&rcnts[t].mode); + emit_writeword(HOST_TEMPREG, &rcnts[t].mode); + host_tempreg_release(); mov_loadtype_adj(type, rt, rt); goto hit; } diff --git a/libpcsxcore/plugins.c b/libpcsxcore/plugins.c index 34f24812..85380647 100644 --- a/libpcsxcore/plugins.c +++ b/libpcsxcore/plugins.c @@ -777,9 +777,7 @@ unsigned char _PADpoll(int port, unsigned char value) { //if no new request the pad return 0xff, for signaling connected if (reqPos >= respSize -#ifdef ICACHE_EMULATION && writeok -#endif ) return 0xff; switch(reqPos){ diff --git a/libpcsxcore/psxbios.c b/libpcsxcore/psxbios.c index 46e1595e..c1918327 100644 --- a/libpcsxcore/psxbios.c +++ b/libpcsxcore/psxbios.c @@ -36,6 +36,10 @@ #include "sio.h" #include +#if (defined(__GNUC__) && __GNUC__ >= 5) || defined(__clang__) +#pragma GCC diagnostic ignored "-Wpointer-sign" +#endif + #undef SysPrintf #define SysPrintf if (Config.PsxOut) printf @@ -1402,10 +1406,8 @@ void psxBios_FlushCache() { // 44 #ifdef PSXBIOS_LOG PSXBIOS_LOG("psxBios_%s\n", biosA0n[0x44]); #endif -#ifdef ICACHE_EMULATION - psxCpu->Notify(R3000ACPU_NOTIFY_CACHE_ISOLATED, NULL); - psxCpu->Notify(R3000ACPU_NOTIFY_CACHE_UNISOLATED, NULL); -#endif + psxCpu->Notify(R3000ACPU_NOTIFY_CACHE_ISOLATED, NULL); + psxCpu->Notify(R3000ACPU_NOTIFY_CACHE_UNISOLATED, NULL); pc0 = ra; } diff --git a/libpcsxcore/psxcommon.h b/libpcsxcore/psxcommon.h index 7e105508..3edab382 100644 --- a/libpcsxcore/psxcommon.h +++ b/libpcsxcore/psxcommon.h @@ -31,6 +31,13 @@ extern "C" { #include "config.h" +// XXX: don't care but maybe fix it someday +#if defined(__GNUC__) && __GNUC__ >= 8 +#pragma GCC diagnostic ignored "-Wformat-truncation" +#pragma GCC diagnostic ignored "-Wformat-overflow" +#pragma GCC diagnostic ignored "-Wstringop-truncation" +#endif + // System includes #include #include @@ -131,6 +138,7 @@ typedef struct { boolean UseNet; boolean VSyncWA; boolean icache_emulation; + boolean DisableStalls; u8 Cpu; // CPU_DYNAREC or CPU_INTERPRETER u8 PsxType; // PSX_TYPE_NTSC or PSX_TYPE_PAL #ifdef _WIN32 diff --git a/libpcsxcore/psxcounters.c b/libpcsxcore/psxcounters.c index e03bc948..b2cc07b2 100644 --- a/libpcsxcore/psxcounters.c +++ b/libpcsxcore/psxcounters.c @@ -60,20 +60,15 @@ static const u32 CountToOverflow = 0; static const u32 CountToTarget = 1; static const u32 FrameRate[] = { 60, 50 }; -static const u32 HSyncTotal[] = { 263, 314 }; // actually one more on odd lines for PAL +static const u32 HSyncTotal[] = { 263, 314 }; // actually one more on odd lines for PAL #define VBlankStart 240 #define VERBOSE_LEVEL 0 -#if VERBOSE_LEVEL > 0 -static const s32 VerboseLevel = VERBOSE_LEVEL; -#endif /******************************************************************************/ - -#ifndef NEW_DYNAREC +#ifdef DRC_DISABLE Rcnt rcnts[ CounterQuantity ]; #endif - u32 hSyncCount = 0; u32 frame_counter = 0; static u32 hsync_steps = 0; @@ -93,7 +88,7 @@ static void verboseLog( u32 level, const char *str, ... ) { #if VERBOSE_LEVEL > 0 - if( level <= VerboseLevel ) + if( level <= VERBOSE_LEVEL ) { va_list va; char buf[ 4096 ]; @@ -507,13 +502,16 @@ s32 psxRcntFreeze( void *f, s32 Mode ) if (Mode == 0) { // don't trust things from a savestate + rcnts[3].rate = 1; for( i = 0; i < CounterQuantity; ++i ) { _psxRcntWmode( i, rcnts[i].mode ); count = (psxRegs.cycle - rcnts[i].cycleStart) / rcnts[i].rate; _psxRcntWcount( i, count ); } - hsync_steps = (psxRegs.cycle - rcnts[3].cycleStart) / rcnts[3].target; + hsync_steps = 0; + if (rcnts[3].target) + hsync_steps = (psxRegs.cycle - rcnts[3].cycleStart) / rcnts[3].target; psxRcntSet(); base_cycle = 0; diff --git a/libpcsxcore/psxinterpreter.c b/libpcsxcore/psxinterpreter.c index f9e13bf2..5d931a9a 100644 --- a/libpcsxcore/psxinterpreter.c +++ b/libpcsxcore/psxinterpreter.c @@ -26,6 +26,8 @@ #include "gte.h" #include "psxhle.h" #include "debug.h" +#include "psxinterpreter.h" +#include static int branch = 0; static int branch2 = 0; @@ -47,8 +49,6 @@ void ProcessDebug() {} void StopDebugger() {} #endif -void execI(); - // Subsets void (*psxBSC[64])(); void (*psxSPC[64])(); @@ -57,64 +57,52 @@ void (*psxCP0[32])(); void (*psxCP2[64])(struct psxCP2Regs *regs); void (*psxCP2BSC[32])(); -#ifdef ICACHE_EMULATION +static u32 fetchNoCache(u32 pc) +{ + u32 *code = (u32 *)PSXM(pc); + return ((code == NULL) ? 0 : SWAP32(*code)); +} + /* Formula One 2001 : Use old CPU cache code when the RAM location is updated with new code (affects in-game racing) */ -static u8* ICache_Addr; -static u8* ICache_Code; -uint32_t *Read_ICache(uint32_t pc) -{ - uint32_t pc_bank, pc_offset, pc_cache; - uint8_t *IAddr, *ICode; +static struct cache_entry { + u32 tag; + u32 data[4]; +} ICache[256]; - pc_bank = pc >> 24; - pc_offset = pc & 0xffffff; - pc_cache = pc & 0xfff; - - IAddr = ICache_Addr; - ICode = ICache_Code; - - // cached - RAM - if (pc_bank == 0x80 || pc_bank == 0x00) +static u32 fetchICache(u32 pc) +{ + // cached? + if (pc < 0xa0000000) { - if (SWAP32(*(uint32_t *)(IAddr + pc_cache)) == pc_offset) - { - // Cache hit - return last opcode used - return (uint32_t *)(ICode + pc_cache); - } - else + // this is not how the hardware works but whatever + struct cache_entry *entry = &ICache[(pc & 0xff0) >> 4]; + + if (((entry->tag ^ pc) & 0xfffffff0) != 0 || pc < entry->tag) { - // Cache miss - addresses don't match - // - default: 0xffffffff (not init) - - // cache line is 4 bytes wide - pc_offset &= ~0xf; - pc_cache &= ~0xf; - - // address line - *(uint32_t *)(IAddr + pc_cache + 0x0) = SWAP32(pc_offset + 0x0); - *(uint32_t *)(IAddr + pc_cache + 0x4) = SWAP32(pc_offset + 0x4); - *(uint32_t *)(IAddr + pc_cache + 0x8) = SWAP32(pc_offset + 0x8); - *(uint32_t *)(IAddr + pc_cache + 0xc) = SWAP32(pc_offset + 0xc); - - // opcode line - pc_offset = pc & ~0xf; - *(uint32_t *)(ICode + pc_cache + 0x0) = psxMu32ref(pc_offset + 0x0); - *(uint32_t *)(ICode + pc_cache + 0x4) = psxMu32ref(pc_offset + 0x4); - *(uint32_t *)(ICode + pc_cache + 0x8) = psxMu32ref(pc_offset + 0x8); - *(uint32_t *)(ICode + pc_cache + 0xc) = psxMu32ref(pc_offset + 0xc); + u32 *code = (u32 *)PSXM(pc & ~0x0f); + if (!code) + return 0; + + entry->tag = pc; + // treat as 4 words, although other configurations are said to be possible + switch (pc & 0x0c) + { + case 0x00: entry->data[0] = SWAP32(code[0]); + case 0x04: entry->data[1] = SWAP32(code[1]); + case 0x08: entry->data[2] = SWAP32(code[2]); + case 0x0c: entry->data[3] = SWAP32(code[3]); + } } + return entry->data[(pc & 0x0f) >> 2]; } - /* - TODO: Probably should add cached BIOS - */ - // default - return (uint32_t *)PSXM(pc); + return fetchNoCache(pc); } -#endif + +u32 (*fetch)(u32 pc) = fetchNoCache; static void delayRead(int reg, u32 bpc) { u32 rold, rnew; @@ -330,20 +318,7 @@ int psxTestLoadDelay(int reg, u32 tmp) { } void psxDelayTest(int reg, u32 bpc) { - u32 *code; - u32 tmp; - - #ifdef ICACHE_EMULATION - if (Config.icache_emulation) - { - code = Read_ICache(psxRegs.pc); - } - else - #endif - { - code = (u32 *)PSXM(psxRegs.pc); - } - tmp = ((code == NULL) ? 0 : SWAP32(*code)); + u32 tmp = fetch(psxRegs.pc); branch = 1; switch (psxTestLoadDelay(reg, tmp)) { @@ -363,20 +338,9 @@ void psxDelayTest(int reg, u32 bpc) { } static u32 psxBranchNoDelay(void) { - u32 *code; u32 temp; - #ifdef ICACHE_EMULATION - if (Config.icache_emulation) - { - code = Read_ICache(psxRegs.pc); - } - else - #endif - { - code = (u32 *)PSXM(psxRegs.pc); - } - psxRegs.code = ((code == NULL) ? 0 : SWAP32(*code)); + psxRegs.code = fetch(psxRegs.pc); switch (_Op_) { case 0x00: // SPECIAL switch (_Funct_) { @@ -494,7 +458,6 @@ static int psxDelayBranchTest(u32 tar1) { } static void doBranch(u32 tar) { - u32 *code; u32 tmp; branch2 = branch = 1; @@ -504,17 +467,7 @@ static void doBranch(u32 tar) { if (psxDelayBranchTest(tar)) return; - #ifdef ICACHE_EMULATION - if (Config.icache_emulation) - { - code = Read_ICache(psxRegs.pc); - } - else - #endif - { - code = (u32 *)PSXM(psxRegs.pc); - } - psxRegs.code = ((code == NULL) ? 0 : SWAP32(*code)); + psxRegs.code = fetch(psxRegs.pc); debugI(); @@ -602,15 +555,27 @@ void psxDIV() { } else { _i32(_rLo_) = 0xFFFFFFFF; } +/* + * Notaz said that this was "not needed" for ARM platforms and could slow it down so let's disable for ARM. + * This fixes a crash issue that can happen when running Amidog's CPU test. + * (It still stays stuck to a black screen but at least it doesn't crash anymore) + */ +#if !defined(__arm__) && !defined(__aarch64__) } else if (_i32(_rRs_) == 0x80000000 && _i32(_rRt_) == 0xFFFFFFFF) { _i32(_rLo_) = 0x80000000; _i32(_rHi_) = 0; +#endif } else { _i32(_rLo_) = _i32(_rRs_) / _i32(_rRt_); _i32(_rHi_) = _i32(_rRs_) % _i32(_rRt_); } } +void psxDIV_stall() { + psxRegs.muldivBusyCycle = psxRegs.cycle + 37; + psxDIV(); +} + void psxDIVU() { if (_rRt_ != 0) { _rLo_ = _rRs_ / _rRt_; @@ -622,6 +587,11 @@ void psxDIVU() { } } +void psxDIVU_stall() { + psxRegs.muldivBusyCycle = psxRegs.cycle + 37; + psxDIVU(); +} + void psxMULT() { u64 res = (s64)((s64)_i32(_rRs_) * (s64)_i32(_rRt_)); @@ -629,6 +599,15 @@ void psxMULT() { psxRegs.GPR.n.hi = (u32)((res >> 32) & 0xffffffff); } +void psxMULT_stall() { + // approximate, but maybe good enough + u32 rs = _rRs_; + u32 lz = __builtin_clz(((rs ^ ((s32)rs >> 21)) | 1)); + u32 c = 7 + (2 - (lz / 11)) * 4; + psxRegs.muldivBusyCycle = psxRegs.cycle + c; + psxMULT(); +} + void psxMULTU() { u64 res = (u64)((u64)_u32(_rRs_) * (u64)_u32(_rRt_)); @@ -636,6 +615,14 @@ void psxMULTU() { psxRegs.GPR.n.hi = (u32)((res >> 32) & 0xffffffff); } +void psxMULTU_stall() { + // approximate, but maybe good enough + u32 lz = __builtin_clz(_rRs_ | 1); + u32 c = 7 + (2 - (lz / 11)) * 4; + psxRegs.muldivBusyCycle = psxRegs.cycle + c; + psxMULTU(); +} + /********************************************************* * Register branch logic * * Format: OP rs, offset * @@ -679,6 +666,18 @@ void psxLUI() { if (!_Rt_) return; _u32(_rRt_) = psxRegs.code << 16; } // Upper void psxMFHI() { if (!_Rd_) return; _rRd_ = _rHi_; } // Rd = Hi void psxMFLO() { if (!_Rd_) return; _rRd_ = _rLo_; } // Rd = Lo +static void mflohiCheckStall(void) +{ + u32 left = psxRegs.muldivBusyCycle - psxRegs.cycle; + if (left <= 37) { + //printf("muldiv stall %u\n", left); + psxRegs.cycle = psxRegs.muldivBusyCycle; + } +} + +void psxMFHI_stall() { mflohiCheckStall(); psxMFHI(); } +void psxMFLO_stall() { mflohiCheckStall(); psxMFLO(); } + /********************************************************* * Move to GPR to HI/LO & Register jump * * Format: OP rs * @@ -704,6 +703,7 @@ void psxRFE() { // SysPrintf("psxRFE\n"); psxRegs.CP0.n.Status = (psxRegs.CP0.n.Status & 0xfffffff0) | ((psxRegs.CP0.n.Status & 0x3c) >> 2); + psxTestSWInts(); } /********************************************************* @@ -727,7 +727,7 @@ void psxJAL() { _SetLink(31); doBranch(_JumpTarget_); } * Format: OP rs, rd * *********************************************************/ void psxJR() { - doBranch(_u32(_rRs_) & ~3); + doBranch(_rRs_ & ~3); psxJumpTest(); } @@ -933,20 +933,14 @@ void psxCOP0() { psxCP0[_Rs_](); } -void psxCOP1() { -#ifdef PSXCPU_LOG - PSXCPU_LOG("Attempted to use an invalid floating point instruction. Ignored.\n"); -#endif -} - void psxCOP2() { psxCP2[_Funct_]((struct psxCP2Regs *)&psxRegs.CP2D); } -void psxCOP3() { -#ifdef PSXCPU_LOG - PSXCPU_LOG("Attempted to access COP3. Ignored\n"); -#endif +void psxCOP2_stall() { + u32 f = _Funct_; + gteCheckStall(f); + psxCP2[f]((struct psxCP2Regs *)&psxRegs.CP2D); } void psxBASIC(struct psxCP2Regs *regs) { @@ -967,7 +961,7 @@ void psxHLE() { void (*psxBSC[64])() = { psxSPECIAL, psxREGIMM, psxJ , psxJAL , psxBEQ , psxBNE , psxBLEZ, psxBGTZ, psxADDI , psxADDIU , psxSLTI, psxSLTIU, psxANDI, psxORI , psxXORI, psxLUI , - psxCOP0 , psxCOP1 , psxCOP2, psxCOP3 , psxNULL, psxNULL, psxNULL, psxNULL, + psxCOP0 , psxNULL , psxCOP2, psxNULL , psxNULL, psxNULL, psxNULL, psxNULL, psxNULL , psxNULL , psxNULL, psxNULL , psxNULL, psxNULL, psxNULL, psxNULL, psxLB , psxLH , psxLWL , psxLW , psxLBU , psxLHU , psxLWR , psxNULL, psxSB , psxSH , psxSWL , psxSW , psxNULL, psxNULL, psxSWR , psxNULL, @@ -1023,35 +1017,11 @@ void (*psxCP2BSC[32])() = { /////////////////////////////////////////// static int intInit() { -#ifdef ICACHE_EMULATION - if (!ICache_Addr) - { - ICache_Addr = malloc(0x1000); - if (!ICache_Addr) - { - return -1; - } - } - - if (!ICache_Code) - { - ICache_Code = malloc(0x1000); - if (!ICache_Code) - { - return -1; - } - } - memset(ICache_Addr, 0xff, 0x1000); - memset(ICache_Code, 0xff, 0x1000); -#endif return 0; } static void intReset() { -#ifdef ICACHE_EMULATION - memset(ICache_Addr, 0xff, 0x1000); - memset(ICache_Code, 0xff, 0x1000); -#endif + memset(&ICache, 0xff, sizeof(ICache)); } void intExecute() { @@ -1069,41 +1039,60 @@ static void intClear(u32 Addr, u32 Size) { } void intNotify (int note, void *data) { -#ifdef ICACHE_EMULATION /* Gameblabla - Only clear the icache if it's isolated */ if (note == R3000ACPU_NOTIFY_CACHE_ISOLATED) { - memset(ICache_Addr, 0xff, 0x1000); - memset(ICache_Code, 0xff, 0x1000); + memset(&ICache, 0xff, sizeof(ICache)); } -#endif } -static void intShutdown() { -#ifdef ICACHE_EMULATION - if (ICache_Addr) - { - free(ICache_Addr); - ICache_Addr = NULL; +void intApplyConfig() { + assert(psxBSC[18] == psxCOP2 || psxBSC[18] == psxCOP2_stall); + assert(psxBSC[50] == gteLWC2 || psxBSC[50] == gteLWC2_stall); + assert(psxBSC[58] == gteSWC2 || psxBSC[58] == gteSWC2_stall); + assert(psxSPC[16] == psxMFHI || psxSPC[16] == psxMFHI_stall); + assert(psxSPC[18] == psxMFLO || psxSPC[18] == psxMFLO_stall); + assert(psxSPC[24] == psxMULT || psxSPC[24] == psxMULT_stall); + assert(psxSPC[25] == psxMULTU || psxSPC[25] == psxMULTU_stall); + assert(psxSPC[26] == psxDIV || psxSPC[26] == psxDIV_stall); + assert(psxSPC[27] == psxDIVU || psxSPC[27] == psxDIVU_stall); + + if (Config.DisableStalls) { + psxBSC[18] = psxCOP2; + psxBSC[50] = gteLWC2; + psxBSC[58] = gteSWC2; + psxSPC[16] = psxMFHI; + psxSPC[18] = psxMFLO; + psxSPC[24] = psxMULT; + psxSPC[25] = psxMULTU; + psxSPC[26] = psxDIV; + psxSPC[27] = psxDIVU; + } else { + psxBSC[18] = psxCOP2_stall; + psxBSC[50] = gteLWC2_stall; + psxBSC[58] = gteSWC2_stall; + psxSPC[16] = psxMFHI_stall; + psxSPC[18] = psxMFLO_stall; + psxSPC[24] = psxMULT_stall; + psxSPC[25] = psxMULTU_stall; + psxSPC[26] = psxDIV_stall; + psxSPC[27] = psxDIVU_stall; } - if (ICache_Code) - { - free(ICache_Code); - ICache_Code = NULL; - } -#endif + // dynarec may occasionally call the interpreter, in such a case the + // cache won't work (cache only works right if all fetches go through it) + if (!Config.icache_emulation || psxCpu != &psxInt) + fetch = fetchNoCache; + else + fetch = fetchICache; +} + +static void intShutdown() { } // interpreter execution void execI() { -#ifndef ICACHE_EMULATION - u32 *code = (u32 *)PSXM(psxRegs.pc); -#else - u32 *code = Read_ICache(psxRegs.pc); -#endif - - psxRegs.code = ((code == NULL) ? 0 : SWAP32(*code)); + psxRegs.code = fetch(psxRegs.pc); debugI(); @@ -1121,8 +1110,7 @@ R3000Acpu psxInt = { intExecute, intExecuteBlock, intClear, -#ifdef ICACHE_EMULATION intNotify, -#endif + intApplyConfig, intShutdown }; diff --git a/libpcsxcore/psxinterpreter.h b/libpcsxcore/psxinterpreter.h new file mode 100644 index 00000000..89dd7ea1 --- /dev/null +++ b/libpcsxcore/psxinterpreter.h @@ -0,0 +1,7 @@ + +extern u32 (*fetch)(u32 pc); + +// called by "new_dynarec" +void execI(); +void psxNULL(); +void intApplyConfig(); diff --git a/libpcsxcore/psxmem.c b/libpcsxcore/psxmem.c index 7d9f8bf6..6f85f82f 100644 --- a/libpcsxcore/psxmem.c +++ b/libpcsxcore/psxmem.c @@ -54,16 +54,7 @@ void (*psxUnmapHook)(void *ptr, size_t size, enum psxMapTag tag); void *psxMap(unsigned long addr, size_t size, int is_fixed, enum psxMapTag tag) { -#ifdef LIGHTREC -#ifdef MAP_FIXED_NOREPLACE - int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE; -#else - int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED; -#endif -#else int flags = MAP_PRIVATE | MAP_ANONYMOUS; -#endif - int try_ = 0; unsigned long mask; void *req, *ret; @@ -156,36 +147,17 @@ int psxMemInit() { memset(psxMemRLUT, 0, 0x10000 * sizeof(void *)); memset(psxMemWLUT, 0, 0x10000 * sizeof(void *)); -#ifdef LIGHTREC - psxM = psxMap(0x30000000, 0x00210000, 1, MAP_TAG_RAM); - if (psxM == NULL) - psxM = psxMap(0x70000000, 0x00210000, 1, MAP_TAG_RAM); - -#else psxM = psxMap(0x80000000, 0x00210000, 1, MAP_TAG_RAM); -#endif -#ifndef RAM_FIXED if (psxM == NULL) psxM = psxMap(0x77000000, 0x00210000, 0, MAP_TAG_RAM); -#endif if (psxM == NULL) { SysMessage(_("mapping main RAM failed")); return -1; } psxP = &psxM[0x200000]; -#ifdef LIGHTREC - psxH = psxMap(0x4f800000, 0x10000, 0, MAP_TAG_OTHER); - if (psxH == NULL) - psxH = psxMap(0x8f800000, 0x10000, 0, MAP_TAG_OTHER); - - psxR = psxMap(0x4fc00000, 0x80000, 0, MAP_TAG_OTHER); - if (psxR == NULL) - psxR = psxMap(0x8fc00000, 0x80000, 0, MAP_TAG_OTHER); -#else psxH = psxMap(0x1f800000, 0x10000, 0, MAP_TAG_OTHER); psxR = psxMap(0x1fc00000, 0x80000, 0, MAP_TAG_OTHER); -#endif if (psxMemRLUT == NULL || psxMemWLUT == NULL || psxR == NULL || psxP == NULL || psxH == NULL) { @@ -351,7 +323,7 @@ void psxMemWrite8(u32 mem, u8 value) { if (Config.Debug) DebugCheckBP((mem & 0xffffff) | 0x80000000, W1); *(u8 *)(p + (mem & 0xffff)) = value; -#ifdef PSXREC +#ifndef DRC_DISABLE psxCpu->Clear((mem & (~3)), 1); #endif } else { @@ -378,7 +350,7 @@ void psxMemWrite16(u32 mem, u16 value) { if (Config.Debug) DebugCheckBP((mem & 0xffffff) | 0x80000000, W2); *(u16 *)(p + (mem & 0xffff)) = SWAPu16(value); -#ifdef PSXREC +#ifndef DRC_DISABLE psxCpu->Clear((mem & (~3)), 1); #endif } else { @@ -391,36 +363,12 @@ void psxMemWrite16(u32 mem, u16 value) { void psxMemWrite32(u32 mem, u32 value) { char *p; -#if defined(ICACHE_EMULATION) - /* Stores in PS1 code during cache isolation invalidate cachelines. - * It is assumed that cache-flush routines write to the lowest 4KB of - * address space for Icache, or 1KB for Dcache/scratchpad. - * Originally, stores had to check 'writeok' in psxRegs struct before - * writing to RAM. To eliminate this necessity, we could simply patch the - * BIOS 0x44 FlushCache() A0 jumptable entry. Unfortunately, this won't - * work for some games that use less-buggy non-BIOS cache-flush routines - * like '007 Tomorrow Never Dies', often provided by SN-systems, the PS1 - * toolchain provider. - * Instead, we backup the lowest 64KB PS1 RAM when the cache is isolated. - * All stores write to RAM regardless of cache state. Thus, cache-flush - * routines temporarily trash the lowest 4KB of PS1 RAM. Fortunately, they - * ran in a 'critical section' with interrupts disabled, so there's little - * worry of PS1 code ever reading the trashed contents. - * We point the relevant portions of psxMemRLUT[] to the 64KB backup while - * cache is isolated. This is in case the dynarec needs to recompile some - * code during isolation. As long as it reads code using psxMemRLUT[] ptrs, - * it should never see trashed RAM contents. - * - * -senquack, mips dynarec team, 2017 - */ - static u32 mem_bak[0x10000/4]; -#endif u32 t; - u32 m = mem & 0xffff; + // if ((mem&0x1fffff) == 0x71E18 || value == 0x48088800) SysPrintf("t2fix!!\n"); t = mem >> 16; if (t == 0x1f80 || t == 0x9f80 || t == 0xbf80) { - if (m < 0x400) + if ((mem & 0xffff) < 0x400) psxHu32ref(mem) = SWAPu32(value); else psxHwWrite32(mem, value); @@ -430,12 +378,12 @@ void psxMemWrite32(u32 mem, u32 value) { if (Config.Debug) DebugCheckBP((mem & 0xffffff) | 0x80000000, W4); *(u32 *)(p + (mem & 0xffff)) = SWAPu32(value); -#ifdef PSXREC +#ifndef DRC_DISABLE psxCpu->Clear(mem, 1); #endif } else { if (mem != 0xfffe0130) { -#ifdef PSXREC +#ifndef DRC_DISABLE if (!writeok) psxCpu->Clear(mem, 1); #endif @@ -448,39 +396,22 @@ void psxMemWrite32(u32 mem, u32 value) { switch (value) { case 0x800: case 0x804: - if (writeok == FALSE) break; - writeok = FALSE; + if (writeok == 0) break; + writeok = 0; memset(psxMemWLUT + 0x0000, 0, 0x80 * sizeof(void *)); memset(psxMemWLUT + 0x8000, 0, 0x80 * sizeof(void *)); memset(psxMemWLUT + 0xa000, 0, 0x80 * sizeof(void *)); -#ifdef ICACHE_EMULATION - /* Cache is now isolated, pending cache-flush sequence: - * Backup lower 64KB of PS1 RAM, adjust psxMemRLUT[]. - */ - memcpy((void*)mem_bak, (void*)psxM, sizeof(mem_bak)); - psxMemRLUT[0x0000] = psxMemRLUT[0x0020] = psxMemRLUT[0x0040] = psxMemRLUT[0x0060] = (u8 *)mem_bak; - psxMemRLUT[0x8000] = psxMemRLUT[0x8020] = psxMemRLUT[0x8040] = psxMemRLUT[0x8060] = (u8 *)mem_bak; - psxMemRLUT[0xa000] = psxMemRLUT[0xa020] = psxMemRLUT[0xa040] = psxMemRLUT[0xa060] = (u8 *)mem_bak; + /* Required for icache interpreter otherwise Armored Core won't boot on icache interpreter */ psxCpu->Notify(R3000ACPU_NOTIFY_CACHE_ISOLATED, NULL); -#endif break; case 0x00: case 0x1e988: - if (writeok == TRUE) break; - writeok = TRUE; + if (writeok == 1) break; + writeok = 1; for (i = 0; i < 0x80; i++) psxMemWLUT[i + 0x0000] = (void *)&psxM[(i & 0x1f) << 16]; memcpy(psxMemWLUT + 0x8000, psxMemWLUT, 0x80 * sizeof(void *)); memcpy(psxMemWLUT + 0xa000, psxMemWLUT, 0x80 * sizeof(void *)); -#ifdef ICACHE_EMULATION - /* Cache is now unisolated: - * Restore lower 64KB RAM contents and psxMemRLUT[]. - */ - memcpy((void*)psxM, (void*)mem_bak, sizeof(mem_bak)); - psxMemRLUT[0x0000] = psxMemRLUT[0x0020] = psxMemRLUT[0x0040] = psxMemRLUT[0x0060] = (u8 *)psxM; - psxMemRLUT[0x8000] = psxMemRLUT[0x8020] = psxMemRLUT[0x8040] = psxMemRLUT[0x8060] = (u8 *)psxM; - psxMemRLUT[0xa000] = psxMemRLUT[0xa020] = psxMemRLUT[0xa040] = psxMemRLUT[0xa060] = (u8 *)psxM; /* Dynarecs might take this opportunity to flush their code cache */ psxCpu->Notify(R3000ACPU_NOTIFY_CACHE_UNISOLATED, NULL); -#endif break; default: #ifdef PSXMEM_LOG diff --git a/libpcsxcore/psxmem.h b/libpcsxcore/psxmem.h index 36b46936..fbf5f67c 100644 --- a/libpcsxcore/psxmem.h +++ b/libpcsxcore/psxmem.h @@ -122,12 +122,6 @@ extern u8 **psxMemRLUT; #define PSXMu32ref(mem) (*(u32 *)PSXM(mem)) -#ifndef PSXREC -#if defined(NEW_DYNAREC) || defined(LIGHTREC) -#define PSXREC -#endif -#endif - int psxMemInit(); void psxMemReset(); void psxMemShutdown(); diff --git a/libpcsxcore/r3000a.c b/libpcsxcore/r3000a.c index 3288f5c3..3a7c3581 100644 --- a/libpcsxcore/r3000a.c +++ b/libpcsxcore/r3000a.c @@ -25,20 +25,22 @@ #include "cdrom.h" #include "mdec.h" #include "gte.h" +#include "psxinterpreter.h" R3000Acpu *psxCpu = NULL; -#ifndef NEW_DYNAREC +#ifdef DRC_DISABLE psxRegisters psxRegs; #endif int psxInit() { SysPrintf(_("Running PCSX Version %s (%s).\n"), PCSX_VERSION, __DATE__); -#if defined(NEW_DYNAREC) || defined(LIGHTREC) +#ifndef DRC_DISABLE if (Config.Cpu == CPU_INTERPRETER) { psxCpu = &psxInt; } else psxCpu = &psxRec; #else + Config.Cpu = CPU_INTERPRETER; psxCpu = &psxInt; #endif @@ -52,8 +54,8 @@ int psxInit() { void psxReset() { psxMemReset(); - memset(&psxRegs, 0x00, sizeof(psxRegs)); - writeok = TRUE; + memset(&psxRegs, 0, sizeof(psxRegs)); + psxRegs.pc = 0xbfc00000; // Start in bootstrap psxRegs.CP0.r[12] = 0x10900000; // COP0 enabled | BEV = 1 | TS = 1 @@ -81,20 +83,8 @@ void psxShutdown() { } void psxException(u32 code, u32 bd) { - #ifdef ICACHE_EMULATION - /* Without the CPU_INTERPRETER condition, this will make Lightrec crash. - * Hopefully a better solution than this mess is found. - Gameblabla - */ - if (Config.icache_emulation && Config.Cpu == CPU_INTERPRETER) - { - psxRegs.code = SWAPu32(*Read_ICache(psxRegs.pc)); - } - else - #endif - { - psxRegs.code = PSXMu32(psxRegs.pc); - } - + psxRegs.code = fetch(psxRegs.pc); + if (!Config.HLE && ((((psxRegs.code) >> 24) & 0xfe) == 0x4a)) { // "hokuto no ken" / "Crash Bandicot 2" ... // BIOS does not allow to return to GTE instructions @@ -112,7 +102,6 @@ void psxException(u32 code, u32 bd) { #ifdef PSXCPU_LOG PSXCPU_LOG("bd set!!!\n"); #endif - SysPrintf("bd set!!!\n"); psxRegs.CP0.n.Cause |= 0x80000000; psxRegs.CP0.n.EPC = (psxRegs.pc - 4); } else diff --git a/libpcsxcore/r3000a.h b/libpcsxcore/r3000a.h index a5166459..2d7ad40d 100644 --- a/libpcsxcore/r3000a.h +++ b/libpcsxcore/r3000a.h @@ -29,14 +29,11 @@ extern "C" { #include "psxcounters.h" #include "psxbios.h" -#ifdef ICACHE_EMULATION enum { R3000ACPU_NOTIFY_CACHE_ISOLATED = 0, R3000ACPU_NOTIFY_CACHE_UNISOLATED = 1, R3000ACPU_NOTIFY_DMA3_EXE_LOAD = 2 }; -extern uint32_t *Read_ICache(uint32_t pc); -#endif typedef struct { int (*Init)(); @@ -44,16 +41,14 @@ typedef struct { void (*Execute)(); /* executes up to a break */ void (*ExecuteBlock)(); /* executes up to a jump */ void (*Clear)(u32 Addr, u32 Size); -#ifdef ICACHE_EMULATION void (*Notify)(int note, void *data); -#endif + void (*ApplyConfig)(); void (*Shutdown)(); } R3000Acpu; extern R3000Acpu *psxCpu; extern R3000Acpu psxInt; extern R3000Acpu psxRec; -#define PSXREC typedef union { #if defined(__BIGENDIAN__) @@ -194,6 +189,11 @@ typedef struct { u32 cycle; u32 interrupt; struct { u32 sCycle, cycle; } intCycle[32]; + u32 gteBusyCycle; + u32 muldivBusyCycle; + // warning: changing anything in psxRegisters requires update of all + // asm in libpcsxcore/new_dynarec/, but this member can be replaced + u32 reserved[2]; } psxRegisters; extern boolean writeok; diff --git a/plugins/gpulib/vout_pl.c b/plugins/gpulib/vout_pl.c index 075e3c33..064b3490 100644 --- a/plugins/gpulib/vout_pl.c +++ b/plugins/gpulib/vout_pl.c @@ -64,7 +64,7 @@ static void check_mode_change(int force) void vout_update(void) { - int x = gpu.screen.x & ~1; // alignment needed by blitter + int x = gpu.screen.x; int y = gpu.screen.y; int w = gpu.screen.w; int h = gpu.screen.h; -- 2.39.2