overwrite dynarec related code with upstream version
authornotaz <notasas@gmail.com>
Mon, 13 Dec 2021 23:14:23 +0000 (01:14 +0200)
committernotaz <notasas@gmail.com>
Tue, 14 Dec 2021 00:36:59 +0000 (02:36 +0200)
This gives a fast arm64 dynarec and many other ari64 dynarec fixes
from upstream. Although I tried to take care not to overwrite libretro
specific changes like lightrec, some things may have got lost or
broken. Only tested on rpi4 in 64bit mode.

Warning: untested on Android and most other platforms. If there are
issues from this merge, post a comment and tag me and I'll try to take
a look at it while I'm still active, at least for the time being.

46 files changed:
Makefile
Makefile.libretro
configure
frontend/cspace_neon.S
frontend/libretro.c
frontend/libretro_core_options.h
frontend/plugin_lib.c
jni/Android.mk
libpcsxcore/database.c [new file with mode: 0644]
libpcsxcore/database.h [new file with mode: 0644]
libpcsxcore/gte.c
libpcsxcore/gte.h
libpcsxcore/gte_neon.S
libpcsxcore/lightrec/plugin.c
libpcsxcore/misc.c
libpcsxcore/new_dynarec/arm/assem_arm.c [deleted file]
libpcsxcore/new_dynarec/arm/assem_arm.h [deleted file]
libpcsxcore/new_dynarec/arm/linkage_offsets.h [deleted file]
libpcsxcore/new_dynarec/assem_arm.c [new file with mode: 0644]
libpcsxcore/new_dynarec/assem_arm.h [new file with mode: 0644]
libpcsxcore/new_dynarec/assem_arm64.c [new file with mode: 0644]
libpcsxcore/new_dynarec/assem_arm64.h [new file with mode: 0644]
libpcsxcore/new_dynarec/emu_if.c [moved from libpcsxcore/new_dynarec/backends/psx/emu_if.c with 87% similarity]
libpcsxcore/new_dynarec/emu_if.h [moved from libpcsxcore/new_dynarec/backends/psx/emu_if.h with 86% similarity]
libpcsxcore/new_dynarec/linkage_arm.S [moved from libpcsxcore/new_dynarec/arm/linkage_arm.S with 91% similarity]
libpcsxcore/new_dynarec/linkage_arm64.S [new file with mode: 0644]
libpcsxcore/new_dynarec/linkage_offsets.h [new file with mode: 0644]
libpcsxcore/new_dynarec/new_dynarec.c
libpcsxcore/new_dynarec/new_dynarec.h
libpcsxcore/new_dynarec/new_dynarec_config.h
libpcsxcore/new_dynarec/patches/trace_drc_chk [new file with mode: 0644]
libpcsxcore/new_dynarec/patches/trace_intr [new file with mode: 0644]
libpcsxcore/new_dynarec/pcsxmem.c [moved from libpcsxcore/new_dynarec/backends/psx/pcsxmem.c with 95% similarity]
libpcsxcore/new_dynarec/pcsxmem.h [moved from libpcsxcore/new_dynarec/backends/psx/pcsxmem.h with 76% similarity]
libpcsxcore/new_dynarec/pcsxmem_inline.c [moved from libpcsxcore/new_dynarec/backends/psx/pcsxmem_inline.c with 78% similarity]
libpcsxcore/plugins.c
libpcsxcore/psxbios.c
libpcsxcore/psxcommon.h
libpcsxcore/psxcounters.c
libpcsxcore/psxinterpreter.c
libpcsxcore/psxinterpreter.h [new file with mode: 0644]
libpcsxcore/psxmem.c
libpcsxcore/psxmem.h
libpcsxcore/r3000a.c
libpcsxcore/r3000a.h
plugins/gpulib/vout_pl.c

index 1d70f64..3f33bd3 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -48,7 +48,7 @@ CFLAGS += -DPCNT
 endif
 
 # core
-OBJS += libpcsxcore/cdriso.o libpcsxcore/cdrom.o libpcsxcore/cheat.o \
+OBJS += libpcsxcore/cdriso.o libpcsxcore/cdrom.o libpcsxcore/cheat.o libpcsxcore/database.o \
        libpcsxcore/decode_xa.o libpcsxcore/mdec.o \
        libpcsxcore/misc.o libpcsxcore/plugins.o libpcsxcore/ppf.o libpcsxcore/psxbios.o \
        libpcsxcore/psxcommon.o libpcsxcore/psxcounters.o libpcsxcore/psxdma.o libpcsxcore/psxhle.o \
@@ -113,26 +113,28 @@ CFLAGS += -Ideps/mman
 OBJS += deps/mman/mman.o
 endif
 else ifeq "$(DYNAREC)" "ari64"
-CFLAGS += -DNEW_DYNAREC
-OBJS += libpcsxcore/new_dynarec/backends/psx/emu_if.o \
-               libpcsxcore/new_dynarec/new_dynarec.o \
-               libpcsxcore/new_dynarec/arm/linkage_arm.o \
-               libpcsxcore/new_dynarec/backends/psx/pcsxmem.o
-libpcsxcore/new_dynarec/new_dynarec.o: libpcsxcore/new_dynarec/arm/assem_arm.c \
-       libpcsxcore/new_dynarec/backends/psx/pcsxmem_inline.c
+OBJS += libpcsxcore/new_dynarec/new_dynarec.o
+OBJS += libpcsxcore/new_dynarec/pcsxmem.o
+ ifeq "$(ARCH)" "arm"
+ OBJS += libpcsxcore/new_dynarec/linkage_arm.o
+ libpcsxcore/new_dynarec/new_dynarec.o: libpcsxcore/new_dynarec/assem_arm.c
+ else ifneq (,$(findstring $(ARCH),aarch64 arm64))
+ OBJS += libpcsxcore/new_dynarec/linkage_arm64.o
+ libpcsxcore/new_dynarec/new_dynarec.o: libpcsxcore/new_dynarec/assem_arm64.c
+ else
+ $(error no dynarec support for architecture $(ARCH))
+ endif
 else
-OBJS += libpcsxcore/new_dynarec/backends/psx/emu_if.o
-libpcsxcore/new_dynarec/backends/psx/emu_if.o: CFLAGS += -DDRC_DISABLE
-frontend/libretro.o: CFLAGS += -DDRC_DISABLE
+CFLAGS += -DDRC_DISABLE
 endif
+OBJS += libpcsxcore/new_dynarec/emu_if.o
+libpcsxcore/new_dynarec/new_dynarec.o: libpcsxcore/new_dynarec/pcsxmem_inline.c
 ifdef DRC_DBG
-libpcsxcore/new_dynarec/backends/psx/emu_if.o: CFLAGS += -D_FILE_OFFSET_BITS=64
+libpcsxcore/new_dynarec/emu_if.o: CFLAGS += -D_FILE_OFFSET_BITS=64
 CFLAGS += -DDRC_DBG
 endif
-ifeq "$(DRC_CACHE_BASE)" "1"
-libpcsxcore/new_dynarec/%.o: CFLAGS += -DBASE_ADDR_FIXED=1
-libpcsxcore/new_dynarec/backends/psx/%.o: CFLAGS += -DBASE_ADDR_FIXED=1
-libpcsxcore/new_dynarec/arm/%.o: CFLAGS += -DBASE_ADDR_FIXED=1
+ifeq "$(BASE_ADDR_DYNAMIC)" "1"
+libpcsxcore/new_dynarec/%.o: CFLAGS += -DBASE_ADDR_DYNAMIC=1
 endif
 
 # spu
@@ -312,9 +314,6 @@ OBJS += libretro-common/time/rtime.o
 OBJS += libretro-common/vfs/vfs_implementation.o
 CFLAGS += -DUSE_LIBRETRO_VFS
 endif
-ifeq "$(ENABLE_ICACHE_EMULATION)" "1"
-CFLAGS += -DICACHE_EMULATION
-endif
 OBJS += frontend/libretro.o
 CFLAGS += -Ilibretro-common/include
 CFLAGS += -DFRONTEND_SUPPORTS_RGB565
@@ -331,6 +330,7 @@ ifeq "$(USE_PLUGIN_LIB)" "1"
 OBJS += frontend/plugin_lib.o
 OBJS += frontend/libpicofe/linux/plat.o
 OBJS += frontend/libpicofe/readpng.o frontend/libpicofe/fonts.o
+frontend/libpicofe/linux/plat.o: CFLAGS += -DNO_HOME_DIR
 ifeq "$(HAVE_NEON)" "1"
 OBJS += frontend/libpicofe/arm/neon_scale2x.o
 OBJS += frontend/libpicofe/arm/neon_eagle2x.o
index e1ba947..59bc575 100644 (file)
@@ -4,7 +4,6 @@ DEBUG ?= 0
 WANT_ZLIB ?= 1
 HAVE_CHD ?= 1
 USE_LIBRETRO_VFS ?= 0
-ENABLE_ICACHE_EMULATION ?= 1
 
 # Dynarec options: lightrec, ari64
 DYNAREC ?= lightrec
@@ -76,7 +75,7 @@ else ifneq (,$(findstring h5,$(platform)))
        fpic := -fPIC
        SHARED := -shared -Wl,-version-script=link.T
        ARCH = arm64
-       DYNAREC ?= lightrec
+       DYNAREC ?= ari64
        CFLAGS += -fomit-frame-pointer -ffast-math -DARM
        CPUFLAGS += -march=armv8-a+crc -mfpu=neon-fp-armv8 -mcpu=cortex-a53 -mtune=cortex-a53
 
@@ -344,7 +343,7 @@ else ifeq ($(platform), rpi3_64)
        TARGET := $(TARGET_NAME)_libretro.so
        ARCH := arm64
        BUILTIN_GPU = unai
-       DYNAREC = lightrec
+       DYNAREC = ari64
        fpic := -fPIC
        CFLAGS += -march=armv8-a+crc+simd -mtune=cortex-a53 -ftree-vectorize
 
@@ -364,7 +363,7 @@ else ifeq ($(platform), rpi4_64)
         TARGET := $(TARGET_NAME)_libretro.so
         ARCH := arm64
         BUILTIN_GPU = unai
-        DYNAREC = lightrec
+       DYNAREC = ari64
         fpic := -fPIC
         CFLAGS += -march=armv8-a+crc+simd -mtune=cortex-a72 -ftree-vectorize
 
index f3a50d1..a618231 100755 (executable)
--- a/configure
+++ b/configure
@@ -45,7 +45,6 @@ sound_driver_list="oss alsa pulseaudio sdl libretro"
 sound_drivers=""
 plugins="plugins/spunull/spunull.so \
 plugins/dfxvideo/gpu_peops.so plugins/gpu_unai/gpu_unai.so"
-ram_fixed="no"
 drc_cache_base="no"
 have_armv5=""
 have_armv6=""
@@ -76,6 +75,7 @@ config_mak="config.mak"
 fail()
 {
   echo "$@"
+  if test -n "$DUMP_CONFIG_LOG"; then cat config.log; fi
   exit 1
 }
 
@@ -88,21 +88,18 @@ set_platform()
     ;;
   pandora)
     sound_drivers="oss alsa"
-    ram_fixed="yes"
     drc_cache_base="yes"
     optimize_cortexa8="yes"
     have_arm_neon="yes"
     need_xlib="yes"
     ;;
   maemo)
-    ram_fixed="yes"
     drc_cache_base="yes"
     optimize_cortexa8="yes"
     have_arm_neon="yes"
     ;;
   caanoo)
     sound_drivers="oss"
-    ram_fixed="yes"
     drc_cache_base="yes"
     optimize_arm926ej="yes"
     need_warm="yes"
@@ -278,7 +275,11 @@ arm*)
     echo "  CFLAGS=-march=armv7-a ./configure ..."
   fi
   ;;
+aarch64)
+  ;;
 *)
+  # dynarec only available on ARM
+  enable_dynarec="no"
   ;;
 esac
 
@@ -291,10 +292,6 @@ if [ "$ARCH" != "arm" -o "$have_armv6" = "yes" ]; then
   PLUGIN_CFLAGS="$PLUGIN_CFLAGS -fPIC"
 fi
 
-if [ "$ram_fixed" = "yes" ]; then
-  CFLAGS="$CFLAGS -DRAM_FIXED"
-fi
-
 case "$platform" in
 generic)
   need_sdl="yes"
@@ -548,15 +545,12 @@ echo >> $config_mak
 
 if [ "$platform" = "libretro" ]; then
   echo "TARGET = libretro.so" >> $config_mak
-  echo "HAVE_CHD = 1" >> $config_mak
 fi
 echo "ARCH = $ARCH" >> $config_mak
 echo "PLATFORM = $platform" >> $config_mak
 echo "BUILTIN_GPU = $builtin_gpu" >> $config_mak
 echo "SOUND_DRIVERS = $sound_drivers" >> $config_mak
-if [ "$platform" != "libretro" ]; then
-  echo "PLUGINS = $plugins" >> $config_mak
-fi
+echo "PLUGINS = $plugins" >> $config_mak
 if [ "$have_arm_neon" = "yes" ]; then
   echo "HAVE_NEON = 1" >> $config_mak
 fi
@@ -572,7 +566,7 @@ if [ "$enable_dynarec" = "yes" ]; then
   echo "USE_DYNAREC = 1" >> $config_mak
 fi
 if [ "$drc_cache_base" = "yes" ]; then
-  echo "DRC_CACHE_BASE = 1" >> $config_mak
+  echo "BASE_ADDR_DYNAMIC = 1" >> $config_mak
 fi
 if [ "$have_c64x_dsp" = "yes" ]; then
   echo "HAVE_C64_TOOLS = 1" >> $config_mak
index 56ab304..4cb3d4c 100644 (file)
@@ -183,8 +183,8 @@ FUNCTION(bgr888_to_rgb888): @ dst, src, bytes
     umull       r12,r2, r3, r2
 0:
     pld         [r1, #48*3]
-    vld3.8      {d0-d2}, [r1, :64]!
-    vld3.8      {d3-d5}, [r1, :64]!
+    vld3.8      {d0-d2}, [r1]!
+    vld3.8      {d3-d5}, [r1]!
     vswp        d0, d2
     vswp        d3, d5
     vst3.8      {d0-d2}, [r0, :64]!
@@ -207,8 +207,8 @@ FUNCTION(bgr888_to_rgb565): @ dst, src, bytes
     vdup.16     q15, r3
 0:
     pld         [r1, #48*3]
-    vld3.8      {d1-d3}, [r1, :64]!
-    vld3.8      {d5-d7}, [r1, :64]!
+    vld3.8      {d1-d3}, [r1]!
+    vld3.8      {d5-d7}, [r1]!
 
     vshll.u8    q8, d2, #3      @ g
     vshll.u8    q9, d6, #3
index 6fee42a..8252e15 100644 (file)
@@ -1628,7 +1628,7 @@ static void update_variables(bool in_flight)
          display_internal_fps = true;
    }
 
-#if defined(LIGHTREC) || defined(NEW_DYNAREC)
+#ifndef DRC_DISABLE
    var.value = NULL;
    var.key = "pcsx_rearmed_drc";
 
@@ -1661,7 +1661,8 @@ static void update_variables(bool in_flight)
          psxCpu->Reset(); // not really a reset..
       }
    }
-#endif /* LIGHTREC || NEW_DYNAREC */
+#endif /* !DRC_DISABLE */
+   psxCpu->ApplyConfig();
 
    var.value = NULL;
    var.key = "pcsx_rearmed_spu_reverb";
@@ -1700,7 +1701,6 @@ static void update_variables(bool in_flight)
          Config.RCntFix = 1;
    }
    
-#ifdef ICACHE_EMULATION
    var.value = NULL;
    var.key = "pcsx_rearmed_icache_emulation";
 
@@ -1711,7 +1711,6 @@ static void update_variables(bool in_flight)
       else if (strcmp(var.value, "enabled") == 0)
          Config.icache_emulation = 1;
    }
-#endif
 
    var.value = NULL;
    var.key = "pcsx_rearmed_inuyasha_fix";
@@ -2054,7 +2053,7 @@ static void update_variables(bool in_flight)
       GunconAdjustRatioY = atof(var.value);
    }
 
-#ifdef NEW_DYNAREC
+#if !defined(DRC_DISABLE) && !defined(LIGHTREC)
    var.value = NULL;
    var.key = "pcsx_rearmed_nosmccheck";
    if (environ_cb(RETRO_ENVIRONMENT_GET_VARIABLE, &var) && var.value)
@@ -2093,7 +2092,27 @@ static void update_variables(bool in_flight)
       int psxclock = atoi(var.value);
       cycle_multiplier = 10000 / psxclock;
    }
-#endif /* NEW_DYNAREC */
+
+   var.value = NULL;
+   var.key = "pcsx_rearmed_nocompathacks";
+   if (environ_cb(RETRO_ENVIRONMENT_GET_VARIABLE, &var) && var.value)
+   {
+      if (strcmp(var.value, "enabled") == 0)
+         new_dynarec_hacks |= NDHACK_NO_COMPAT_HACKS;
+      else
+         new_dynarec_hacks &= ~NDHACK_NO_COMPAT_HACKS;
+   }
+#endif /* !DRC_DISABLE && !LIGHTREC */
+
+   var.value = NULL;
+   var.key = "pcsx_rearmed_nostalls";
+   if (environ_cb(RETRO_ENVIRONMENT_GET_VARIABLE, &var) && var.value)
+   {
+      if (strcmp(var.value, "enabled") == 0)
+         Config.DisableStalls = 1;
+      else
+         Config.DisableStalls = 0;
+   }
 
    var.value = NULL;
    var.key = "pcsx_rearmed_input_sensitivity";
index 6a754cf..3e1daf2 100644 (file)
@@ -479,7 +479,7 @@ struct retro_core_option_definition option_defs_us[] = {
 #endif
    },
 
-#if defined(LIGHTREC) || defined(NEW_DYNAREC)
+#ifndef DRC_DISABLE
    {
       "pcsx_rearmed_drc",
       "Dynamic Recompiler",
@@ -491,9 +491,9 @@ struct retro_core_option_definition option_defs_us[] = {
       },
       "enabled",
    },
-#endif /* LIGHTREC || NEW_DYNAREC */
+#endif
 
-#ifdef NEW_DYNAREC
+#if !defined(DRC_DISABLE) && !defined(LIGHTREC)
    {
       "pcsx_rearmed_psxclock",
       "PSX CPU Clock",
@@ -582,7 +582,7 @@ struct retro_core_option_definition option_defs_us[] = {
       "57",
 #endif
    },
-#endif /* NEW_DYNAREC */
+#endif /* !DRC_DISABLE && !LIGHTREC */
 
 #ifdef GPU_NEON
    {
@@ -969,7 +969,7 @@ struct retro_core_option_definition option_defs_us[] = {
       "disabled",
    },
 
-#ifdef NEW_DYNAREC
+#if !defined(DRC_DISABLE) && !defined(LIGHTREC)
    {
       "pcsx_rearmed_nosmccheck",
       "(Speed Hack) Disable SMC Checks",
@@ -1003,7 +1003,29 @@ struct retro_core_option_definition option_defs_us[] = {
       },
       "disabled",
    },
-#endif /* NEW_DYNAREC */
+   {
+      "pcsx_rearmed_nostalls",
+      "Disable CPU/GTE stalls",
+      "Will cause some games to run too fast.",
+      {
+         { "disabled", NULL },
+         { "enabled",  NULL },
+         { NULL, NULL },
+      },
+      "disabled",
+   },
+   {
+      "pcsx_rearmed_nocompathacks",
+      "Disable compat hacks",
+      "Disables game-specific compatibility hacks.",
+      {
+         { "disabled", NULL },
+         { "enabled",  NULL },
+         { NULL, NULL },
+      },
+      "disabled",
+   },
+#endif /* !DRC_DISABLE && !LIGHTREC */
 
    { NULL, NULL, NULL, {{0}}, NULL },
 };
index eee255b..eb9d48e 100644 (file)
@@ -400,6 +400,8 @@ static void pl_vout_flip(const void *vram, int stride, int bgr24, int w, int h)
 #endif
        else
        {
+               src = (void *)((uintptr_t)src & ~3); // align for the blitter
+
                for (; h1-- > 0; dest += dstride * 2, src += stride)
                {
                        bgr555_to_rgb565(dest, src, w * 2);
index 644e2af..50aa696 100644 (file)
@@ -25,6 +25,7 @@ EXTRA_INCLUDES :=
 SOURCES_C := $(CORE_DIR)/cdriso.c \
              $(CORE_DIR)/cdrom.c \
              $(CORE_DIR)/cheat.c \
+             $(CORE_DIR)/database.c \
              $(CORE_DIR)/decode_xa.c \
              $(CORE_DIR)/mdec.c \
              $(CORE_DIR)/misc.c \
@@ -115,7 +116,7 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
 else ifeq ($(TARGET_ARCH_ABI),armeabi)
   HAVE_ARI64=1
 else ifeq ($(TARGET_ARCH_ABI),arm64-v8a)
-  HAVE_LIGHTREC=1
+  HAVE_ARI64=1
 else ifeq ($(TARGET_ARCH_ABI),x86_64)
   HAVE_LIGHTREC=1
 else ifeq ($(TARGET_ARCH_ABI),x86)
@@ -125,13 +126,17 @@ else
 endif
 
 ifeq ($(HAVE_ARI64),1)
-  COREFLAGS   += -DNEW_DYNAREC
-  SOURCES_ASM += $(CORE_DIR)/gte_arm.S \
-                 $(SPU_DIR)/arm_utils.S \
-                 $(DYNAREC_DIR)/arm/linkage_arm.S
   SOURCES_C   += $(DYNAREC_DIR)/new_dynarec.c \
-                 $(DYNAREC_DIR)/backends/psx/pcsxmem.c
+                 $(DYNAREC_DIR)/pcsxmem.c
+  ifeq ($(TARGET_ARCH_ABI),arm64-v8a)
+    SOURCES_ASM += $(DYNAREC_DIR)/linkage_arm64.S
+  else
+    SOURCES_ASM += $(CORE_DIR)/gte_arm.S \
+                   $(SPU_DIR)/arm_utils.S \
+                   $(DYNAREC_DIR)/linkage_arm.S
+  endif
 endif
+  SOURCES_C   += $(DYNAREC_DIR)/emu_if.c
 
 ifeq ($(HAVE_LIGHTREC),1)
   COREFLAGS   += -DLIGHTREC -DLIGHTREC_STATIC
@@ -164,7 +169,6 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
                  $(NEON_DIR)/psx_gpu/psx_gpu_arm_neon.S \
                  $(FRONTEND_DIR)/cspace_neon.S
   SOURCES_C   += $(NEON_DIR)/psx_gpu_if.c
-  SOURCES_C   += $(DYNAREC_DIR)/backends/psx/emu_if.c
 else ifeq ($(TARGET_ARCH_ABI),armeabi)
   COREFLAGS += -DUSE_GPULIB=1 -DGPU_UNAI
   SOURCES_ASM += $(UNAI_DIR)/gpu_arm.S \
diff --git a/libpcsxcore/database.c b/libpcsxcore/database.c
new file mode 100644 (file)
index 0000000..61312e0
--- /dev/null
@@ -0,0 +1,47 @@
+#include "misc.h"
+#include "sio.h"
+#include "new_dynarec/new_dynarec.h"
+
+/* It's duplicated from emu_if.c */
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
+
+static const char MemorycardHack_db[8][10] =
+{
+       /* Lifeforce Tenka, also known as Codename Tenka */
+       {"SLES00613"},
+       {"SLED00690"},
+       {"SLES00614"},
+       {"SLES00615"},
+       {"SLES00616"},
+       {"SLES00617"},
+       {"SCUS94409"}
+};
+
+/* Function for automatic patching according to GameID. */
+void Apply_Hacks_Cdrom()
+{
+       uint32_t i;
+       
+       /* Apply Memory card hack for Codename Tenka. (The game needs one of the memory card slots to be empty) */
+       for(i=0;i<ARRAY_SIZE(MemorycardHack_db);i++)
+       {
+               if (strncmp(CdromId, MemorycardHack_db[i], 9) == 0)
+               {
+                       /* Disable the second memory card slot for the game */
+                       Config.Mcd2[0] = 0;
+                       /* This also needs to be done because in sio.c, they don't use Config.Mcd2 for that purpose */
+                       McdDisable[1] = 1;
+               }
+       }
+
+       /* Dynarec game-specific hacks */
+       new_dynarec_hacks_pergame = 0;
+       cycle_multiplier_override = 0;
+
+       /* Internal Section is fussy about timings */
+       if (strcmp(CdromId, "SLPS01868") == 0)
+       {
+               cycle_multiplier_override = 202;
+               new_dynarec_hacks_pergame |= NDHACK_OVERRIDE_CYCLE_M;
+       }
+}
diff --git a/libpcsxcore/database.h b/libpcsxcore/database.h
new file mode 100644 (file)
index 0000000..fbb564d
--- /dev/null
@@ -0,0 +1,6 @@
+#ifndef DATABASE_H
+#define DATABASE_H
+
+extern void Apply_Hacks_Cdrom();
+
+#endif
index 239d2e5..5164a89 100644 (file)
 //  sign-extended by bug in original hardware, according to Nocash docs
 //  GTE section 'Screen Offset and Distance'. The emulator does this
 //  sign extension when it is loaded to GTE by CTC2.
-//#define gteH   (regs->CP2C.p[26].sw.l)
-#define gteH   (regs->CP2C.p[26].w.l)
+//#define gteH   (psxRegs.CP2C.p[26].sw.l)
+#define gteH   (psxRegs.CP2C.p[26].w.l)
 #define gteDQA (regs->CP2C.p[27].sw.l)
 #define gteDQB (((s32 *)regs->CP2C.r)[28])
 #define gteZSF3 (regs->CP2C.p[29].sw.l)
@@ -260,6 +260,7 @@ static inline u32 limE_(psxCP2Regs *regs, u32 result) {
 #define A3U(x) (x)
 #endif
 
+
 //senquack - n param should be unsigned (will be 'gteH' reg which is u16)
 #ifdef GTE_USE_NATIVE_DIVIDE
 INLINE u32 DIVIDE(u16 n, u16 d) {
@@ -274,6 +275,32 @@ INLINE u32 DIVIDE(u16 n, u16 d) {
 
 #ifndef FLAGLESS
 
+const unsigned char gte_cycletab[64] = {
+       /*   1   2   3   4   5   6   7   8   9   a   b   c   d   e   f */
+        0, 15,  0,  0,  0,  0,  8,  0,  0,  0,  0,  0,  6,  0,  0,  0,
+        8,  8,  8, 19, 13,  0, 44,  0,  0,  0,  0, 17, 11,  0, 14,  0,
+       30,  0,  0,  0,  0,  0,  0,  0,  5,  8, 17,  0,  0,  5,  6,  0,
+       23,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  5,  5, 39,
+};
+
+// warning: called by the dynarec
+int gteCheckStallRaw(u32 op_cycles, psxRegisters *regs) {
+       u32 left = regs->gteBusyCycle - regs->cycle;
+       int stall = 0;
+
+       if (left <= 44) {
+               //printf("c %2u stall %2u %u\n", op_cycles, left, regs->cycle);
+               regs->cycle = regs->gteBusyCycle;
+               stall = left;
+       }
+       regs->gteBusyCycle = regs->cycle + op_cycles;
+       return stall;
+}
+
+void gteCheckStall(u32 op) {
+       gteCheckStallRaw(gte_cycletab[op], &psxRegs);
+}
+
 u32 MFC2(int reg) {
        psxCP2Regs *regs = &psxRegs.CP2;
        switch (reg) {
@@ -321,9 +348,10 @@ void MTC2(u32 value, int reg) {
 
                case 28:
                        gteIRGB = value;
-                       gteIR1 = (value & 0x1f) << 7;
-                       gteIR2 = (value & 0x3e0) << 2;
-                       gteIR3 = (value & 0x7c00) >> 3;
+                       // not gteIR1 etc. just to be consistent with dynarec
+                       regs->CP2D.n.ir1 = (value & 0x1f) << 7;
+                       regs->CP2D.n.ir2 = (value & 0x3e0) << 2;
+                       regs->CP2D.n.ir3 = (value & 0x7c00) >> 3;
                        break;
 
                case 30:
@@ -377,13 +405,11 @@ void CTC2(u32 value, int reg) {
 }
 
 void gteMFC2() {
-       psxRegs.cycle += 1;
        if (!_Rt_) return;
        psxRegs.GPR.r[_Rt_] = MFC2(_Rd_);
 }
 
 void gteCFC2() {
-       psxRegs.cycle += 1;
        if (!_Rt_) return;
        psxRegs.GPR.r[_Rt_] = psxRegs.CP2C.r[_Rd_];
 }
@@ -403,10 +429,19 @@ void gteLWC2() {
 }
 
 void gteSWC2() {
-       //psxRegs.cycle += 1;
        psxMemWrite32(_oB_, MFC2(_Rt_));
 }
 
+void gteLWC2_stall() {
+       gteCheckStall(0);
+       gteLWC2();
+}
+
+void gteSWC2_stall() {
+       gteCheckStall(0);
+       gteSWC2();
+}
+
 #endif // FLAGLESS
 
 #if 0
@@ -428,7 +463,6 @@ void gteRTPS(psxCP2Regs *regs) {
 #ifdef GTE_LOG
        GTE_LOG("GTE RTPS\n");
 #endif
-       psxRegs.cycle += 15;
        gteFLAG = 0;
 
        gteMAC1 = A1((((s64)gteTRX << 12) + (gteR11 * gteVX0) + (gteR12 * gteVY0) + (gteR13 * gteVZ0)) >> 12);
@@ -461,7 +495,6 @@ void gteRTPT(psxCP2Regs *regs) {
 #ifdef GTE_LOG
        GTE_LOG("GTE RTPT\n");
 #endif
-       psxRegs.cycle += 23;
        gteFLAG = 0;
 
        gteSZ0 = gteSZ3;
@@ -500,7 +533,6 @@ void gteMVMVA(psxCP2Regs *regs) {
        GTE_LOG("GTE MVMVA\n");
 #endif
        gteFLAG = 0;
-       psxRegs.cycle += 8;
 
        gteMAC1 = A1((((s64)CV1(cv) << 12) + (MX11(mx) * vx) + (MX12(mx) * vy) + (MX13(mx) * vz)) >> shift);
        gteMAC2 = A2((((s64)CV2(cv) << 12) + (MX21(mx) * vx) + (MX22(mx) * vy) + (MX23(mx) * vz)) >> shift);
@@ -516,7 +548,6 @@ void gteNCLIP(psxCP2Regs *regs) {
        GTE_LOG("GTE NCLIP\n");
 #endif
        gteFLAG = 0;
-       psxRegs.cycle += 8;
 
        gteMAC0 = F((s64)gteSX0 * (gteSY1 - gteSY2) +
                                gteSX1 * (gteSY2 - gteSY0) +
@@ -528,7 +559,6 @@ void gteAVSZ3(psxCP2Regs *regs) {
        GTE_LOG("GTE AVSZ3\n");
 #endif
        gteFLAG = 0;
-       psxRegs.cycle += 5;
 
        gteMAC0 = F((s64)gteZSF3 * (gteSZ1 + gteSZ2 + gteSZ3));
        gteOTZ = limD(gteMAC0 >> 12);
@@ -539,7 +569,6 @@ void gteAVSZ4(psxCP2Regs *regs) {
        GTE_LOG("GTE AVSZ4\n");
 #endif
        gteFLAG = 0;
-       psxRegs.cycle += 6;
 
        gteMAC0 = F((s64)gteZSF4 * (gteSZ0 + gteSZ1 + gteSZ2 + gteSZ3));
        gteOTZ = limD(gteMAC0 >> 12);
@@ -553,7 +582,6 @@ void gteSQR(psxCP2Regs *regs) {
        GTE_LOG("GTE SQR\n");
 #endif
        gteFLAG = 0;
-       psxRegs.cycle += 5;
 
        gteMAC1 = (gteIR1 * gteIR1) >> shift;
        gteMAC2 = (gteIR2 * gteIR2) >> shift;
@@ -568,7 +596,6 @@ void gteNCCS(psxCP2Regs *regs) {
        GTE_LOG("GTE NCCS\n");
 #endif
        gteFLAG = 0;
-       psxRegs.cycle += 17;
 
        gteMAC1 = ((s64)(gteL11 * gteVX0) + (gteL12 * gteVY0) + (gteL13 * gteVZ0)) >> 12;
        gteMAC2 = ((s64)(gteL21 * gteVX0) + (gteL22 * gteVY0) + (gteL23 * gteVZ0)) >> 12;
@@ -605,7 +632,6 @@ void gteNCCT(psxCP2Regs *regs) {
        GTE_LOG("GTE NCCT\n");
 #endif
        gteFLAG = 0;
-       psxRegs.cycle += 39;
 
        for (v = 0; v < 3; v++) {
                vx = VX(v);
@@ -644,7 +670,6 @@ void gteNCDS(psxCP2Regs *regs) {
        GTE_LOG("GTE NCDS\n");
 #endif
        gteFLAG = 0;
-       psxRegs.cycle += 19;
 
        gteMAC1 = ((s64)(gteL11 * gteVX0) + (gteL12 * gteVY0) + (gteL13 * gteVZ0)) >> 12;
        gteMAC2 = ((s64)(gteL21 * gteVX0) + (gteL22 * gteVY0) + (gteL23 * gteVZ0)) >> 12;
@@ -681,7 +706,6 @@ void gteNCDT(psxCP2Regs *regs) {
        GTE_LOG("GTE NCDT\n");
 #endif
        gteFLAG = 0;
-       psxRegs.cycle += 44;
 
        for (v = 0; v < 3; v++) {
                vx = VX(v);
@@ -723,7 +747,6 @@ void gteOP(psxCP2Regs *regs) {
        GTE_LOG("GTE OP\n");
 #endif
        gteFLAG = 0;
-       psxRegs.cycle += 6;
 
        gteMAC1 = ((gteR22 * gteIR3) - (gteR33 * gteIR2)) >> shift;
        gteMAC2 = ((gteR33 * gteIR1) - (gteR11 * gteIR3)) >> shift;
@@ -744,7 +767,6 @@ void gteDCPL(psxCP2Regs *regs) {
        GTE_LOG("GTE DCPL\n");
 #endif
        gteFLAG = 0;
-       psxRegs.cycle += 8;
 
        gteMAC1 = RIR1 + ((gteIR0 * limB1(A1U((s64)gteRFC - RIR1), 0)) >> 12);
        gteMAC2 = GIR2 + ((gteIR0 * limB1(A2U((s64)gteGFC - GIR2), 0)) >> 12);
@@ -769,7 +791,6 @@ void gteGPF(psxCP2Regs *regs) {
        GTE_LOG("GTE GPF\n");
 #endif
        gteFLAG = 0;
-       psxRegs.cycle += 5;
 
        gteMAC1 = (gteIR0 * gteIR1) >> shift;
        gteMAC2 = (gteIR0 * gteIR2) >> shift;
@@ -793,7 +814,6 @@ void gteGPL(psxCP2Regs *regs) {
        GTE_LOG("GTE GPL\n");
 #endif
        gteFLAG = 0;
-       psxRegs.cycle += 5;
 
        gteMAC1 = A1((((s64)gteMAC1 << shift) + (gteIR0 * gteIR1)) >> shift);
        gteMAC2 = A2((((s64)gteMAC2 << shift) + (gteIR0 * gteIR2)) >> shift);
@@ -817,7 +837,6 @@ void gteDPCS(psxCP2Regs *regs) {
        GTE_LOG("GTE DPCS\n");
 #endif
        gteFLAG = 0;
-       psxRegs.cycle += 8;
 
        gteMAC1 = ((gteR << 16) + (gteIR0 * limB1(A1U(((s64)gteRFC - (gteR << 4)) << (12 - shift)), 0))) >> 12;
        gteMAC2 = ((gteG << 16) + (gteIR0 * limB2(A2U(((s64)gteGFC - (gteG << 4)) << (12 - shift)), 0))) >> 12;
@@ -841,7 +860,6 @@ void gteDPCT(psxCP2Regs *regs) {
        GTE_LOG("GTE DPCT\n");
 #endif
        gteFLAG = 0;
-       psxRegs.cycle += 17;
 
        for (v = 0; v < 3; v++) {
                gteMAC1 = ((gteR0 << 16) + (gteIR0 * limB1(A1U((s64)gteRFC - (gteR0 << 4)), 0))) >> 12;
@@ -865,7 +883,6 @@ void gteNCS(psxCP2Regs *regs) {
        GTE_LOG("GTE NCS\n");
 #endif
        gteFLAG = 0;
-       psxRegs.cycle += 14;
 
        gteMAC1 = ((s64)(gteL11 * gteVX0) + (gteL12 * gteVY0) + (gteL13 * gteVZ0)) >> 12;
        gteMAC2 = ((s64)(gteL21 * gteVX0) + (gteL22 * gteVY0) + (gteL23 * gteVZ0)) >> 12;
@@ -896,7 +913,6 @@ void gteNCT(psxCP2Regs *regs) {
        GTE_LOG("GTE NCT\n");
 #endif
        gteFLAG = 0;
-       psxRegs.cycle += 30;
 
        for (v = 0; v < 3; v++) {
                vx = VX(v);
@@ -928,7 +944,6 @@ void gteCC(psxCP2Regs *regs) {
        GTE_LOG("GTE CC\n");
 #endif
        gteFLAG = 0;
-       psxRegs.cycle += 11;
 
        gteMAC1 = A1((((s64)gteRBK << 12) + (gteLR1 * gteIR1) + (gteLR2 * gteIR2) + (gteLR3 * gteIR3)) >> 12);
        gteMAC2 = A2((((s64)gteGBK << 12) + (gteLG1 * gteIR1) + (gteLG2 * gteIR2) + (gteLG3 * gteIR3)) >> 12);
@@ -959,7 +974,6 @@ void gteINTPL(psxCP2Regs *regs) {
        GTE_LOG("GTE INTPL\n");
 #endif
        gteFLAG = 0;
-       psxRegs.cycle += 8;
 
        gteMAC1 = ((gteIR1 << 12) + (gteIR0 * limB1(A1U((s64)gteRFC - gteIR1), 0))) >> shift;
        gteMAC2 = ((gteIR2 << 12) + (gteIR0 * limB2(A2U((s64)gteGFC - gteIR2), 0))) >> shift;
@@ -980,7 +994,6 @@ void gteCDP(psxCP2Regs *regs) {
        GTE_LOG("GTE CDP\n");
 #endif
        gteFLAG = 0;
-       psxRegs.cycle += 13;
 
        gteMAC1 = A1((((s64)gteRBK << 12) + (gteLR1 * gteIR1) + (gteLR2 * gteIR2) + (gteLR3 * gteIR3)) >> 12);
        gteMAC2 = A2((((s64)gteGBK << 12) + (gteLG1 * gteIR1) + (gteLG2 * gteIR2) + (gteLG3 * gteIR3)) >> 12);
index 8bc6988..8f133f5 100644 (file)
@@ -67,6 +67,12 @@ extern "C" {
 
 struct psxCP2Regs;
 
+extern const unsigned char gte_cycletab[64];
+
+int  gteCheckStallRaw(u32 op_cycles, psxRegisters *regs);
+void gteCheckStall(u32 op);
+
+// for lightrec
 u32 MFC2(int reg);
 void MTC2(u32 value, int reg);
 void CTC2(u32 value, int reg);
@@ -77,6 +83,8 @@ void gteMTC2();
 void gteCTC2();
 void gteLWC2();
 void gteSWC2();
+void gteLWC2_stall();
+void gteSWC2_stall();
 
 void gteRTPS(struct psxCP2Regs *regs);
 void gteOP(struct psxCP2Regs *regs);
index 60065f8..2799caa 100644 (file)
@@ -6,7 +6,7 @@
  */
 
 #include "arm_features.h"
-#include "new_dynarec/arm/linkage_offsets.h"
+#include "new_dynarec/linkage_offsets.h"
 
 .syntax unified
 .text
index 3e68a9c..bb4138b 100644 (file)
@@ -37,6 +37,9 @@
 #      define unlikely(x)     (x)
 #endif
 
+psxRegisters psxRegs;
+Rcnt rcnts[4];
+
 static struct lightrec_state *lightrec_state;
 
 static char *name = "retroarch.exe";
@@ -47,18 +50,6 @@ static bool lightrec_debug;
 static bool lightrec_very_debug;
 static u32 lightrec_begin_cycles;
 
-int stop;
-u32 cycle_multiplier;
-int new_dynarec_hacks;
-
-/* Unused for now */
-u32 event_cycles[PSXINT_COUNT];
-u32 next_interupt;
-
-void new_dyna_before_save() {}
-void new_dyna_after_save() {}
-void new_dyna_freeze(void *f, int i) {}
-
 enum my_cp2_opcodes {
        OP_CP2_RTPS             = 0x01,
        OP_CP2_NCLIP            = 0x06,
@@ -578,7 +569,6 @@ static void lightrec_plugin_clear(u32 addr, u32 size)
                lightrec_invalidate(lightrec_state, addr, size * 4);
 }
 
-#ifdef ICACHE_EMULATION
 static void lightrec_plugin_notify(int note, void *data)
 {
        /*
@@ -595,7 +585,10 @@ static void lightrec_plugin_notify(int note, void *data)
                        break;
        }*/
 }
-#endif
+
+static void lightrec_plugin_apply_config()
+{
+}
 
 static void lightrec_plugin_shutdown(void)
 {
@@ -615,8 +608,7 @@ R3000Acpu psxRec =
        lightrec_plugin_execute,
        lightrec_plugin_execute_block,
        lightrec_plugin_clear,
-#ifdef ICACHE_EMULATION
        lightrec_plugin_notify,
-#endif
+       lightrec_plugin_apply_config,
        lightrec_plugin_shutdown,
 };
index 8911bac..be501a2 100644 (file)
 * Miscellaneous functions, including savestates and CD-ROM loading.
 */
 
+#include <stddef.h>
 #include "misc.h"
 #include "cdrom.h"
 #include "mdec.h"
 #include "gpu.h"
 #include "ppf.h"
+#include "database.h"
 #include <zlib.h>
 
 char CdromId[10] = "";
@@ -388,17 +390,25 @@ int CheckCdrom() {
                strcpy(CdromId, "SLUS99999");
 
        if (Config.PsxAuto) { // autodetect system (pal or ntsc)
-               if (CdromId[2] == 'e' || CdromId[2] == 'E')
+               if (
+                       /* Make sure Wild Arms SCUS-94608 is not detected as a PAL game. */
+                       ((CdromId[0] == 's' || CdromId[0] == 'S') && (CdromId[2] == 'e' || CdromId[2] == 'E')) ||
+                       !strncmp(CdromId, "DTLS3035", 8) ||
+                       !strncmp(CdromId, "PBPX95001", 9) || // according to redump.org, these PAL
+                       !strncmp(CdromId, "PBPX95007", 9) || // discs have a non-standard ID;
+                       !strncmp(CdromId, "PBPX95008", 9))   // add more serials if they are discovered.
                        Config.PsxType = PSX_TYPE_PAL; // pal
                else Config.PsxType = PSX_TYPE_NTSC; // ntsc
        }
 
        if (CdromLabel[0] == ' ') {
-               memcpy(CdromLabel, CdromId, 9);
+               strncpy(CdromLabel, CdromId, 9);
        }
        SysPrintf(_("CD-ROM Label: %.32s\n"), CdromLabel);
        SysPrintf(_("CD-ROM ID: %.9s\n"), CdromId);
        SysPrintf(_("CD-ROM EXE Name: %.255s\n"), exename);
+       
+       Apply_Hacks_Cdrom();
 
        BuildPPFCache();
 
@@ -621,7 +631,8 @@ int SaveState(const char *file) {
        SaveFuncs.write(f, psxM, 0x00200000);
        SaveFuncs.write(f, psxR, 0x00080000);
        SaveFuncs.write(f, psxH, 0x00010000);
-       SaveFuncs.write(f, (void *)&psxRegs, sizeof(psxRegs));
+       // only partial save of psxRegisters to maintain savestate compat
+       SaveFuncs.write(f, &psxRegs, offsetof(psxRegisters, gteBusyCycle));
 
        // gpu
        gpufP = (GPUFreeze_t *)malloc(sizeof(GPUFreeze_t));
@@ -690,7 +701,8 @@ int LoadState(const char *file) {
        SaveFuncs.read(f, psxM, 0x00200000);
        SaveFuncs.read(f, psxR, 0x00080000);
        SaveFuncs.read(f, psxH, 0x00010000);
-       SaveFuncs.read(f, (void *)&psxRegs, sizeof(psxRegs));
+       SaveFuncs.read(f, &psxRegs, offsetof(psxRegisters, gteBusyCycle));
+       psxRegs.gteBusyCycle = psxRegs.cycle;
 
        if (Config.HLE)
                psxBiosFreeze(0);
@@ -777,7 +789,7 @@ int RecvPcsxInfo() {
        NET_recvData(&Config.Cpu, sizeof(Config.Cpu), PSE_NET_BLOCKING);
        if (tmp != Config.Cpu) {
                psxCpu->Shutdown();
-#if defined(NEW_DYNAREC) || defined(LIGHTREC)
+#ifndef DRC_DISABLE
                if (Config.Cpu == CPU_INTERPRETER) psxCpu = &psxInt;
                else psxCpu = &psxRec;
 #else
diff --git a/libpcsxcore/new_dynarec/arm/assem_arm.c b/libpcsxcore/new_dynarec/arm/assem_arm.c
deleted file mode 100644 (file)
index a373bd3..0000000
+++ /dev/null
@@ -1,4157 +0,0 @@
-/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
- *   Mupen64plus/PCSX - assem_arm.c                                        *
- *   Copyright (C) 2009-2011 Ari64                                         *
- *   Copyright (C) 2010-2011 Gražvydas "notaz" Ignotas                     *
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- *   This program is distributed in the hope that it will be useful,       *
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
- *   GNU General Public License for more details.                          *
- *                                                                         *
- *   You should have received a copy of the GNU General Public License     *
- *   along with this program; if not, write to the                         *
- *   Free Software Foundation, Inc.,                                       *
- *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
- * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
-
-#include "../../gte.h"
-#define FLAGLESS
-#include "../../gte.h"
-#undef FLAGLESS
-#include "../../gte_arm.h"
-#include "../../gte_neon.h"
-#include "pcnt.h"
-#include "arm_features.h"
-
-#if   defined(BASE_ADDR_FIXED)
-#elif defined(BASE_ADDR_DYNAMIC)
-char *translation_cache;
-#else
-char translation_cache[1 << TARGET_SIZE_2] __attribute__((aligned(4096)));
-#endif
-
-#ifndef __MACH__
-#define CALLER_SAVE_REGS 0x100f
-#else
-#define CALLER_SAVE_REGS 0x120f
-#endif
-
-#define unused __attribute__((unused))
-
-extern int cycle_count;
-extern int last_count;
-extern int pcaddr;
-extern int pending_exception;
-extern int branch_target;
-extern uint64_t readmem_dword;
-extern void *dynarec_local;
-extern u_int mini_ht[32][2];
-
-void indirect_jump_indexed();
-void indirect_jump();
-void do_interrupt();
-void jump_vaddr_r0();
-void jump_vaddr_r1();
-void jump_vaddr_r2();
-void jump_vaddr_r3();
-void jump_vaddr_r4();
-void jump_vaddr_r5();
-void jump_vaddr_r6();
-void jump_vaddr_r7();
-void jump_vaddr_r8();
-void jump_vaddr_r9();
-void jump_vaddr_r10();
-void jump_vaddr_r12();
-
-const u_int jump_vaddr_reg[16] = {
-  (int)jump_vaddr_r0,
-  (int)jump_vaddr_r1,
-  (int)jump_vaddr_r2,
-  (int)jump_vaddr_r3,
-  (int)jump_vaddr_r4,
-  (int)jump_vaddr_r5,
-  (int)jump_vaddr_r6,
-  (int)jump_vaddr_r7,
-  (int)jump_vaddr_r8,
-  (int)jump_vaddr_r9,
-  (int)jump_vaddr_r10,
-  0,
-  (int)jump_vaddr_r12,
-  0,
-  0,
-  0};
-
-void invalidate_addr_r0();
-void invalidate_addr_r1();
-void invalidate_addr_r2();
-void invalidate_addr_r3();
-void invalidate_addr_r4();
-void invalidate_addr_r5();
-void invalidate_addr_r6();
-void invalidate_addr_r7();
-void invalidate_addr_r8();
-void invalidate_addr_r9();
-void invalidate_addr_r10();
-void invalidate_addr_r12();
-
-const u_int invalidate_addr_reg[16] = {
-  (int)invalidate_addr_r0,
-  (int)invalidate_addr_r1,
-  (int)invalidate_addr_r2,
-  (int)invalidate_addr_r3,
-  (int)invalidate_addr_r4,
-  (int)invalidate_addr_r5,
-  (int)invalidate_addr_r6,
-  (int)invalidate_addr_r7,
-  (int)invalidate_addr_r8,
-  (int)invalidate_addr_r9,
-  (int)invalidate_addr_r10,
-  0,
-  (int)invalidate_addr_r12,
-  0,
-  0,
-  0};
-
-static u_int needs_clear_cache[1<<(TARGET_SIZE_2-17)];
-
-/* Linker */
-
-static void set_jump_target(int addr,u_int target)
-{
-  u_char *ptr=(u_char *)addr;
-  u_int *ptr2=(u_int *)ptr;
-  if(ptr[3]==0xe2) {
-    assert((target-(u_int)ptr2-8)<1024);
-    assert((addr&3)==0);
-    assert((target&3)==0);
-    *ptr2=(*ptr2&0xFFFFF000)|((target-(u_int)ptr2-8)>>2)|0xF00;
-    //printf("target=%x addr=%x insn=%x\n",target,addr,*ptr2);
-  }
-  else if(ptr[3]==0x72) {
-    // generated by emit_jno_unlikely
-    if((target-(u_int)ptr2-8)<1024) {
-      assert((addr&3)==0);
-      assert((target&3)==0);
-      *ptr2=(*ptr2&0xFFFFF000)|((target-(u_int)ptr2-8)>>2)|0xF00;
-    }
-    else if((target-(u_int)ptr2-8)<4096&&!((target-(u_int)ptr2-8)&15)) {
-      assert((addr&3)==0);
-      assert((target&3)==0);
-      *ptr2=(*ptr2&0xFFFFF000)|((target-(u_int)ptr2-8)>>4)|0xE00;
-    }
-    else *ptr2=(0x7A000000)|(((target-(u_int)ptr2-8)<<6)>>8);
-  }
-  else {
-    assert((ptr[3]&0x0e)==0xa);
-    *ptr2=(*ptr2&0xFF000000)|(((target-(u_int)ptr2-8)<<6)>>8);
-  }
-}
-
-// This optionally copies the instruction from the target of the branch into
-// the space before the branch.  Works, but the difference in speed is
-// usually insignificant.
-#if 0
-static void set_jump_target_fillslot(int addr,u_int target,int copy)
-{
-  u_char *ptr=(u_char *)addr;
-  u_int *ptr2=(u_int *)ptr;
-  assert(!copy||ptr2[-1]==0xe28dd000);
-  if(ptr[3]==0xe2) {
-    assert(!copy);
-    assert((target-(u_int)ptr2-8)<4096);
-    *ptr2=(*ptr2&0xFFFFF000)|(target-(u_int)ptr2-8);
-  }
-  else {
-    assert((ptr[3]&0x0e)==0xa);
-    u_int target_insn=*(u_int *)target;
-    if((target_insn&0x0e100000)==0) { // ALU, no immediate, no flags
-      copy=0;
-    }
-    if((target_insn&0x0c100000)==0x04100000) { // Load
-      copy=0;
-    }
-    if(target_insn&0x08000000) {
-      copy=0;
-    }
-    if(copy) {
-      ptr2[-1]=target_insn;
-      target+=4;
-    }
-    *ptr2=(*ptr2&0xFF000000)|(((target-(u_int)ptr2-8)<<6)>>8);
-  }
-}
-#endif
-
-/* Literal pool */
-static void add_literal(int addr,int val)
-{
-  assert(literalcount<sizeof(literals)/sizeof(literals[0]));
-  literals[literalcount][0]=addr;
-  literals[literalcount][1]=val;
-  literalcount++;
-}
-
-// from a pointer to external jump stub (which was produced by emit_extjump2)
-// find where the jumping insn is
-static void *find_extjump_insn(void *stub)
-{
-  int *ptr=(int *)(stub+4);
-  assert((*ptr&0x0fff0000)==0x059f0000); // ldr rx, [pc, #ofs]
-  u_int offset=*ptr&0xfff;
-  void **l_ptr=(void *)ptr+offset+8;
-  return *l_ptr;
-}
-
-// find where external branch is liked to using addr of it's stub:
-// get address that insn one after stub loads (dyna_linker arg1),
-// treat it as a pointer to branch insn,
-// return addr where that branch jumps to
-static int get_pointer(void *stub)
-{
-  //printf("get_pointer(%x)\n",(int)stub);
-  int *i_ptr=find_extjump_insn(stub);
-  assert((*i_ptr&0x0f000000)==0x0a000000);
-  return (int)i_ptr+((*i_ptr<<8)>>6)+8;
-}
-
-// Find the "clean" entry point from a "dirty" entry point
-// by skipping past the call to verify_code
-static u_int get_clean_addr(int addr)
-{
-  int *ptr=(int *)addr;
-  #ifndef HAVE_ARMV7
-  ptr+=4;
-  #else
-  ptr+=6;
-  #endif
-  if((*ptr&0xFF000000)!=0xeb000000) ptr++;
-  assert((*ptr&0xFF000000)==0xeb000000); // bl instruction
-  ptr++;
-  if((*ptr&0xFF000000)==0xea000000) {
-    return (int)ptr+((*ptr<<8)>>6)+8; // follow jump
-  }
-  return (u_int)ptr;
-}
-
-static int verify_dirty(u_int *ptr)
-{
-  #ifndef HAVE_ARMV7
-  u_int offset;
-  // get from literal pool
-  assert((*ptr&0xFFFF0000)==0xe59f0000);
-  offset=*ptr&0xfff;
-  u_int source=*(u_int*)((void *)ptr+offset+8);
-  ptr++;
-  assert((*ptr&0xFFFF0000)==0xe59f0000);
-  offset=*ptr&0xfff;
-  u_int copy=*(u_int*)((void *)ptr+offset+8);
-  ptr++;
-  assert((*ptr&0xFFFF0000)==0xe59f0000);
-  offset=*ptr&0xfff;
-  u_int len=*(u_int*)((void *)ptr+offset+8);
-  ptr++;
-  ptr++;
-  #else
-  // ARMv7 movw/movt
-  assert((*ptr&0xFFF00000)==0xe3000000);
-  u_int source=(ptr[0]&0xFFF)+((ptr[0]>>4)&0xF000)+((ptr[2]<<16)&0xFFF0000)+((ptr[2]<<12)&0xF0000000);
-  u_int copy=(ptr[1]&0xFFF)+((ptr[1]>>4)&0xF000)+((ptr[3]<<16)&0xFFF0000)+((ptr[3]<<12)&0xF0000000);
-  u_int len=(ptr[4]&0xFFF)+((ptr[4]>>4)&0xF000);
-  ptr+=6;
-  #endif
-  if((*ptr&0xFF000000)!=0xeb000000) ptr++;
-  assert((*ptr&0xFF000000)==0xeb000000); // bl instruction
-  //printf("verify_dirty: %x %x %x\n",source,copy,len);
-  return !memcmp((void *)source,(void *)copy,len);
-}
-
-// This doesn't necessarily find all clean entry points, just
-// guarantees that it's not dirty
-static int isclean(int addr)
-{
-  #ifndef HAVE_ARMV7
-  u_int *ptr=((u_int *)addr)+4;
-  #else
-  u_int *ptr=((u_int *)addr)+6;
-  #endif
-  if((*ptr&0xFF000000)!=0xeb000000) ptr++;
-  if((*ptr&0xFF000000)!=0xeb000000) return 1; // bl instruction
-  if((int)ptr+((*ptr<<8)>>6)+8==(int)verify_code) return 0;
-  if((int)ptr+((*ptr<<8)>>6)+8==(int)verify_code_vm) return 0;
-  if((int)ptr+((*ptr<<8)>>6)+8==(int)verify_code_ds) return 0;
-  return 1;
-}
-
-// get source that block at addr was compiled from (host pointers)
-static void get_bounds(int addr,u_int *start,u_int *end)
-{
-  u_int *ptr=(u_int *)addr;
-  #ifndef HAVE_ARMV7
-  u_int offset;
-  // get from literal pool
-  assert((*ptr&0xFFFF0000)==0xe59f0000);
-  offset=*ptr&0xfff;
-  u_int source=*(u_int*)((void *)ptr+offset+8);
-  ptr++;
-  //assert((*ptr&0xFFFF0000)==0xe59f0000);
-  //offset=*ptr&0xfff;
-  //u_int copy=*(u_int*)((void *)ptr+offset+8);
-  ptr++;
-  assert((*ptr&0xFFFF0000)==0xe59f0000);
-  offset=*ptr&0xfff;
-  u_int len=*(u_int*)((void *)ptr+offset+8);
-  ptr++;
-  ptr++;
-  #else
-  // ARMv7 movw/movt
-  assert((*ptr&0xFFF00000)==0xe3000000);
-  u_int source=(ptr[0]&0xFFF)+((ptr[0]>>4)&0xF000)+((ptr[2]<<16)&0xFFF0000)+((ptr[2]<<12)&0xF0000000);
-  //u_int copy=(ptr[1]&0xFFF)+((ptr[1]>>4)&0xF000)+((ptr[3]<<16)&0xFFF0000)+((ptr[3]<<12)&0xF0000000);
-  u_int len=(ptr[4]&0xFFF)+((ptr[4]>>4)&0xF000);
-  ptr+=6;
-  #endif
-  if((*ptr&0xFF000000)!=0xeb000000) ptr++;
-  assert((*ptr&0xFF000000)==0xeb000000); // bl instruction
-  *start=source;
-  *end=source+len;
-}
-
-/* Register allocation */
-
-// Note: registers are allocated clean (unmodified state)
-// if you intend to modify the register, you must call dirty_reg().
-static void alloc_reg(struct regstat *cur,int i,signed char reg)
-{
-  int r,hr;
-  int preferred_reg = (reg&7);
-  if(reg==CCREG) preferred_reg=HOST_CCREG;
-  if(reg==PTEMP||reg==FTEMP) preferred_reg=12;
-
-  // Don't allocate unused registers
-  if((cur->u>>reg)&1) return;
-
-  // see if it's already allocated
-  for(hr=0;hr<HOST_REGS;hr++)
-  {
-    if(cur->regmap[hr]==reg) return;
-  }
-
-  // Keep the same mapping if the register was already allocated in a loop
-  preferred_reg = loop_reg(i,reg,preferred_reg);
-
-  // Try to allocate the preferred register
-  if(cur->regmap[preferred_reg]==-1) {
-    cur->regmap[preferred_reg]=reg;
-    cur->dirty&=~(1<<preferred_reg);
-    cur->isconst&=~(1<<preferred_reg);
-    return;
-  }
-  r=cur->regmap[preferred_reg];
-  if(r<64&&((cur->u>>r)&1)) {
-    cur->regmap[preferred_reg]=reg;
-    cur->dirty&=~(1<<preferred_reg);
-    cur->isconst&=~(1<<preferred_reg);
-    return;
-  }
-  if(r>=64&&((cur->uu>>(r&63))&1)) {
-    cur->regmap[preferred_reg]=reg;
-    cur->dirty&=~(1<<preferred_reg);
-    cur->isconst&=~(1<<preferred_reg);
-    return;
-  }
-
-  // Clear any unneeded registers
-  // We try to keep the mapping consistent, if possible, because it
-  // makes branches easier (especially loops).  So we try to allocate
-  // first (see above) before removing old mappings.  If this is not
-  // possible then go ahead and clear out the registers that are no
-  // longer needed.
-  for(hr=0;hr<HOST_REGS;hr++)
-  {
-    r=cur->regmap[hr];
-    if(r>=0) {
-      if(r<64) {
-        if((cur->u>>r)&1) {cur->regmap[hr]=-1;break;}
-      }
-      else
-      {
-        if((cur->uu>>(r&63))&1) {cur->regmap[hr]=-1;break;}
-      }
-    }
-  }
-  // Try to allocate any available register, but prefer
-  // registers that have not been used recently.
-  if(i>0) {
-    for(hr=0;hr<HOST_REGS;hr++) {
-      if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
-        if(regs[i-1].regmap[hr]!=rs1[i-1]&&regs[i-1].regmap[hr]!=rs2[i-1]&&regs[i-1].regmap[hr]!=rt1[i-1]&&regs[i-1].regmap[hr]!=rt2[i-1]) {
-          cur->regmap[hr]=reg;
-          cur->dirty&=~(1<<hr);
-          cur->isconst&=~(1<<hr);
-          return;
-        }
-      }
-    }
-  }
-  // Try to allocate any available register
-  for(hr=0;hr<HOST_REGS;hr++) {
-    if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
-      cur->regmap[hr]=reg;
-      cur->dirty&=~(1<<hr);
-      cur->isconst&=~(1<<hr);
-      return;
-    }
-  }
-
-  // Ok, now we have to evict someone
-  // Pick a register we hopefully won't need soon
-  u_char hsn[MAXREG+1];
-  memset(hsn,10,sizeof(hsn));
-  int j;
-  lsn(hsn,i,&preferred_reg);
-  //printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",cur->regmap[0],cur->regmap[1],cur->regmap[2],cur->regmap[3],cur->regmap[5],cur->regmap[6],cur->regmap[7]);
-  //printf("hsn(%x): %d %d %d %d %d %d %d\n",start+i*4,hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]);
-  if(i>0) {
-    // Don't evict the cycle count at entry points, otherwise the entry
-    // stub will have to write it.
-    if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2;
-    if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP)) hsn[CCREG]=2;
-    for(j=10;j>=3;j--)
-    {
-      // Alloc preferred register if available
-      if(hsn[r=cur->regmap[preferred_reg]&63]==j) {
-        for(hr=0;hr<HOST_REGS;hr++) {
-          // Evict both parts of a 64-bit register
-          if((cur->regmap[hr]&63)==r) {
-            cur->regmap[hr]=-1;
-            cur->dirty&=~(1<<hr);
-            cur->isconst&=~(1<<hr);
-          }
-        }
-        cur->regmap[preferred_reg]=reg;
-        return;
-      }
-      for(r=1;r<=MAXREG;r++)
-      {
-        if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) {
-          for(hr=0;hr<HOST_REGS;hr++) {
-            if(hr!=HOST_CCREG||j<hsn[CCREG]) {
-              if(cur->regmap[hr]==r+64) {
-                cur->regmap[hr]=reg;
-                cur->dirty&=~(1<<hr);
-                cur->isconst&=~(1<<hr);
-                return;
-              }
-            }
-          }
-          for(hr=0;hr<HOST_REGS;hr++) {
-            if(hr!=HOST_CCREG||j<hsn[CCREG]) {
-              if(cur->regmap[hr]==r) {
-                cur->regmap[hr]=reg;
-                cur->dirty&=~(1<<hr);
-                cur->isconst&=~(1<<hr);
-                return;
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  for(j=10;j>=0;j--)
-  {
-    for(r=1;r<=MAXREG;r++)
-    {
-      if(hsn[r]==j) {
-        for(hr=0;hr<HOST_REGS;hr++) {
-          if(cur->regmap[hr]==r+64) {
-            cur->regmap[hr]=reg;
-            cur->dirty&=~(1<<hr);
-            cur->isconst&=~(1<<hr);
-            return;
-          }
-        }
-        for(hr=0;hr<HOST_REGS;hr++) {
-          if(cur->regmap[hr]==r) {
-            cur->regmap[hr]=reg;
-            cur->dirty&=~(1<<hr);
-            cur->isconst&=~(1<<hr);
-            return;
-          }
-        }
-      }
-    }
-  }
-  SysPrintf("This shouldn't happen (alloc_reg)");exit(1);
-}
-
-static void alloc_reg64(struct regstat *cur,int i,signed char reg)
-{
-  int preferred_reg = 8+(reg&1);
-  int r,hr;
-
-  // allocate the lower 32 bits
-  alloc_reg(cur,i,reg);
-
-  // Don't allocate unused registers
-  if((cur->uu>>reg)&1) return;
-
-  // see if the upper half is already allocated
-  for(hr=0;hr<HOST_REGS;hr++)
-  {
-    if(cur->regmap[hr]==reg+64) return;
-  }
-
-  // Keep the same mapping if the register was already allocated in a loop
-  preferred_reg = loop_reg(i,reg,preferred_reg);
-
-  // Try to allocate the preferred register
-  if(cur->regmap[preferred_reg]==-1) {
-    cur->regmap[preferred_reg]=reg|64;
-    cur->dirty&=~(1<<preferred_reg);
-    cur->isconst&=~(1<<preferred_reg);
-    return;
-  }
-  r=cur->regmap[preferred_reg];
-  if(r<64&&((cur->u>>r)&1)) {
-    cur->regmap[preferred_reg]=reg|64;
-    cur->dirty&=~(1<<preferred_reg);
-    cur->isconst&=~(1<<preferred_reg);
-    return;
-  }
-  if(r>=64&&((cur->uu>>(r&63))&1)) {
-    cur->regmap[preferred_reg]=reg|64;
-    cur->dirty&=~(1<<preferred_reg);
-    cur->isconst&=~(1<<preferred_reg);
-    return;
-  }
-
-  // Clear any unneeded registers
-  // We try to keep the mapping consistent, if possible, because it
-  // makes branches easier (especially loops).  So we try to allocate
-  // first (see above) before removing old mappings.  If this is not
-  // possible then go ahead and clear out the registers that are no
-  // longer needed.
-  for(hr=HOST_REGS-1;hr>=0;hr--)
-  {
-    r=cur->regmap[hr];
-    if(r>=0) {
-      if(r<64) {
-        if((cur->u>>r)&1) {cur->regmap[hr]=-1;break;}
-      }
-      else
-      {
-        if((cur->uu>>(r&63))&1) {cur->regmap[hr]=-1;break;}
-      }
-    }
-  }
-  // Try to allocate any available register, but prefer
-  // registers that have not been used recently.
-  if(i>0) {
-    for(hr=0;hr<HOST_REGS;hr++) {
-      if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
-        if(regs[i-1].regmap[hr]!=rs1[i-1]&&regs[i-1].regmap[hr]!=rs2[i-1]&&regs[i-1].regmap[hr]!=rt1[i-1]&&regs[i-1].regmap[hr]!=rt2[i-1]) {
-          cur->regmap[hr]=reg|64;
-          cur->dirty&=~(1<<hr);
-          cur->isconst&=~(1<<hr);
-          return;
-        }
-      }
-    }
-  }
-  // Try to allocate any available register
-  for(hr=0;hr<HOST_REGS;hr++) {
-    if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
-      cur->regmap[hr]=reg|64;
-      cur->dirty&=~(1<<hr);
-      cur->isconst&=~(1<<hr);
-      return;
-    }
-  }
-
-  // Ok, now we have to evict someone
-  // Pick a register we hopefully won't need soon
-  u_char hsn[MAXREG+1];
-  memset(hsn,10,sizeof(hsn));
-  int j;
-  lsn(hsn,i,&preferred_reg);
-  //printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",cur->regmap[0],cur->regmap[1],cur->regmap[2],cur->regmap[3],cur->regmap[5],cur->regmap[6],cur->regmap[7]);
-  //printf("hsn(%x): %d %d %d %d %d %d %d\n",start+i*4,hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]);
-  if(i>0) {
-    // Don't evict the cycle count at entry points, otherwise the entry
-    // stub will have to write it.
-    if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2;
-    if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP)) hsn[CCREG]=2;
-    for(j=10;j>=3;j--)
-    {
-      // Alloc preferred register if available
-      if(hsn[r=cur->regmap[preferred_reg]&63]==j) {
-        for(hr=0;hr<HOST_REGS;hr++) {
-          // Evict both parts of a 64-bit register
-          if((cur->regmap[hr]&63)==r) {
-            cur->regmap[hr]=-1;
-            cur->dirty&=~(1<<hr);
-            cur->isconst&=~(1<<hr);
-          }
-        }
-        cur->regmap[preferred_reg]=reg|64;
-        return;
-      }
-      for(r=1;r<=MAXREG;r++)
-      {
-        if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) {
-          for(hr=0;hr<HOST_REGS;hr++) {
-            if(hr!=HOST_CCREG||j<hsn[CCREG]) {
-              if(cur->regmap[hr]==r+64) {
-                cur->regmap[hr]=reg|64;
-                cur->dirty&=~(1<<hr);
-                cur->isconst&=~(1<<hr);
-                return;
-              }
-            }
-          }
-          for(hr=0;hr<HOST_REGS;hr++) {
-            if(hr!=HOST_CCREG||j<hsn[CCREG]) {
-              if(cur->regmap[hr]==r) {
-                cur->regmap[hr]=reg|64;
-                cur->dirty&=~(1<<hr);
-                cur->isconst&=~(1<<hr);
-                return;
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  for(j=10;j>=0;j--)
-  {
-    for(r=1;r<=MAXREG;r++)
-    {
-      if(hsn[r]==j) {
-        for(hr=0;hr<HOST_REGS;hr++) {
-          if(cur->regmap[hr]==r+64) {
-            cur->regmap[hr]=reg|64;
-            cur->dirty&=~(1<<hr);
-            cur->isconst&=~(1<<hr);
-            return;
-          }
-        }
-        for(hr=0;hr<HOST_REGS;hr++) {
-          if(cur->regmap[hr]==r) {
-            cur->regmap[hr]=reg|64;
-            cur->dirty&=~(1<<hr);
-            cur->isconst&=~(1<<hr);
-            return;
-          }
-        }
-      }
-    }
-  }
-  SysPrintf("This shouldn't happen");exit(1);
-}
-
-// Allocate a temporary register.  This is done without regard to
-// dirty status or whether the register we request is on the unneeded list
-// Note: This will only allocate one register, even if called multiple times
-static void alloc_reg_temp(struct regstat *cur,int i,signed char reg)
-{
-  int r,hr;
-  int preferred_reg = -1;
-
-  // see if it's already allocated
-  for(hr=0;hr<HOST_REGS;hr++)
-  {
-    if(hr!=EXCLUDE_REG&&cur->regmap[hr]==reg) return;
-  }
-
-  // Try to allocate any available register
-  for(hr=HOST_REGS-1;hr>=0;hr--) {
-    if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
-      cur->regmap[hr]=reg;
-      cur->dirty&=~(1<<hr);
-      cur->isconst&=~(1<<hr);
-      return;
-    }
-  }
-
-  // Find an unneeded register
-  for(hr=HOST_REGS-1;hr>=0;hr--)
-  {
-    r=cur->regmap[hr];
-    if(r>=0) {
-      if(r<64) {
-        if((cur->u>>r)&1) {
-          if(i==0||((unneeded_reg[i-1]>>r)&1)) {
-            cur->regmap[hr]=reg;
-            cur->dirty&=~(1<<hr);
-            cur->isconst&=~(1<<hr);
-            return;
-          }
-        }
-      }
-      else
-      {
-        if((cur->uu>>(r&63))&1) {
-          if(i==0||((unneeded_reg_upper[i-1]>>(r&63))&1)) {
-            cur->regmap[hr]=reg;
-            cur->dirty&=~(1<<hr);
-            cur->isconst&=~(1<<hr);
-            return;
-          }
-        }
-      }
-    }
-  }
-
-  // Ok, now we have to evict someone
-  // Pick a register we hopefully won't need soon
-  // TODO: we might want to follow unconditional jumps here
-  // TODO: get rid of dupe code and make this into a function
-  u_char hsn[MAXREG+1];
-  memset(hsn,10,sizeof(hsn));
-  int j;
-  lsn(hsn,i,&preferred_reg);
-  //printf("hsn: %d %d %d %d %d %d %d\n",hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]);
-  if(i>0) {
-    // Don't evict the cycle count at entry points, otherwise the entry
-    // stub will have to write it.
-    if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2;
-    if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP)) hsn[CCREG]=2;
-    for(j=10;j>=3;j--)
-    {
-      for(r=1;r<=MAXREG;r++)
-      {
-        if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) {
-          for(hr=0;hr<HOST_REGS;hr++) {
-            if(hr!=HOST_CCREG||hsn[CCREG]>2) {
-              if(cur->regmap[hr]==r+64) {
-                cur->regmap[hr]=reg;
-                cur->dirty&=~(1<<hr);
-                cur->isconst&=~(1<<hr);
-                return;
-              }
-            }
-          }
-          for(hr=0;hr<HOST_REGS;hr++) {
-            if(hr!=HOST_CCREG||hsn[CCREG]>2) {
-              if(cur->regmap[hr]==r) {
-                cur->regmap[hr]=reg;
-                cur->dirty&=~(1<<hr);
-                cur->isconst&=~(1<<hr);
-                return;
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  for(j=10;j>=0;j--)
-  {
-    for(r=1;r<=MAXREG;r++)
-    {
-      if(hsn[r]==j) {
-        for(hr=0;hr<HOST_REGS;hr++) {
-          if(cur->regmap[hr]==r+64) {
-            cur->regmap[hr]=reg;
-            cur->dirty&=~(1<<hr);
-            cur->isconst&=~(1<<hr);
-            return;
-          }
-        }
-        for(hr=0;hr<HOST_REGS;hr++) {
-          if(cur->regmap[hr]==r) {
-            cur->regmap[hr]=reg;
-            cur->dirty&=~(1<<hr);
-            cur->isconst&=~(1<<hr);
-            return;
-          }
-        }
-      }
-    }
-  }
-  SysPrintf("This shouldn't happen");exit(1);
-}
-
-// Allocate a specific ARM register.
-static void alloc_arm_reg(struct regstat *cur,int i,signed char reg,int hr)
-{
-  int n;
-  int dirty=0;
-
-  // see if it's already allocated (and dealloc it)
-  for(n=0;n<HOST_REGS;n++)
-  {
-    if(n!=EXCLUDE_REG&&cur->regmap[n]==reg) {
-      dirty=(cur->dirty>>n)&1;
-      cur->regmap[n]=-1;
-    }
-  }
-
-  cur->regmap[hr]=reg;
-  cur->dirty&=~(1<<hr);
-  cur->dirty|=dirty<<hr;
-  cur->isconst&=~(1<<hr);
-}
-
-// Alloc cycle count into dedicated register
-static void alloc_cc(struct regstat *cur,int i)
-{
-  alloc_arm_reg(cur,i,CCREG,HOST_CCREG);
-}
-
-/* Special alloc */
-
-
-/* Assembler */
-
-static unused char regname[16][4] = {
- "r0",
- "r1",
- "r2",
- "r3",
- "r4",
- "r5",
- "r6",
- "r7",
- "r8",
- "r9",
- "r10",
- "fp",
- "r12",
- "sp",
- "lr",
- "pc"};
-
-static void output_w32(u_int word)
-{
-  *((u_int *)out)=word;
-  out+=4;
-}
-
-static u_int rd_rn_rm(u_int rd, u_int rn, u_int rm)
-{
-  assert(rd<16);
-  assert(rn<16);
-  assert(rm<16);
-  return((rn<<16)|(rd<<12)|rm);
-}
-
-static u_int rd_rn_imm_shift(u_int rd, u_int rn, u_int imm, u_int shift)
-{
-  assert(rd<16);
-  assert(rn<16);
-  assert(imm<256);
-  assert((shift&1)==0);
-  return((rn<<16)|(rd<<12)|(((32-shift)&30)<<7)|imm);
-}
-
-static u_int genimm(u_int imm,u_int *encoded)
-{
-  *encoded=0;
-  if(imm==0) return 1;
-  int i=32;
-  while(i>0)
-  {
-    if(imm<256) {
-      *encoded=((i&30)<<7)|imm;
-      return 1;
-    }
-    imm=(imm>>2)|(imm<<30);i-=2;
-  }
-  return 0;
-}
-
-static void genimm_checked(u_int imm,u_int *encoded)
-{
-  u_int ret=genimm(imm,encoded);
-  assert(ret);
-  (void)ret;
-}
-
-static u_int genjmp(u_int addr)
-{
-  int offset=addr-(int)out-8;
-  if(offset<-33554432||offset>=33554432) {
-    if (addr>2) {
-      SysPrintf("genjmp: out of range: %08x\n", offset);
-      exit(1);
-    }
-    return 0;
-  }
-  return ((u_int)offset>>2)&0xffffff;
-}
-
-static void emit_mov(int rs,int rt)
-{
-  assem_debug("mov %s,%s\n",regname[rt],regname[rs]);
-  output_w32(0xe1a00000|rd_rn_rm(rt,0,rs));
-}
-
-static void emit_movs(int rs,int rt)
-{
-  assem_debug("movs %s,%s\n",regname[rt],regname[rs]);
-  output_w32(0xe1b00000|rd_rn_rm(rt,0,rs));
-}
-
-static void emit_add(int rs1,int rs2,int rt)
-{
-  assem_debug("add %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0xe0800000|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_adds(int rs1,int rs2,int rt)
-{
-  assem_debug("adds %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0xe0900000|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_adcs(int rs1,int rs2,int rt)
-{
-  assem_debug("adcs %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0xe0b00000|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_sbc(int rs1,int rs2,int rt)
-{
-  assem_debug("sbc %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0xe0c00000|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_sbcs(int rs1,int rs2,int rt)
-{
-  assem_debug("sbcs %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0xe0d00000|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_neg(int rs, int rt)
-{
-  assem_debug("rsb %s,%s,#0\n",regname[rt],regname[rs]);
-  output_w32(0xe2600000|rd_rn_rm(rt,rs,0));
-}
-
-static void emit_negs(int rs, int rt)
-{
-  assem_debug("rsbs %s,%s,#0\n",regname[rt],regname[rs]);
-  output_w32(0xe2700000|rd_rn_rm(rt,rs,0));
-}
-
-static void emit_sub(int rs1,int rs2,int rt)
-{
-  assem_debug("sub %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0xe0400000|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_subs(int rs1,int rs2,int rt)
-{
-  assem_debug("subs %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0xe0500000|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_zeroreg(int rt)
-{
-  assem_debug("mov %s,#0\n",regname[rt]);
-  output_w32(0xe3a00000|rd_rn_rm(rt,0,0));
-}
-
-static void emit_loadlp(u_int imm,u_int rt)
-{
-  add_literal((int)out,imm);
-  assem_debug("ldr %s,pc+? [=%x]\n",regname[rt],imm);
-  output_w32(0xe5900000|rd_rn_rm(rt,15,0));
-}
-
-static void emit_movw(u_int imm,u_int rt)
-{
-  assert(imm<65536);
-  assem_debug("movw %s,#%d (0x%x)\n",regname[rt],imm,imm);
-  output_w32(0xe3000000|rd_rn_rm(rt,0,0)|(imm&0xfff)|((imm<<4)&0xf0000));
-}
-
-static void emit_movt(u_int imm,u_int rt)
-{
-  assem_debug("movt %s,#%d (0x%x)\n",regname[rt],imm&0xffff0000,imm&0xffff0000);
-  output_w32(0xe3400000|rd_rn_rm(rt,0,0)|((imm>>16)&0xfff)|((imm>>12)&0xf0000));
-}
-
-static void emit_movimm(u_int imm,u_int rt)
-{
-  u_int armval;
-  if(genimm(imm,&armval)) {
-    assem_debug("mov %s,#%d\n",regname[rt],imm);
-    output_w32(0xe3a00000|rd_rn_rm(rt,0,0)|armval);
-  }else if(genimm(~imm,&armval)) {
-    assem_debug("mvn %s,#%d\n",regname[rt],imm);
-    output_w32(0xe3e00000|rd_rn_rm(rt,0,0)|armval);
-  }else if(imm<65536) {
-    #ifndef HAVE_ARMV7
-    assem_debug("mov %s,#%d\n",regname[rt],imm&0xFF00);
-    output_w32(0xe3a00000|rd_rn_imm_shift(rt,0,imm>>8,8));
-    assem_debug("add %s,%s,#%d\n",regname[rt],regname[rt],imm&0xFF);
-    output_w32(0xe2800000|rd_rn_imm_shift(rt,rt,imm&0xff,0));
-    #else
-    emit_movw(imm,rt);
-    #endif
-  }else{
-    #ifndef HAVE_ARMV7
-    emit_loadlp(imm,rt);
-    #else
-    emit_movw(imm&0x0000FFFF,rt);
-    emit_movt(imm&0xFFFF0000,rt);
-    #endif
-  }
-}
-
-static void emit_pcreladdr(u_int rt)
-{
-  assem_debug("add %s,pc,#?\n",regname[rt]);
-  output_w32(0xe2800000|rd_rn_rm(rt,15,0));
-}
-
-static void emit_loadreg(int r, int hr)
-{
-  if(r&64) {
-    SysPrintf("64bit load in 32bit mode!\n");
-    assert(0);
-    return;
-  }
-  if((r&63)==0)
-    emit_zeroreg(hr);
-  else {
-    int addr=((int)reg)+((r&63)<<REG_SHIFT)+((r&64)>>4);
-    if((r&63)==HIREG) addr=(int)&hi+((r&64)>>4);
-    if((r&63)==LOREG) addr=(int)&lo+((r&64)>>4);
-    if(r==CCREG) addr=(int)&cycle_count;
-    if(r==CSREG) addr=(int)&Status;
-    if(r==FSREG) addr=(int)&FCR31;
-    if(r==INVCP) addr=(int)&invc_ptr;
-    u_int offset = addr-(u_int)&dynarec_local;
-    assert(offset<4096);
-    assem_debug("ldr %s,fp+%d\n",regname[hr],offset);
-    output_w32(0xe5900000|rd_rn_rm(hr,FP,0)|offset);
-  }
-}
-
-static void emit_storereg(int r, int hr)
-{
-  if(r&64) {
-    SysPrintf("64bit store in 32bit mode!\n");
-    assert(0);
-    return;
-  }
-  int addr=((int)reg)+((r&63)<<REG_SHIFT)+((r&64)>>4);
-  if((r&63)==HIREG) addr=(int)&hi+((r&64)>>4);
-  if((r&63)==LOREG) addr=(int)&lo+((r&64)>>4);
-  if(r==CCREG) addr=(int)&cycle_count;
-  if(r==FSREG) addr=(int)&FCR31;
-  u_int offset = addr-(u_int)&dynarec_local;
-  assert(offset<4096);
-  assem_debug("str %s,fp+%d\n",regname[hr],offset);
-  output_w32(0xe5800000|rd_rn_rm(hr,FP,0)|offset);
-}
-
-static void emit_test(int rs, int rt)
-{
-  assem_debug("tst %s,%s\n",regname[rs],regname[rt]);
-  output_w32(0xe1100000|rd_rn_rm(0,rs,rt));
-}
-
-static void emit_testimm(int rs,int imm)
-{
-  u_int armval;
-  assem_debug("tst %s,#%d\n",regname[rs],imm);
-  genimm_checked(imm,&armval);
-  output_w32(0xe3100000|rd_rn_rm(0,rs,0)|armval);
-}
-
-static void emit_testeqimm(int rs,int imm)
-{
-  u_int armval;
-  assem_debug("tsteq %s,$%d\n",regname[rs],imm);
-  genimm_checked(imm,&armval);
-  output_w32(0x03100000|rd_rn_rm(0,rs,0)|armval);
-}
-
-static void emit_not(int rs,int rt)
-{
-  assem_debug("mvn %s,%s\n",regname[rt],regname[rs]);
-  output_w32(0xe1e00000|rd_rn_rm(rt,0,rs));
-}
-
-static void emit_mvnmi(int rs,int rt)
-{
-  assem_debug("mvnmi %s,%s\n",regname[rt],regname[rs]);
-  output_w32(0x41e00000|rd_rn_rm(rt,0,rs));
-}
-
-static void emit_and(u_int rs1,u_int rs2,u_int rt)
-{
-  assem_debug("and %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0xe0000000|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_or(u_int rs1,u_int rs2,u_int rt)
-{
-  assem_debug("orr %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0xe1800000|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_or_and_set_flags(int rs1,int rs2,int rt)
-{
-  assem_debug("orrs %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0xe1900000|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_orrshl_imm(u_int rs,u_int imm,u_int rt)
-{
-  assert(rs<16);
-  assert(rt<16);
-  assert(imm<32);
-  assem_debug("orr %s,%s,%s,lsl #%d\n",regname[rt],regname[rt],regname[rs],imm);
-  output_w32(0xe1800000|rd_rn_rm(rt,rt,rs)|(imm<<7));
-}
-
-static void emit_orrshr_imm(u_int rs,u_int imm,u_int rt)
-{
-  assert(rs<16);
-  assert(rt<16);
-  assert(imm<32);
-  assem_debug("orr %s,%s,%s,lsr #%d\n",regname[rt],regname[rt],regname[rs],imm);
-  output_w32(0xe1800020|rd_rn_rm(rt,rt,rs)|(imm<<7));
-}
-
-static void emit_xor(u_int rs1,u_int rs2,u_int rt)
-{
-  assem_debug("eor %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0xe0200000|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_addimm(u_int rs,int imm,u_int rt)
-{
-  assert(rs<16);
-  assert(rt<16);
-  if(imm!=0) {
-    u_int armval;
-    if(genimm(imm,&armval)) {
-      assem_debug("add %s,%s,#%d\n",regname[rt],regname[rs],imm);
-      output_w32(0xe2800000|rd_rn_rm(rt,rs,0)|armval);
-    }else if(genimm(-imm,&armval)) {
-      assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rs],-imm);
-      output_w32(0xe2400000|rd_rn_rm(rt,rs,0)|armval);
-    #ifdef HAVE_ARMV7
-    }else if(rt!=rs&&(u_int)imm<65536) {
-      emit_movw(imm&0x0000ffff,rt);
-      emit_add(rs,rt,rt);
-    }else if(rt!=rs&&(u_int)-imm<65536) {
-      emit_movw(-imm&0x0000ffff,rt);
-      emit_sub(rs,rt,rt);
-    #endif
-    }else if((u_int)-imm<65536) {
-      assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rs],(-imm)&0xFF00);
-      assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rt],(-imm)&0xFF);
-      output_w32(0xe2400000|rd_rn_imm_shift(rt,rs,(-imm)>>8,8));
-      output_w32(0xe2400000|rd_rn_imm_shift(rt,rt,(-imm)&0xff,0));
-    }else {
-      do {
-        int shift = (ffs(imm) - 1) & ~1;
-        int imm8 = imm & (0xff << shift);
-        genimm_checked(imm8,&armval);
-        assem_debug("add %s,%s,#0x%x\n",regname[rt],regname[rs],imm8);
-        output_w32(0xe2800000|rd_rn_rm(rt,rs,0)|armval);
-        rs = rt;
-        imm &= ~imm8;
-      }
-      while (imm != 0);
-    }
-  }
-  else if(rs!=rt) emit_mov(rs,rt);
-}
-
-static void emit_addimm_and_set_flags(int imm,int rt)
-{
-  assert(imm>-65536&&imm<65536);
-  u_int armval;
-  if(genimm(imm,&armval)) {
-    assem_debug("adds %s,%s,#%d\n",regname[rt],regname[rt],imm);
-    output_w32(0xe2900000|rd_rn_rm(rt,rt,0)|armval);
-  }else if(genimm(-imm,&armval)) {
-    assem_debug("subs %s,%s,#%d\n",regname[rt],regname[rt],imm);
-    output_w32(0xe2500000|rd_rn_rm(rt,rt,0)|armval);
-  }else if(imm<0) {
-    assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rt],(-imm)&0xFF00);
-    assem_debug("subs %s,%s,#%d\n",regname[rt],regname[rt],(-imm)&0xFF);
-    output_w32(0xe2400000|rd_rn_imm_shift(rt,rt,(-imm)>>8,8));
-    output_w32(0xe2500000|rd_rn_imm_shift(rt,rt,(-imm)&0xff,0));
-  }else{
-    assem_debug("add %s,%s,#%d\n",regname[rt],regname[rt],imm&0xFF00);
-    assem_debug("adds %s,%s,#%d\n",regname[rt],regname[rt],imm&0xFF);
-    output_w32(0xe2800000|rd_rn_imm_shift(rt,rt,imm>>8,8));
-    output_w32(0xe2900000|rd_rn_imm_shift(rt,rt,imm&0xff,0));
-  }
-}
-
-static void emit_addimm_no_flags(u_int imm,u_int rt)
-{
-  emit_addimm(rt,imm,rt);
-}
-
-static void emit_addnop(u_int r)
-{
-  assert(r<16);
-  assem_debug("add %s,%s,#0 (nop)\n",regname[r],regname[r]);
-  output_w32(0xe2800000|rd_rn_rm(r,r,0));
-}
-
-static void emit_adcimm(u_int rs,int imm,u_int rt)
-{
-  u_int armval;
-  genimm_checked(imm,&armval);
-  assem_debug("adc %s,%s,#%d\n",regname[rt],regname[rs],imm);
-  output_w32(0xe2a00000|rd_rn_rm(rt,rs,0)|armval);
-}
-
-static void emit_rscimm(int rs,int imm,u_int rt)
-{
-  assert(0);
-  u_int armval;
-  genimm_checked(imm,&armval);
-  assem_debug("rsc %s,%s,#%d\n",regname[rt],regname[rs],imm);
-  output_w32(0xe2e00000|rd_rn_rm(rt,rs,0)|armval);
-}
-
-static void emit_addimm64_32(int rsh,int rsl,int imm,int rth,int rtl)
-{
-  // TODO: if(genimm(imm,&armval)) ...
-  // else
-  emit_movimm(imm,HOST_TEMPREG);
-  emit_adds(HOST_TEMPREG,rsl,rtl);
-  emit_adcimm(rsh,0,rth);
-}
-
-static void emit_andimm(int rs,int imm,int rt)
-{
-  u_int armval;
-  if(imm==0) {
-    emit_zeroreg(rt);
-  }else if(genimm(imm,&armval)) {
-    assem_debug("and %s,%s,#%d\n",regname[rt],regname[rs],imm);
-    output_w32(0xe2000000|rd_rn_rm(rt,rs,0)|armval);
-  }else if(genimm(~imm,&armval)) {
-    assem_debug("bic %s,%s,#%d\n",regname[rt],regname[rs],imm);
-    output_w32(0xe3c00000|rd_rn_rm(rt,rs,0)|armval);
-  }else if(imm==65535) {
-    #ifndef HAVE_ARMV6
-    assem_debug("bic %s,%s,#FF000000\n",regname[rt],regname[rs]);
-    output_w32(0xe3c00000|rd_rn_rm(rt,rs,0)|0x4FF);
-    assem_debug("bic %s,%s,#00FF0000\n",regname[rt],regname[rt]);
-    output_w32(0xe3c00000|rd_rn_rm(rt,rt,0)|0x8FF);
-    #else
-    assem_debug("uxth %s,%s\n",regname[rt],regname[rs]);
-    output_w32(0xe6ff0070|rd_rn_rm(rt,0,rs));
-    #endif
-  }else{
-    assert(imm>0&&imm<65535);
-    #ifndef HAVE_ARMV7
-    assem_debug("mov r14,#%d\n",imm&0xFF00);
-    output_w32(0xe3a00000|rd_rn_imm_shift(HOST_TEMPREG,0,imm>>8,8));
-    assem_debug("add r14,r14,#%d\n",imm&0xFF);
-    output_w32(0xe2800000|rd_rn_imm_shift(HOST_TEMPREG,HOST_TEMPREG,imm&0xff,0));
-    #else
-    emit_movw(imm,HOST_TEMPREG);
-    #endif
-    assem_debug("and %s,%s,r14\n",regname[rt],regname[rs]);
-    output_w32(0xe0000000|rd_rn_rm(rt,rs,HOST_TEMPREG));
-  }
-}
-
-static void emit_orimm(int rs,int imm,int rt)
-{
-  u_int armval;
-  if(imm==0) {
-    if(rs!=rt) emit_mov(rs,rt);
-  }else if(genimm(imm,&armval)) {
-    assem_debug("orr %s,%s,#%d\n",regname[rt],regname[rs],imm);
-    output_w32(0xe3800000|rd_rn_rm(rt,rs,0)|armval);
-  }else{
-    assert(imm>0&&imm<65536);
-    assem_debug("orr %s,%s,#%d\n",regname[rt],regname[rs],imm&0xFF00);
-    assem_debug("orr %s,%s,#%d\n",regname[rt],regname[rs],imm&0xFF);
-    output_w32(0xe3800000|rd_rn_imm_shift(rt,rs,imm>>8,8));
-    output_w32(0xe3800000|rd_rn_imm_shift(rt,rt,imm&0xff,0));
-  }
-}
-
-static void emit_xorimm(int rs,int imm,int rt)
-{
-  u_int armval;
-  if(imm==0) {
-    if(rs!=rt) emit_mov(rs,rt);
-  }else if(genimm(imm,&armval)) {
-    assem_debug("eor %s,%s,#%d\n",regname[rt],regname[rs],imm);
-    output_w32(0xe2200000|rd_rn_rm(rt,rs,0)|armval);
-  }else{
-    assert(imm>0&&imm<65536);
-    assem_debug("eor %s,%s,#%d\n",regname[rt],regname[rs],imm&0xFF00);
-    assem_debug("eor %s,%s,#%d\n",regname[rt],regname[rs],imm&0xFF);
-    output_w32(0xe2200000|rd_rn_imm_shift(rt,rs,imm>>8,8));
-    output_w32(0xe2200000|rd_rn_imm_shift(rt,rt,imm&0xff,0));
-  }
-}
-
-static void emit_shlimm(int rs,u_int imm,int rt)
-{
-  assert(imm>0);
-  assert(imm<32);
-  //if(imm==1) ...
-  assem_debug("lsl %s,%s,#%d\n",regname[rt],regname[rs],imm);
-  output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|(imm<<7));
-}
-
-static void emit_lsls_imm(int rs,int imm,int rt)
-{
-  assert(imm>0);
-  assert(imm<32);
-  assem_debug("lsls %s,%s,#%d\n",regname[rt],regname[rs],imm);
-  output_w32(0xe1b00000|rd_rn_rm(rt,0,rs)|(imm<<7));
-}
-
-static unused void emit_lslpls_imm(int rs,int imm,int rt)
-{
-  assert(imm>0);
-  assert(imm<32);
-  assem_debug("lslpls %s,%s,#%d\n",regname[rt],regname[rs],imm);
-  output_w32(0x51b00000|rd_rn_rm(rt,0,rs)|(imm<<7));
-}
-
-static void emit_shrimm(int rs,u_int imm,int rt)
-{
-  assert(imm>0);
-  assert(imm<32);
-  assem_debug("lsr %s,%s,#%d\n",regname[rt],regname[rs],imm);
-  output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x20|(imm<<7));
-}
-
-static void emit_sarimm(int rs,u_int imm,int rt)
-{
-  assert(imm>0);
-  assert(imm<32);
-  assem_debug("asr %s,%s,#%d\n",regname[rt],regname[rs],imm);
-  output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x40|(imm<<7));
-}
-
-static void emit_rorimm(int rs,u_int imm,int rt)
-{
-  assert(imm>0);
-  assert(imm<32);
-  assem_debug("ror %s,%s,#%d\n",regname[rt],regname[rs],imm);
-  output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x60|(imm<<7));
-}
-
-static void emit_shldimm(int rs,int rs2,u_int imm,int rt)
-{
-  assem_debug("shld %%%s,%%%s,%d\n",regname[rt],regname[rs2],imm);
-  assert(imm>0);
-  assert(imm<32);
-  //if(imm==1) ...
-  assem_debug("lsl %s,%s,#%d\n",regname[rt],regname[rs],imm);
-  output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|(imm<<7));
-  assem_debug("orr %s,%s,%s,lsr #%d\n",regname[rt],regname[rt],regname[rs2],32-imm);
-  output_w32(0xe1800020|rd_rn_rm(rt,rt,rs2)|((32-imm)<<7));
-}
-
-static void emit_shrdimm(int rs,int rs2,u_int imm,int rt)
-{
-  assem_debug("shrd %%%s,%%%s,%d\n",regname[rt],regname[rs2],imm);
-  assert(imm>0);
-  assert(imm<32);
-  //if(imm==1) ...
-  assem_debug("lsr %s,%s,#%d\n",regname[rt],regname[rs],imm);
-  output_w32(0xe1a00020|rd_rn_rm(rt,0,rs)|(imm<<7));
-  assem_debug("orr %s,%s,%s,lsl #%d\n",regname[rt],regname[rt],regname[rs2],32-imm);
-  output_w32(0xe1800000|rd_rn_rm(rt,rt,rs2)|((32-imm)<<7));
-}
-
-static void emit_signextend16(int rs,int rt)
-{
-  #ifndef HAVE_ARMV6
-  emit_shlimm(rs,16,rt);
-  emit_sarimm(rt,16,rt);
-  #else
-  assem_debug("sxth %s,%s\n",regname[rt],regname[rs]);
-  output_w32(0xe6bf0070|rd_rn_rm(rt,0,rs));
-  #endif
-}
-
-static void emit_signextend8(int rs,int rt)
-{
-  #ifndef HAVE_ARMV6
-  emit_shlimm(rs,24,rt);
-  emit_sarimm(rt,24,rt);
-  #else
-  assem_debug("sxtb %s,%s\n",regname[rt],regname[rs]);
-  output_w32(0xe6af0070|rd_rn_rm(rt,0,rs));
-  #endif
-}
-
-static void emit_shl(u_int rs,u_int shift,u_int rt)
-{
-  assert(rs<16);
-  assert(rt<16);
-  assert(shift<16);
-  //if(imm==1) ...
-  assem_debug("lsl %s,%s,%s\n",regname[rt],regname[rs],regname[shift]);
-  output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x10|(shift<<8));
-}
-
-static void emit_shr(u_int rs,u_int shift,u_int rt)
-{
-  assert(rs<16);
-  assert(rt<16);
-  assert(shift<16);
-  assem_debug("lsr %s,%s,%s\n",regname[rt],regname[rs],regname[shift]);
-  output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x30|(shift<<8));
-}
-
-static void emit_sar(u_int rs,u_int shift,u_int rt)
-{
-  assert(rs<16);
-  assert(rt<16);
-  assert(shift<16);
-  assem_debug("asr %s,%s,%s\n",regname[rt],regname[rs],regname[shift]);
-  output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x50|(shift<<8));
-}
-
-static void emit_orrshl(u_int rs,u_int shift,u_int rt)
-{
-  assert(rs<16);
-  assert(rt<16);
-  assert(shift<16);
-  assem_debug("orr %s,%s,%s,lsl %s\n",regname[rt],regname[rt],regname[rs],regname[shift]);
-  output_w32(0xe1800000|rd_rn_rm(rt,rt,rs)|0x10|(shift<<8));
-}
-
-static void emit_orrshr(u_int rs,u_int shift,u_int rt)
-{
-  assert(rs<16);
-  assert(rt<16);
-  assert(shift<16);
-  assem_debug("orr %s,%s,%s,lsr %s\n",regname[rt],regname[rt],regname[rs],regname[shift]);
-  output_w32(0xe1800000|rd_rn_rm(rt,rt,rs)|0x30|(shift<<8));
-}
-
-static void emit_cmpimm(int rs,int imm)
-{
-  u_int armval;
-  if(genimm(imm,&armval)) {
-    assem_debug("cmp %s,#%d\n",regname[rs],imm);
-    output_w32(0xe3500000|rd_rn_rm(0,rs,0)|armval);
-  }else if(genimm(-imm,&armval)) {
-    assem_debug("cmn %s,#%d\n",regname[rs],imm);
-    output_w32(0xe3700000|rd_rn_rm(0,rs,0)|armval);
-  }else if(imm>0) {
-    assert(imm<65536);
-    emit_movimm(imm,HOST_TEMPREG);
-    assem_debug("cmp %s,r14\n",regname[rs]);
-    output_w32(0xe1500000|rd_rn_rm(0,rs,HOST_TEMPREG));
-  }else{
-    assert(imm>-65536);
-    emit_movimm(-imm,HOST_TEMPREG);
-    assem_debug("cmn %s,r14\n",regname[rs]);
-    output_w32(0xe1700000|rd_rn_rm(0,rs,HOST_TEMPREG));
-  }
-}
-
-static void emit_cmovne_imm(int imm,int rt)
-{
-  assem_debug("movne %s,#%d\n",regname[rt],imm);
-  u_int armval;
-  genimm_checked(imm,&armval);
-  output_w32(0x13a00000|rd_rn_rm(rt,0,0)|armval);
-}
-
-static void emit_cmovl_imm(int imm,int rt)
-{
-  assem_debug("movlt %s,#%d\n",regname[rt],imm);
-  u_int armval;
-  genimm_checked(imm,&armval);
-  output_w32(0xb3a00000|rd_rn_rm(rt,0,0)|armval);
-}
-
-static void emit_cmovb_imm(int imm,int rt)
-{
-  assem_debug("movcc %s,#%d\n",regname[rt],imm);
-  u_int armval;
-  genimm_checked(imm,&armval);
-  output_w32(0x33a00000|rd_rn_rm(rt,0,0)|armval);
-}
-
-static void emit_cmovs_imm(int imm,int rt)
-{
-  assem_debug("movmi %s,#%d\n",regname[rt],imm);
-  u_int armval;
-  genimm_checked(imm,&armval);
-  output_w32(0x43a00000|rd_rn_rm(rt,0,0)|armval);
-}
-
-static void emit_cmove_reg(int rs,int rt)
-{
-  assem_debug("moveq %s,%s\n",regname[rt],regname[rs]);
-  output_w32(0x01a00000|rd_rn_rm(rt,0,rs));
-}
-
-static void emit_cmovne_reg(int rs,int rt)
-{
-  assem_debug("movne %s,%s\n",regname[rt],regname[rs]);
-  output_w32(0x11a00000|rd_rn_rm(rt,0,rs));
-}
-
-static void emit_cmovl_reg(int rs,int rt)
-{
-  assem_debug("movlt %s,%s\n",regname[rt],regname[rs]);
-  output_w32(0xb1a00000|rd_rn_rm(rt,0,rs));
-}
-
-static void emit_cmovs_reg(int rs,int rt)
-{
-  assem_debug("movmi %s,%s\n",regname[rt],regname[rs]);
-  output_w32(0x41a00000|rd_rn_rm(rt,0,rs));
-}
-
-static void emit_slti32(int rs,int imm,int rt)
-{
-  if(rs!=rt) emit_zeroreg(rt);
-  emit_cmpimm(rs,imm);
-  if(rs==rt) emit_movimm(0,rt);
-  emit_cmovl_imm(1,rt);
-}
-
-static void emit_sltiu32(int rs,int imm,int rt)
-{
-  if(rs!=rt) emit_zeroreg(rt);
-  emit_cmpimm(rs,imm);
-  if(rs==rt) emit_movimm(0,rt);
-  emit_cmovb_imm(1,rt);
-}
-
-static void emit_slti64_32(int rsh,int rsl,int imm,int rt)
-{
-  assert(rsh!=rt);
-  emit_slti32(rsl,imm,rt);
-  if(imm>=0)
-  {
-    emit_test(rsh,rsh);
-    emit_cmovne_imm(0,rt);
-    emit_cmovs_imm(1,rt);
-  }
-  else
-  {
-    emit_cmpimm(rsh,-1);
-    emit_cmovne_imm(0,rt);
-    emit_cmovl_imm(1,rt);
-  }
-}
-
-static void emit_sltiu64_32(int rsh,int rsl,int imm,int rt)
-{
-  assert(rsh!=rt);
-  emit_sltiu32(rsl,imm,rt);
-  if(imm>=0)
-  {
-    emit_test(rsh,rsh);
-    emit_cmovne_imm(0,rt);
-  }
-  else
-  {
-    emit_cmpimm(rsh,-1);
-    emit_cmovne_imm(1,rt);
-  }
-}
-
-static void emit_cmp(int rs,int rt)
-{
-  assem_debug("cmp %s,%s\n",regname[rs],regname[rt]);
-  output_w32(0xe1500000|rd_rn_rm(0,rs,rt));
-}
-
-static void emit_set_gz32(int rs, int rt)
-{
-  //assem_debug("set_gz32\n");
-  emit_cmpimm(rs,1);
-  emit_movimm(1,rt);
-  emit_cmovl_imm(0,rt);
-}
-
-static void emit_set_nz32(int rs, int rt)
-{
-  //assem_debug("set_nz32\n");
-  if(rs!=rt) emit_movs(rs,rt);
-  else emit_test(rs,rs);
-  emit_cmovne_imm(1,rt);
-}
-
-static void emit_set_gz64_32(int rsh, int rsl, int rt)
-{
-  //assem_debug("set_gz64\n");
-  emit_set_gz32(rsl,rt);
-  emit_test(rsh,rsh);
-  emit_cmovne_imm(1,rt);
-  emit_cmovs_imm(0,rt);
-}
-
-static void emit_set_nz64_32(int rsh, int rsl, int rt)
-{
-  //assem_debug("set_nz64\n");
-  emit_or_and_set_flags(rsh,rsl,rt);
-  emit_cmovne_imm(1,rt);
-}
-
-static void emit_set_if_less32(int rs1, int rs2, int rt)
-{
-  //assem_debug("set if less (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]);
-  if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt);
-  emit_cmp(rs1,rs2);
-  if(rs1==rt||rs2==rt) emit_movimm(0,rt);
-  emit_cmovl_imm(1,rt);
-}
-
-static void emit_set_if_carry32(int rs1, int rs2, int rt)
-{
-  //assem_debug("set if carry (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]);
-  if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt);
-  emit_cmp(rs1,rs2);
-  if(rs1==rt||rs2==rt) emit_movimm(0,rt);
-  emit_cmovb_imm(1,rt);
-}
-
-static void emit_set_if_less64_32(int u1, int l1, int u2, int l2, int rt)
-{
-  //assem_debug("set if less64 (%%%s,%%%s,%%%s,%%%s),%%%s\n",regname[u1],regname[l1],regname[u2],regname[l2],regname[rt]);
-  assert(u1!=rt);
-  assert(u2!=rt);
-  emit_cmp(l1,l2);
-  emit_movimm(0,rt);
-  emit_sbcs(u1,u2,HOST_TEMPREG);
-  emit_cmovl_imm(1,rt);
-}
-
-static void emit_set_if_carry64_32(int u1, int l1, int u2, int l2, int rt)
-{
-  //assem_debug("set if carry64 (%%%s,%%%s,%%%s,%%%s),%%%s\n",regname[u1],regname[l1],regname[u2],regname[l2],regname[rt]);
-  assert(u1!=rt);
-  assert(u2!=rt);
-  emit_cmp(l1,l2);
-  emit_movimm(0,rt);
-  emit_sbcs(u1,u2,HOST_TEMPREG);
-  emit_cmovb_imm(1,rt);
-}
-
-static void emit_call(int a)
-{
-  assem_debug("bl %x (%x+%x)\n",a,(int)out,a-(int)out-8);
-  u_int offset=genjmp(a);
-  output_w32(0xeb000000|offset);
-}
-
-static void emit_jmp(int a)
-{
-  assem_debug("b %x (%x+%x)\n",a,(int)out,a-(int)out-8);
-  u_int offset=genjmp(a);
-  output_w32(0xea000000|offset);
-}
-
-static void emit_jne(int a)
-{
-  assem_debug("bne %x\n",a);
-  u_int offset=genjmp(a);
-  output_w32(0x1a000000|offset);
-}
-
-static void emit_jeq(int a)
-{
-  assem_debug("beq %x\n",a);
-  u_int offset=genjmp(a);
-  output_w32(0x0a000000|offset);
-}
-
-static void emit_js(int a)
-{
-  assem_debug("bmi %x\n",a);
-  u_int offset=genjmp(a);
-  output_w32(0x4a000000|offset);
-}
-
-static void emit_jns(int a)
-{
-  assem_debug("bpl %x\n",a);
-  u_int offset=genjmp(a);
-  output_w32(0x5a000000|offset);
-}
-
-static void emit_jl(int a)
-{
-  assem_debug("blt %x\n",a);
-  u_int offset=genjmp(a);
-  output_w32(0xba000000|offset);
-}
-
-static void emit_jge(int a)
-{
-  assem_debug("bge %x\n",a);
-  u_int offset=genjmp(a);
-  output_w32(0xaa000000|offset);
-}
-
-static void emit_jno(int a)
-{
-  assem_debug("bvc %x\n",a);
-  u_int offset=genjmp(a);
-  output_w32(0x7a000000|offset);
-}
-
-static void emit_jc(int a)
-{
-  assem_debug("bcs %x\n",a);
-  u_int offset=genjmp(a);
-  output_w32(0x2a000000|offset);
-}
-
-static void emit_jcc(int a)
-{
-  assem_debug("bcc %x\n",a);
-  u_int offset=genjmp(a);
-  output_w32(0x3a000000|offset);
-}
-
-static void emit_callreg(u_int r)
-{
-  assert(r<15);
-  assem_debug("blx %s\n",regname[r]);
-  output_w32(0xe12fff30|r);
-}
-
-static void emit_jmpreg(u_int r)
-{
-  assem_debug("mov pc,%s\n",regname[r]);
-  output_w32(0xe1a00000|rd_rn_rm(15,0,r));
-}
-
-static void emit_readword_indexed(int offset, int rs, int rt)
-{
-  assert(offset>-4096&&offset<4096);
-  assem_debug("ldr %s,%s+%d\n",regname[rt],regname[rs],offset);
-  if(offset>=0) {
-    output_w32(0xe5900000|rd_rn_rm(rt,rs,0)|offset);
-  }else{
-    output_w32(0xe5100000|rd_rn_rm(rt,rs,0)|(-offset));
-  }
-}
-
-static void emit_readword_dualindexedx4(int rs1, int rs2, int rt)
-{
-  assem_debug("ldr %s,%s,%s lsl #2\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0xe7900000|rd_rn_rm(rt,rs1,rs2)|0x100);
-}
-
-static void emit_ldrcc_dualindexed(int rs1, int rs2, int rt)
-{
-  assem_debug("ldrcc %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0x37900000|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_ldrccb_dualindexed(int rs1, int rs2, int rt)
-{
-  assem_debug("ldrccb %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0x37d00000|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_ldrccsb_dualindexed(int rs1, int rs2, int rt)
-{
-  assem_debug("ldrccsb %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0x319000d0|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_ldrcch_dualindexed(int rs1, int rs2, int rt)
-{
-  assem_debug("ldrcch %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0x319000b0|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_ldrccsh_dualindexed(int rs1, int rs2, int rt)
-{
-  assem_debug("ldrccsh %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0x319000f0|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_readword_indexed_tlb(int addr, int rs, int map, int rt)
-{
-  if(map<0) emit_readword_indexed(addr, rs, rt);
-  else {
-    assert(addr==0);
-    emit_readword_dualindexedx4(rs, map, rt);
-  }
-}
-
-static void emit_readdword_indexed_tlb(int addr, int rs, int map, int rh, int rl)
-{
-  if(map<0) {
-    if(rh>=0) emit_readword_indexed(addr, rs, rh);
-    emit_readword_indexed(addr+4, rs, rl);
-  }else{
-    assert(rh!=rs);
-    if(rh>=0) emit_readword_indexed_tlb(addr, rs, map, rh);
-    emit_addimm(map,1,map);
-    emit_readword_indexed_tlb(addr, rs, map, rl);
-  }
-}
-
-static void emit_movsbl_indexed(int offset, int rs, int rt)
-{
-  assert(offset>-256&&offset<256);
-  assem_debug("ldrsb %s,%s+%d\n",regname[rt],regname[rs],offset);
-  if(offset>=0) {
-    output_w32(0xe1d000d0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf));
-  }else{
-    output_w32(0xe15000d0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf));
-  }
-}
-
-static void emit_movsbl_indexed_tlb(int addr, int rs, int map, int rt)
-{
-  if(map<0) emit_movsbl_indexed(addr, rs, rt);
-  else {
-    if(addr==0) {
-      emit_shlimm(map,2,map);
-      assem_debug("ldrsb %s,%s+%s\n",regname[rt],regname[rs],regname[map]);
-      output_w32(0xe19000d0|rd_rn_rm(rt,rs,map));
-    }else{
-      assert(addr>-256&&addr<256);
-      assem_debug("add %s,%s,%s,lsl #2\n",regname[rt],regname[rs],regname[map]);
-      output_w32(0xe0800000|rd_rn_rm(rt,rs,map)|(2<<7));
-      emit_movsbl_indexed(addr, rt, rt);
-    }
-  }
-}
-
-static void emit_movswl_indexed(int offset, int rs, int rt)
-{
-  assert(offset>-256&&offset<256);
-  assem_debug("ldrsh %s,%s+%d\n",regname[rt],regname[rs],offset);
-  if(offset>=0) {
-    output_w32(0xe1d000f0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf));
-  }else{
-    output_w32(0xe15000f0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf));
-  }
-}
-
-static void emit_movzbl_indexed(int offset, int rs, int rt)
-{
-  assert(offset>-4096&&offset<4096);
-  assem_debug("ldrb %s,%s+%d\n",regname[rt],regname[rs],offset);
-  if(offset>=0) {
-    output_w32(0xe5d00000|rd_rn_rm(rt,rs,0)|offset);
-  }else{
-    output_w32(0xe5500000|rd_rn_rm(rt,rs,0)|(-offset));
-  }
-}
-
-static void emit_movzbl_dualindexedx4(int rs1, int rs2, int rt)
-{
-  assem_debug("ldrb %s,%s,%s lsl #2\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0xe7d00000|rd_rn_rm(rt,rs1,rs2)|0x100);
-}
-
-static void emit_movzbl_indexed_tlb(int addr, int rs, int map, int rt)
-{
-  if(map<0) emit_movzbl_indexed(addr, rs, rt);
-  else {
-    if(addr==0) {
-      emit_movzbl_dualindexedx4(rs, map, rt);
-    }else{
-      emit_addimm(rs,addr,rt);
-      emit_movzbl_dualindexedx4(rt, map, rt);
-    }
-  }
-}
-
-static void emit_movzwl_indexed(int offset, int rs, int rt)
-{
-  assert(offset>-256&&offset<256);
-  assem_debug("ldrh %s,%s+%d\n",regname[rt],regname[rs],offset);
-  if(offset>=0) {
-    output_w32(0xe1d000b0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf));
-  }else{
-    output_w32(0xe15000b0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf));
-  }
-}
-
-static void emit_ldrd(int offset, int rs, int rt)
-{
-  assert(offset>-256&&offset<256);
-  assem_debug("ldrd %s,%s+%d\n",regname[rt],regname[rs],offset);
-  if(offset>=0) {
-    output_w32(0xe1c000d0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf));
-  }else{
-    output_w32(0xe14000d0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf));
-  }
-}
-
-static void emit_readword(int addr, int rt)
-{
-  u_int offset = addr-(u_int)&dynarec_local;
-  assert(offset<4096);
-  assem_debug("ldr %s,fp+%d\n",regname[rt],offset);
-  output_w32(0xe5900000|rd_rn_rm(rt,FP,0)|offset);
-}
-
-static unused void emit_movsbl(int addr, int rt)
-{
-  u_int offset = addr-(u_int)&dynarec_local;
-  assert(offset<256);
-  assem_debug("ldrsb %s,fp+%d\n",regname[rt],offset);
-  output_w32(0xe1d000d0|rd_rn_rm(rt,FP,0)|((offset<<4)&0xf00)|(offset&0xf));
-}
-
-static unused void emit_movswl(int addr, int rt)
-{
-  u_int offset = addr-(u_int)&dynarec_local;
-  assert(offset<256);
-  assem_debug("ldrsh %s,fp+%d\n",regname[rt],offset);
-  output_w32(0xe1d000f0|rd_rn_rm(rt,FP,0)|((offset<<4)&0xf00)|(offset&0xf));
-}
-
-static unused void emit_movzbl(int addr, int rt)
-{
-  u_int offset = addr-(u_int)&dynarec_local;
-  assert(offset<4096);
-  assem_debug("ldrb %s,fp+%d\n",regname[rt],offset);
-  output_w32(0xe5d00000|rd_rn_rm(rt,FP,0)|offset);
-}
-
-static unused void emit_movzwl(int addr, int rt)
-{
-  u_int offset = addr-(u_int)&dynarec_local;
-  assert(offset<256);
-  assem_debug("ldrh %s,fp+%d\n",regname[rt],offset);
-  output_w32(0xe1d000b0|rd_rn_rm(rt,FP,0)|((offset<<4)&0xf00)|(offset&0xf));
-}
-
-static void emit_writeword_indexed(int rt, int offset, int rs)
-{
-  assert(offset>-4096&&offset<4096);
-  assem_debug("str %s,%s+%d\n",regname[rt],regname[rs],offset);
-  if(offset>=0) {
-    output_w32(0xe5800000|rd_rn_rm(rt,rs,0)|offset);
-  }else{
-    output_w32(0xe5000000|rd_rn_rm(rt,rs,0)|(-offset));
-  }
-}
-
-static void emit_writeword_dualindexedx4(int rt, int rs1, int rs2)
-{
-  assem_debug("str %s,%s,%s lsl #2\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0xe7800000|rd_rn_rm(rt,rs1,rs2)|0x100);
-}
-
-static void emit_writeword_indexed_tlb(int rt, int addr, int rs, int map, int temp)
-{
-  if(map<0) emit_writeword_indexed(rt, addr, rs);
-  else {
-    assert(addr==0);
-    emit_writeword_dualindexedx4(rt, rs, map);
-  }
-}
-
-static void emit_writedword_indexed_tlb(int rh, int rl, int addr, int rs, int map, int temp)
-{
-  if(map<0) {
-    if(rh>=0) emit_writeword_indexed(rh, addr, rs);
-    emit_writeword_indexed(rl, addr+4, rs);
-  }else{
-    assert(rh>=0);
-    if(temp!=rs) emit_addimm(map,1,temp);
-    emit_writeword_indexed_tlb(rh, addr, rs, map, temp);
-    if(temp!=rs) emit_writeword_indexed_tlb(rl, addr, rs, temp, temp);
-    else {
-      emit_addimm(rs,4,rs);
-      emit_writeword_indexed_tlb(rl, addr, rs, map, temp);
-    }
-  }
-}
-
-static void emit_writehword_indexed(int rt, int offset, int rs)
-{
-  assert(offset>-256&&offset<256);
-  assem_debug("strh %s,%s+%d\n",regname[rt],regname[rs],offset);
-  if(offset>=0) {
-    output_w32(0xe1c000b0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf));
-  }else{
-    output_w32(0xe14000b0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf));
-  }
-}
-
-static void emit_writebyte_indexed(int rt, int offset, int rs)
-{
-  assert(offset>-4096&&offset<4096);
-  assem_debug("strb %s,%s+%d\n",regname[rt],regname[rs],offset);
-  if(offset>=0) {
-    output_w32(0xe5c00000|rd_rn_rm(rt,rs,0)|offset);
-  }else{
-    output_w32(0xe5400000|rd_rn_rm(rt,rs,0)|(-offset));
-  }
-}
-
-static void emit_writebyte_dualindexedx4(int rt, int rs1, int rs2)
-{
-  assem_debug("strb %s,%s,%s lsl #2\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0xe7c00000|rd_rn_rm(rt,rs1,rs2)|0x100);
-}
-
-static void emit_writebyte_indexed_tlb(int rt, int addr, int rs, int map, int temp)
-{
-  if(map<0) emit_writebyte_indexed(rt, addr, rs);
-  else {
-    if(addr==0) {
-      emit_writebyte_dualindexedx4(rt, rs, map);
-    }else{
-      emit_addimm(rs,addr,temp);
-      emit_writebyte_dualindexedx4(rt, temp, map);
-    }
-  }
-}
-
-static void emit_strcc_dualindexed(int rs1, int rs2, int rt)
-{
-  assem_debug("strcc %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0x37800000|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_strccb_dualindexed(int rs1, int rs2, int rt)
-{
-  assem_debug("strccb %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0x37c00000|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_strcch_dualindexed(int rs1, int rs2, int rt)
-{
-  assem_debug("strcch %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0x318000b0|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_writeword(int rt, int addr)
-{
-  u_int offset = addr-(u_int)&dynarec_local;
-  assert(offset<4096);
-  assem_debug("str %s,fp+%d\n",regname[rt],offset);
-  output_w32(0xe5800000|rd_rn_rm(rt,FP,0)|offset);
-}
-
-static unused void emit_writehword(int rt, int addr)
-{
-  u_int offset = addr-(u_int)&dynarec_local;
-  assert(offset<256);
-  assem_debug("strh %s,fp+%d\n",regname[rt],offset);
-  output_w32(0xe1c000b0|rd_rn_rm(rt,FP,0)|((offset<<4)&0xf00)|(offset&0xf));
-}
-
-static unused void emit_writebyte(int rt, int addr)
-{
-  u_int offset = addr-(u_int)&dynarec_local;
-  assert(offset<4096);
-  assem_debug("strb %s,fp+%d\n",regname[rt],offset);
-  output_w32(0xe5c00000|rd_rn_rm(rt,FP,0)|offset);
-}
-
-static void emit_umull(u_int rs1, u_int rs2, u_int hi, u_int lo)
-{
-  assem_debug("umull %s, %s, %s, %s\n",regname[lo],regname[hi],regname[rs1],regname[rs2]);
-  assert(rs1<16);
-  assert(rs2<16);
-  assert(hi<16);
-  assert(lo<16);
-  output_w32(0xe0800090|(hi<<16)|(lo<<12)|(rs2<<8)|rs1);
-}
-
-static void emit_smull(u_int rs1, u_int rs2, u_int hi, u_int lo)
-{
-  assem_debug("smull %s, %s, %s, %s\n",regname[lo],regname[hi],regname[rs1],regname[rs2]);
-  assert(rs1<16);
-  assert(rs2<16);
-  assert(hi<16);
-  assert(lo<16);
-  output_w32(0xe0c00090|(hi<<16)|(lo<<12)|(rs2<<8)|rs1);
-}
-
-static void emit_clz(int rs,int rt)
-{
-  assem_debug("clz %s,%s\n",regname[rt],regname[rs]);
-  output_w32(0xe16f0f10|rd_rn_rm(rt,0,rs));
-}
-
-static void emit_subcs(int rs1,int rs2,int rt)
-{
-  assem_debug("subcs %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0x20400000|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_shrcc_imm(int rs,u_int imm,int rt)
-{
-  assert(imm>0);
-  assert(imm<32);
-  assem_debug("lsrcc %s,%s,#%d\n",regname[rt],regname[rs],imm);
-  output_w32(0x31a00000|rd_rn_rm(rt,0,rs)|0x20|(imm<<7));
-}
-
-static void emit_shrne_imm(int rs,u_int imm,int rt)
-{
-  assert(imm>0);
-  assert(imm<32);
-  assem_debug("lsrne %s,%s,#%d\n",regname[rt],regname[rs],imm);
-  output_w32(0x11a00000|rd_rn_rm(rt,0,rs)|0x20|(imm<<7));
-}
-
-static void emit_negmi(int rs, int rt)
-{
-  assem_debug("rsbmi %s,%s,#0\n",regname[rt],regname[rs]);
-  output_w32(0x42600000|rd_rn_rm(rt,rs,0));
-}
-
-static void emit_negsmi(int rs, int rt)
-{
-  assem_debug("rsbsmi %s,%s,#0\n",regname[rt],regname[rs]);
-  output_w32(0x42700000|rd_rn_rm(rt,rs,0));
-}
-
-static void emit_orreq(u_int rs1,u_int rs2,u_int rt)
-{
-  assem_debug("orreq %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0x01800000|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_orrne(u_int rs1,u_int rs2,u_int rt)
-{
-  assem_debug("orrne %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
-  output_w32(0x11800000|rd_rn_rm(rt,rs1,rs2));
-}
-
-static void emit_bic_lsl(u_int rs1,u_int rs2,u_int shift,u_int rt)
-{
-  assem_debug("bic %s,%s,%s lsl %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]);
-  output_w32(0xe1C00000|rd_rn_rm(rt,rs1,rs2)|0x10|(shift<<8));
-}
-
-static void emit_biceq_lsl(u_int rs1,u_int rs2,u_int shift,u_int rt)
-{
-  assem_debug("biceq %s,%s,%s lsl %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]);
-  output_w32(0x01C00000|rd_rn_rm(rt,rs1,rs2)|0x10|(shift<<8));
-}
-
-static void emit_bicne_lsl(u_int rs1,u_int rs2,u_int shift,u_int rt)
-{
-  assem_debug("bicne %s,%s,%s lsl %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]);
-  output_w32(0x11C00000|rd_rn_rm(rt,rs1,rs2)|0x10|(shift<<8));
-}
-
-static void emit_bic_lsr(u_int rs1,u_int rs2,u_int shift,u_int rt)
-{
-  assem_debug("bic %s,%s,%s lsr %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]);
-  output_w32(0xe1C00000|rd_rn_rm(rt,rs1,rs2)|0x30|(shift<<8));
-}
-
-static void emit_biceq_lsr(u_int rs1,u_int rs2,u_int shift,u_int rt)
-{
-  assem_debug("biceq %s,%s,%s lsr %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]);
-  output_w32(0x01C00000|rd_rn_rm(rt,rs1,rs2)|0x30|(shift<<8));
-}
-
-static void emit_bicne_lsr(u_int rs1,u_int rs2,u_int shift,u_int rt)
-{
-  assem_debug("bicne %s,%s,%s lsr %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]);
-  output_w32(0x11C00000|rd_rn_rm(rt,rs1,rs2)|0x30|(shift<<8));
-}
-
-static void emit_teq(int rs, int rt)
-{
-  assem_debug("teq %s,%s\n",regname[rs],regname[rt]);
-  output_w32(0xe1300000|rd_rn_rm(0,rs,rt));
-}
-
-static void emit_rsbimm(int rs, int imm, int rt)
-{
-  u_int armval;
-  genimm_checked(imm,&armval);
-  assem_debug("rsb %s,%s,#%d\n",regname[rt],regname[rs],imm);
-  output_w32(0xe2600000|rd_rn_rm(rt,rs,0)|armval);
-}
-
-// Load 2 immediates optimizing for small code size
-static void emit_mov2imm_compact(int imm1,u_int rt1,int imm2,u_int rt2)
-{
-  emit_movimm(imm1,rt1);
-  u_int armval;
-  if(genimm(imm2-imm1,&armval)) {
-    assem_debug("add %s,%s,#%d\n",regname[rt2],regname[rt1],imm2-imm1);
-    output_w32(0xe2800000|rd_rn_rm(rt2,rt1,0)|armval);
-  }else if(genimm(imm1-imm2,&armval)) {
-    assem_debug("sub %s,%s,#%d\n",regname[rt2],regname[rt1],imm1-imm2);
-    output_w32(0xe2400000|rd_rn_rm(rt2,rt1,0)|armval);
-  }
-  else emit_movimm(imm2,rt2);
-}
-
-// Conditionally select one of two immediates, optimizing for small code size
-// This will only be called if HAVE_CMOV_IMM is defined
-static void emit_cmov2imm_e_ne_compact(int imm1,int imm2,u_int rt)
-{
-  u_int armval;
-  if(genimm(imm2-imm1,&armval)) {
-    emit_movimm(imm1,rt);
-    assem_debug("addne %s,%s,#%d\n",regname[rt],regname[rt],imm2-imm1);
-    output_w32(0x12800000|rd_rn_rm(rt,rt,0)|armval);
-  }else if(genimm(imm1-imm2,&armval)) {
-    emit_movimm(imm1,rt);
-    assem_debug("subne %s,%s,#%d\n",regname[rt],regname[rt],imm1-imm2);
-    output_w32(0x12400000|rd_rn_rm(rt,rt,0)|armval);
-  }
-  else {
-    #ifndef HAVE_ARMV7
-    emit_movimm(imm1,rt);
-    add_literal((int)out,imm2);
-    assem_debug("ldrne %s,pc+? [=%x]\n",regname[rt],imm2);
-    output_w32(0x15900000|rd_rn_rm(rt,15,0));
-    #else
-    emit_movw(imm1&0x0000FFFF,rt);
-    if((imm1&0xFFFF)!=(imm2&0xFFFF)) {
-      assem_debug("movwne %s,#%d (0x%x)\n",regname[rt],imm2&0xFFFF,imm2&0xFFFF);
-      output_w32(0x13000000|rd_rn_rm(rt,0,0)|(imm2&0xfff)|((imm2<<4)&0xf0000));
-    }
-    emit_movt(imm1&0xFFFF0000,rt);
-    if((imm1&0xFFFF0000)!=(imm2&0xFFFF0000)) {
-      assem_debug("movtne %s,#%d (0x%x)\n",regname[rt],imm2&0xffff0000,imm2&0xffff0000);
-      output_w32(0x13400000|rd_rn_rm(rt,0,0)|((imm2>>16)&0xfff)|((imm2>>12)&0xf0000));
-    }
-    #endif
-  }
-}
-
-// special case for checking invalid_code
-static void emit_cmpmem_indexedsr12_reg(int base,int r,int imm)
-{
-  assert(imm<128&&imm>=0);
-  assert(r>=0&&r<16);
-  assem_debug("ldrb lr,%s,%s lsr #12\n",regname[base],regname[r]);
-  output_w32(0xe7d00000|rd_rn_rm(HOST_TEMPREG,base,r)|0x620);
-  emit_cmpimm(HOST_TEMPREG,imm);
-}
-
-static void emit_callne(int a)
-{
-  assem_debug("blne %x\n",a);
-  u_int offset=genjmp(a);
-  output_w32(0x1b000000|offset);
-}
-
-// Used to preload hash table entries
-static unused void emit_prefetchreg(int r)
-{
-  assem_debug("pld %s\n",regname[r]);
-  output_w32(0xf5d0f000|rd_rn_rm(0,r,0));
-}
-
-// Special case for mini_ht
-static void emit_ldreq_indexed(int rs, u_int offset, int rt)
-{
-  assert(offset<4096);
-  assem_debug("ldreq %s,[%s, #%d]\n",regname[rt],regname[rs],offset);
-  output_w32(0x05900000|rd_rn_rm(rt,rs,0)|offset);
-}
-
-static unused void emit_bicne_imm(int rs,int imm,int rt)
-{
-  u_int armval;
-  genimm_checked(imm,&armval);
-  assem_debug("bicne %s,%s,#%d\n",regname[rt],regname[rs],imm);
-  output_w32(0x13c00000|rd_rn_rm(rt,rs,0)|armval);
-}
-
-static unused void emit_biccs_imm(int rs,int imm,int rt)
-{
-  u_int armval;
-  genimm_checked(imm,&armval);
-  assem_debug("biccs %s,%s,#%d\n",regname[rt],regname[rs],imm);
-  output_w32(0x23c00000|rd_rn_rm(rt,rs,0)|armval);
-}
-
-static unused void emit_bicvc_imm(int rs,int imm,int rt)
-{
-  u_int armval;
-  genimm_checked(imm,&armval);
-  assem_debug("bicvc %s,%s,#%d\n",regname[rt],regname[rs],imm);
-  output_w32(0x73c00000|rd_rn_rm(rt,rs,0)|armval);
-}
-
-static unused void emit_bichi_imm(int rs,int imm,int rt)
-{
-  u_int armval;
-  genimm_checked(imm,&armval);
-  assem_debug("bichi %s,%s,#%d\n",regname[rt],regname[rs],imm);
-  output_w32(0x83c00000|rd_rn_rm(rt,rs,0)|armval);
-}
-
-static unused void emit_orrvs_imm(int rs,int imm,int rt)
-{
-  u_int armval;
-  genimm_checked(imm,&armval);
-  assem_debug("orrvs %s,%s,#%d\n",regname[rt],regname[rs],imm);
-  output_w32(0x63800000|rd_rn_rm(rt,rs,0)|armval);
-}
-
-static void emit_orrne_imm(int rs,int imm,int rt)
-{
-  u_int armval;
-  genimm_checked(imm,&armval);
-  assem_debug("orrne %s,%s,#%d\n",regname[rt],regname[rs],imm);
-  output_w32(0x13800000|rd_rn_rm(rt,rs,0)|armval);
-}
-
-static void emit_andne_imm(int rs,int imm,int rt)
-{
-  u_int armval;
-  genimm_checked(imm,&armval);
-  assem_debug("andne %s,%s,#%d\n",regname[rt],regname[rs],imm);
-  output_w32(0x12000000|rd_rn_rm(rt,rs,0)|armval);
-}
-
-static unused void emit_addpl_imm(int rs,int imm,int rt)
-{
-  u_int armval;
-  genimm_checked(imm,&armval);
-  assem_debug("addpl %s,%s,#%d\n",regname[rt],regname[rs],imm);
-  output_w32(0x52800000|rd_rn_rm(rt,rs,0)|armval);
-}
-
-static void emit_jno_unlikely(int a)
-{
-  //emit_jno(a);
-  assem_debug("addvc pc,pc,#? (%x)\n",/*a-(int)out-8,*/a);
-  output_w32(0x72800000|rd_rn_rm(15,15,0));
-}
-
-static void save_regs_all(u_int reglist)
-{
-  int i;
-  if(!reglist) return;
-  assem_debug("stmia fp,{");
-  for(i=0;i<16;i++)
-    if(reglist&(1<<i))
-      assem_debug("r%d,",i);
-  assem_debug("}\n");
-  output_w32(0xe88b0000|reglist);
-}
-
-static void restore_regs_all(u_int reglist)
-{
-  int i;
-  if(!reglist) return;
-  assem_debug("ldmia fp,{");
-  for(i=0;i<16;i++)
-    if(reglist&(1<<i))
-      assem_debug("r%d,",i);
-  assem_debug("}\n");
-  output_w32(0xe89b0000|reglist);
-}
-
-// Save registers before function call
-static void save_regs(u_int reglist)
-{
-  reglist&=CALLER_SAVE_REGS; // only save the caller-save registers, r0-r3, r12
-  save_regs_all(reglist);
-}
-
-// Restore registers after function call
-static void restore_regs(u_int reglist)
-{
-  reglist&=CALLER_SAVE_REGS;
-  restore_regs_all(reglist);
-}
-
-/* Stubs/epilogue */
-
-static void literal_pool(int n)
-{
-  if(!literalcount) return;
-  if(n) {
-    if((int)out-literals[0][0]<4096-n) return;
-  }
-  u_int *ptr;
-  int i;
-  for(i=0;i<literalcount;i++)
-  {
-    u_int l_addr=(u_int)out;
-    int j;
-    for(j=0;j<i;j++) {
-      if(literals[j][1]==literals[i][1]) {
-        //printf("dup %08x\n",literals[i][1]);
-        l_addr=literals[j][0];
-        break;
-      }
-    }
-    ptr=(u_int *)literals[i][0];
-    u_int offset=l_addr-(u_int)ptr-8;
-    assert(offset<4096);
-    assert(!(offset&3));
-    *ptr|=offset;
-    if(l_addr==(u_int)out) {
-      literals[i][0]=l_addr; // remember for dupes
-      output_w32(literals[i][1]);
-    }
-  }
-  literalcount=0;
-}
-
-static void literal_pool_jumpover(int n)
-{
-  if(!literalcount) return;
-  if(n) {
-    if((int)out-literals[0][0]<4096-n) return;
-  }
-  int jaddr=(int)out;
-  emit_jmp(0);
-  literal_pool(0);
-  set_jump_target(jaddr,(int)out);
-}
-
-static void emit_extjump2(u_int addr, int target, int linker)
-{
-  u_char *ptr=(u_char *)addr;
-  assert((ptr[3]&0x0e)==0xa);
-  (void)ptr;
-
-  emit_loadlp(target,0);
-  emit_loadlp(addr,1);
-  assert(addr>=BASE_ADDR&&addr<(BASE_ADDR+(1<<TARGET_SIZE_2)));
-  //assert((target>=0x80000000&&target<0x80800000)||(target>0xA4000000&&target<0xA4001000));
-//DEBUG >
-#ifdef DEBUG_CYCLE_COUNT
-  emit_readword((int)&last_count,ECX);
-  emit_add(HOST_CCREG,ECX,HOST_CCREG);
-  emit_readword((int)&next_interupt,ECX);
-  emit_writeword(HOST_CCREG,(int)&Count);
-  emit_sub(HOST_CCREG,ECX,HOST_CCREG);
-  emit_writeword(ECX,(int)&last_count);
-#endif
-//DEBUG <
-  emit_jmp(linker);
-}
-
-static void emit_extjump(int addr, int target)
-{
-  emit_extjump2(addr, target, (int)dyna_linker);
-}
-
-static void emit_extjump_ds(int addr, int target)
-{
-  emit_extjump2(addr, target, (int)dyna_linker_ds);
-}
-
-// put rt_val into rt, potentially making use of rs with value rs_val
-static void emit_movimm_from(u_int rs_val,int rs,u_int rt_val,int rt)
-{
-  u_int armval;
-  int diff;
-  if(genimm(rt_val,&armval)) {
-    assem_debug("mov %s,#%d\n",regname[rt],rt_val);
-    output_w32(0xe3a00000|rd_rn_rm(rt,0,0)|armval);
-    return;
-  }
-  if(genimm(~rt_val,&armval)) {
-    assem_debug("mvn %s,#%d\n",regname[rt],rt_val);
-    output_w32(0xe3e00000|rd_rn_rm(rt,0,0)|armval);
-    return;
-  }
-  diff=rt_val-rs_val;
-  if(genimm(diff,&armval)) {
-    assem_debug("add %s,%s,#%d\n",regname[rt],regname[rs],diff);
-    output_w32(0xe2800000|rd_rn_rm(rt,rs,0)|armval);
-    return;
-  }else if(genimm(-diff,&armval)) {
-    assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rs],-diff);
-    output_w32(0xe2400000|rd_rn_rm(rt,rs,0)|armval);
-    return;
-  }
-  emit_movimm(rt_val,rt);
-}
-
-// return 1 if above function can do it's job cheaply
-static int is_similar_value(u_int v1,u_int v2)
-{
-  u_int xs;
-  int diff;
-  if(v1==v2) return 1;
-  diff=v2-v1;
-  for(xs=diff;xs!=0&&(xs&3)==0;xs>>=2)
-    ;
-  if(xs<0x100) return 1;
-  for(xs=-diff;xs!=0&&(xs&3)==0;xs>>=2)
-    ;
-  if(xs<0x100) return 1;
-  return 0;
-}
-
-// trashes r2
-static void pass_args(int a0, int a1)
-{
-  if(a0==1&&a1==0) {
-    // must swap
-    emit_mov(a0,2); emit_mov(a1,1); emit_mov(2,0);
-  }
-  else if(a0!=0&&a1==0) {
-    emit_mov(a1,1);
-    if (a0>=0) emit_mov(a0,0);
-  }
-  else {
-    if(a0>=0&&a0!=0) emit_mov(a0,0);
-    if(a1>=0&&a1!=1) emit_mov(a1,1);
-  }
-}
-
-static void mov_loadtype_adj(int type,int rs,int rt)
-{
-  switch(type) {
-    case LOADB_STUB:  emit_signextend8(rs,rt); break;
-    case LOADBU_STUB: emit_andimm(rs,0xff,rt); break;
-    case LOADH_STUB:  emit_signextend16(rs,rt); break;
-    case LOADHU_STUB: emit_andimm(rs,0xffff,rt); break;
-    case LOADW_STUB:  if(rs!=rt) emit_mov(rs,rt); break;
-    default: assert(0);
-  }
-}
-
-#include "../backends/psx/pcsxmem.h"
-#include "../backends/psx/pcsxmem_inline.c"
-
-static void do_readstub(int n)
-{
-  assem_debug("do_readstub %x\n",start+stubs[n][3]*4);
-  literal_pool(256);
-  set_jump_target(stubs[n][1],(int)out);
-  int type=stubs[n][0];
-  int i=stubs[n][3];
-  int rs=stubs[n][4];
-  struct regstat *i_regs=(struct regstat *)stubs[n][5];
-  u_int reglist=stubs[n][7];
-  signed char *i_regmap=i_regs->regmap;
-  int rt;
-  if(itype[i]==C1LS||itype[i]==C2LS||itype[i]==LOADLR) {
-    rt=get_reg(i_regmap,FTEMP);
-  }else{
-    rt=get_reg(i_regmap,rt1[i]);
-  }
-  assert(rs>=0);
-  int r,temp=-1,temp2=HOST_TEMPREG,regs_saved=0,restore_jump=0;
-  reglist|=(1<<rs);
-  for(r=0;r<=12;r++) {
-    if(((1<<r)&0x13ff)&&((1<<r)&reglist)==0) {
-      temp=r; break;
-    }
-  }
-  if(rt>=0&&rt1[i]!=0)
-    reglist&=~(1<<rt);
-  if(temp==-1) {
-    save_regs(reglist);
-    regs_saved=1;
-    temp=(rs==0)?2:0;
-  }
-  if((regs_saved||(reglist&2)==0)&&temp!=1&&rs!=1)
-    temp2=1;
-  emit_readword((int)&mem_rtab,temp);
-  emit_shrimm(rs,12,temp2);
-  emit_readword_dualindexedx4(temp,temp2,temp2);
-  emit_lsls_imm(temp2,1,temp2);
-  if(itype[i]==C1LS||itype[i]==C2LS||(rt>=0&&rt1[i]!=0)) {
-    switch(type) {
-      case LOADB_STUB:  emit_ldrccsb_dualindexed(temp2,rs,rt); break;
-      case LOADBU_STUB: emit_ldrccb_dualindexed(temp2,rs,rt); break;
-      case LOADH_STUB:  emit_ldrccsh_dualindexed(temp2,rs,rt); break;
-      case LOADHU_STUB: emit_ldrcch_dualindexed(temp2,rs,rt); break;
-      case LOADW_STUB:  emit_ldrcc_dualindexed(temp2,rs,rt); break;
-    }
-  }
-  if(regs_saved) {
-    restore_jump=(int)out;
-    emit_jcc(0); // jump to reg restore
-  }
-  else
-    emit_jcc(stubs[n][2]); // return address
-
-  if(!regs_saved)
-    save_regs(reglist);
-  int handler=0;
-  if(type==LOADB_STUB||type==LOADBU_STUB)
-    handler=(int)jump_handler_read8;
-  if(type==LOADH_STUB||type==LOADHU_STUB)
-    handler=(int)jump_handler_read16;
-  if(type==LOADW_STUB)
-    handler=(int)jump_handler_read32;
-  assert(handler!=0);
-  pass_args(rs,temp2);
-  int cc=get_reg(i_regmap,CCREG);
-  if(cc<0)
-    emit_loadreg(CCREG,2);
-  emit_addimm(cc<0?2:cc,CLOCK_ADJUST((int)stubs[n][6]+1),2);
-  emit_call(handler);
-  if(itype[i]==C1LS||itype[i]==C2LS||(rt>=0&&rt1[i]!=0)) {
-    mov_loadtype_adj(type,0,rt);
-  }
-  if(restore_jump)
-    set_jump_target(restore_jump,(int)out);
-  restore_regs(reglist);
-  emit_jmp(stubs[n][2]); // return address
-}
-
-// return memhandler, or get directly accessable address and return 0
-static u_int get_direct_memhandler(void *table,u_int addr,int type,u_int *addr_host)
-{
-  u_int l1,l2=0;
-  l1=((u_int *)table)[addr>>12];
-  if((l1&(1<<31))==0) {
-    u_int v=l1<<1;
-    *addr_host=v+addr;
-    return 0;
-  }
-  else {
-    l1<<=1;
-    if(type==LOADB_STUB||type==LOADBU_STUB||type==STOREB_STUB)
-      l2=((u_int *)l1)[0x1000/4 + 0x1000/2 + (addr&0xfff)];
-    else if(type==LOADH_STUB||type==LOADHU_STUB||type==STOREH_STUB)
-      l2=((u_int *)l1)[0x1000/4 + (addr&0xfff)/2];
-    else
-      l2=((u_int *)l1)[(addr&0xfff)/4];
-    if((l2&(1<<31))==0) {
-      u_int v=l2<<1;
-      *addr_host=v+(addr&0xfff);
-      return 0;
-    }
-    return l2<<1;
-  }
-}
-
-static void inline_readstub(int type, int i, u_int addr, signed char regmap[], int target, int adj, u_int reglist)
-{
-  int rs=get_reg(regmap,target);
-  int rt=get_reg(regmap,target);
-  if(rs<0) rs=get_reg(regmap,-1);
-  assert(rs>=0);
-  u_int handler,host_addr=0,is_dynamic,far_call=0;
-  int cc=get_reg(regmap,CCREG);
-  if(pcsx_direct_read(type,addr,CLOCK_ADJUST(adj+1),cc,target?rs:-1,rt))
-    return;
-  handler=get_direct_memhandler(mem_rtab,addr,type,&host_addr);
-  if (handler==0) {
-    if(rt<0||rt1[i]==0)
-      return;
-    if(addr!=host_addr)
-      emit_movimm_from(addr,rs,host_addr,rs);
-    switch(type) {
-      case LOADB_STUB:  emit_movsbl_indexed(0,rs,rt); break;
-      case LOADBU_STUB: emit_movzbl_indexed(0,rs,rt); break;
-      case LOADH_STUB:  emit_movswl_indexed(0,rs,rt); break;
-      case LOADHU_STUB: emit_movzwl_indexed(0,rs,rt); break;
-      case LOADW_STUB:  emit_readword_indexed(0,rs,rt); break;
-      default:          assert(0);
-    }
-    return;
-  }
-  is_dynamic=pcsxmem_is_handler_dynamic(addr);
-  if(is_dynamic) {
-    if(type==LOADB_STUB||type==LOADBU_STUB)
-      handler=(int)jump_handler_read8;
-    if(type==LOADH_STUB||type==LOADHU_STUB)
-      handler=(int)jump_handler_read16;
-    if(type==LOADW_STUB)
-      handler=(int)jump_handler_read32;
-  }
-
-  // call a memhandler
-  if(rt>=0&&rt1[i]!=0)
-    reglist&=~(1<<rt);
-  save_regs(reglist);
-  if(target==0)
-    emit_movimm(addr,0);
-  else if(rs!=0)
-    emit_mov(rs,0);
-  int offset=(int)handler-(int)out-8;
-  if(offset<-33554432||offset>=33554432) {
-    // unreachable memhandler, a plugin func perhaps
-    emit_movimm(handler,12);
-    far_call=1;
-  }
-  if(cc<0)
-    emit_loadreg(CCREG,2);
-  if(is_dynamic) {
-    emit_movimm(((u_int *)mem_rtab)[addr>>12]<<1,1);
-    emit_addimm(cc<0?2:cc,CLOCK_ADJUST(adj+1),2);
-  }
-  else {
-    emit_readword((int)&last_count,3);
-    emit_addimm(cc<0?2:cc,CLOCK_ADJUST(adj+1),2);
-    emit_add(2,3,2);
-    emit_writeword(2,(int)&Count);
-  }
-
-  if(far_call)
-    emit_callreg(12);
-  else
-    emit_call(handler);
-
-  if(rt>=0&&rt1[i]!=0) {
-    switch(type) {
-      case LOADB_STUB:  emit_signextend8(0,rt); break;
-      case LOADBU_STUB: emit_andimm(0,0xff,rt); break;
-      case LOADH_STUB:  emit_signextend16(0,rt); break;
-      case LOADHU_STUB: emit_andimm(0,0xffff,rt); break;
-      case LOADW_STUB:  if(rt!=0) emit_mov(0,rt); break;
-      default:          assert(0);
-    }
-  }
-  restore_regs(reglist);
-}
-
-static void do_writestub(int n)
-{
-  assem_debug("do_writestub %x\n",start+stubs[n][3]*4);
-  literal_pool(256);
-  set_jump_target(stubs[n][1],(int)out);
-  int type=stubs[n][0];
-  int i=stubs[n][3];
-  int rs=stubs[n][4];
-  struct regstat *i_regs=(struct regstat *)stubs[n][5];
-  u_int reglist=stubs[n][7];
-  signed char *i_regmap=i_regs->regmap;
-  int rt,r;
-  if(itype[i]==C1LS||itype[i]==C2LS) {
-    rt=get_reg(i_regmap,r=FTEMP);
-  }else{
-    rt=get_reg(i_regmap,r=rs2[i]);
-  }
-  assert(rs>=0);
-  assert(rt>=0);
-  int rtmp,temp=-1,temp2=HOST_TEMPREG,regs_saved=0,restore_jump=0,ra;
-  int reglist2=reglist|(1<<rs)|(1<<rt);
-  for(rtmp=0;rtmp<=12;rtmp++) {
-    if(((1<<rtmp)&0x13ff)&&((1<<rtmp)&reglist2)==0) {
-      temp=rtmp; break;
-    }
-  }
-  if(temp==-1) {
-    save_regs(reglist);
-    regs_saved=1;
-    for(rtmp=0;rtmp<=3;rtmp++)
-      if(rtmp!=rs&&rtmp!=rt)
-        {temp=rtmp;break;}
-  }
-  if((regs_saved||(reglist2&8)==0)&&temp!=3&&rs!=3&&rt!=3)
-    temp2=3;
-  emit_readword((int)&mem_wtab,temp);
-  emit_shrimm(rs,12,temp2);
-  emit_readword_dualindexedx4(temp,temp2,temp2);
-  emit_lsls_imm(temp2,1,temp2);
-  switch(type) {
-    case STOREB_STUB: emit_strccb_dualindexed(temp2,rs,rt); break;
-    case STOREH_STUB: emit_strcch_dualindexed(temp2,rs,rt); break;
-    case STOREW_STUB: emit_strcc_dualindexed(temp2,rs,rt); break;
-    default:          assert(0);
-  }
-  if(regs_saved) {
-    restore_jump=(int)out;
-    emit_jcc(0); // jump to reg restore
-  }
-  else
-    emit_jcc(stubs[n][2]); // return address (invcode check)
-
-  if(!regs_saved)
-    save_regs(reglist);
-  int handler=0;
-  switch(type) {
-    case STOREB_STUB: handler=(int)jump_handler_write8; break;
-    case STOREH_STUB: handler=(int)jump_handler_write16; break;
-    case STOREW_STUB: handler=(int)jump_handler_write32; break;
-  }
-  assert(handler!=0);
-  pass_args(rs,rt);
-  if(temp2!=3)
-    emit_mov(temp2,3);
-  int cc=get_reg(i_regmap,CCREG);
-  if(cc<0)
-    emit_loadreg(CCREG,2);
-  emit_addimm(cc<0?2:cc,CLOCK_ADJUST((int)stubs[n][6]+1),2);
-  // returns new cycle_count
-  emit_call(handler);
-  emit_addimm(0,-CLOCK_ADJUST((int)stubs[n][6]+1),cc<0?2:cc);
-  if(cc<0)
-    emit_storereg(CCREG,2);
-  if(restore_jump)
-    set_jump_target(restore_jump,(int)out);
-  restore_regs(reglist);
-  ra=stubs[n][2];
-  emit_jmp(ra);
-}
-
-static void inline_writestub(int type, int i, u_int addr, signed char regmap[], int target, int adj, u_int reglist)
-{
-  int rs=get_reg(regmap,-1);
-  int rt=get_reg(regmap,target);
-  assert(rs>=0);
-  assert(rt>=0);
-  u_int handler,host_addr=0;
-  handler=get_direct_memhandler(mem_wtab,addr,type,&host_addr);
-  if (handler==0) {
-    if(addr!=host_addr)
-      emit_movimm_from(addr,rs,host_addr,rs);
-    switch(type) {
-      case STOREB_STUB: emit_writebyte_indexed(rt,0,rs); break;
-      case STOREH_STUB: emit_writehword_indexed(rt,0,rs); break;
-      case STOREW_STUB: emit_writeword_indexed(rt,0,rs); break;
-      default:          assert(0);
-    }
-    return;
-  }
-
-  // call a memhandler
-  save_regs(reglist);
-  pass_args(rs,rt);
-  int cc=get_reg(regmap,CCREG);
-  if(cc<0)
-    emit_loadreg(CCREG,2);
-  emit_addimm(cc<0?2:cc,CLOCK_ADJUST(adj+1),2);
-  emit_movimm(handler,3);
-  // returns new cycle_count
-  emit_call((int)jump_handler_write_h);
-  emit_addimm(0,-CLOCK_ADJUST(adj+1),cc<0?2:cc);
-  if(cc<0)
-    emit_storereg(CCREG,2);
-  restore_regs(reglist);
-}
-
-static void do_unalignedwritestub(int n)
-{
-  assem_debug("do_unalignedwritestub %x\n",start+stubs[n][3]*4);
-  literal_pool(256);
-  set_jump_target(stubs[n][1],(int)out);
-
-  int i=stubs[n][3];
-  struct regstat *i_regs=(struct regstat *)stubs[n][4];
-  int addr=stubs[n][5];
-  u_int reglist=stubs[n][7];
-  signed char *i_regmap=i_regs->regmap;
-  int temp2=get_reg(i_regmap,FTEMP);
-  int rt;
-  rt=get_reg(i_regmap,rs2[i]);
-  assert(rt>=0);
-  assert(addr>=0);
-  assert(opcode[i]==0x2a||opcode[i]==0x2e); // SWL/SWR only implemented
-  reglist|=(1<<addr);
-  reglist&=~(1<<temp2);
-
-#if 1
-  // don't bother with it and call write handler
-  save_regs(reglist);
-  pass_args(addr,rt);
-  int cc=get_reg(i_regmap,CCREG);
-  if(cc<0)
-    emit_loadreg(CCREG,2);
-  emit_addimm(cc<0?2:cc,CLOCK_ADJUST((int)stubs[n][6]+1),2);
-  emit_call((int)(opcode[i]==0x2a?jump_handle_swl:jump_handle_swr));
-  emit_addimm(0,-CLOCK_ADJUST((int)stubs[n][6]+1),cc<0?2:cc);
-  if(cc<0)
-    emit_storereg(CCREG,2);
-  restore_regs(reglist);
-  emit_jmp(stubs[n][2]); // return address
-#else
-  emit_andimm(addr,0xfffffffc,temp2);
-  emit_writeword(temp2,(int)&address);
-
-  save_regs(reglist);
-  emit_shrimm(addr,16,1);
-  int cc=get_reg(i_regmap,CCREG);
-  if(cc<0) {
-    emit_loadreg(CCREG,2);
-  }
-  emit_movimm((u_int)readmem,0);
-  emit_addimm(cc<0?2:cc,2*stubs[n][6]+2,2);
-  emit_call((int)&indirect_jump_indexed);
-  restore_regs(reglist);
-
-  emit_readword((int)&readmem_dword,temp2);
-  int temp=addr; //hmh
-  emit_shlimm(addr,3,temp);
-  emit_andimm(temp,24,temp);
-#ifdef BIG_ENDIAN_MIPS
-  if (opcode[i]==0x2e) // SWR
-#else
-  if (opcode[i]==0x2a) // SWL
-#endif
-    emit_xorimm(temp,24,temp);
-  emit_movimm(-1,HOST_TEMPREG);
-  if (opcode[i]==0x2a) { // SWL
-    emit_bic_lsr(temp2,HOST_TEMPREG,temp,temp2);
-    emit_orrshr(rt,temp,temp2);
-  }else{
-    emit_bic_lsl(temp2,HOST_TEMPREG,temp,temp2);
-    emit_orrshl(rt,temp,temp2);
-  }
-  emit_readword((int)&address,addr);
-  emit_writeword(temp2,(int)&word);
-  //save_regs(reglist); // don't need to, no state changes
-  emit_shrimm(addr,16,1);
-  emit_movimm((u_int)writemem,0);
-  //emit_call((int)&indirect_jump_indexed);
-  emit_mov(15,14);
-  emit_readword_dualindexedx4(0,1,15);
-  emit_readword((int)&Count,HOST_TEMPREG);
-  emit_readword((int)&next_interupt,2);
-  emit_addimm(HOST_TEMPREG,-2*stubs[n][6]-2,HOST_TEMPREG);
-  emit_writeword(2,(int)&last_count);
-  emit_sub(HOST_TEMPREG,2,cc<0?HOST_TEMPREG:cc);
-  if(cc<0) {
-    emit_storereg(CCREG,HOST_TEMPREG);
-  }
-  restore_regs(reglist);
-  emit_jmp(stubs[n][2]); // return address
-#endif
-}
-
-static void do_invstub(int n)
-{
-  literal_pool(20);
-  u_int reglist=stubs[n][3];
-  set_jump_target(stubs[n][1],(int)out);
-  save_regs(reglist);
-  if(stubs[n][4]!=0) emit_mov(stubs[n][4],0);
-  emit_call((int)&invalidate_addr);
-  restore_regs(reglist);
-  emit_jmp(stubs[n][2]); // return address
-}
-
-int do_dirty_stub(int i)
-{
-  assem_debug("do_dirty_stub %x\n",start+i*4);
-  u_int addr=(u_int)source;
-  // Careful about the code output here, verify_dirty needs to parse it.
-  #ifndef HAVE_ARMV7
-  emit_loadlp(addr,1);
-  emit_loadlp((int)copy,2);
-  emit_loadlp(slen*4,3);
-  #else
-  emit_movw(addr&0x0000FFFF,1);
-  emit_movw(((u_int)copy)&0x0000FFFF,2);
-  emit_movt(addr&0xFFFF0000,1);
-  emit_movt(((u_int)copy)&0xFFFF0000,2);
-  emit_movw(slen*4,3);
-  #endif
-  emit_movimm(start+i*4,0);
-  emit_call((int)start<(int)0xC0000000?(int)&verify_code:(int)&verify_code_vm);
-  int entry=(int)out;
-  load_regs_entry(i);
-  if(entry==(int)out) entry=instr_addr[i];
-  emit_jmp(instr_addr[i]);
-  return entry;
-}
-
-static void do_dirty_stub_ds()
-{
-  // Careful about the code output here, verify_dirty needs to parse it.
-  #ifndef HAVE_ARMV7
-  emit_loadlp((int)start<(int)0xC0000000?(int)source:(int)start,1);
-  emit_loadlp((int)copy,2);
-  emit_loadlp(slen*4,3);
-  #else
-  emit_movw(((int)start<(int)0xC0000000?(u_int)source:(u_int)start)&0x0000FFFF,1);
-  emit_movw(((u_int)copy)&0x0000FFFF,2);
-  emit_movt(((int)start<(int)0xC0000000?(u_int)source:(u_int)start)&0xFFFF0000,1);
-  emit_movt(((u_int)copy)&0xFFFF0000,2);
-  emit_movw(slen*4,3);
-  #endif
-  emit_movimm(start+1,0);
-  emit_call((int)&verify_code_ds);
-}
-
-static void do_cop1stub(int n)
-{
-  literal_pool(256);
-  assem_debug("do_cop1stub %x\n",start+stubs[n][3]*4);
-  set_jump_target(stubs[n][1],(int)out);
-  int i=stubs[n][3];
-//  int rs=stubs[n][4];
-  struct regstat *i_regs=(struct regstat *)stubs[n][5];
-  int ds=stubs[n][6];
-  if(!ds) {
-    load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
-    //if(i_regs!=&regs[i]) printf("oops: regs[i]=%x i_regs=%x",(int)&regs[i],(int)i_regs);
-  }
-  //else {printf("fp exception in delay slot\n");}
-  wb_dirtys(i_regs->regmap_entry,i_regs->was32,i_regs->wasdirty);
-  if(regs[i].regmap_entry[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
-  emit_movimm(start+(i-ds)*4,EAX); // Get PC
-  emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
-  emit_jmp(ds?(int)fp_exception_ds:(int)fp_exception);
-}
-
-/* Special assem */
-
-static void shift_assemble_arm(int i,struct regstat *i_regs)
-{
-  if(rt1[i]) {
-    if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
-    {
-      signed char s,t,shift;
-      t=get_reg(i_regs->regmap,rt1[i]);
-      s=get_reg(i_regs->regmap,rs1[i]);
-      shift=get_reg(i_regs->regmap,rs2[i]);
-      if(t>=0){
-        if(rs1[i]==0)
-        {
-          emit_zeroreg(t);
-        }
-        else if(rs2[i]==0)
-        {
-          assert(s>=0);
-          if(s!=t) emit_mov(s,t);
-        }
-        else
-        {
-          emit_andimm(shift,31,HOST_TEMPREG);
-          if(opcode2[i]==4) // SLLV
-          {
-            emit_shl(s,HOST_TEMPREG,t);
-          }
-          if(opcode2[i]==6) // SRLV
-          {
-            emit_shr(s,HOST_TEMPREG,t);
-          }
-          if(opcode2[i]==7) // SRAV
-          {
-            emit_sar(s,HOST_TEMPREG,t);
-          }
-        }
-      }
-    } else { // DSLLV/DSRLV/DSRAV
-      signed char sh,sl,th,tl,shift;
-      th=get_reg(i_regs->regmap,rt1[i]|64);
-      tl=get_reg(i_regs->regmap,rt1[i]);
-      sh=get_reg(i_regs->regmap,rs1[i]|64);
-      sl=get_reg(i_regs->regmap,rs1[i]);
-      shift=get_reg(i_regs->regmap,rs2[i]);
-      if(tl>=0){
-        if(rs1[i]==0)
-        {
-          emit_zeroreg(tl);
-          if(th>=0) emit_zeroreg(th);
-        }
-        else if(rs2[i]==0)
-        {
-          assert(sl>=0);
-          if(sl!=tl) emit_mov(sl,tl);
-          if(th>=0&&sh!=th) emit_mov(sh,th);
-        }
-        else
-        {
-          // FIXME: What if shift==tl ?
-          assert(shift!=tl);
-          int temp=get_reg(i_regs->regmap,-1);
-          int real_th=th;
-          if(th<0&&opcode2[i]!=0x14) {th=temp;} // DSLLV doesn't need a temporary register
-          assert(sl>=0);
-          assert(sh>=0);
-          emit_andimm(shift,31,HOST_TEMPREG);
-          if(opcode2[i]==0x14) // DSLLV
-          {
-            if(th>=0) emit_shl(sh,HOST_TEMPREG,th);
-            emit_rsbimm(HOST_TEMPREG,32,HOST_TEMPREG);
-            emit_orrshr(sl,HOST_TEMPREG,th);
-            emit_andimm(shift,31,HOST_TEMPREG);
-            emit_testimm(shift,32);
-            emit_shl(sl,HOST_TEMPREG,tl);
-            if(th>=0) emit_cmovne_reg(tl,th);
-            emit_cmovne_imm(0,tl);
-          }
-          if(opcode2[i]==0x16) // DSRLV
-          {
-            assert(th>=0);
-            emit_shr(sl,HOST_TEMPREG,tl);
-            emit_rsbimm(HOST_TEMPREG,32,HOST_TEMPREG);
-            emit_orrshl(sh,HOST_TEMPREG,tl);
-            emit_andimm(shift,31,HOST_TEMPREG);
-            emit_testimm(shift,32);
-            emit_shr(sh,HOST_TEMPREG,th);
-            emit_cmovne_reg(th,tl);
-            if(real_th>=0) emit_cmovne_imm(0,th);
-          }
-          if(opcode2[i]==0x17) // DSRAV
-          {
-            assert(th>=0);
-            emit_shr(sl,HOST_TEMPREG,tl);
-            emit_rsbimm(HOST_TEMPREG,32,HOST_TEMPREG);
-            if(real_th>=0) {
-              assert(temp>=0);
-              emit_sarimm(th,31,temp);
-            }
-            emit_orrshl(sh,HOST_TEMPREG,tl);
-            emit_andimm(shift,31,HOST_TEMPREG);
-            emit_testimm(shift,32);
-            emit_sar(sh,HOST_TEMPREG,th);
-            emit_cmovne_reg(th,tl);
-            if(real_th>=0) emit_cmovne_reg(temp,th);
-          }
-        }
-      }
-    }
-  }
-}
-
-static void speculate_mov(int rs,int rt)
-{
-  if(rt!=0) {
-    smrv_strong_next|=1<<rt;
-    smrv[rt]=smrv[rs];
-  }
-}
-
-static void speculate_mov_weak(int rs,int rt)
-{
-  if(rt!=0) {
-    smrv_weak_next|=1<<rt;
-    smrv[rt]=smrv[rs];
-  }
-}
-
-static void speculate_register_values(int i)
-{
-  if(i==0) {
-    memcpy(smrv,psxRegs.GPR.r,sizeof(smrv));
-    // gp,sp are likely to stay the same throughout the block
-    smrv_strong_next=(1<<28)|(1<<29)|(1<<30);
-    smrv_weak_next=~smrv_strong_next;
-    //printf(" llr %08x\n", smrv[4]);
-  }
-  smrv_strong=smrv_strong_next;
-  smrv_weak=smrv_weak_next;
-  switch(itype[i]) {
-    case ALU:
-      if     ((smrv_strong>>rs1[i])&1) speculate_mov(rs1[i],rt1[i]);
-      else if((smrv_strong>>rs2[i])&1) speculate_mov(rs2[i],rt1[i]);
-      else if((smrv_weak>>rs1[i])&1) speculate_mov_weak(rs1[i],rt1[i]);
-      else if((smrv_weak>>rs2[i])&1) speculate_mov_weak(rs2[i],rt1[i]);
-      else {
-        smrv_strong_next&=~(1<<rt1[i]);
-        smrv_weak_next&=~(1<<rt1[i]);
-      }
-      break;
-    case SHIFTIMM:
-      smrv_strong_next&=~(1<<rt1[i]);
-      smrv_weak_next&=~(1<<rt1[i]);
-      // fallthrough
-    case IMM16:
-      if(rt1[i]&&is_const(&regs[i],rt1[i])) {
-        int value,hr=get_reg(regs[i].regmap,rt1[i]);
-        if(hr>=0) {
-          if(get_final_value(hr,i,&value))
-               smrv[rt1[i]]=value;
-          else smrv[rt1[i]]=constmap[i][hr];
-          smrv_strong_next|=1<<rt1[i];
-        }
-      }
-      else {
-        if     ((smrv_strong>>rs1[i])&1) speculate_mov(rs1[i],rt1[i]);
-        else if((smrv_weak>>rs1[i])&1) speculate_mov_weak(rs1[i],rt1[i]);
-      }
-      break;
-    case LOAD:
-      if(start<0x2000&&(rt1[i]==26||(smrv[rt1[i]]>>24)==0xa0)) {
-        // special case for BIOS
-        smrv[rt1[i]]=0xa0000000;
-        smrv_strong_next|=1<<rt1[i];
-        break;
-      }
-      // fallthrough
-    case SHIFT:
-    case LOADLR:
-    case MOV:
-      smrv_strong_next&=~(1<<rt1[i]);
-      smrv_weak_next&=~(1<<rt1[i]);
-      break;
-    case COP0:
-    case COP2:
-      if(opcode2[i]==0||opcode2[i]==2) { // MFC/CFC
-        smrv_strong_next&=~(1<<rt1[i]);
-        smrv_weak_next&=~(1<<rt1[i]);
-      }
-      break;
-    case C2LS:
-      if (opcode[i]==0x32) { // LWC2
-        smrv_strong_next&=~(1<<rt1[i]);
-        smrv_weak_next&=~(1<<rt1[i]);
-      }
-      break;
-  }
-#if 0
-  int r=4;
-  printf("x %08x %08x %d %d c %08x %08x\n",smrv[r],start+i*4,
-    ((smrv_strong>>r)&1),(smrv_weak>>r)&1,regs[i].isconst,regs[i].wasconst);
-#endif
-}
-
-enum {
-  MTYPE_8000 = 0,
-  MTYPE_8020,
-  MTYPE_0000,
-  MTYPE_A000,
-  MTYPE_1F80,
-};
-
-static int get_ptr_mem_type(u_int a)
-{
-  if(a < 0x00200000) {
-    if(a<0x1000&&((start>>20)==0xbfc||(start>>24)==0xa0))
-      // return wrong, must use memhandler for BIOS self-test to pass
-      // 007 does similar stuff from a00 mirror, weird stuff
-      return MTYPE_8000;
-    return MTYPE_0000;
-  }
-  if(0x1f800000 <= a && a < 0x1f801000)
-    return MTYPE_1F80;
-  if(0x80200000 <= a && a < 0x80800000)
-    return MTYPE_8020;
-  if(0xa0000000 <= a && a < 0xa0200000)
-    return MTYPE_A000;
-  return MTYPE_8000;
-}
-
-static int emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override)
-{
-  int jaddr=0,type=0;
-  int mr=rs1[i];
-  if(((smrv_strong|smrv_weak)>>mr)&1) {
-    type=get_ptr_mem_type(smrv[mr]);
-    //printf("set %08x @%08x r%d %d\n", smrv[mr], start+i*4, mr, type);
-  }
-  else {
-    // use the mirror we are running on
-    type=get_ptr_mem_type(start);
-    //printf("set nospec   @%08x r%d %d\n", start+i*4, mr, type);
-  }
-
-  if(type==MTYPE_8020) { // RAM 80200000+ mirror
-    emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
-    addr=*addr_reg_override=HOST_TEMPREG;
-    type=0;
-  }
-  else if(type==MTYPE_0000) { // RAM 0 mirror
-    emit_orimm(addr,0x80000000,HOST_TEMPREG);
-    addr=*addr_reg_override=HOST_TEMPREG;
-    type=0;
-  }
-  else if(type==MTYPE_A000) { // RAM A mirror
-    emit_andimm(addr,~0x20000000,HOST_TEMPREG);
-    addr=*addr_reg_override=HOST_TEMPREG;
-    type=0;
-  }
-  else if(type==MTYPE_1F80) { // scratchpad
-    if (psxH == (void *)0x1f800000) {
-      emit_addimm(addr,-0x1f800000,HOST_TEMPREG);
-      emit_cmpimm(HOST_TEMPREG,0x1000);
-      jaddr=(int)out;
-      emit_jc(0);
-    }
-    else {
-      // do usual RAM check, jump will go to the right handler
-      type=0;
-    }
-  }
-
-  if(type==0)
-  {
-    emit_cmpimm(addr,RAM_SIZE);
-    jaddr=(int)out;
-    #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
-    // Hint to branch predictor that the branch is unlikely to be taken
-    if(rs1[i]>=28)
-      emit_jno_unlikely(0);
-    else
-    #endif
-      emit_jno(0);
-    if(ram_offset!=0) {
-      emit_addimm(addr,ram_offset,HOST_TEMPREG);
-      addr=*addr_reg_override=HOST_TEMPREG;
-    }
-  }
-
-  return jaddr;
-}
-
-#define shift_assemble shift_assemble_arm
-
-static void loadlr_assemble_arm(int i,struct regstat *i_regs)
-{
-  int s,th,tl,temp,temp2,addr,map=-1;
-  int offset;
-  int jaddr=0;
-  int memtarget=0,c=0;
-  int fastload_reg_override=0;
-  u_int hr,reglist=0;
-  th=get_reg(i_regs->regmap,rt1[i]|64);
-  tl=get_reg(i_regs->regmap,rt1[i]);
-  s=get_reg(i_regs->regmap,rs1[i]);
-  temp=get_reg(i_regs->regmap,-1);
-  temp2=get_reg(i_regs->regmap,FTEMP);
-  addr=get_reg(i_regs->regmap,AGEN1+(i&1));
-  assert(addr<0);
-  offset=imm[i];
-  for(hr=0;hr<HOST_REGS;hr++) {
-    if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
-  }
-  reglist|=1<<temp;
-  if(offset||s<0||c) addr=temp2;
-  else addr=s;
-  if(s>=0) {
-    c=(i_regs->wasconst>>s)&1;
-    if(c) {
-      memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
-    }
-  }
-  if(!c) {
-    #ifdef RAM_OFFSET
-    map=get_reg(i_regs->regmap,ROREG);
-    if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
-    #endif
-    emit_shlimm(addr,3,temp);
-    if (opcode[i]==0x22||opcode[i]==0x26) {
-      emit_andimm(addr,0xFFFFFFFC,temp2); // LWL/LWR
-    }else{
-      emit_andimm(addr,0xFFFFFFF8,temp2); // LDL/LDR
-    }
-    jaddr=emit_fastpath_cmp_jump(i,temp2,&fastload_reg_override);
-  }
-  else {
-    if(ram_offset&&memtarget) {
-      emit_addimm(temp2,ram_offset,HOST_TEMPREG);
-      fastload_reg_override=HOST_TEMPREG;
-    }
-    if (opcode[i]==0x22||opcode[i]==0x26) {
-      emit_movimm(((constmap[i][s]+offset)<<3)&24,temp); // LWL/LWR
-    }else{
-      emit_movimm(((constmap[i][s]+offset)<<3)&56,temp); // LDL/LDR
-    }
-  }
-  if (opcode[i]==0x22||opcode[i]==0x26) { // LWL/LWR
-    if(!c||memtarget) {
-      int a=temp2;
-      if(fastload_reg_override) a=fastload_reg_override;
-      //emit_readword_indexed((int)rdram-0x80000000,temp2,temp2);
-      emit_readword_indexed_tlb(0,a,map,temp2);
-      if(jaddr) add_stub(LOADW_STUB,jaddr,(int)out,i,temp2,(int)i_regs,ccadj[i],reglist);
-    }
-    else
-      inline_readstub(LOADW_STUB,i,(constmap[i][s]+offset)&0xFFFFFFFC,i_regs->regmap,FTEMP,ccadj[i],reglist);
-    if(rt1[i]) {
-      assert(tl>=0);
-      emit_andimm(temp,24,temp);
-#ifdef BIG_ENDIAN_MIPS
-      if (opcode[i]==0x26) // LWR
-#else
-      if (opcode[i]==0x22) // LWL
-#endif
-        emit_xorimm(temp,24,temp);
-      emit_movimm(-1,HOST_TEMPREG);
-      if (opcode[i]==0x26) {
-        emit_shr(temp2,temp,temp2);
-        emit_bic_lsr(tl,HOST_TEMPREG,temp,tl);
-      }else{
-        emit_shl(temp2,temp,temp2);
-        emit_bic_lsl(tl,HOST_TEMPREG,temp,tl);
-      }
-      emit_or(temp2,tl,tl);
-    }
-    //emit_storereg(rt1[i],tl); // DEBUG
-  }
-  if (opcode[i]==0x1A||opcode[i]==0x1B) { // LDL/LDR
-    // FIXME: little endian, fastload_reg_override
-    int temp2h=get_reg(i_regs->regmap,FTEMP|64);
-    if(!c||memtarget) {
-      //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,temp2,temp2h);
-      //emit_readword_indexed((int)rdram-0x7FFFFFFC,temp2,temp2);
-      emit_readdword_indexed_tlb(0,temp2,map,temp2h,temp2);
-      if(jaddr) add_stub(LOADD_STUB,jaddr,(int)out,i,temp2,(int)i_regs,ccadj[i],reglist);
-    }
-    else
-      inline_readstub(LOADD_STUB,i,(constmap[i][s]+offset)&0xFFFFFFF8,i_regs->regmap,FTEMP,ccadj[i],reglist);
-    if(rt1[i]) {
-      assert(th>=0);
-      assert(tl>=0);
-      emit_testimm(temp,32);
-      emit_andimm(temp,24,temp);
-      if (opcode[i]==0x1A) { // LDL
-        emit_rsbimm(temp,32,HOST_TEMPREG);
-        emit_shl(temp2h,temp,temp2h);
-        emit_orrshr(temp2,HOST_TEMPREG,temp2h);
-        emit_movimm(-1,HOST_TEMPREG);
-        emit_shl(temp2,temp,temp2);
-        emit_cmove_reg(temp2h,th);
-        emit_biceq_lsl(tl,HOST_TEMPREG,temp,tl);
-        emit_bicne_lsl(th,HOST_TEMPREG,temp,th);
-        emit_orreq(temp2,tl,tl);
-        emit_orrne(temp2,th,th);
-      }
-      if (opcode[i]==0x1B) { // LDR
-        emit_xorimm(temp,24,temp);
-        emit_rsbimm(temp,32,HOST_TEMPREG);
-        emit_shr(temp2,temp,temp2);
-        emit_orrshl(temp2h,HOST_TEMPREG,temp2);
-        emit_movimm(-1,HOST_TEMPREG);
-        emit_shr(temp2h,temp,temp2h);
-        emit_cmovne_reg(temp2,tl);
-        emit_bicne_lsr(th,HOST_TEMPREG,temp,th);
-        emit_biceq_lsr(tl,HOST_TEMPREG,temp,tl);
-        emit_orrne(temp2h,th,th);
-        emit_orreq(temp2h,tl,tl);
-      }
-    }
-  }
-}
-#define loadlr_assemble loadlr_assemble_arm
-
-static void cop0_assemble(int i,struct regstat *i_regs)
-{
-  if(opcode2[i]==0) // MFC0
-  {
-    signed char t=get_reg(i_regs->regmap,rt1[i]);
-    char copr=(source[i]>>11)&0x1f;
-    //assert(t>=0); // Why does this happen?  OOT is weird
-    if(t>=0&&rt1[i]!=0) {
-      emit_readword((int)&reg_cop0+copr*4,t);
-    }
-  }
-  else if(opcode2[i]==4) // MTC0
-  {
-    signed char s=get_reg(i_regs->regmap,rs1[i]);
-    char copr=(source[i]>>11)&0x1f;
-    assert(s>=0);
-    wb_register(rs1[i],i_regs->regmap,i_regs->dirty,i_regs->is32);
-    if(copr==9||copr==11||copr==12||copr==13) {
-      emit_readword((int)&last_count,HOST_TEMPREG);
-      emit_loadreg(CCREG,HOST_CCREG); // TODO: do proper reg alloc
-      emit_add(HOST_CCREG,HOST_TEMPREG,HOST_CCREG);
-      emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
-      emit_writeword(HOST_CCREG,(int)&Count);
-    }
-    // What a mess.  The status register (12) can enable interrupts,
-    // so needs a special case to handle a pending interrupt.
-    // The interrupt must be taken immediately, because a subsequent
-    // instruction might disable interrupts again.
-    if(copr==12||copr==13) {
-      if (is_delayslot) {
-        // burn cycles to cause cc_interrupt, which will
-        // reschedule next_interupt. Relies on CCREG from above.
-        assem_debug("MTC0 DS %d\n", copr);
-        emit_writeword(HOST_CCREG,(int)&last_count);
-        emit_movimm(0,HOST_CCREG);
-        emit_storereg(CCREG,HOST_CCREG);
-        emit_loadreg(rs1[i],1);
-        emit_movimm(copr,0);
-        emit_call((int)pcsx_mtc0_ds);
-        emit_loadreg(rs1[i],s);
-        return;
-      }
-      emit_movimm(start+i*4+4,HOST_TEMPREG);
-      emit_writeword(HOST_TEMPREG,(int)&pcaddr);
-      emit_movimm(0,HOST_TEMPREG);
-      emit_writeword(HOST_TEMPREG,(int)&pending_exception);
-    }
-    //else if(copr==12&&is_delayslot) emit_call((int)MTC0_R12);
-    //else
-    if(s==HOST_CCREG)
-      emit_loadreg(rs1[i],1);
-    else if(s!=1)
-      emit_mov(s,1);
-    emit_movimm(copr,0);
-    emit_call((int)pcsx_mtc0);
-    if(copr==9||copr==11||copr==12||copr==13) {
-      emit_readword((int)&Count,HOST_CCREG);
-      emit_readword((int)&next_interupt,HOST_TEMPREG);
-      emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
-      emit_sub(HOST_CCREG,HOST_TEMPREG,HOST_CCREG);
-      emit_writeword(HOST_TEMPREG,(int)&last_count);
-      emit_storereg(CCREG,HOST_CCREG);
-    }
-    if(copr==12||copr==13) {
-      assert(!is_delayslot);
-      emit_readword((int)&pending_exception,14);
-      emit_test(14,14);
-      emit_jne((int)&do_interrupt);
-    }
-    emit_loadreg(rs1[i],s);
-    if(get_reg(i_regs->regmap,rs1[i]|64)>=0)
-      emit_loadreg(rs1[i]|64,get_reg(i_regs->regmap,rs1[i]|64));
-    cop1_usable=0;
-  }
-  else
-  {
-    assert(opcode2[i]==0x10);
-    if((source[i]&0x3f)==0x10) // RFE
-    {
-      emit_readword((int)&Status,0);
-      emit_andimm(0,0x3c,1);
-      emit_andimm(0,~0xf,0);
-      emit_orrshr_imm(1,2,0);
-      emit_writeword(0,(int)&Status);
-    }
-  }
-}
-
-static void cop2_get_dreg(u_int copr,signed char tl,signed char temp)
-{
-  switch (copr) {
-    case 1:
-    case 3:
-    case 5:
-    case 8:
-    case 9:
-    case 10:
-    case 11:
-      emit_readword((int)&reg_cop2d[copr],tl);
-      emit_signextend16(tl,tl);
-      emit_writeword(tl,(int)&reg_cop2d[copr]); // hmh
-      break;
-    case 7:
-    case 16:
-    case 17:
-    case 18:
-    case 19:
-      emit_readword((int)&reg_cop2d[copr],tl);
-      emit_andimm(tl,0xffff,tl);
-      emit_writeword(tl,(int)&reg_cop2d[copr]);
-      break;
-    case 15:
-      emit_readword((int)&reg_cop2d[14],tl); // SXY2
-      emit_writeword(tl,(int)&reg_cop2d[copr]);
-      break;
-    case 28:
-    case 29:
-      emit_readword((int)&reg_cop2d[9],temp);
-      emit_testimm(temp,0x8000); // do we need this?
-      emit_andimm(temp,0xf80,temp);
-      emit_andne_imm(temp,0,temp);
-      emit_shrimm(temp,7,tl);
-      emit_readword((int)&reg_cop2d[10],temp);
-      emit_testimm(temp,0x8000);
-      emit_andimm(temp,0xf80,temp);
-      emit_andne_imm(temp,0,temp);
-      emit_orrshr_imm(temp,2,tl);
-      emit_readword((int)&reg_cop2d[11],temp);
-      emit_testimm(temp,0x8000);
-      emit_andimm(temp,0xf80,temp);
-      emit_andne_imm(temp,0,temp);
-      emit_orrshl_imm(temp,3,tl);
-      emit_writeword(tl,(int)&reg_cop2d[copr]);
-      break;
-    default:
-      emit_readword((int)&reg_cop2d[copr],tl);
-      break;
-  }
-}
-
-static void cop2_put_dreg(u_int copr,signed char sl,signed char temp)
-{
-  switch (copr) {
-    case 15:
-      emit_readword((int)&reg_cop2d[13],temp);  // SXY1
-      emit_writeword(sl,(int)&reg_cop2d[copr]);
-      emit_writeword(temp,(int)&reg_cop2d[12]); // SXY0
-      emit_readword((int)&reg_cop2d[14],temp);  // SXY2
-      emit_writeword(sl,(int)&reg_cop2d[14]);
-      emit_writeword(temp,(int)&reg_cop2d[13]); // SXY1
-      break;
-    case 28:
-      emit_andimm(sl,0x001f,temp);
-      emit_shlimm(temp,7,temp);
-      emit_writeword(temp,(int)&reg_cop2d[9]);
-      emit_andimm(sl,0x03e0,temp);
-      emit_shlimm(temp,2,temp);
-      emit_writeword(temp,(int)&reg_cop2d[10]);
-      emit_andimm(sl,0x7c00,temp);
-      emit_shrimm(temp,3,temp);
-      emit_writeword(temp,(int)&reg_cop2d[11]);
-      emit_writeword(sl,(int)&reg_cop2d[28]);
-      break;
-    case 30:
-      emit_movs(sl,temp);
-      emit_mvnmi(temp,temp);
-#ifdef HAVE_ARMV5
-      emit_clz(temp,temp);
-#else
-      emit_movs(temp,HOST_TEMPREG);
-      emit_movimm(0,temp);
-      emit_jeq((int)out+4*4);
-      emit_addpl_imm(temp,1,temp);
-      emit_lslpls_imm(HOST_TEMPREG,1,HOST_TEMPREG);
-      emit_jns((int)out-2*4);
-#endif
-      emit_writeword(sl,(int)&reg_cop2d[30]);
-      emit_writeword(temp,(int)&reg_cop2d[31]);
-      break;
-    case 31:
-      break;
-    default:
-      emit_writeword(sl,(int)&reg_cop2d[copr]);
-      break;
-  }
-}
-
-static void cop2_assemble(int i,struct regstat *i_regs)
-{
-  u_int copr=(source[i]>>11)&0x1f;
-  signed char temp=get_reg(i_regs->regmap,-1);
-  if (opcode2[i]==0) { // MFC2
-    signed char tl=get_reg(i_regs->regmap,rt1[i]);
-    if(tl>=0&&rt1[i]!=0)
-      cop2_get_dreg(copr,tl,temp);
-  }
-  else if (opcode2[i]==4) { // MTC2
-    signed char sl=get_reg(i_regs->regmap,rs1[i]);
-    cop2_put_dreg(copr,sl,temp);
-  }
-  else if (opcode2[i]==2) // CFC2
-  {
-    signed char tl=get_reg(i_regs->regmap,rt1[i]);
-    if(tl>=0&&rt1[i]!=0)
-      emit_readword((int)&reg_cop2c[copr],tl);
-  }
-  else if (opcode2[i]==6) // CTC2
-  {
-    signed char sl=get_reg(i_regs->regmap,rs1[i]);
-    switch(copr) {
-      case 4:
-      case 12:
-      case 20:
-      case 26:
-      case 27:
-      case 29:
-      case 30:
-        emit_signextend16(sl,temp);
-        break;
-      case 31:
-        //value = value & 0x7ffff000;
-        //if (value & 0x7f87e000) value |= 0x80000000;
-        emit_shrimm(sl,12,temp);
-        emit_shlimm(temp,12,temp);
-        emit_testimm(temp,0x7f000000);
-        emit_testeqimm(temp,0x00870000);
-        emit_testeqimm(temp,0x0000e000);
-        emit_orrne_imm(temp,0x80000000,temp);
-        break;
-      default:
-        temp=sl;
-        break;
-    }
-    emit_writeword(temp,(int)&reg_cop2c[copr]);
-    assert(sl>=0);
-  }
-}
-
-static void c2op_prologue(u_int op,u_int reglist)
-{
-  save_regs_all(reglist);
-#ifdef PCNT
-  emit_movimm(op,0);
-  emit_call((int)pcnt_gte_start);
-#endif
-  emit_addimm(FP,(int)&psxRegs.CP2D.r[0]-(int)&dynarec_local,0); // cop2 regs
-}
-
-static void c2op_epilogue(u_int op,u_int reglist)
-{
-#ifdef PCNT
-  emit_movimm(op,0);
-  emit_call((int)pcnt_gte_end);
-#endif
-  restore_regs_all(reglist);
-}
-
-static void c2op_call_MACtoIR(int lm,int need_flags)
-{
-  if(need_flags)
-    emit_call((int)(lm?gteMACtoIR_lm1:gteMACtoIR_lm0));
-  else
-    emit_call((int)(lm?gteMACtoIR_lm1_nf:gteMACtoIR_lm0_nf));
-}
-
-static void c2op_call_rgb_func(void *func,int lm,int need_ir,int need_flags)
-{
-  emit_call((int)func);
-  // func is C code and trashes r0
-  emit_addimm(FP,(int)&psxRegs.CP2D.r[0]-(int)&dynarec_local,0);
-  if(need_flags||need_ir)
-    c2op_call_MACtoIR(lm,need_flags);
-  emit_call((int)(need_flags?gteMACtoRGB:gteMACtoRGB_nf));
-}
-
-static void c2op_assemble(int i,struct regstat *i_regs)
-{
-  u_int c2op=source[i]&0x3f;
-  u_int hr,reglist_full=0,reglist;
-  int need_flags,need_ir;
-  for(hr=0;hr<HOST_REGS;hr++) {
-    if(i_regs->regmap[hr]>=0) reglist_full|=1<<hr;
-  }
-  reglist=reglist_full&CALLER_SAVE_REGS;
-
-  if (gte_handlers[c2op]!=NULL) {
-    need_flags=!(gte_unneeded[i+1]>>63); // +1 because of how liveness detection works
-    need_ir=(gte_unneeded[i+1]&0xe00)!=0xe00;
-    assem_debug("gte op %08x, unneeded %016llx, need_flags %d, need_ir %d\n",
-      source[i],gte_unneeded[i+1],need_flags,need_ir);
-    if(new_dynarec_hacks&NDHACK_GTE_NO_FLAGS)
-      need_flags=0;
-    int shift = (source[i] >> 19) & 1;
-    int lm = (source[i] >> 10) & 1;
-    switch(c2op) {
-#ifndef DRC_DBG
-      case GTE_MVMVA: {
-#ifdef HAVE_ARMV5
-        int v  = (source[i] >> 15) & 3;
-        int cv = (source[i] >> 13) & 3;
-        int mx = (source[i] >> 17) & 3;
-        reglist=reglist_full&(CALLER_SAVE_REGS|0xf0); // +{r4-r7}
-        c2op_prologue(c2op,reglist);
-        /* r4,r5 = VXYZ(v) packed; r6 = &MX11(mx); r7 = &CV1(cv) */
-        if(v<3)
-          emit_ldrd(v*8,0,4);
-        else {
-          emit_movzwl_indexed(9*4,0,4);  // gteIR
-          emit_movzwl_indexed(10*4,0,6);
-          emit_movzwl_indexed(11*4,0,5);
-          emit_orrshl_imm(6,16,4);
-        }
-        if(mx<3)
-          emit_addimm(0,32*4+mx*8*4,6);
-        else
-          emit_readword((int)&zeromem_ptr,6);
-        if(cv<3)
-          emit_addimm(0,32*4+(cv*8+5)*4,7);
-        else
-          emit_readword((int)&zeromem_ptr,7);
-#ifdef __ARM_NEON__
-        emit_movimm(source[i],1); // opcode
-        emit_call((int)gteMVMVA_part_neon);
-        if(need_flags) {
-          emit_movimm(lm,1);
-          emit_call((int)gteMACtoIR_flags_neon);
-        }
-#else
-        if(cv==3&&shift)
-          emit_call((int)gteMVMVA_part_cv3sh12_arm);
-        else {
-          emit_movimm(shift,1);
-          emit_call((int)(need_flags?gteMVMVA_part_arm:gteMVMVA_part_nf_arm));
-        }
-        if(need_flags||need_ir)
-          c2op_call_MACtoIR(lm,need_flags);
-#endif
-#else /* if not HAVE_ARMV5 */
-        c2op_prologue(c2op,reglist);
-        emit_movimm(source[i],1); // opcode
-        emit_writeword(1,(int)&psxRegs.code);
-        emit_call((int)(need_flags?gte_handlers[c2op]:gte_handlers_nf[c2op]));
-#endif
-        break;
-      }
-      case GTE_OP:
-        c2op_prologue(c2op,reglist);
-        emit_call((int)(shift?gteOP_part_shift:gteOP_part_noshift));
-        if(need_flags||need_ir) {
-          emit_addimm(FP,(int)&psxRegs.CP2D.r[0]-(int)&dynarec_local,0);
-          c2op_call_MACtoIR(lm,need_flags);
-        }
-        break;
-      case GTE_DPCS:
-        c2op_prologue(c2op,reglist);
-        c2op_call_rgb_func(shift?gteDPCS_part_shift:gteDPCS_part_noshift,lm,need_ir,need_flags);
-        break;
-      case GTE_INTPL:
-        c2op_prologue(c2op,reglist);
-        c2op_call_rgb_func(shift?gteINTPL_part_shift:gteINTPL_part_noshift,lm,need_ir,need_flags);
-        break;
-      case GTE_SQR:
-        c2op_prologue(c2op,reglist);
-        emit_call((int)(shift?gteSQR_part_shift:gteSQR_part_noshift));
-        if(need_flags||need_ir) {
-          emit_addimm(FP,(int)&psxRegs.CP2D.r[0]-(int)&dynarec_local,0);
-          c2op_call_MACtoIR(lm,need_flags);
-        }
-        break;
-      case GTE_DCPL:
-        c2op_prologue(c2op,reglist);
-        c2op_call_rgb_func(gteDCPL_part,lm,need_ir,need_flags);
-        break;
-      case GTE_GPF:
-        c2op_prologue(c2op,reglist);
-        c2op_call_rgb_func(shift?gteGPF_part_shift:gteGPF_part_noshift,lm,need_ir,need_flags);
-        break;
-      case GTE_GPL:
-        c2op_prologue(c2op,reglist);
-        c2op_call_rgb_func(shift?gteGPL_part_shift:gteGPL_part_noshift,lm,need_ir,need_flags);
-        break;
-#endif
-      default:
-        c2op_prologue(c2op,reglist);
-#ifdef DRC_DBG
-        emit_movimm(source[i],1); // opcode
-        emit_writeword(1,(int)&psxRegs.code);
-#endif
-        emit_call((int)(need_flags?gte_handlers[c2op]:gte_handlers_nf[c2op]));
-        break;
-    }
-    c2op_epilogue(c2op,reglist);
-  }
-}
-
-static void cop1_unusable(int i,struct regstat *i_regs)
-{
-  // XXX: should just just do the exception instead
-  if(!cop1_usable) {
-    int jaddr=(int)out;
-    emit_jmp(0);
-    add_stub(FP_STUB,jaddr,(int)out,i,0,(int)i_regs,is_delayslot,0);
-    cop1_usable=1;
-  }
-}
-
-static void cop1_assemble(int i,struct regstat *i_regs)
-{
-  cop1_unusable(i, i_regs);
-}
-
-static void fconv_assemble_arm(int i,struct regstat *i_regs)
-{
-  cop1_unusable(i, i_regs);
-}
-#define fconv_assemble fconv_assemble_arm
-
-static void fcomp_assemble(int i,struct regstat *i_regs)
-{
-  cop1_unusable(i, i_regs);
-}
-
-static void float_assemble(int i,struct regstat *i_regs)
-{
-  cop1_unusable(i, i_regs);
-}
-
-static void multdiv_assemble_arm(int i,struct regstat *i_regs)
-{
-  //  case 0x18: MULT
-  //  case 0x19: MULTU
-  //  case 0x1A: DIV
-  //  case 0x1B: DIVU
-  //  case 0x1C: DMULT
-  //  case 0x1D: DMULTU
-  //  case 0x1E: DDIV
-  //  case 0x1F: DDIVU
-  if(rs1[i]&&rs2[i])
-  {
-    if((opcode2[i]&4)==0) // 32-bit
-    {
-      if(opcode2[i]==0x18) // MULT
-      {
-        signed char m1=get_reg(i_regs->regmap,rs1[i]);
-        signed char m2=get_reg(i_regs->regmap,rs2[i]);
-        signed char hi=get_reg(i_regs->regmap,HIREG);
-        signed char lo=get_reg(i_regs->regmap,LOREG);
-        assert(m1>=0);
-        assert(m2>=0);
-        assert(hi>=0);
-        assert(lo>=0);
-        emit_smull(m1,m2,hi,lo);
-      }
-      if(opcode2[i]==0x19) // MULTU
-      {
-        signed char m1=get_reg(i_regs->regmap,rs1[i]);
-        signed char m2=get_reg(i_regs->regmap,rs2[i]);
-        signed char hi=get_reg(i_regs->regmap,HIREG);
-        signed char lo=get_reg(i_regs->regmap,LOREG);
-        assert(m1>=0);
-        assert(m2>=0);
-        assert(hi>=0);
-        assert(lo>=0);
-        emit_umull(m1,m2,hi,lo);
-      }
-      if(opcode2[i]==0x1A) // DIV
-      {
-        signed char d1=get_reg(i_regs->regmap,rs1[i]);
-        signed char d2=get_reg(i_regs->regmap,rs2[i]);
-        assert(d1>=0);
-        assert(d2>=0);
-        signed char quotient=get_reg(i_regs->regmap,LOREG);
-        signed char remainder=get_reg(i_regs->regmap,HIREG);
-        assert(quotient>=0);
-        assert(remainder>=0);
-        emit_movs(d1,remainder);
-        emit_movimm(0xffffffff,quotient);
-        emit_negmi(quotient,quotient); // .. quotient and ..
-        emit_negmi(remainder,remainder); // .. remainder for div0 case (will be negated back after jump)
-        emit_movs(d2,HOST_TEMPREG);
-        emit_jeq((int)out+52); // Division by zero
-        emit_negsmi(HOST_TEMPREG,HOST_TEMPREG);
-#ifdef HAVE_ARMV5
-        emit_clz(HOST_TEMPREG,quotient);
-        emit_shl(HOST_TEMPREG,quotient,HOST_TEMPREG);
-#else
-        emit_movimm(0,quotient);
-        emit_addpl_imm(quotient,1,quotient);
-        emit_lslpls_imm(HOST_TEMPREG,1,HOST_TEMPREG);
-        emit_jns((int)out-2*4);
-#endif
-        emit_orimm(quotient,1<<31,quotient);
-        emit_shr(quotient,quotient,quotient);
-        emit_cmp(remainder,HOST_TEMPREG);
-        emit_subcs(remainder,HOST_TEMPREG,remainder);
-        emit_adcs(quotient,quotient,quotient);
-        emit_shrimm(HOST_TEMPREG,1,HOST_TEMPREG);
-        emit_jcc((int)out-16); // -4
-        emit_teq(d1,d2);
-        emit_negmi(quotient,quotient);
-        emit_test(d1,d1);
-        emit_negmi(remainder,remainder);
-      }
-      if(opcode2[i]==0x1B) // DIVU
-      {
-        signed char d1=get_reg(i_regs->regmap,rs1[i]); // dividend
-        signed char d2=get_reg(i_regs->regmap,rs2[i]); // divisor
-        assert(d1>=0);
-        assert(d2>=0);
-        signed char quotient=get_reg(i_regs->regmap,LOREG);
-        signed char remainder=get_reg(i_regs->regmap,HIREG);
-        assert(quotient>=0);
-        assert(remainder>=0);
-        emit_mov(d1,remainder);
-        emit_movimm(0xffffffff,quotient); // div0 case
-        emit_test(d2,d2);
-        emit_jeq((int)out+40); // Division by zero
-#ifdef HAVE_ARMV5
-        emit_clz(d2,HOST_TEMPREG);
-        emit_movimm(1<<31,quotient);
-        emit_shl(d2,HOST_TEMPREG,d2);
-#else
-        emit_movimm(0,HOST_TEMPREG);
-        emit_addpl_imm(HOST_TEMPREG,1,HOST_TEMPREG);
-        emit_lslpls_imm(d2,1,d2);
-        emit_jns((int)out-2*4);
-        emit_movimm(1<<31,quotient);
-#endif
-        emit_shr(quotient,HOST_TEMPREG,quotient);
-        emit_cmp(remainder,d2);
-        emit_subcs(remainder,d2,remainder);
-        emit_adcs(quotient,quotient,quotient);
-        emit_shrcc_imm(d2,1,d2);
-        emit_jcc((int)out-16); // -4
-      }
-    }
-    else // 64-bit
-      assert(0);
-  }
-  else
-  {
-    // Multiply by zero is zero.
-    // MIPS does not have a divide by zero exception.
-    // The result is undefined, we return zero.
-    signed char hr=get_reg(i_regs->regmap,HIREG);
-    signed char lr=get_reg(i_regs->regmap,LOREG);
-    if(hr>=0) emit_zeroreg(hr);
-    if(lr>=0) emit_zeroreg(lr);
-  }
-}
-#define multdiv_assemble multdiv_assemble_arm
-
-static void do_preload_rhash(int r) {
-  // Don't need this for ARM.  On x86, this puts the value 0xf8 into the
-  // register.  On ARM the hash can be done with a single instruction (below)
-}
-
-static void do_preload_rhtbl(int ht) {
-  emit_addimm(FP,(int)&mini_ht-(int)&dynarec_local,ht);
-}
-
-static void do_rhash(int rs,int rh) {
-  emit_andimm(rs,0xf8,rh);
-}
-
-static void do_miniht_load(int ht,int rh) {
-  assem_debug("ldr %s,[%s,%s]!\n",regname[rh],regname[ht],regname[rh]);
-  output_w32(0xe7b00000|rd_rn_rm(rh,ht,rh));
-}
-
-static void do_miniht_jump(int rs,int rh,int ht) {
-  emit_cmp(rh,rs);
-  emit_ldreq_indexed(ht,4,15);
-  #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
-  emit_mov(rs,7);
-  emit_jmp(jump_vaddr_reg[7]);
-  #else
-  emit_jmp(jump_vaddr_reg[rs]);
-  #endif
-}
-
-static void do_miniht_insert(u_int return_address,int rt,int temp) {
-  #ifndef HAVE_ARMV7
-  emit_movimm(return_address,rt); // PC into link register
-  add_to_linker((int)out,return_address,1);
-  emit_pcreladdr(temp);
-  emit_writeword(rt,(int)&mini_ht[(return_address&0xFF)>>3][0]);
-  emit_writeword(temp,(int)&mini_ht[(return_address&0xFF)>>3][1]);
-  #else
-  emit_movw(return_address&0x0000FFFF,rt);
-  add_to_linker((int)out,return_address,1);
-  emit_pcreladdr(temp);
-  emit_writeword(temp,(int)&mini_ht[(return_address&0xFF)>>3][1]);
-  emit_movt(return_address&0xFFFF0000,rt);
-  emit_writeword(rt,(int)&mini_ht[(return_address&0xFF)>>3][0]);
-  #endif
-}
-
-static void wb_valid(signed char pre[],signed char entry[],u_int dirty_pre,u_int dirty,uint64_t is32_pre,uint64_t u,uint64_t uu)
-{
-  //if(dirty_pre==dirty) return;
-  int hr,reg;
-  for(hr=0;hr<HOST_REGS;hr++) {
-    if(hr!=EXCLUDE_REG) {
-      reg=pre[hr];
-      if(((~u)>>(reg&63))&1) {
-        if(reg>0) {
-          if(((dirty_pre&~dirty)>>hr)&1) {
-            if(reg>0&&reg<34) {
-              emit_storereg(reg,hr);
-              if( ((is32_pre&~uu)>>reg)&1 ) {
-                emit_sarimm(hr,31,HOST_TEMPREG);
-                emit_storereg(reg|64,HOST_TEMPREG);
-              }
-            }
-            else if(reg>=64) {
-              emit_storereg(reg,hr);
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-
-/* using strd could possibly help but you'd have to allocate registers in pairs
-static void wb_invalidate_arm(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,uint64_t u,uint64_t uu)
-{
-  int hr;
-  int wrote=-1;
-  for(hr=HOST_REGS-1;hr>=0;hr--) {
-    if(hr!=EXCLUDE_REG) {
-      if(pre[hr]!=entry[hr]) {
-        if(pre[hr]>=0) {
-          if((dirty>>hr)&1) {
-            if(get_reg(entry,pre[hr])<0) {
-              if(pre[hr]<64) {
-                if(!((u>>pre[hr])&1)) {
-                  if(hr<10&&(~hr&1)&&(pre[hr+1]<0||wrote==hr+1)) {
-                    if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
-                      emit_sarimm(hr,31,hr+1);
-                      emit_strdreg(pre[hr],hr);
-                    }
-                    else
-                      emit_storereg(pre[hr],hr);
-                  }else{
-                    emit_storereg(pre[hr],hr);
-                    if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
-                      emit_sarimm(hr,31,hr);
-                      emit_storereg(pre[hr]|64,hr);
-                    }
-                  }
-                }
-              }else{
-                if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
-                  emit_storereg(pre[hr],hr);
-                }
-              }
-              wrote=hr;
-            }
-          }
-        }
-      }
-    }
-  }
-  for(hr=0;hr<HOST_REGS;hr++) {
-    if(hr!=EXCLUDE_REG) {
-      if(pre[hr]!=entry[hr]) {
-        if(pre[hr]>=0) {
-          int nr;
-          if((nr=get_reg(entry,pre[hr]))>=0) {
-            emit_mov(hr,nr);
-          }
-        }
-      }
-    }
-  }
-}
-#define wb_invalidate wb_invalidate_arm
-*/
-
-static void mark_clear_cache(void *target)
-{
-  u_long offset = (char *)target - (char *)BASE_ADDR;
-  u_int mask = 1u << ((offset >> 12) & 31);
-  if (!(needs_clear_cache[offset >> 17] & mask)) {
-    char *start = (char *)((u_long)target & ~4095ul);
-    start_tcache_write(start, start + 4096);
-    needs_clear_cache[offset >> 17] |= mask;
-  }
-}
-
-// Clearing the cache is rather slow on ARM Linux, so mark the areas
-// that need to be cleared, and then only clear these areas once.
-static void do_clear_cache()
-{
-  int i,j;
-  for (i=0;i<(1<<(TARGET_SIZE_2-17));i++)
-  {
-    u_int bitmap=needs_clear_cache[i];
-    if(bitmap) {
-      u_int start,end;
-      for(j=0;j<32;j++)
-      {
-        if(bitmap&(1<<j)) {
-          start=(u_int)BASE_ADDR+i*131072+j*4096;
-          end=start+4095;
-          j++;
-          while(j<32) {
-            if(bitmap&(1<<j)) {
-              end+=4096;
-              j++;
-            }else{
-              end_tcache_write((void *)start,(void *)end);
-              break;
-            }
-          }
-        }
-      }
-      needs_clear_cache[i]=0;
-    }
-  }
-}
-
-// CPU-architecture-specific initialization
-static void arch_init() {
-}
-
-// vim:shiftwidth=2:expandtab
diff --git a/libpcsxcore/new_dynarec/arm/assem_arm.h b/libpcsxcore/new_dynarec/arm/assem_arm.h
deleted file mode 100644 (file)
index 1dcc55f..0000000
+++ /dev/null
@@ -1,62 +0,0 @@
-#ifndef __ASSEM_ARM_H__
-#define __ASSEM_ARM_H__
-
-#define HOST_REGS 13
-#define HOST_CCREG 10
-#define HOST_BTREG 8
-#define EXCLUDE_REG 11
-
-#define HOST_IMM8 1
-#define HAVE_CMOV_IMM 1
-#define HAVE_CONDITIONAL_CALL 1
-#define RAM_SIZE 0x200000
-
-#define REG_SHIFT 2
-
-/* ARM calling convention:
-   r0-r3, r12: caller-save
-   r4-r11: callee-save */
-
-#define ARG1_REG 0
-#define ARG2_REG 1
-#define ARG3_REG 2
-#define ARG4_REG 3
-
-/* GCC register naming convention:
-   r10 = sl (base)
-   r11 = fp (frame pointer)
-   r12 = ip (scratch)
-   r13 = sp (stack pointer)
-   r14 = lr (link register)
-   r15 = pc (program counter) */
-
-#define FP 11
-#define LR 14
-#define HOST_TEMPREG 14
-
-// Note: FP is set to &dynarec_local when executing generated code.
-// Thus the local variables are actually global and not on the stack.
-
-extern char *invc_ptr;
-
-#define TARGET_SIZE_2 24 // 2^24 = 16 megabytes
-
-// Code generator target address
-#if   defined(BASE_ADDR_FIXED)
-  // "round" address helpful for debug
-  // this produces best code, but not many platforms allow it,
-  // only use if you are sure this range is always free
-  #define BASE_ADDR 0x1000000
-  #define translation_cache (char *)BASE_ADDR
-#elif defined(BASE_ADDR_DYNAMIC)
-  // for platforms that can't just use .bss buffer, like vita
-  // otherwise better to use the next option for closer branches
-  extern char *translation_cache;
-  #define BASE_ADDR (u_int)translation_cache
-#else
-  // using a static buffer in .bss
-  extern char translation_cache[1 << TARGET_SIZE_2];
-  #define BASE_ADDR (u_int)translation_cache
-#endif
-
-#endif /* __ASSEM_ARM_H__ */
diff --git a/libpcsxcore/new_dynarec/arm/linkage_offsets.h b/libpcsxcore/new_dynarec/arm/linkage_offsets.h
deleted file mode 100644 (file)
index c7abff0..0000000
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef __LINKAGE_OFFSETS_H__
-#define __LINKAGE_OFFSETS_H__
-
-#define LO_next_interupt       64
-#define LO_cycle_count         (LO_next_interupt + 4)
-#define LO_last_count          (LO_cycle_count + 4)
-#define LO_pending_exception   (LO_last_count + 4)
-#define LO_stop                        (LO_pending_exception + 4)
-#define LO_invc_ptr            (LO_stop + 4)
-#define LO_address             (LO_invc_ptr + 4)
-#define LO_psxRegs             (LO_address + 4)
-#define LO_reg                 (LO_psxRegs)
-#define LO_lo                  (LO_reg + 128)
-#define LO_hi                  (LO_lo + 4)
-#define LO_reg_cop0            (LO_hi + 4)
-#define LO_reg_cop2d           (LO_reg_cop0 + 128)
-#define LO_reg_cop2c           (LO_reg_cop2d + 128)
-#define LO_PC                  (LO_reg_cop2c + 128)
-#define LO_pcaddr              (LO_PC)
-#define LO_code                        (LO_PC + 4)
-#define LO_cycle               (LO_code + 4)
-#define LO_interrupt           (LO_cycle + 4)
-#define LO_intCycle            (LO_interrupt + 4)
-#define LO_psxRegs_end         (LO_intCycle + 256)
-#define LO_rcnts               (LO_psxRegs_end)
-#define LO_rcnts_end           (LO_rcnts + 7*4*4)
-#define LO_mem_rtab            (LO_rcnts_end)
-#define LO_mem_wtab            (LO_mem_rtab + 4)
-#define LO_psxH_ptr            (LO_mem_wtab + 4)
-#define LO_zeromem_ptr         (LO_psxH_ptr + 4)
-#define LO_inv_code_start      (LO_zeromem_ptr + 4)
-#define LO_inv_code_end                (LO_inv_code_start + 4)
-#define LO_branch_target       (LO_inv_code_end + 4)
-#define LO_scratch_buf_ptr     (LO_branch_target + 4)
-#define LO_align0              (LO_scratch_buf_ptr + 4)
-#define LO_mini_ht             (LO_align0 + 12)
-#define LO_restore_candidate   (LO_mini_ht + 256)
-#define LO_dynarec_local_size  (LO_restore_candidate + 512)
-
-#define LO_FCR0                        (LO_align0)
-#define LO_FCR31               (LO_align0)
-
-#define LO_cop2_to_scratch_buf (LO_scratch_buf_ptr - LO_reg_cop2d)
-
-#endif /* __LINKAGE_OFFSETS_H__ */
diff --git a/libpcsxcore/new_dynarec/assem_arm.c b/libpcsxcore/new_dynarec/assem_arm.c
new file mode 100644 (file)
index 0000000..3267cb6
--- /dev/null
@@ -0,0 +1,2417 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ *   Mupen64plus/PCSX - assem_arm.c                                        *
+ *   Copyright (C) 2009-2011 Ari64                                         *
+ *   Copyright (C) 2010-2021 Gražvydas "notaz" Ignotas                     *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#define FLAGLESS
+#include "../gte.h"
+#undef FLAGLESS
+#include "../gte_arm.h"
+#include "../gte_neon.h"
+#include "pcnt.h"
+#include "arm_features.h"
+
+#define unused __attribute__((unused))
+
+#ifdef DRC_DBG
+#pragma GCC diagnostic ignored "-Wunused-function"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#endif
+
+void indirect_jump_indexed();
+void indirect_jump();
+void do_interrupt();
+void jump_vaddr_r0();
+void jump_vaddr_r1();
+void jump_vaddr_r2();
+void jump_vaddr_r3();
+void jump_vaddr_r4();
+void jump_vaddr_r5();
+void jump_vaddr_r6();
+void jump_vaddr_r7();
+void jump_vaddr_r8();
+void jump_vaddr_r9();
+void jump_vaddr_r10();
+void jump_vaddr_r12();
+
+void * const jump_vaddr_reg[16] = {
+  jump_vaddr_r0,
+  jump_vaddr_r1,
+  jump_vaddr_r2,
+  jump_vaddr_r3,
+  jump_vaddr_r4,
+  jump_vaddr_r5,
+  jump_vaddr_r6,
+  jump_vaddr_r7,
+  jump_vaddr_r8,
+  jump_vaddr_r9,
+  jump_vaddr_r10,
+  0,
+  jump_vaddr_r12,
+  0,
+  0,
+  0
+};
+
+void invalidate_addr_r0();
+void invalidate_addr_r1();
+void invalidate_addr_r2();
+void invalidate_addr_r3();
+void invalidate_addr_r4();
+void invalidate_addr_r5();
+void invalidate_addr_r6();
+void invalidate_addr_r7();
+void invalidate_addr_r8();
+void invalidate_addr_r9();
+void invalidate_addr_r10();
+void invalidate_addr_r12();
+
+const u_int invalidate_addr_reg[16] = {
+  (int)invalidate_addr_r0,
+  (int)invalidate_addr_r1,
+  (int)invalidate_addr_r2,
+  (int)invalidate_addr_r3,
+  (int)invalidate_addr_r4,
+  (int)invalidate_addr_r5,
+  (int)invalidate_addr_r6,
+  (int)invalidate_addr_r7,
+  (int)invalidate_addr_r8,
+  (int)invalidate_addr_r9,
+  (int)invalidate_addr_r10,
+  0,
+  (int)invalidate_addr_r12,
+  0,
+  0,
+  0};
+
+static u_int needs_clear_cache[1<<(TARGET_SIZE_2-17)];
+
+/* Linker */
+
+static void set_jump_target(void *addr, void *target_)
+{
+  u_int target = (u_int)target_;
+  u_char *ptr = addr;
+  u_int *ptr2=(u_int *)ptr;
+  if(ptr[3]==0xe2) {
+    assert((target-(u_int)ptr2-8)<1024);
+    assert(((uintptr_t)addr&3)==0);
+    assert((target&3)==0);
+    *ptr2=(*ptr2&0xFFFFF000)|((target-(u_int)ptr2-8)>>2)|0xF00;
+    //printf("target=%x addr=%p insn=%x\n",target,addr,*ptr2);
+  }
+  else if(ptr[3]==0x72) {
+    // generated by emit_jno_unlikely
+    if((target-(u_int)ptr2-8)<1024) {
+      assert(((uintptr_t)addr&3)==0);
+      assert((target&3)==0);
+      *ptr2=(*ptr2&0xFFFFF000)|((target-(u_int)ptr2-8)>>2)|0xF00;
+    }
+    else if((target-(u_int)ptr2-8)<4096&&!((target-(u_int)ptr2-8)&15)) {
+      assert(((uintptr_t)addr&3)==0);
+      assert((target&3)==0);
+      *ptr2=(*ptr2&0xFFFFF000)|((target-(u_int)ptr2-8)>>4)|0xE00;
+    }
+    else *ptr2=(0x7A000000)|(((target-(u_int)ptr2-8)<<6)>>8);
+  }
+  else {
+    assert((ptr[3]&0x0e)==0xa);
+    *ptr2=(*ptr2&0xFF000000)|(((target-(u_int)ptr2-8)<<6)>>8);
+  }
+}
+
+// This optionally copies the instruction from the target of the branch into
+// the space before the branch.  Works, but the difference in speed is
+// usually insignificant.
+#if 0
+static void set_jump_target_fillslot(int addr,u_int target,int copy)
+{
+  u_char *ptr=(u_char *)addr;
+  u_int *ptr2=(u_int *)ptr;
+  assert(!copy||ptr2[-1]==0xe28dd000);
+  if(ptr[3]==0xe2) {
+    assert(!copy);
+    assert((target-(u_int)ptr2-8)<4096);
+    *ptr2=(*ptr2&0xFFFFF000)|(target-(u_int)ptr2-8);
+  }
+  else {
+    assert((ptr[3]&0x0e)==0xa);
+    u_int target_insn=*(u_int *)target;
+    if((target_insn&0x0e100000)==0) { // ALU, no immediate, no flags
+      copy=0;
+    }
+    if((target_insn&0x0c100000)==0x04100000) { // Load
+      copy=0;
+    }
+    if(target_insn&0x08000000) {
+      copy=0;
+    }
+    if(copy) {
+      ptr2[-1]=target_insn;
+      target+=4;
+    }
+    *ptr2=(*ptr2&0xFF000000)|(((target-(u_int)ptr2-8)<<6)>>8);
+  }
+}
+#endif
+
+/* Literal pool */
+static void add_literal(int addr,int val)
+{
+  assert(literalcount<sizeof(literals)/sizeof(literals[0]));
+  literals[literalcount][0]=addr;
+  literals[literalcount][1]=val;
+  literalcount++;
+}
+
+// from a pointer to external jump stub (which was produced by emit_extjump2)
+// find where the jumping insn is
+static void *find_extjump_insn(void *stub)
+{
+  int *ptr=(int *)(stub+4);
+  assert((*ptr&0x0fff0000)==0x059f0000); // ldr rx, [pc, #ofs]
+  u_int offset=*ptr&0xfff;
+  void **l_ptr=(void *)ptr+offset+8;
+  return *l_ptr;
+}
+
+// find where external branch is liked to using addr of it's stub:
+// get address that insn one after stub loads (dyna_linker arg1),
+// treat it as a pointer to branch insn,
+// return addr where that branch jumps to
+static void *get_pointer(void *stub)
+{
+  //printf("get_pointer(%x)\n",(int)stub);
+  int *i_ptr=find_extjump_insn(stub);
+  assert((*i_ptr&0x0f000000)==0x0a000000); // b
+  return (u_char *)i_ptr+((*i_ptr<<8)>>6)+8;
+}
+
+// Find the "clean" entry point from a "dirty" entry point
+// by skipping past the call to verify_code
+static void *get_clean_addr(void *addr)
+{
+  signed int *ptr = addr;
+  #ifndef HAVE_ARMV7
+  ptr+=4;
+  #else
+  ptr+=6;
+  #endif
+  if((*ptr&0xFF000000)!=0xeb000000) ptr++;
+  assert((*ptr&0xFF000000)==0xeb000000); // bl instruction
+  ptr++;
+  if((*ptr&0xFF000000)==0xea000000) {
+    return (char *)ptr+((*ptr<<8)>>6)+8; // follow jump
+  }
+  return ptr;
+}
+
+static int verify_dirty(const u_int *ptr)
+{
+  #ifndef HAVE_ARMV7
+  u_int offset;
+  // get from literal pool
+  assert((*ptr&0xFFFF0000)==0xe59f0000);
+  offset=*ptr&0xfff;
+  u_int source=*(u_int*)((void *)ptr+offset+8);
+  ptr++;
+  assert((*ptr&0xFFFF0000)==0xe59f0000);
+  offset=*ptr&0xfff;
+  u_int copy=*(u_int*)((void *)ptr+offset+8);
+  ptr++;
+  assert((*ptr&0xFFFF0000)==0xe59f0000);
+  offset=*ptr&0xfff;
+  u_int len=*(u_int*)((void *)ptr+offset+8);
+  ptr++;
+  ptr++;
+  #else
+  // ARMv7 movw/movt
+  assert((*ptr&0xFFF00000)==0xe3000000);
+  u_int source=(ptr[0]&0xFFF)+((ptr[0]>>4)&0xF000)+((ptr[2]<<16)&0xFFF0000)+((ptr[2]<<12)&0xF0000000);
+  u_int copy=(ptr[1]&0xFFF)+((ptr[1]>>4)&0xF000)+((ptr[3]<<16)&0xFFF0000)+((ptr[3]<<12)&0xF0000000);
+  u_int len=(ptr[4]&0xFFF)+((ptr[4]>>4)&0xF000);
+  ptr+=6;
+  #endif
+  if((*ptr&0xFF000000)!=0xeb000000) ptr++;
+  assert((*ptr&0xFF000000)==0xeb000000); // bl instruction
+  //printf("verify_dirty: %x %x %x\n",source,copy,len);
+  return !memcmp((void *)source,(void *)copy,len);
+}
+
+// This doesn't necessarily find all clean entry points, just
+// guarantees that it's not dirty
+static int isclean(void *addr)
+{
+  #ifndef HAVE_ARMV7
+  u_int *ptr=((u_int *)addr)+4;
+  #else
+  u_int *ptr=((u_int *)addr)+6;
+  #endif
+  if((*ptr&0xFF000000)!=0xeb000000) ptr++;
+  if((*ptr&0xFF000000)!=0xeb000000) return 1; // bl instruction
+  if((int)ptr+((*ptr<<8)>>6)+8==(int)verify_code) return 0;
+  if((int)ptr+((*ptr<<8)>>6)+8==(int)verify_code_ds) return 0;
+  return 1;
+}
+
+// get source that block at addr was compiled from (host pointers)
+static void get_bounds(void *addr, u_char **start, u_char **end)
+{
+  u_int *ptr = addr;
+  #ifndef HAVE_ARMV7
+  u_int offset;
+  // get from literal pool
+  assert((*ptr&0xFFFF0000)==0xe59f0000);
+  offset=*ptr&0xfff;
+  u_int source=*(u_int*)((void *)ptr+offset+8);
+  ptr++;
+  //assert((*ptr&0xFFFF0000)==0xe59f0000);
+  //offset=*ptr&0xfff;
+  //u_int copy=*(u_int*)((void *)ptr+offset+8);
+  ptr++;
+  assert((*ptr&0xFFFF0000)==0xe59f0000);
+  offset=*ptr&0xfff;
+  u_int len=*(u_int*)((void *)ptr+offset+8);
+  ptr++;
+  ptr++;
+  #else
+  // ARMv7 movw/movt
+  assert((*ptr&0xFFF00000)==0xe3000000);
+  u_int source=(ptr[0]&0xFFF)+((ptr[0]>>4)&0xF000)+((ptr[2]<<16)&0xFFF0000)+((ptr[2]<<12)&0xF0000000);
+  //u_int copy=(ptr[1]&0xFFF)+((ptr[1]>>4)&0xF000)+((ptr[3]<<16)&0xFFF0000)+((ptr[3]<<12)&0xF0000000);
+  u_int len=(ptr[4]&0xFFF)+((ptr[4]>>4)&0xF000);
+  ptr+=6;
+  #endif
+  if((*ptr&0xFF000000)!=0xeb000000) ptr++;
+  assert((*ptr&0xFF000000)==0xeb000000); // bl instruction
+  *start=(u_char *)source;
+  *end=(u_char *)source+len;
+}
+
+// Allocate a specific ARM register.
+static void alloc_arm_reg(struct regstat *cur,int i,signed char reg,int hr)
+{
+  int n;
+  int dirty=0;
+
+  // see if it's already allocated (and dealloc it)
+  for(n=0;n<HOST_REGS;n++)
+  {
+    if(n!=EXCLUDE_REG&&cur->regmap[n]==reg) {
+      dirty=(cur->dirty>>n)&1;
+      cur->regmap[n]=-1;
+    }
+  }
+
+  cur->regmap[hr]=reg;
+  cur->dirty&=~(1<<hr);
+  cur->dirty|=dirty<<hr;
+  cur->isconst&=~(1<<hr);
+}
+
+// Alloc cycle count into dedicated register
+static void alloc_cc(struct regstat *cur,int i)
+{
+  alloc_arm_reg(cur,i,CCREG,HOST_CCREG);
+}
+
+/* Assembler */
+
+static unused char regname[16][4] = {
+ "r0",
+ "r1",
+ "r2",
+ "r3",
+ "r4",
+ "r5",
+ "r6",
+ "r7",
+ "r8",
+ "r9",
+ "r10",
+ "fp",
+ "r12",
+ "sp",
+ "lr",
+ "pc"};
+
+static void output_w32(u_int word)
+{
+  *((u_int *)out)=word;
+  out+=4;
+}
+
+static u_int rd_rn_rm(u_int rd, u_int rn, u_int rm)
+{
+  assert(rd<16);
+  assert(rn<16);
+  assert(rm<16);
+  return((rn<<16)|(rd<<12)|rm);
+}
+
+static u_int rd_rn_imm_shift(u_int rd, u_int rn, u_int imm, u_int shift)
+{
+  assert(rd<16);
+  assert(rn<16);
+  assert(imm<256);
+  assert((shift&1)==0);
+  return((rn<<16)|(rd<<12)|(((32-shift)&30)<<7)|imm);
+}
+
+static u_int genimm(u_int imm,u_int *encoded)
+{
+  *encoded=0;
+  if(imm==0) return 1;
+  int i=32;
+  while(i>0)
+  {
+    if(imm<256) {
+      *encoded=((i&30)<<7)|imm;
+      return 1;
+    }
+    imm=(imm>>2)|(imm<<30);i-=2;
+  }
+  return 0;
+}
+
+static void genimm_checked(u_int imm,u_int *encoded)
+{
+  u_int ret=genimm(imm,encoded);
+  assert(ret);
+  (void)ret;
+}
+
+static u_int genjmp(u_int addr)
+{
+  if (addr < 3) return 0; // a branch that will be patched later
+  int offset = addr-(int)out-8;
+  if (offset < -33554432 || offset >= 33554432) {
+    SysPrintf("genjmp: out of range: %08x\n", offset);
+    abort();
+    return 0;
+  }
+  return ((u_int)offset>>2)&0xffffff;
+}
+
+static unused void emit_breakpoint(void)
+{
+  assem_debug("bkpt #0\n");
+  //output_w32(0xe1200070);
+  output_w32(0xe7f001f0);
+}
+
+static void emit_mov(int rs,int rt)
+{
+  assem_debug("mov %s,%s\n",regname[rt],regname[rs]);
+  output_w32(0xe1a00000|rd_rn_rm(rt,0,rs));
+}
+
+static void emit_movs(int rs,int rt)
+{
+  assem_debug("movs %s,%s\n",regname[rt],regname[rs]);
+  output_w32(0xe1b00000|rd_rn_rm(rt,0,rs));
+}
+
+static void emit_add(int rs1,int rs2,int rt)
+{
+  assem_debug("add %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0xe0800000|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_adds(int rs1,int rs2,int rt)
+{
+  assem_debug("adds %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0xe0900000|rd_rn_rm(rt,rs1,rs2));
+}
+#define emit_adds_ptr emit_adds
+
+static void emit_adcs(int rs1,int rs2,int rt)
+{
+  assem_debug("adcs %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0xe0b00000|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_neg(int rs, int rt)
+{
+  assem_debug("rsb %s,%s,#0\n",regname[rt],regname[rs]);
+  output_w32(0xe2600000|rd_rn_rm(rt,rs,0));
+}
+
+static void emit_sub(int rs1,int rs2,int rt)
+{
+  assem_debug("sub %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0xe0400000|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_zeroreg(int rt)
+{
+  assem_debug("mov %s,#0\n",regname[rt]);
+  output_w32(0xe3a00000|rd_rn_rm(rt,0,0));
+}
+
+static void emit_loadlp(u_int imm,u_int rt)
+{
+  add_literal((int)out,imm);
+  assem_debug("ldr %s,pc+? [=%x]\n",regname[rt],imm);
+  output_w32(0xe5900000|rd_rn_rm(rt,15,0));
+}
+
+static void emit_movw(u_int imm,u_int rt)
+{
+  assert(imm<65536);
+  assem_debug("movw %s,#%d (0x%x)\n",regname[rt],imm,imm);
+  output_w32(0xe3000000|rd_rn_rm(rt,0,0)|(imm&0xfff)|((imm<<4)&0xf0000));
+}
+
+static void emit_movt(u_int imm,u_int rt)
+{
+  assem_debug("movt %s,#%d (0x%x)\n",regname[rt],imm&0xffff0000,imm&0xffff0000);
+  output_w32(0xe3400000|rd_rn_rm(rt,0,0)|((imm>>16)&0xfff)|((imm>>12)&0xf0000));
+}
+
+static void emit_movimm(u_int imm,u_int rt)
+{
+  u_int armval;
+  if(genimm(imm,&armval)) {
+    assem_debug("mov %s,#%d\n",regname[rt],imm);
+    output_w32(0xe3a00000|rd_rn_rm(rt,0,0)|armval);
+  }else if(genimm(~imm,&armval)) {
+    assem_debug("mvn %s,#%d\n",regname[rt],imm);
+    output_w32(0xe3e00000|rd_rn_rm(rt,0,0)|armval);
+  }else if(imm<65536) {
+    #ifndef HAVE_ARMV7
+    assem_debug("mov %s,#%d\n",regname[rt],imm&0xFF00);
+    output_w32(0xe3a00000|rd_rn_imm_shift(rt,0,imm>>8,8));
+    assem_debug("add %s,%s,#%d\n",regname[rt],regname[rt],imm&0xFF);
+    output_w32(0xe2800000|rd_rn_imm_shift(rt,rt,imm&0xff,0));
+    #else
+    emit_movw(imm,rt);
+    #endif
+  }else{
+    #ifndef HAVE_ARMV7
+    emit_loadlp(imm,rt);
+    #else
+    emit_movw(imm&0x0000FFFF,rt);
+    emit_movt(imm&0xFFFF0000,rt);
+    #endif
+  }
+}
+
+static void emit_pcreladdr(u_int rt)
+{
+  assem_debug("add %s,pc,#?\n",regname[rt]);
+  output_w32(0xe2800000|rd_rn_rm(rt,15,0));
+}
+
+static void emit_loadreg(int r, int hr)
+{
+  if(r&64) {
+    SysPrintf("64bit load in 32bit mode!\n");
+    assert(0);
+    return;
+  }
+  if((r&63)==0)
+    emit_zeroreg(hr);
+  else {
+    int addr = (int)&psxRegs.GPR.r[r];
+    switch (r) {
+    //case HIREG: addr = &hi; break;
+    //case LOREG: addr = &lo; break;
+    case CCREG: addr = (int)&cycle_count; break;
+    case CSREG: addr = (int)&Status; break;
+    case INVCP: addr = (int)&invc_ptr; break;
+    case ROREG: addr = (int)&ram_offset; break;
+    default: assert(r < 34); break;
+    }
+    u_int offset = addr-(u_int)&dynarec_local;
+    assert(offset<4096);
+    assem_debug("ldr %s,fp+%d\n",regname[hr],offset);
+    output_w32(0xe5900000|rd_rn_rm(hr,FP,0)|offset);
+  }
+}
+
+static void emit_storereg(int r, int hr)
+{
+  if(r&64) {
+    SysPrintf("64bit store in 32bit mode!\n");
+    assert(0);
+    return;
+  }
+  int addr = (int)&psxRegs.GPR.r[r];
+  switch (r) {
+  //case HIREG: addr = &hi; break;
+  //case LOREG: addr = &lo; break;
+  case CCREG: addr = (int)&cycle_count; break;
+  default: assert(r < 34); break;
+  }
+  u_int offset = addr-(u_int)&dynarec_local;
+  assert(offset<4096);
+  assem_debug("str %s,fp+%d\n",regname[hr],offset);
+  output_w32(0xe5800000|rd_rn_rm(hr,FP,0)|offset);
+}
+
+static void emit_test(int rs, int rt)
+{
+  assem_debug("tst %s,%s\n",regname[rs],regname[rt]);
+  output_w32(0xe1100000|rd_rn_rm(0,rs,rt));
+}
+
+static void emit_testimm(int rs,int imm)
+{
+  u_int armval;
+  assem_debug("tst %s,#%d\n",regname[rs],imm);
+  genimm_checked(imm,&armval);
+  output_w32(0xe3100000|rd_rn_rm(0,rs,0)|armval);
+}
+
+static void emit_testeqimm(int rs,int imm)
+{
+  u_int armval;
+  assem_debug("tsteq %s,$%d\n",regname[rs],imm);
+  genimm_checked(imm,&armval);
+  output_w32(0x03100000|rd_rn_rm(0,rs,0)|armval);
+}
+
+static void emit_not(int rs,int rt)
+{
+  assem_debug("mvn %s,%s\n",regname[rt],regname[rs]);
+  output_w32(0xe1e00000|rd_rn_rm(rt,0,rs));
+}
+
+static void emit_and(u_int rs1,u_int rs2,u_int rt)
+{
+  assem_debug("and %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0xe0000000|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_or(u_int rs1,u_int rs2,u_int rt)
+{
+  assem_debug("orr %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0xe1800000|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_orrshl_imm(u_int rs,u_int imm,u_int rt)
+{
+  assert(rs<16);
+  assert(rt<16);
+  assert(imm<32);
+  assem_debug("orr %s,%s,%s,lsl #%d\n",regname[rt],regname[rt],regname[rs],imm);
+  output_w32(0xe1800000|rd_rn_rm(rt,rt,rs)|(imm<<7));
+}
+
+static void emit_orrshr_imm(u_int rs,u_int imm,u_int rt)
+{
+  assert(rs<16);
+  assert(rt<16);
+  assert(imm<32);
+  assem_debug("orr %s,%s,%s,lsr #%d\n",regname[rt],regname[rt],regname[rs],imm);
+  output_w32(0xe1800020|rd_rn_rm(rt,rt,rs)|(imm<<7));
+}
+
+static void emit_xor(u_int rs1,u_int rs2,u_int rt)
+{
+  assem_debug("eor %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0xe0200000|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_xorsar_imm(u_int rs1,u_int rs2,u_int imm,u_int rt)
+{
+  assem_debug("eor %s,%s,%s,asr #%d\n",regname[rt],regname[rs1],regname[rs2],imm);
+  output_w32(0xe0200040|rd_rn_rm(rt,rs1,rs2)|(imm<<7));
+}
+
+static void emit_addimm(u_int rs,int imm,u_int rt)
+{
+  assert(rs<16);
+  assert(rt<16);
+  if(imm!=0) {
+    u_int armval;
+    if(genimm(imm,&armval)) {
+      assem_debug("add %s,%s,#%d\n",regname[rt],regname[rs],imm);
+      output_w32(0xe2800000|rd_rn_rm(rt,rs,0)|armval);
+    }else if(genimm(-imm,&armval)) {
+      assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rs],-imm);
+      output_w32(0xe2400000|rd_rn_rm(rt,rs,0)|armval);
+    #ifdef HAVE_ARMV7
+    }else if(rt!=rs&&(u_int)imm<65536) {
+      emit_movw(imm&0x0000ffff,rt);
+      emit_add(rs,rt,rt);
+    }else if(rt!=rs&&(u_int)-imm<65536) {
+      emit_movw(-imm&0x0000ffff,rt);
+      emit_sub(rs,rt,rt);
+    #endif
+    }else if((u_int)-imm<65536) {
+      assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rs],(-imm)&0xFF00);
+      assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rt],(-imm)&0xFF);
+      output_w32(0xe2400000|rd_rn_imm_shift(rt,rs,(-imm)>>8,8));
+      output_w32(0xe2400000|rd_rn_imm_shift(rt,rt,(-imm)&0xff,0));
+    }else {
+      do {
+        int shift = (ffs(imm) - 1) & ~1;
+        int imm8 = imm & (0xff << shift);
+        genimm_checked(imm8,&armval);
+        assem_debug("add %s,%s,#0x%x\n",regname[rt],regname[rs],imm8);
+        output_w32(0xe2800000|rd_rn_rm(rt,rs,0)|armval);
+        rs = rt;
+        imm &= ~imm8;
+      }
+      while (imm != 0);
+    }
+  }
+  else if(rs!=rt) emit_mov(rs,rt);
+}
+
+static void emit_addimm_and_set_flags(int imm,int rt)
+{
+  assert(imm>-65536&&imm<65536);
+  u_int armval;
+  if(genimm(imm,&armval)) {
+    assem_debug("adds %s,%s,#%d\n",regname[rt],regname[rt],imm);
+    output_w32(0xe2900000|rd_rn_rm(rt,rt,0)|armval);
+  }else if(genimm(-imm,&armval)) {
+    assem_debug("subs %s,%s,#%d\n",regname[rt],regname[rt],imm);
+    output_w32(0xe2500000|rd_rn_rm(rt,rt,0)|armval);
+  }else if(imm<0) {
+    assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rt],(-imm)&0xFF00);
+    assem_debug("subs %s,%s,#%d\n",regname[rt],regname[rt],(-imm)&0xFF);
+    output_w32(0xe2400000|rd_rn_imm_shift(rt,rt,(-imm)>>8,8));
+    output_w32(0xe2500000|rd_rn_imm_shift(rt,rt,(-imm)&0xff,0));
+  }else{
+    assem_debug("add %s,%s,#%d\n",regname[rt],regname[rt],imm&0xFF00);
+    assem_debug("adds %s,%s,#%d\n",regname[rt],regname[rt],imm&0xFF);
+    output_w32(0xe2800000|rd_rn_imm_shift(rt,rt,imm>>8,8));
+    output_w32(0xe2900000|rd_rn_imm_shift(rt,rt,imm&0xff,0));
+  }
+}
+
+static void emit_addnop(u_int r)
+{
+  assert(r<16);
+  assem_debug("add %s,%s,#0 (nop)\n",regname[r],regname[r]);
+  output_w32(0xe2800000|rd_rn_rm(r,r,0));
+}
+
+static void emit_andimm(int rs,int imm,int rt)
+{
+  u_int armval;
+  if(imm==0) {
+    emit_zeroreg(rt);
+  }else if(genimm(imm,&armval)) {
+    assem_debug("and %s,%s,#%d\n",regname[rt],regname[rs],imm);
+    output_w32(0xe2000000|rd_rn_rm(rt,rs,0)|armval);
+  }else if(genimm(~imm,&armval)) {
+    assem_debug("bic %s,%s,#%d\n",regname[rt],regname[rs],imm);
+    output_w32(0xe3c00000|rd_rn_rm(rt,rs,0)|armval);
+  }else if(imm==65535) {
+    #ifndef HAVE_ARMV6
+    assem_debug("bic %s,%s,#FF000000\n",regname[rt],regname[rs]);
+    output_w32(0xe3c00000|rd_rn_rm(rt,rs,0)|0x4FF);
+    assem_debug("bic %s,%s,#00FF0000\n",regname[rt],regname[rt]);
+    output_w32(0xe3c00000|rd_rn_rm(rt,rt,0)|0x8FF);
+    #else
+    assem_debug("uxth %s,%s\n",regname[rt],regname[rs]);
+    output_w32(0xe6ff0070|rd_rn_rm(rt,0,rs));
+    #endif
+  }else{
+    assert(imm>0&&imm<65535);
+    #ifndef HAVE_ARMV7
+    assem_debug("mov r14,#%d\n",imm&0xFF00);
+    output_w32(0xe3a00000|rd_rn_imm_shift(HOST_TEMPREG,0,imm>>8,8));
+    assem_debug("add r14,r14,#%d\n",imm&0xFF);
+    output_w32(0xe2800000|rd_rn_imm_shift(HOST_TEMPREG,HOST_TEMPREG,imm&0xff,0));
+    #else
+    emit_movw(imm,HOST_TEMPREG);
+    #endif
+    assem_debug("and %s,%s,r14\n",regname[rt],regname[rs]);
+    output_w32(0xe0000000|rd_rn_rm(rt,rs,HOST_TEMPREG));
+  }
+}
+
+static void emit_orimm(int rs,int imm,int rt)
+{
+  u_int armval;
+  if(imm==0) {
+    if(rs!=rt) emit_mov(rs,rt);
+  }else if(genimm(imm,&armval)) {
+    assem_debug("orr %s,%s,#%d\n",regname[rt],regname[rs],imm);
+    output_w32(0xe3800000|rd_rn_rm(rt,rs,0)|armval);
+  }else{
+    assert(imm>0&&imm<65536);
+    assem_debug("orr %s,%s,#%d\n",regname[rt],regname[rs],imm&0xFF00);
+    assem_debug("orr %s,%s,#%d\n",regname[rt],regname[rs],imm&0xFF);
+    output_w32(0xe3800000|rd_rn_imm_shift(rt,rs,imm>>8,8));
+    output_w32(0xe3800000|rd_rn_imm_shift(rt,rt,imm&0xff,0));
+  }
+}
+
+static void emit_xorimm(int rs,int imm,int rt)
+{
+  u_int armval;
+  if(imm==0) {
+    if(rs!=rt) emit_mov(rs,rt);
+  }else if(genimm(imm,&armval)) {
+    assem_debug("eor %s,%s,#%d\n",regname[rt],regname[rs],imm);
+    output_w32(0xe2200000|rd_rn_rm(rt,rs,0)|armval);
+  }else{
+    assert(imm>0&&imm<65536);
+    assem_debug("eor %s,%s,#%d\n",regname[rt],regname[rs],imm&0xFF00);
+    assem_debug("eor %s,%s,#%d\n",regname[rt],regname[rs],imm&0xFF);
+    output_w32(0xe2200000|rd_rn_imm_shift(rt,rs,imm>>8,8));
+    output_w32(0xe2200000|rd_rn_imm_shift(rt,rt,imm&0xff,0));
+  }
+}
+
+static void emit_shlimm(int rs,u_int imm,int rt)
+{
+  assert(imm>0);
+  assert(imm<32);
+  //if(imm==1) ...
+  assem_debug("lsl %s,%s,#%d\n",regname[rt],regname[rs],imm);
+  output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|(imm<<7));
+}
+
+static void emit_lsls_imm(int rs,int imm,int rt)
+{
+  assert(imm>0);
+  assert(imm<32);
+  assem_debug("lsls %s,%s,#%d\n",regname[rt],regname[rs],imm);
+  output_w32(0xe1b00000|rd_rn_rm(rt,0,rs)|(imm<<7));
+}
+
+static unused void emit_lslpls_imm(int rs,int imm,int rt)
+{
+  assert(imm>0);
+  assert(imm<32);
+  assem_debug("lslpls %s,%s,#%d\n",regname[rt],regname[rs],imm);
+  output_w32(0x51b00000|rd_rn_rm(rt,0,rs)|(imm<<7));
+}
+
+static void emit_shrimm(int rs,u_int imm,int rt)
+{
+  assert(imm>0);
+  assert(imm<32);
+  assem_debug("lsr %s,%s,#%d\n",regname[rt],regname[rs],imm);
+  output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x20|(imm<<7));
+}
+
+static void emit_sarimm(int rs,u_int imm,int rt)
+{
+  assert(imm>0);
+  assert(imm<32);
+  assem_debug("asr %s,%s,#%d\n",regname[rt],regname[rs],imm);
+  output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x40|(imm<<7));
+}
+
+static void emit_rorimm(int rs,u_int imm,int rt)
+{
+  assert(imm>0);
+  assert(imm<32);
+  assem_debug("ror %s,%s,#%d\n",regname[rt],regname[rs],imm);
+  output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x60|(imm<<7));
+}
+
+static void emit_signextend16(int rs,int rt)
+{
+  #ifndef HAVE_ARMV6
+  emit_shlimm(rs,16,rt);
+  emit_sarimm(rt,16,rt);
+  #else
+  assem_debug("sxth %s,%s\n",regname[rt],regname[rs]);
+  output_w32(0xe6bf0070|rd_rn_rm(rt,0,rs));
+  #endif
+}
+
+static void emit_signextend8(int rs,int rt)
+{
+  #ifndef HAVE_ARMV6
+  emit_shlimm(rs,24,rt);
+  emit_sarimm(rt,24,rt);
+  #else
+  assem_debug("sxtb %s,%s\n",regname[rt],regname[rs]);
+  output_w32(0xe6af0070|rd_rn_rm(rt,0,rs));
+  #endif
+}
+
+static void emit_shl(u_int rs,u_int shift,u_int rt)
+{
+  assert(rs<16);
+  assert(rt<16);
+  assert(shift<16);
+  //if(imm==1) ...
+  assem_debug("lsl %s,%s,%s\n",regname[rt],regname[rs],regname[shift]);
+  output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x10|(shift<<8));
+}
+
+static void emit_shr(u_int rs,u_int shift,u_int rt)
+{
+  assert(rs<16);
+  assert(rt<16);
+  assert(shift<16);
+  assem_debug("lsr %s,%s,%s\n",regname[rt],regname[rs],regname[shift]);
+  output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x30|(shift<<8));
+}
+
+static void emit_sar(u_int rs,u_int shift,u_int rt)
+{
+  assert(rs<16);
+  assert(rt<16);
+  assert(shift<16);
+  assem_debug("asr %s,%s,%s\n",regname[rt],regname[rs],regname[shift]);
+  output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x50|(shift<<8));
+}
+
+static unused void emit_orrshl(u_int rs,u_int shift,u_int rt)
+{
+  assert(rs<16);
+  assert(rt<16);
+  assert(shift<16);
+  assem_debug("orr %s,%s,%s,lsl %s\n",regname[rt],regname[rt],regname[rs],regname[shift]);
+  output_w32(0xe1800000|rd_rn_rm(rt,rt,rs)|0x10|(shift<<8));
+}
+
+static unused void emit_orrshr(u_int rs,u_int shift,u_int rt)
+{
+  assert(rs<16);
+  assert(rt<16);
+  assert(shift<16);
+  assem_debug("orr %s,%s,%s,lsr %s\n",regname[rt],regname[rt],regname[rs],regname[shift]);
+  output_w32(0xe1800000|rd_rn_rm(rt,rt,rs)|0x30|(shift<<8));
+}
+
+static void emit_cmpimm(int rs,int imm)
+{
+  u_int armval;
+  if(genimm(imm,&armval)) {
+    assem_debug("cmp %s,#%d\n",regname[rs],imm);
+    output_w32(0xe3500000|rd_rn_rm(0,rs,0)|armval);
+  }else if(genimm(-imm,&armval)) {
+    assem_debug("cmn %s,#%d\n",regname[rs],imm);
+    output_w32(0xe3700000|rd_rn_rm(0,rs,0)|armval);
+  }else if(imm>0) {
+    assert(imm<65536);
+    emit_movimm(imm,HOST_TEMPREG);
+    assem_debug("cmp %s,r14\n",regname[rs]);
+    output_w32(0xe1500000|rd_rn_rm(0,rs,HOST_TEMPREG));
+  }else{
+    assert(imm>-65536);
+    emit_movimm(-imm,HOST_TEMPREG);
+    assem_debug("cmn %s,r14\n",regname[rs]);
+    output_w32(0xe1700000|rd_rn_rm(0,rs,HOST_TEMPREG));
+  }
+}
+
+static void emit_cmovne_imm(int imm,int rt)
+{
+  assem_debug("movne %s,#%d\n",regname[rt],imm);
+  u_int armval;
+  genimm_checked(imm,&armval);
+  output_w32(0x13a00000|rd_rn_rm(rt,0,0)|armval);
+}
+
+static void emit_cmovl_imm(int imm,int rt)
+{
+  assem_debug("movlt %s,#%d\n",regname[rt],imm);
+  u_int armval;
+  genimm_checked(imm,&armval);
+  output_w32(0xb3a00000|rd_rn_rm(rt,0,0)|armval);
+}
+
+static void emit_cmovb_imm(int imm,int rt)
+{
+  assem_debug("movcc %s,#%d\n",regname[rt],imm);
+  u_int armval;
+  genimm_checked(imm,&armval);
+  output_w32(0x33a00000|rd_rn_rm(rt,0,0)|armval);
+}
+
+static void emit_cmovae_imm(int imm,int rt)
+{
+  assem_debug("movcs %s,#%d\n",regname[rt],imm);
+  u_int armval;
+  genimm_checked(imm,&armval);
+  output_w32(0x23a00000|rd_rn_rm(rt,0,0)|armval);
+}
+
+static void emit_cmovs_imm(int imm,int rt)
+{
+  assem_debug("movmi %s,#%d\n",regname[rt],imm);
+  u_int armval;
+  genimm_checked(imm,&armval);
+  output_w32(0x43a00000|rd_rn_rm(rt,0,0)|armval);
+}
+
+static void emit_cmovne_reg(int rs,int rt)
+{
+  assem_debug("movne %s,%s\n",regname[rt],regname[rs]);
+  output_w32(0x11a00000|rd_rn_rm(rt,0,rs));
+}
+
+static void emit_cmovl_reg(int rs,int rt)
+{
+  assem_debug("movlt %s,%s\n",regname[rt],regname[rs]);
+  output_w32(0xb1a00000|rd_rn_rm(rt,0,rs));
+}
+
+static void emit_cmovb_reg(int rs,int rt)
+{
+  assem_debug("movcc %s,%s\n",regname[rt],regname[rs]);
+  output_w32(0x31a00000|rd_rn_rm(rt,0,rs));
+}
+
+static void emit_cmovs_reg(int rs,int rt)
+{
+  assem_debug("movmi %s,%s\n",regname[rt],regname[rs]);
+  output_w32(0x41a00000|rd_rn_rm(rt,0,rs));
+}
+
+static void emit_slti32(int rs,int imm,int rt)
+{
+  if(rs!=rt) emit_zeroreg(rt);
+  emit_cmpimm(rs,imm);
+  if(rs==rt) emit_movimm(0,rt);
+  emit_cmovl_imm(1,rt);
+}
+
+static void emit_sltiu32(int rs,int imm,int rt)
+{
+  if(rs!=rt) emit_zeroreg(rt);
+  emit_cmpimm(rs,imm);
+  if(rs==rt) emit_movimm(0,rt);
+  emit_cmovb_imm(1,rt);
+}
+
+static void emit_cmp(int rs,int rt)
+{
+  assem_debug("cmp %s,%s\n",regname[rs],regname[rt]);
+  output_w32(0xe1500000|rd_rn_rm(0,rs,rt));
+}
+
+static void emit_set_gz32(int rs, int rt)
+{
+  //assem_debug("set_gz32\n");
+  emit_cmpimm(rs,1);
+  emit_movimm(1,rt);
+  emit_cmovl_imm(0,rt);
+}
+
+static void emit_set_nz32(int rs, int rt)
+{
+  //assem_debug("set_nz32\n");
+  if(rs!=rt) emit_movs(rs,rt);
+  else emit_test(rs,rs);
+  emit_cmovne_imm(1,rt);
+}
+
+static void emit_set_if_less32(int rs1, int rs2, int rt)
+{
+  //assem_debug("set if less (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]);
+  if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt);
+  emit_cmp(rs1,rs2);
+  if(rs1==rt||rs2==rt) emit_movimm(0,rt);
+  emit_cmovl_imm(1,rt);
+}
+
+static void emit_set_if_carry32(int rs1, int rs2, int rt)
+{
+  //assem_debug("set if carry (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]);
+  if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt);
+  emit_cmp(rs1,rs2);
+  if(rs1==rt||rs2==rt) emit_movimm(0,rt);
+  emit_cmovb_imm(1,rt);
+}
+
+static int can_jump_or_call(const void *a)
+{
+  intptr_t offset = (u_char *)a - out - 8;
+  return (-33554432 <= offset && offset < 33554432);
+}
+
+static void emit_call(const void *a_)
+{
+  int a = (int)a_;
+  assem_debug("bl %x (%x+%x)%s\n",a,(int)out,a-(int)out-8,func_name(a_));
+  u_int offset=genjmp(a);
+  output_w32(0xeb000000|offset);
+}
+
+static void emit_jmp(const void *a_)
+{
+  int a = (int)a_;
+  assem_debug("b %x (%x+%x)%s\n",a,(int)out,a-(int)out-8,func_name(a_));
+  u_int offset=genjmp(a);
+  output_w32(0xea000000|offset);
+}
+
+static void emit_jne(const void *a_)
+{
+  int a = (int)a_;
+  assem_debug("bne %x\n",a);
+  u_int offset=genjmp(a);
+  output_w32(0x1a000000|offset);
+}
+
+static void emit_jeq(const void *a_)
+{
+  int a = (int)a_;
+  assem_debug("beq %x\n",a);
+  u_int offset=genjmp(a);
+  output_w32(0x0a000000|offset);
+}
+
+static void emit_js(const void *a_)
+{
+  int a = (int)a_;
+  assem_debug("bmi %x\n",a);
+  u_int offset=genjmp(a);
+  output_w32(0x4a000000|offset);
+}
+
+static void emit_jns(const void *a_)
+{
+  int a = (int)a_;
+  assem_debug("bpl %x\n",a);
+  u_int offset=genjmp(a);
+  output_w32(0x5a000000|offset);
+}
+
+static void emit_jl(const void *a_)
+{
+  int a = (int)a_;
+  assem_debug("blt %x\n",a);
+  u_int offset=genjmp(a);
+  output_w32(0xba000000|offset);
+}
+
+static void emit_jge(const void *a_)
+{
+  int a = (int)a_;
+  assem_debug("bge %x\n",a);
+  u_int offset=genjmp(a);
+  output_w32(0xaa000000|offset);
+}
+
+static void emit_jno(const void *a_)
+{
+  int a = (int)a_;
+  assem_debug("bvc %x\n",a);
+  u_int offset=genjmp(a);
+  output_w32(0x7a000000|offset);
+}
+
+static void emit_jc(const void *a_)
+{
+  int a = (int)a_;
+  assem_debug("bcs %x\n",a);
+  u_int offset=genjmp(a);
+  output_w32(0x2a000000|offset);
+}
+
+static void emit_jcc(const void *a_)
+{
+  int a = (int)a_;
+  assem_debug("bcc %x\n",a);
+  u_int offset=genjmp(a);
+  output_w32(0x3a000000|offset);
+}
+
+static unused void emit_callreg(u_int r)
+{
+  assert(r<15);
+  assem_debug("blx %s\n",regname[r]);
+  output_w32(0xe12fff30|r);
+}
+
+static void emit_jmpreg(u_int r)
+{
+  assem_debug("mov pc,%s\n",regname[r]);
+  output_w32(0xe1a00000|rd_rn_rm(15,0,r));
+}
+
+static void emit_ret(void)
+{
+  emit_jmpreg(14);
+}
+
+static void emit_readword_indexed(int offset, int rs, int rt)
+{
+  assert(offset>-4096&&offset<4096);
+  assem_debug("ldr %s,%s+%d\n",regname[rt],regname[rs],offset);
+  if(offset>=0) {
+    output_w32(0xe5900000|rd_rn_rm(rt,rs,0)|offset);
+  }else{
+    output_w32(0xe5100000|rd_rn_rm(rt,rs,0)|(-offset));
+  }
+}
+
+static void emit_readword_dualindexedx4(int rs1, int rs2, int rt)
+{
+  assem_debug("ldr %s,%s,%s lsl #2\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0xe7900000|rd_rn_rm(rt,rs1,rs2)|0x100);
+}
+#define emit_readptr_dualindexedx_ptrlen emit_readword_dualindexedx4
+
+static void emit_ldr_dualindexed(int rs1, int rs2, int rt)
+{
+  assem_debug("ldr %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0xe7900000|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_ldrcc_dualindexed(int rs1, int rs2, int rt)
+{
+  assem_debug("ldrcc %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0x37900000|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_ldrb_dualindexed(int rs1, int rs2, int rt)
+{
+  assem_debug("ldrb %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0xe7d00000|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_ldrccb_dualindexed(int rs1, int rs2, int rt)
+{
+  assem_debug("ldrccb %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0x37d00000|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_ldrsb_dualindexed(int rs1, int rs2, int rt)
+{
+  assem_debug("ldrsb %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0xe19000d0|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_ldrccsb_dualindexed(int rs1, int rs2, int rt)
+{
+  assem_debug("ldrccsb %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0x319000d0|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_ldrh_dualindexed(int rs1, int rs2, int rt)
+{
+  assem_debug("ldrh %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0xe19000b0|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_ldrcch_dualindexed(int rs1, int rs2, int rt)
+{
+  assem_debug("ldrcch %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0x319000b0|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_ldrsh_dualindexed(int rs1, int rs2, int rt)
+{
+  assem_debug("ldrsh %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0xe19000f0|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_ldrccsh_dualindexed(int rs1, int rs2, int rt)
+{
+  assem_debug("ldrccsh %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0x319000f0|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_str_dualindexed(int rs1, int rs2, int rt)
+{
+  assem_debug("str %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0xe7800000|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_strb_dualindexed(int rs1, int rs2, int rt)
+{
+  assem_debug("strb %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0xe7c00000|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_strh_dualindexed(int rs1, int rs2, int rt)
+{
+  assem_debug("strh %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0xe18000b0|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_movsbl_indexed(int offset, int rs, int rt)
+{
+  assert(offset>-256&&offset<256);
+  assem_debug("ldrsb %s,%s+%d\n",regname[rt],regname[rs],offset);
+  if(offset>=0) {
+    output_w32(0xe1d000d0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf));
+  }else{
+    output_w32(0xe15000d0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf));
+  }
+}
+
+static void emit_movswl_indexed(int offset, int rs, int rt)
+{
+  assert(offset>-256&&offset<256);
+  assem_debug("ldrsh %s,%s+%d\n",regname[rt],regname[rs],offset);
+  if(offset>=0) {
+    output_w32(0xe1d000f0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf));
+  }else{
+    output_w32(0xe15000f0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf));
+  }
+}
+
+static void emit_movzbl_indexed(int offset, int rs, int rt)
+{
+  assert(offset>-4096&&offset<4096);
+  assem_debug("ldrb %s,%s+%d\n",regname[rt],regname[rs],offset);
+  if(offset>=0) {
+    output_w32(0xe5d00000|rd_rn_rm(rt,rs,0)|offset);
+  }else{
+    output_w32(0xe5500000|rd_rn_rm(rt,rs,0)|(-offset));
+  }
+}
+
+static void emit_movzwl_indexed(int offset, int rs, int rt)
+{
+  assert(offset>-256&&offset<256);
+  assem_debug("ldrh %s,%s+%d\n",regname[rt],regname[rs],offset);
+  if(offset>=0) {
+    output_w32(0xe1d000b0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf));
+  }else{
+    output_w32(0xe15000b0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf));
+  }
+}
+
+static void emit_ldrd(int offset, int rs, int rt)
+{
+  assert(offset>-256&&offset<256);
+  assem_debug("ldrd %s,%s+%d\n",regname[rt],regname[rs],offset);
+  if(offset>=0) {
+    output_w32(0xe1c000d0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf));
+  }else{
+    output_w32(0xe14000d0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf));
+  }
+}
+
+static void emit_readword(void *addr, int rt)
+{
+  uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local;
+  assert(offset<4096);
+  assem_debug("ldr %s,fp+%d\n",regname[rt],offset);
+  output_w32(0xe5900000|rd_rn_rm(rt,FP,0)|offset);
+}
+#define emit_readptr emit_readword
+
+static void emit_writeword_indexed(int rt, int offset, int rs)
+{
+  assert(offset>-4096&&offset<4096);
+  assem_debug("str %s,%s+%d\n",regname[rt],regname[rs],offset);
+  if(offset>=0) {
+    output_w32(0xe5800000|rd_rn_rm(rt,rs,0)|offset);
+  }else{
+    output_w32(0xe5000000|rd_rn_rm(rt,rs,0)|(-offset));
+  }
+}
+
+static void emit_writehword_indexed(int rt, int offset, int rs)
+{
+  assert(offset>-256&&offset<256);
+  assem_debug("strh %s,%s+%d\n",regname[rt],regname[rs],offset);
+  if(offset>=0) {
+    output_w32(0xe1c000b0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf));
+  }else{
+    output_w32(0xe14000b0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf));
+  }
+}
+
+static void emit_writebyte_indexed(int rt, int offset, int rs)
+{
+  assert(offset>-4096&&offset<4096);
+  assem_debug("strb %s,%s+%d\n",regname[rt],regname[rs],offset);
+  if(offset>=0) {
+    output_w32(0xe5c00000|rd_rn_rm(rt,rs,0)|offset);
+  }else{
+    output_w32(0xe5400000|rd_rn_rm(rt,rs,0)|(-offset));
+  }
+}
+
+static void emit_strcc_dualindexed(int rs1, int rs2, int rt)
+{
+  assem_debug("strcc %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0x37800000|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_strccb_dualindexed(int rs1, int rs2, int rt)
+{
+  assem_debug("strccb %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0x37c00000|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_strcch_dualindexed(int rs1, int rs2, int rt)
+{
+  assem_debug("strcch %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0x318000b0|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_writeword(int rt, void *addr)
+{
+  uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local;
+  assert(offset<4096);
+  assem_debug("str %s,fp+%d\n",regname[rt],offset);
+  output_w32(0xe5800000|rd_rn_rm(rt,FP,0)|offset);
+}
+
+static void emit_umull(u_int rs1, u_int rs2, u_int hi, u_int lo)
+{
+  assem_debug("umull %s, %s, %s, %s\n",regname[lo],regname[hi],regname[rs1],regname[rs2]);
+  assert(rs1<16);
+  assert(rs2<16);
+  assert(hi<16);
+  assert(lo<16);
+  output_w32(0xe0800090|(hi<<16)|(lo<<12)|(rs2<<8)|rs1);
+}
+
+static void emit_smull(u_int rs1, u_int rs2, u_int hi, u_int lo)
+{
+  assem_debug("smull %s, %s, %s, %s\n",regname[lo],regname[hi],regname[rs1],regname[rs2]);
+  assert(rs1<16);
+  assert(rs2<16);
+  assert(hi<16);
+  assert(lo<16);
+  output_w32(0xe0c00090|(hi<<16)|(lo<<12)|(rs2<<8)|rs1);
+}
+
+static void emit_clz(int rs,int rt)
+{
+  assem_debug("clz %s,%s\n",regname[rt],regname[rs]);
+  output_w32(0xe16f0f10|rd_rn_rm(rt,0,rs));
+}
+
+static void emit_subcs(int rs1,int rs2,int rt)
+{
+  assem_debug("subcs %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0x20400000|rd_rn_rm(rt,rs1,rs2));
+}
+
+static void emit_shrcc_imm(int rs,u_int imm,int rt)
+{
+  assert(imm>0);
+  assert(imm<32);
+  assem_debug("lsrcc %s,%s,#%d\n",regname[rt],regname[rs],imm);
+  output_w32(0x31a00000|rd_rn_rm(rt,0,rs)|0x20|(imm<<7));
+}
+
+static void emit_shrne_imm(int rs,u_int imm,int rt)
+{
+  assert(imm>0);
+  assert(imm<32);
+  assem_debug("lsrne %s,%s,#%d\n",regname[rt],regname[rs],imm);
+  output_w32(0x11a00000|rd_rn_rm(rt,0,rs)|0x20|(imm<<7));
+}
+
+static void emit_negmi(int rs, int rt)
+{
+  assem_debug("rsbmi %s,%s,#0\n",regname[rt],regname[rs]);
+  output_w32(0x42600000|rd_rn_rm(rt,rs,0));
+}
+
+static void emit_negsmi(int rs, int rt)
+{
+  assem_debug("rsbsmi %s,%s,#0\n",regname[rt],regname[rs]);
+  output_w32(0x42700000|rd_rn_rm(rt,rs,0));
+}
+
+static void emit_bic_lsl(u_int rs1,u_int rs2,u_int shift,u_int rt)
+{
+  assem_debug("bic %s,%s,%s lsl %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]);
+  output_w32(0xe1C00000|rd_rn_rm(rt,rs1,rs2)|0x10|(shift<<8));
+}
+
+static void emit_bic_lsr(u_int rs1,u_int rs2,u_int shift,u_int rt)
+{
+  assem_debug("bic %s,%s,%s lsr %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]);
+  output_w32(0xe1C00000|rd_rn_rm(rt,rs1,rs2)|0x30|(shift<<8));
+}
+
+static void emit_teq(int rs, int rt)
+{
+  assem_debug("teq %s,%s\n",regname[rs],regname[rt]);
+  output_w32(0xe1300000|rd_rn_rm(0,rs,rt));
+}
+
+static unused void emit_rsbimm(int rs, int imm, int rt)
+{
+  u_int armval;
+  genimm_checked(imm,&armval);
+  assem_debug("rsb %s,%s,#%d\n",regname[rt],regname[rs],imm);
+  output_w32(0xe2600000|rd_rn_rm(rt,rs,0)|armval);
+}
+
+// Conditionally select one of two immediates, optimizing for small code size
+// This will only be called if HAVE_CMOV_IMM is defined
+static void emit_cmov2imm_e_ne_compact(int imm1,int imm2,u_int rt)
+{
+  u_int armval;
+  if(genimm(imm2-imm1,&armval)) {
+    emit_movimm(imm1,rt);
+    assem_debug("addne %s,%s,#%d\n",regname[rt],regname[rt],imm2-imm1);
+    output_w32(0x12800000|rd_rn_rm(rt,rt,0)|armval);
+  }else if(genimm(imm1-imm2,&armval)) {
+    emit_movimm(imm1,rt);
+    assem_debug("subne %s,%s,#%d\n",regname[rt],regname[rt],imm1-imm2);
+    output_w32(0x12400000|rd_rn_rm(rt,rt,0)|armval);
+  }
+  else {
+    #ifndef HAVE_ARMV7
+    emit_movimm(imm1,rt);
+    add_literal((int)out,imm2);
+    assem_debug("ldrne %s,pc+? [=%x]\n",regname[rt],imm2);
+    output_w32(0x15900000|rd_rn_rm(rt,15,0));
+    #else
+    emit_movw(imm1&0x0000FFFF,rt);
+    if((imm1&0xFFFF)!=(imm2&0xFFFF)) {
+      assem_debug("movwne %s,#%d (0x%x)\n",regname[rt],imm2&0xFFFF,imm2&0xFFFF);
+      output_w32(0x13000000|rd_rn_rm(rt,0,0)|(imm2&0xfff)|((imm2<<4)&0xf0000));
+    }
+    emit_movt(imm1&0xFFFF0000,rt);
+    if((imm1&0xFFFF0000)!=(imm2&0xFFFF0000)) {
+      assem_debug("movtne %s,#%d (0x%x)\n",regname[rt],imm2&0xffff0000,imm2&0xffff0000);
+      output_w32(0x13400000|rd_rn_rm(rt,0,0)|((imm2>>16)&0xfff)|((imm2>>12)&0xf0000));
+    }
+    #endif
+  }
+}
+
+// special case for checking invalid_code
+static void emit_cmpmem_indexedsr12_reg(int base,int r,int imm)
+{
+  assert(imm<128&&imm>=0);
+  assert(r>=0&&r<16);
+  assem_debug("ldrb lr,%s,%s lsr #12\n",regname[base],regname[r]);
+  output_w32(0xe7d00000|rd_rn_rm(HOST_TEMPREG,base,r)|0x620);
+  emit_cmpimm(HOST_TEMPREG,imm);
+}
+
+static void emit_callne(int a)
+{
+  assem_debug("blne %x\n",a);
+  u_int offset=genjmp(a);
+  output_w32(0x1b000000|offset);
+}
+
+// Used to preload hash table entries
+static unused void emit_prefetchreg(int r)
+{
+  assem_debug("pld %s\n",regname[r]);
+  output_w32(0xf5d0f000|rd_rn_rm(0,r,0));
+}
+
+// Special case for mini_ht
+static void emit_ldreq_indexed(int rs, u_int offset, int rt)
+{
+  assert(offset<4096);
+  assem_debug("ldreq %s,[%s, #%d]\n",regname[rt],regname[rs],offset);
+  output_w32(0x05900000|rd_rn_rm(rt,rs,0)|offset);
+}
+
+static void emit_orrne_imm(int rs,int imm,int rt)
+{
+  u_int armval;
+  genimm_checked(imm,&armval);
+  assem_debug("orrne %s,%s,#%d\n",regname[rt],regname[rs],imm);
+  output_w32(0x13800000|rd_rn_rm(rt,rs,0)|armval);
+}
+
+static unused void emit_addpl_imm(int rs,int imm,int rt)
+{
+  u_int armval;
+  genimm_checked(imm,&armval);
+  assem_debug("addpl %s,%s,#%d\n",regname[rt],regname[rs],imm);
+  output_w32(0x52800000|rd_rn_rm(rt,rs,0)|armval);
+}
+
+static void emit_jno_unlikely(int a)
+{
+  //emit_jno(a);
+  assem_debug("addvc pc,pc,#? (%x)\n",/*a-(int)out-8,*/a);
+  output_w32(0x72800000|rd_rn_rm(15,15,0));
+}
+
+static void save_regs_all(u_int reglist)
+{
+  int i;
+  if(!reglist) return;
+  assem_debug("stmia fp,{");
+  for(i=0;i<16;i++)
+    if(reglist&(1<<i))
+      assem_debug("r%d,",i);
+  assem_debug("}\n");
+  output_w32(0xe88b0000|reglist);
+}
+
+static void restore_regs_all(u_int reglist)
+{
+  int i;
+  if(!reglist) return;
+  assem_debug("ldmia fp,{");
+  for(i=0;i<16;i++)
+    if(reglist&(1<<i))
+      assem_debug("r%d,",i);
+  assem_debug("}\n");
+  output_w32(0xe89b0000|reglist);
+}
+
+// Save registers before function call
+static void save_regs(u_int reglist)
+{
+  reglist&=CALLER_SAVE_REGS; // only save the caller-save registers, r0-r3, r12
+  save_regs_all(reglist);
+}
+
+// Restore registers after function call
+static void restore_regs(u_int reglist)
+{
+  reglist&=CALLER_SAVE_REGS;
+  restore_regs_all(reglist);
+}
+
+/* Stubs/epilogue */
+
+static void literal_pool(int n)
+{
+  if(!literalcount) return;
+  if(n) {
+    if((int)out-literals[0][0]<4096-n) return;
+  }
+  u_int *ptr;
+  int i;
+  for(i=0;i<literalcount;i++)
+  {
+    u_int l_addr=(u_int)out;
+    int j;
+    for(j=0;j<i;j++) {
+      if(literals[j][1]==literals[i][1]) {
+        //printf("dup %08x\n",literals[i][1]);
+        l_addr=literals[j][0];
+        break;
+      }
+    }
+    ptr=(u_int *)literals[i][0];
+    u_int offset=l_addr-(u_int)ptr-8;
+    assert(offset<4096);
+    assert(!(offset&3));
+    *ptr|=offset;
+    if(l_addr==(u_int)out) {
+      literals[i][0]=l_addr; // remember for dupes
+      output_w32(literals[i][1]);
+    }
+  }
+  literalcount=0;
+}
+
+static void literal_pool_jumpover(int n)
+{
+  if(!literalcount) return;
+  if(n) {
+    if((int)out-literals[0][0]<4096-n) return;
+  }
+  void *jaddr = out;
+  emit_jmp(0);
+  literal_pool(0);
+  set_jump_target(jaddr, out);
+}
+
+// parsed by get_pointer, find_extjump_insn
+static void emit_extjump2(u_char *addr, u_int target, void *linker)
+{
+  u_char *ptr=(u_char *)addr;
+  assert((ptr[3]&0x0e)==0xa);
+  (void)ptr;
+
+  emit_loadlp(target,0);
+  emit_loadlp((u_int)addr,1);
+  assert(addr>=ndrc->translation_cache&&addr<(ndrc->translation_cache+(1<<TARGET_SIZE_2)));
+  //assert((target>=0x80000000&&target<0x80800000)||(target>0xA4000000&&target<0xA4001000));
+//DEBUG >
+#ifdef DEBUG_CYCLE_COUNT
+  emit_readword(&last_count,ECX);
+  emit_add(HOST_CCREG,ECX,HOST_CCREG);
+  emit_readword(&next_interupt,ECX);
+  emit_writeword(HOST_CCREG,&Count);
+  emit_sub(HOST_CCREG,ECX,HOST_CCREG);
+  emit_writeword(ECX,&last_count);
+#endif
+//DEBUG <
+  emit_far_jump(linker);
+}
+
+static void check_extjump2(void *src)
+{
+  u_int *ptr = src;
+  assert((ptr[1] & 0x0fff0000) == 0x059f0000); // ldr rx, [pc, #ofs]
+  (void)ptr;
+}
+
+// put rt_val into rt, potentially making use of rs with value rs_val
+static void emit_movimm_from(u_int rs_val,int rs,u_int rt_val,int rt)
+{
+  u_int armval;
+  int diff;
+  if(genimm(rt_val,&armval)) {
+    assem_debug("mov %s,#%d\n",regname[rt],rt_val);
+    output_w32(0xe3a00000|rd_rn_rm(rt,0,0)|armval);
+    return;
+  }
+  if(genimm(~rt_val,&armval)) {
+    assem_debug("mvn %s,#%d\n",regname[rt],rt_val);
+    output_w32(0xe3e00000|rd_rn_rm(rt,0,0)|armval);
+    return;
+  }
+  diff=rt_val-rs_val;
+  if(genimm(diff,&armval)) {
+    assem_debug("add %s,%s,#%d\n",regname[rt],regname[rs],diff);
+    output_w32(0xe2800000|rd_rn_rm(rt,rs,0)|armval);
+    return;
+  }else if(genimm(-diff,&armval)) {
+    assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rs],-diff);
+    output_w32(0xe2400000|rd_rn_rm(rt,rs,0)|armval);
+    return;
+  }
+  emit_movimm(rt_val,rt);
+}
+
+// return 1 if above function can do it's job cheaply
+static int is_similar_value(u_int v1,u_int v2)
+{
+  u_int xs;
+  int diff;
+  if(v1==v2) return 1;
+  diff=v2-v1;
+  for(xs=diff;xs!=0&&(xs&3)==0;xs>>=2)
+    ;
+  if(xs<0x100) return 1;
+  for(xs=-diff;xs!=0&&(xs&3)==0;xs>>=2)
+    ;
+  if(xs<0x100) return 1;
+  return 0;
+}
+
+static void mov_loadtype_adj(enum stub_type type,int rs,int rt)
+{
+  switch(type) {
+    case LOADB_STUB:  emit_signextend8(rs,rt); break;
+    case LOADBU_STUB: emit_andimm(rs,0xff,rt); break;
+    case LOADH_STUB:  emit_signextend16(rs,rt); break;
+    case LOADHU_STUB: emit_andimm(rs,0xffff,rt); break;
+    case LOADW_STUB:  if(rs!=rt) emit_mov(rs,rt); break;
+    default: assert(0);
+  }
+}
+
+#include "pcsxmem.h"
+#include "pcsxmem_inline.c"
+
+static void do_readstub(int n)
+{
+  assem_debug("do_readstub %x\n",start+stubs[n].a*4);
+  literal_pool(256);
+  set_jump_target(stubs[n].addr, out);
+  enum stub_type type=stubs[n].type;
+  int i=stubs[n].a;
+  int rs=stubs[n].b;
+  const struct regstat *i_regs=(struct regstat *)stubs[n].c;
+  u_int reglist=stubs[n].e;
+  const signed char *i_regmap=i_regs->regmap;
+  int rt;
+  if(dops[i].itype==C1LS||dops[i].itype==C2LS||dops[i].itype==LOADLR) {
+    rt=get_reg(i_regmap,FTEMP);
+  }else{
+    rt=get_reg(i_regmap,dops[i].rt1);
+  }
+  assert(rs>=0);
+  int r,temp=-1,temp2=HOST_TEMPREG,regs_saved=0;
+  void *restore_jump = NULL;
+  reglist|=(1<<rs);
+  for(r=0;r<=12;r++) {
+    if(((1<<r)&0x13ff)&&((1<<r)&reglist)==0) {
+      temp=r; break;
+    }
+  }
+  if(rt>=0&&dops[i].rt1!=0)
+    reglist&=~(1<<rt);
+  if(temp==-1) {
+    save_regs(reglist);
+    regs_saved=1;
+    temp=(rs==0)?2:0;
+  }
+  if((regs_saved||(reglist&2)==0)&&temp!=1&&rs!=1)
+    temp2=1;
+  emit_readword(&mem_rtab,temp);
+  emit_shrimm(rs,12,temp2);
+  emit_readword_dualindexedx4(temp,temp2,temp2);
+  emit_lsls_imm(temp2,1,temp2);
+  if(dops[i].itype==C1LS||dops[i].itype==C2LS||(rt>=0&&dops[i].rt1!=0)) {
+    switch(type) {
+      case LOADB_STUB:  emit_ldrccsb_dualindexed(temp2,rs,rt); break;
+      case LOADBU_STUB: emit_ldrccb_dualindexed(temp2,rs,rt); break;
+      case LOADH_STUB:  emit_ldrccsh_dualindexed(temp2,rs,rt); break;
+      case LOADHU_STUB: emit_ldrcch_dualindexed(temp2,rs,rt); break;
+      case LOADW_STUB:  emit_ldrcc_dualindexed(temp2,rs,rt); break;
+      default: assert(0);
+    }
+  }
+  if(regs_saved) {
+    restore_jump=out;
+    emit_jcc(0); // jump to reg restore
+  }
+  else
+    emit_jcc(stubs[n].retaddr); // return address
+
+  if(!regs_saved)
+    save_regs(reglist);
+  void *handler=NULL;
+  if(type==LOADB_STUB||type==LOADBU_STUB)
+    handler=jump_handler_read8;
+  if(type==LOADH_STUB||type==LOADHU_STUB)
+    handler=jump_handler_read16;
+  if(type==LOADW_STUB)
+    handler=jump_handler_read32;
+  assert(handler);
+  pass_args(rs,temp2);
+  int cc=get_reg(i_regmap,CCREG);
+  if(cc<0)
+    emit_loadreg(CCREG,2);
+  emit_addimm(cc<0?2:cc,(int)stubs[n].d,2);
+  emit_far_call(handler);
+  if(dops[i].itype==C1LS||dops[i].itype==C2LS||(rt>=0&&dops[i].rt1!=0)) {
+    mov_loadtype_adj(type,0,rt);
+  }
+  if(restore_jump)
+    set_jump_target(restore_jump, out);
+  restore_regs(reglist);
+  emit_jmp(stubs[n].retaddr); // return address
+}
+
+static void inline_readstub(enum stub_type type, int i, u_int addr,
+  const signed char regmap[], int target, int adj, u_int reglist)
+{
+  int rs=get_reg(regmap,target);
+  int rt=get_reg(regmap,target);
+  if(rs<0) rs=get_reg(regmap,-1);
+  assert(rs>=0);
+  u_int is_dynamic;
+  uintptr_t host_addr = 0;
+  void *handler;
+  int cc=get_reg(regmap,CCREG);
+  if(pcsx_direct_read(type,addr,adj,cc,target?rs:-1,rt))
+    return;
+  handler = get_direct_memhandler(mem_rtab, addr, type, &host_addr);
+  if (handler == NULL) {
+    if(rt<0||dops[i].rt1==0)
+      return;
+    if(addr!=host_addr)
+      emit_movimm_from(addr,rs,host_addr,rs);
+    switch(type) {
+      case LOADB_STUB:  emit_movsbl_indexed(0,rs,rt); break;
+      case LOADBU_STUB: emit_movzbl_indexed(0,rs,rt); break;
+      case LOADH_STUB:  emit_movswl_indexed(0,rs,rt); break;
+      case LOADHU_STUB: emit_movzwl_indexed(0,rs,rt); break;
+      case LOADW_STUB:  emit_readword_indexed(0,rs,rt); break;
+      default:          assert(0);
+    }
+    return;
+  }
+  is_dynamic=pcsxmem_is_handler_dynamic(addr);
+  if(is_dynamic) {
+    if(type==LOADB_STUB||type==LOADBU_STUB)
+      handler=jump_handler_read8;
+    if(type==LOADH_STUB||type==LOADHU_STUB)
+      handler=jump_handler_read16;
+    if(type==LOADW_STUB)
+      handler=jump_handler_read32;
+  }
+
+  // call a memhandler
+  if(rt>=0&&dops[i].rt1!=0)
+    reglist&=~(1<<rt);
+  save_regs(reglist);
+  if(target==0)
+    emit_movimm(addr,0);
+  else if(rs!=0)
+    emit_mov(rs,0);
+  if(cc<0)
+    emit_loadreg(CCREG,2);
+  if(is_dynamic) {
+    emit_movimm(((u_int *)mem_rtab)[addr>>12]<<1,1);
+    emit_addimm(cc<0?2:cc,adj,2);
+  }
+  else {
+    emit_readword(&last_count,3);
+    emit_addimm(cc<0?2:cc,adj,2);
+    emit_add(2,3,2);
+    emit_writeword(2,&Count);
+  }
+
+  emit_far_call(handler);
+
+  if(rt>=0&&dops[i].rt1!=0) {
+    switch(type) {
+      case LOADB_STUB:  emit_signextend8(0,rt); break;
+      case LOADBU_STUB: emit_andimm(0,0xff,rt); break;
+      case LOADH_STUB:  emit_signextend16(0,rt); break;
+      case LOADHU_STUB: emit_andimm(0,0xffff,rt); break;
+      case LOADW_STUB:  if(rt!=0) emit_mov(0,rt); break;
+      default:          assert(0);
+    }
+  }
+  restore_regs(reglist);
+}
+
+static void do_writestub(int n)
+{
+  assem_debug("do_writestub %x\n",start+stubs[n].a*4);
+  literal_pool(256);
+  set_jump_target(stubs[n].addr, out);
+  enum stub_type type=stubs[n].type;
+  int i=stubs[n].a;
+  int rs=stubs[n].b;
+  const struct regstat *i_regs=(struct regstat *)stubs[n].c;
+  u_int reglist=stubs[n].e;
+  const signed char *i_regmap=i_regs->regmap;
+  int rt,r;
+  if(dops[i].itype==C1LS||dops[i].itype==C2LS) {
+    rt=get_reg(i_regmap,r=FTEMP);
+  }else{
+    rt=get_reg(i_regmap,r=dops[i].rs2);
+  }
+  assert(rs>=0);
+  assert(rt>=0);
+  int rtmp,temp=-1,temp2=HOST_TEMPREG,regs_saved=0;
+  void *restore_jump = NULL;
+  int reglist2=reglist|(1<<rs)|(1<<rt);
+  for(rtmp=0;rtmp<=12;rtmp++) {
+    if(((1<<rtmp)&0x13ff)&&((1<<rtmp)&reglist2)==0) {
+      temp=rtmp; break;
+    }
+  }
+  if(temp==-1) {
+    save_regs(reglist);
+    regs_saved=1;
+    for(rtmp=0;rtmp<=3;rtmp++)
+      if(rtmp!=rs&&rtmp!=rt)
+        {temp=rtmp;break;}
+  }
+  if((regs_saved||(reglist2&8)==0)&&temp!=3&&rs!=3&&rt!=3)
+    temp2=3;
+  emit_readword(&mem_wtab,temp);
+  emit_shrimm(rs,12,temp2);
+  emit_readword_dualindexedx4(temp,temp2,temp2);
+  emit_lsls_imm(temp2,1,temp2);
+  switch(type) {
+    case STOREB_STUB: emit_strccb_dualindexed(temp2,rs,rt); break;
+    case STOREH_STUB: emit_strcch_dualindexed(temp2,rs,rt); break;
+    case STOREW_STUB: emit_strcc_dualindexed(temp2,rs,rt); break;
+    default:          assert(0);
+  }
+  if(regs_saved) {
+    restore_jump=out;
+    emit_jcc(0); // jump to reg restore
+  }
+  else
+    emit_jcc(stubs[n].retaddr); // return address (invcode check)
+
+  if(!regs_saved)
+    save_regs(reglist);
+  void *handler=NULL;
+  switch(type) {
+    case STOREB_STUB: handler=jump_handler_write8; break;
+    case STOREH_STUB: handler=jump_handler_write16; break;
+    case STOREW_STUB: handler=jump_handler_write32; break;
+    default: assert(0);
+  }
+  assert(handler);
+  pass_args(rs,rt);
+  if(temp2!=3)
+    emit_mov(temp2,3);
+  int cc=get_reg(i_regmap,CCREG);
+  if(cc<0)
+    emit_loadreg(CCREG,2);
+  emit_addimm(cc<0?2:cc,(int)stubs[n].d,2);
+  // returns new cycle_count
+  emit_far_call(handler);
+  emit_addimm(0,-(int)stubs[n].d,cc<0?2:cc);
+  if(cc<0)
+    emit_storereg(CCREG,2);
+  if(restore_jump)
+    set_jump_target(restore_jump, out);
+  restore_regs(reglist);
+  emit_jmp(stubs[n].retaddr);
+}
+
+static void inline_writestub(enum stub_type type, int i, u_int addr,
+  const signed char regmap[], int target, int adj, u_int reglist)
+{
+  int rs=get_reg(regmap,-1);
+  int rt=get_reg(regmap,target);
+  assert(rs>=0);
+  assert(rt>=0);
+  uintptr_t host_addr = 0;
+  void *handler = get_direct_memhandler(mem_wtab, addr, type, &host_addr);
+  if (handler == NULL) {
+    if(addr!=host_addr)
+      emit_movimm_from(addr,rs,host_addr,rs);
+    switch(type) {
+      case STOREB_STUB: emit_writebyte_indexed(rt,0,rs); break;
+      case STOREH_STUB: emit_writehword_indexed(rt,0,rs); break;
+      case STOREW_STUB: emit_writeword_indexed(rt,0,rs); break;
+      default:          assert(0);
+    }
+    return;
+  }
+
+  // call a memhandler
+  save_regs(reglist);
+  pass_args(rs,rt);
+  int cc=get_reg(regmap,CCREG);
+  if(cc<0)
+    emit_loadreg(CCREG,2);
+  emit_addimm(cc<0?2:cc,adj,2);
+  emit_movimm((u_int)handler,3);
+  // returns new cycle_count
+  emit_far_call(jump_handler_write_h);
+  emit_addimm(0,-adj,cc<0?2:cc);
+  if(cc<0)
+    emit_storereg(CCREG,2);
+  restore_regs(reglist);
+}
+
+// this output is parsed by verify_dirty, get_bounds, isclean, get_clean_addr
+static void do_dirty_stub_emit_args(u_int arg0, u_int source_len)
+{
+  #ifndef HAVE_ARMV7
+  emit_loadlp((int)source, 1);
+  emit_loadlp((int)copy, 2);
+  emit_loadlp(source_len, 3);
+  #else
+  emit_movw(((u_int)source)&0x0000FFFF, 1);
+  emit_movw(((u_int)copy)&0x0000FFFF, 2);
+  emit_movt(((u_int)source)&0xFFFF0000, 1);
+  emit_movt(((u_int)copy)&0xFFFF0000, 2);
+  emit_movw(source_len, 3);
+  #endif
+  emit_movimm(arg0, 0);
+}
+
+static void *do_dirty_stub(int i, u_int source_len)
+{
+  assem_debug("do_dirty_stub %x\n",start+i*4);
+  do_dirty_stub_emit_args(start + i*4, source_len);
+  emit_far_call(verify_code);
+  void *entry = out;
+  load_regs_entry(i);
+  if (entry == out)
+    entry = instr_addr[i];
+  emit_jmp(instr_addr[i]);
+  return entry;
+}
+
+static void do_dirty_stub_ds(u_int source_len)
+{
+  do_dirty_stub_emit_args(start + 1, source_len);
+  emit_far_call(verify_code_ds);
+}
+
+/* Special assem */
+
+static void c2op_prologue(u_int op, int i, const struct regstat *i_regs, u_int reglist)
+{
+  save_regs_all(reglist);
+  cop2_do_stall_check(op, i, i_regs, 0);
+#ifdef PCNT
+  emit_movimm(op, 0);
+  emit_far_call(pcnt_gte_start);
+#endif
+  emit_addimm(FP, (u_char *)&psxRegs.CP2D.r[0] - (u_char *)&dynarec_local, 0); // cop2 regs
+}
+
+static void c2op_epilogue(u_int op,u_int reglist)
+{
+#ifdef PCNT
+  emit_movimm(op,0);
+  emit_far_call(pcnt_gte_end);
+#endif
+  restore_regs_all(reglist);
+}
+
+static void c2op_call_MACtoIR(int lm,int need_flags)
+{
+  if(need_flags)
+    emit_far_call(lm?gteMACtoIR_lm1:gteMACtoIR_lm0);
+  else
+    emit_far_call(lm?gteMACtoIR_lm1_nf:gteMACtoIR_lm0_nf);
+}
+
+static void c2op_call_rgb_func(void *func,int lm,int need_ir,int need_flags)
+{
+  emit_far_call(func);
+  // func is C code and trashes r0
+  emit_addimm(FP,(int)&psxRegs.CP2D.r[0]-(int)&dynarec_local,0);
+  if(need_flags||need_ir)
+    c2op_call_MACtoIR(lm,need_flags);
+  emit_far_call(need_flags?gteMACtoRGB:gteMACtoRGB_nf);
+}
+
+static void c2op_assemble(int i, const struct regstat *i_regs)
+{
+  u_int c2op = source[i] & 0x3f;
+  u_int reglist_full = get_host_reglist(i_regs->regmap);
+  u_int reglist = reglist_full & CALLER_SAVE_REGS;
+  int need_flags, need_ir;
+
+  if (gte_handlers[c2op]!=NULL) {
+    need_flags=!(gte_unneeded[i+1]>>63); // +1 because of how liveness detection works
+    need_ir=(gte_unneeded[i+1]&0xe00)!=0xe00;
+    assem_debug("gte op %08x, unneeded %016llx, need_flags %d, need_ir %d\n",
+      source[i],gte_unneeded[i+1],need_flags,need_ir);
+    if(HACK_ENABLED(NDHACK_GTE_NO_FLAGS))
+      need_flags=0;
+    int shift = (source[i] >> 19) & 1;
+    int lm = (source[i] >> 10) & 1;
+    switch(c2op) {
+#ifndef DRC_DBG
+      case GTE_MVMVA: {
+#ifdef HAVE_ARMV5
+        int v  = (source[i] >> 15) & 3;
+        int cv = (source[i] >> 13) & 3;
+        int mx = (source[i] >> 17) & 3;
+        reglist=reglist_full&(CALLER_SAVE_REGS|0xf0); // +{r4-r7}
+        c2op_prologue(c2op,i,i_regs,reglist);
+        /* r4,r5 = VXYZ(v) packed; r6 = &MX11(mx); r7 = &CV1(cv) */
+        if(v<3)
+          emit_ldrd(v*8,0,4);
+        else {
+          emit_movzwl_indexed(9*4,0,4);  // gteIR
+          emit_movzwl_indexed(10*4,0,6);
+          emit_movzwl_indexed(11*4,0,5);
+          emit_orrshl_imm(6,16,4);
+        }
+        if(mx<3)
+          emit_addimm(0,32*4+mx*8*4,6);
+        else
+          emit_readword(&zeromem_ptr,6);
+        if(cv<3)
+          emit_addimm(0,32*4+(cv*8+5)*4,7);
+        else
+          emit_readword(&zeromem_ptr,7);
+#ifdef __ARM_NEON__
+        emit_movimm(source[i],1); // opcode
+        emit_far_call(gteMVMVA_part_neon);
+        if(need_flags) {
+          emit_movimm(lm,1);
+          emit_far_call(gteMACtoIR_flags_neon);
+        }
+#else
+        if(cv==3&&shift)
+          emit_far_call((int)gteMVMVA_part_cv3sh12_arm);
+        else {
+          emit_movimm(shift,1);
+          emit_far_call((int)(need_flags?gteMVMVA_part_arm:gteMVMVA_part_nf_arm));
+        }
+        if(need_flags||need_ir)
+          c2op_call_MACtoIR(lm,need_flags);
+#endif
+#else /* if not HAVE_ARMV5 */
+        c2op_prologue(c2op,i,i_regs,reglist);
+        emit_movimm(source[i],1); // opcode
+        emit_writeword(1,&psxRegs.code);
+        emit_far_call(need_flags?gte_handlers[c2op]:gte_handlers_nf[c2op]);
+#endif
+        break;
+      }
+      case GTE_OP:
+        c2op_prologue(c2op,i,i_regs,reglist);
+        emit_far_call(shift?gteOP_part_shift:gteOP_part_noshift);
+        if(need_flags||need_ir) {
+          emit_addimm(FP,(int)&psxRegs.CP2D.r[0]-(int)&dynarec_local,0);
+          c2op_call_MACtoIR(lm,need_flags);
+        }
+        break;
+      case GTE_DPCS:
+        c2op_prologue(c2op,i,i_regs,reglist);
+        c2op_call_rgb_func(shift?gteDPCS_part_shift:gteDPCS_part_noshift,lm,need_ir,need_flags);
+        break;
+      case GTE_INTPL:
+        c2op_prologue(c2op,i,i_regs,reglist);
+        c2op_call_rgb_func(shift?gteINTPL_part_shift:gteINTPL_part_noshift,lm,need_ir,need_flags);
+        break;
+      case GTE_SQR:
+        c2op_prologue(c2op,i,i_regs,reglist);
+        emit_far_call(shift?gteSQR_part_shift:gteSQR_part_noshift);
+        if(need_flags||need_ir) {
+          emit_addimm(FP,(int)&psxRegs.CP2D.r[0]-(int)&dynarec_local,0);
+          c2op_call_MACtoIR(lm,need_flags);
+        }
+        break;
+      case GTE_DCPL:
+        c2op_prologue(c2op,i,i_regs,reglist);
+        c2op_call_rgb_func(gteDCPL_part,lm,need_ir,need_flags);
+        break;
+      case GTE_GPF:
+        c2op_prologue(c2op,i,i_regs,reglist);
+        c2op_call_rgb_func(shift?gteGPF_part_shift:gteGPF_part_noshift,lm,need_ir,need_flags);
+        break;
+      case GTE_GPL:
+        c2op_prologue(c2op,i,i_regs,reglist);
+        c2op_call_rgb_func(shift?gteGPL_part_shift:gteGPL_part_noshift,lm,need_ir,need_flags);
+        break;
+#endif
+      default:
+        c2op_prologue(c2op,i,i_regs,reglist);
+#ifdef DRC_DBG
+        emit_movimm(source[i],1); // opcode
+        emit_writeword(1,&psxRegs.code);
+#endif
+        emit_far_call(need_flags?gte_handlers[c2op]:gte_handlers_nf[c2op]);
+        break;
+    }
+    c2op_epilogue(c2op,reglist);
+  }
+}
+
+static void c2op_ctc2_31_assemble(signed char sl, signed char temp)
+{
+  //value = value & 0x7ffff000;
+  //if (value & 0x7f87e000) value |= 0x80000000;
+  emit_shrimm(sl,12,temp);
+  emit_shlimm(temp,12,temp);
+  emit_testimm(temp,0x7f000000);
+  emit_testeqimm(temp,0x00870000);
+  emit_testeqimm(temp,0x0000e000);
+  emit_orrne_imm(temp,0x80000000,temp);
+}
+
+static void do_mfc2_31_one(u_int copr,signed char temp)
+{
+  emit_readword(&reg_cop2d[copr],temp);
+  emit_lsls_imm(temp,16,temp);
+  emit_cmovs_imm(0,temp);
+  emit_cmpimm(temp,0xf80<<16);
+  emit_andimm(temp,0xf80<<16,temp);
+  emit_cmovae_imm(0xf80<<16,temp);
+}
+
+static void c2op_mfc2_29_assemble(signed char tl, signed char temp)
+{
+  if (temp < 0) {
+    host_tempreg_acquire();
+    temp = HOST_TEMPREG;
+  }
+  do_mfc2_31_one(9,temp);
+  emit_shrimm(temp,7+16,tl);
+  do_mfc2_31_one(10,temp);
+  emit_orrshr_imm(temp,2+16,tl);
+  do_mfc2_31_one(11,temp);
+  emit_orrshr_imm(temp,-3+16,tl);
+  emit_writeword(tl,&reg_cop2d[29]);
+  if (temp == HOST_TEMPREG)
+    host_tempreg_release();
+}
+
+static void multdiv_assemble_arm(int i, const struct regstat *i_regs)
+{
+  //  case 0x18: MULT
+  //  case 0x19: MULTU
+  //  case 0x1A: DIV
+  //  case 0x1B: DIVU
+  //  case 0x1C: DMULT
+  //  case 0x1D: DMULTU
+  //  case 0x1E: DDIV
+  //  case 0x1F: DDIVU
+  if(dops[i].rs1&&dops[i].rs2)
+  {
+    if((dops[i].opcode2&4)==0) // 32-bit
+    {
+      if(dops[i].opcode2==0x18) // MULT
+      {
+        signed char m1=get_reg(i_regs->regmap,dops[i].rs1);
+        signed char m2=get_reg(i_regs->regmap,dops[i].rs2);
+        signed char hi=get_reg(i_regs->regmap,HIREG);
+        signed char lo=get_reg(i_regs->regmap,LOREG);
+        assert(m1>=0);
+        assert(m2>=0);
+        assert(hi>=0);
+        assert(lo>=0);
+        emit_smull(m1,m2,hi,lo);
+      }
+      if(dops[i].opcode2==0x19) // MULTU
+      {
+        signed char m1=get_reg(i_regs->regmap,dops[i].rs1);
+        signed char m2=get_reg(i_regs->regmap,dops[i].rs2);
+        signed char hi=get_reg(i_regs->regmap,HIREG);
+        signed char lo=get_reg(i_regs->regmap,LOREG);
+        assert(m1>=0);
+        assert(m2>=0);
+        assert(hi>=0);
+        assert(lo>=0);
+        emit_umull(m1,m2,hi,lo);
+      }
+      if(dops[i].opcode2==0x1A) // DIV
+      {
+        signed char d1=get_reg(i_regs->regmap,dops[i].rs1);
+        signed char d2=get_reg(i_regs->regmap,dops[i].rs2);
+        assert(d1>=0);
+        assert(d2>=0);
+        signed char quotient=get_reg(i_regs->regmap,LOREG);
+        signed char remainder=get_reg(i_regs->regmap,HIREG);
+        assert(quotient>=0);
+        assert(remainder>=0);
+        emit_movs(d1,remainder);
+        emit_movimm(0xffffffff,quotient);
+        emit_negmi(quotient,quotient); // .. quotient and ..
+        emit_negmi(remainder,remainder); // .. remainder for div0 case (will be negated back after jump)
+        emit_movs(d2,HOST_TEMPREG);
+        emit_jeq(out+52); // Division by zero
+        emit_negsmi(HOST_TEMPREG,HOST_TEMPREG);
+#ifdef HAVE_ARMV5
+        emit_clz(HOST_TEMPREG,quotient);
+        emit_shl(HOST_TEMPREG,quotient,HOST_TEMPREG);
+#else
+        emit_movimm(0,quotient);
+        emit_addpl_imm(quotient,1,quotient);
+        emit_lslpls_imm(HOST_TEMPREG,1,HOST_TEMPREG);
+        emit_jns(out-2*4);
+#endif
+        emit_orimm(quotient,1<<31,quotient);
+        emit_shr(quotient,quotient,quotient);
+        emit_cmp(remainder,HOST_TEMPREG);
+        emit_subcs(remainder,HOST_TEMPREG,remainder);
+        emit_adcs(quotient,quotient,quotient);
+        emit_shrimm(HOST_TEMPREG,1,HOST_TEMPREG);
+        emit_jcc(out-16); // -4
+        emit_teq(d1,d2);
+        emit_negmi(quotient,quotient);
+        emit_test(d1,d1);
+        emit_negmi(remainder,remainder);
+      }
+      if(dops[i].opcode2==0x1B) // DIVU
+      {
+        signed char d1=get_reg(i_regs->regmap,dops[i].rs1); // dividend
+        signed char d2=get_reg(i_regs->regmap,dops[i].rs2); // divisor
+        assert(d1>=0);
+        assert(d2>=0);
+        signed char quotient=get_reg(i_regs->regmap,LOREG);
+        signed char remainder=get_reg(i_regs->regmap,HIREG);
+        assert(quotient>=0);
+        assert(remainder>=0);
+        emit_mov(d1,remainder);
+        emit_movimm(0xffffffff,quotient); // div0 case
+        emit_test(d2,d2);
+        emit_jeq(out+40); // Division by zero
+#ifdef HAVE_ARMV5
+        emit_clz(d2,HOST_TEMPREG);
+        emit_movimm(1<<31,quotient);
+        emit_shl(d2,HOST_TEMPREG,d2);
+#else
+        emit_movimm(0,HOST_TEMPREG);
+        emit_addpl_imm(HOST_TEMPREG,1,HOST_TEMPREG);
+        emit_lslpls_imm(d2,1,d2);
+        emit_jns(out-2*4);
+        emit_movimm(1<<31,quotient);
+#endif
+        emit_shr(quotient,HOST_TEMPREG,quotient);
+        emit_cmp(remainder,d2);
+        emit_subcs(remainder,d2,remainder);
+        emit_adcs(quotient,quotient,quotient);
+        emit_shrcc_imm(d2,1,d2);
+        emit_jcc(out-16); // -4
+      }
+    }
+    else // 64-bit
+      assert(0);
+  }
+  else
+  {
+    // Multiply by zero is zero.
+    // MIPS does not have a divide by zero exception.
+    // The result is undefined, we return zero.
+    signed char hr=get_reg(i_regs->regmap,HIREG);
+    signed char lr=get_reg(i_regs->regmap,LOREG);
+    if(hr>=0) emit_zeroreg(hr);
+    if(lr>=0) emit_zeroreg(lr);
+  }
+}
+#define multdiv_assemble multdiv_assemble_arm
+
+static void do_jump_vaddr(int rs)
+{
+  emit_far_jump(jump_vaddr_reg[rs]);
+}
+
+static void do_preload_rhash(int r) {
+  // Don't need this for ARM.  On x86, this puts the value 0xf8 into the
+  // register.  On ARM the hash can be done with a single instruction (below)
+}
+
+static void do_preload_rhtbl(int ht) {
+  emit_addimm(FP,(int)&mini_ht-(int)&dynarec_local,ht);
+}
+
+static void do_rhash(int rs,int rh) {
+  emit_andimm(rs,0xf8,rh);
+}
+
+static void do_miniht_load(int ht,int rh) {
+  assem_debug("ldr %s,[%s,%s]!\n",regname[rh],regname[ht],regname[rh]);
+  output_w32(0xe7b00000|rd_rn_rm(rh,ht,rh));
+}
+
+static void do_miniht_jump(int rs,int rh,int ht) {
+  emit_cmp(rh,rs);
+  emit_ldreq_indexed(ht,4,15);
+  #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
+  if(rs!=7)
+    emit_mov(rs,7);
+  rs=7;
+  #endif
+  do_jump_vaddr(rs);
+}
+
+static void do_miniht_insert(u_int return_address,int rt,int temp) {
+  #ifndef HAVE_ARMV7
+  emit_movimm(return_address,rt); // PC into link register
+  add_to_linker(out,return_address,1);
+  emit_pcreladdr(temp);
+  emit_writeword(rt,&mini_ht[(return_address&0xFF)>>3][0]);
+  emit_writeword(temp,&mini_ht[(return_address&0xFF)>>3][1]);
+  #else
+  emit_movw(return_address&0x0000FFFF,rt);
+  add_to_linker(out,return_address,1);
+  emit_pcreladdr(temp);
+  emit_writeword(temp,&mini_ht[(return_address&0xFF)>>3][1]);
+  emit_movt(return_address&0xFFFF0000,rt);
+  emit_writeword(rt,&mini_ht[(return_address&0xFF)>>3][0]);
+  #endif
+}
+
+// CPU-architecture-specific initialization
+static void arch_init(void)
+{
+  uintptr_t diff = (u_char *)&ndrc->tramp.f - (u_char *)&ndrc->tramp.ops - 8;
+  struct tramp_insns *ops = ndrc->tramp.ops;
+  size_t i;
+  assert(!(diff & 3));
+  assert(diff < 0x1000);
+  start_tcache_write(ops, (u_char *)ops + sizeof(ndrc->tramp.ops));
+  for (i = 0; i < ARRAY_SIZE(ndrc->tramp.ops); i++)
+    ops[i].ldrpc = 0xe5900000 | rd_rn_rm(15,15,0) | diff; // ldr pc, [=val]
+  end_tcache_write(ops, (u_char *)ops + sizeof(ndrc->tramp.ops));
+}
+
+// vim:shiftwidth=2:expandtab
diff --git a/libpcsxcore/new_dynarec/assem_arm.h b/libpcsxcore/new_dynarec/assem_arm.h
new file mode 100644 (file)
index 0000000..75273aa
--- /dev/null
@@ -0,0 +1,44 @@
+#define HOST_IMM8 1
+#define HAVE_CMOV_IMM 1
+#define HAVE_CONDITIONAL_CALL 1
+
+/* ARM calling convention:
+   r0-r3, r12: caller-save
+   r4-r11: callee-save */
+
+/* GCC register naming convention:
+   r10 = sl (base)
+   r11 = fp (frame pointer)
+   r12 = ip (scratch)
+   r13 = sp (stack pointer)
+   r14 = lr (link register)
+   r15 = pc (program counter) */
+
+#define HOST_REGS 13
+#define HOST_CCREG 10
+#define HOST_BTREG 8
+#define EXCLUDE_REG 11
+
+// Note: FP is set to &dynarec_local when executing generated code.
+// Thus the local variables are actually global and not on the stack.
+#define FP 11
+#define LR 14
+#define HOST_TEMPREG 14
+
+#ifndef __MACH__
+#define CALLER_SAVE_REGS 0x100f
+#else
+#define CALLER_SAVE_REGS 0x120f
+#endif
+#define PREFERRED_REG_FIRST 4
+#define PREFERRED_REG_LAST  9
+
+extern char *invc_ptr;
+
+#define TARGET_SIZE_2 24 // 2^24 = 16 megabytes
+
+struct tramp_insns
+{
+  u_int ldrpc;
+};
+
diff --git a/libpcsxcore/new_dynarec/assem_arm64.c b/libpcsxcore/new_dynarec/assem_arm64.c
new file mode 100644 (file)
index 0000000..0b49221
--- /dev/null
@@ -0,0 +1,2093 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ *   Mupen64plus/PCSX - assem_arm64.c                                      *
+ *   Copyright (C) 2009-2011 Ari64                                         *
+ *   Copyright (C) 2009-2018 Gillou68310                                   *
+ *   Copyright (C) 2021 notaz                                              *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include "pcnt.h"
+#include "arm_features.h"
+
+#define unused __attribute__((unused))
+
+void do_memhandler_pre();
+void do_memhandler_post();
+
+/* Linker */
+static void set_jump_target(void *addr, void *target)
+{
+  u_int *ptr = addr;
+  intptr_t offset = (u_char *)target - (u_char *)addr;
+
+  if ((*ptr&0xFC000000) == 0x14000000) { // b
+    assert(offset>=-134217728LL&&offset<134217728LL);
+    *ptr=(*ptr&0xFC000000)|((offset>>2)&0x3ffffff);
+  }
+  else if ((*ptr&0xff000000) == 0x54000000 // b.cond
+        || (*ptr&0x7e000000) == 0x34000000) { // cbz/cbnz
+    // Conditional branch are limited to +/- 1MB
+    // block max size is 256k so branching beyond the +/- 1MB limit
+    // should only happen when jumping to an already compiled block (see add_jump_out)
+    // a workaround would be to do a trampoline jump via a stub at the end of the block
+    assert(-1048576 <= offset && offset < 1048576);
+    *ptr=(*ptr&0xFF00000F)|(((offset>>2)&0x7ffff)<<5);
+  }
+  else if((*ptr&0x9f000000)==0x10000000) { // adr
+    // generated by do_miniht_insert
+    assert(offset>=-1048576LL&&offset<1048576LL);
+    *ptr=(*ptr&0x9F00001F)|(offset&0x3)<<29|((offset>>2)&0x7ffff)<<5;
+  }
+  else
+    abort(); // should not happen
+}
+
+// from a pointer to external jump stub (which was produced by emit_extjump2)
+// find where the jumping insn is
+static void *find_extjump_insn(void *stub)
+{
+  int *ptr = (int *)stub + 2;
+  assert((*ptr&0x9f000000) == 0x10000000); // adr
+  int offset = (((signed int)(*ptr<<8)>>13)<<2)|((*ptr>>29)&0x3);
+  return ptr + offset / 4;
+}
+
+// find where external branch is liked to using addr of it's stub:
+// get address that the stub loads (dyna_linker arg1),
+// treat it as a pointer to branch insn,
+// return addr where that branch jumps to
+static void *get_pointer(void *stub)
+{
+  int *i_ptr = find_extjump_insn(stub);
+  if ((*i_ptr&0xfc000000) == 0x14000000)  // b
+    return i_ptr + ((signed int)(*i_ptr<<6)>>6);
+  if ((*i_ptr&0xff000000) == 0x54000000     // b.cond
+      || (*i_ptr&0x7e000000) == 0x34000000) // cbz/cbnz
+    return i_ptr + ((signed int)(*i_ptr<<8)>>13);
+  assert(0);
+  return NULL;
+}
+
+// Allocate a specific ARM register.
+static void alloc_arm_reg(struct regstat *cur,int i,signed char reg,int hr)
+{
+  int n;
+  int dirty=0;
+
+  // see if it's already allocated (and dealloc it)
+  for(n=0;n<HOST_REGS;n++)
+  {
+    if(n!=EXCLUDE_REG&&cur->regmap[n]==reg) {
+      dirty=(cur->dirty>>n)&1;
+      cur->regmap[n]=-1;
+    }
+  }
+
+  cur->regmap[hr]=reg;
+  cur->dirty&=~(1<<hr);
+  cur->dirty|=dirty<<hr;
+  cur->isconst&=~(1<<hr);
+}
+
+// Alloc cycle count into dedicated register
+static void alloc_cc(struct regstat *cur,int i)
+{
+  alloc_arm_reg(cur,i,CCREG,HOST_CCREG);
+}
+
+/* Special alloc */
+
+
+/* Assembler */
+
+static unused const char *regname[32] = {
+  "w0",  "w1",  "w2",  "w3",  "w4",  "w5",  "w6",  "w7",
+  "w8",  "w9", "w10", "w11", "w12", "w13", "w14", "w15",
+ "ip0", "ip1", "w18", "w19", "w20", "w21", "w22", "w23",
+ "w24", "w25", "w26", "w27", "w28", "wfp", "wlr", "wsp"
+};
+
+static unused const char *regname64[32] = {
+  "x0",  "x1",  "x2",  "x3",  "x4",  "x5",  "x6",  "x7",
+  "x8",  "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+ "ip0", "ip1", "x18", "x19", "x20", "x21", "x22", "x23",
+ "x24", "x25", "x26", "x27", "x28",  "fp",  "lr",  "sp"
+};
+
+enum {
+  COND_EQ, COND_NE, COND_CS, COND_CC, COND_MI, COND_PL, COND_VS, COND_VC,
+  COND_HI, COND_LS, COND_GE, COND_LT, COND_GT, COND_LE, COND_AW, COND_NV
+};
+
+static unused const char *condname[16] = {
+  "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
+  "hi", "ls", "ge", "lt", "gt", "le", "aw", "nv"
+};
+
+static void output_w32(u_int word)
+{
+  *((u_int *)out) = word;
+  out += 4;
+}
+
+static void output_w64(uint64_t dword)
+{
+  *((uint64_t *)out) = dword;
+  out+=8;
+}
+
+/*
+static u_int rm_rd(u_int rm, u_int rd)
+{
+  assert(rm < 31);
+  assert(rd < 31);
+  return (rm << 16) | rd;
+}
+*/
+
+static u_int rn_rd(u_int rn, u_int rd)
+{
+  assert(rn < 31);
+  assert(rd < 31);
+  return (rn << 5) | rd;
+}
+
+static u_int rm_rn_rd(u_int rm, u_int rn, u_int rd)
+{
+  assert(rm < 32);
+  assert(rn < 32);
+  assert(rd < 32);
+  return (rm << 16) | (rn << 5) | rd;
+}
+
+static u_int rm_ra_rn_rd(u_int rm, u_int ra, u_int rn, u_int rd)
+{
+  assert(ra < 32);
+  return rm_rn_rd(rm, rn, rd) | (ra << 10);
+}
+
+static u_int imm7_rt2_rn_rt(u_int imm7, u_int rt2, u_int rn, u_int rt)
+{
+  assert(imm7 < 0x80);
+  assert(rt2 < 31);
+  assert(rn < 32);
+  assert(rt < 31);
+  return (imm7 << 15) | (rt2 << 10) | (rn << 5) | rt;
+}
+
+static u_int rm_imm6_rn_rd(u_int rm, u_int imm6, u_int rn, u_int rd)
+{
+  assert(imm6 <= 63);
+  return rm_rn_rd(rm, rn, rd) | (imm6 << 10);
+}
+
+static u_int imm16_rd(u_int imm16, u_int rd)
+{
+  assert(imm16 < 0x10000);
+  assert(rd < 31);
+  return (imm16 << 5) | rd;
+}
+
+static u_int imm12_rn_rd(u_int imm12, u_int rn, u_int rd)
+{
+  assert(imm12 < 0x1000);
+  assert(rn < 32);
+  assert(rd < 32);
+  return (imm12 << 10) | (rn << 5) | rd;
+}
+
+static u_int imm9_rn_rt(u_int imm9, u_int rn, u_int rd)
+{
+  assert(imm9 < 0x200);
+  assert(rn < 31);
+  assert(rd < 31);
+  return (imm9 << 12) | (rn << 5) | rd;
+}
+
+static u_int imm19_rt(u_int imm19, u_int rt)
+{
+  assert(imm19 < 0x80000);
+  assert(rt < 31);
+  return (imm19 << 5) | rt;
+}
+
+static u_int n_immr_imms_rn_rd(u_int n, u_int immr, u_int imms, u_int rn, u_int rd)
+{
+  assert(n < 2);
+  assert(immr < 0x40);
+  assert(imms < 0x40);
+  assert(rn < 32);
+  assert(rd < 32);
+  return (n << 22) | (immr << 16) | (imms << 10) | (rn << 5) | rd;
+}
+
+static u_int genjmp(const u_char *addr)
+{
+  intptr_t offset = addr - out;
+  if ((uintptr_t)addr < 3) return 0; // a branch that will be patched later
+  if (offset < -134217728 || offset > 134217727) {
+    SysPrintf("%s: out of range: %p %lx\n", __func__, addr, offset);
+    abort();
+    return 0;
+  }
+  return ((u_int)offset >> 2) & 0x03ffffff;
+}
+
+static u_int genjmpcc(const u_char *addr)
+{
+  intptr_t offset = addr - out;
+  if ((uintptr_t)addr < 3) return 0;
+  if (offset < -1048576 || offset > 1048572) {
+    SysPrintf("%s: out of range: %p %lx\n", __func__, addr, offset);
+    abort();
+    return 0;
+  }
+  return ((u_int)offset >> 2) & 0x7ffff;
+}
+
+static uint32_t is_mask(u_int value)
+{
+  return value && ((value + 1) & value) == 0;
+}
+
+// This function returns true if the argument contains a
+// non-empty sequence of ones (possibly rotated) with the remainder zero.
+static uint32_t is_rotated_mask(u_int value)
+{
+  if (value == 0 || value == ~0)
+    return 0;
+  if (is_mask((value - 1) | value))
+    return 1;
+  return is_mask((~value - 1) | ~value);
+}
+
+static void gen_logical_imm(u_int value, u_int *immr, u_int *imms)
+{
+  int lzeros, tzeros, ones;
+  assert(value != 0);
+  if (is_mask((value - 1) | value)) {
+    lzeros = __builtin_clz(value);
+    tzeros = __builtin_ctz(value);
+    ones = 32 - lzeros - tzeros;
+    *immr = (32 - tzeros) & 31;
+    *imms = ones - 1;
+    return;
+  }
+  value = ~value;
+  if (is_mask((value - 1) | value)) {
+    lzeros = __builtin_clz(value);
+    tzeros = __builtin_ctz(value);
+    ones = 32 - lzeros - tzeros;
+    *immr = lzeros;
+    *imms = 31 - ones;
+    return;
+  }
+  abort();
+}
+
+static void emit_mov(u_int rs, u_int rt)
+{
+  assem_debug("mov %s,%s\n", regname[rt], regname[rs]);
+  output_w32(0x2a000000 | rm_rn_rd(rs, WZR, rt));
+}
+
+static void emit_mov64(u_int rs, u_int rt)
+{
+  assem_debug("mov %s,%s\n", regname64[rt], regname64[rs]);
+  output_w32(0xaa000000 | rm_rn_rd(rs, WZR, rt));
+}
+
+static void emit_add(u_int rs1, u_int rs2, u_int rt)
+{
+  assem_debug("add %s,%s,%s\n", regname[rt], regname[rs1], regname[rs2]);
+  output_w32(0x0b000000 | rm_rn_rd(rs2, rs1, rt));
+}
+
+static void emit_add64(u_int rs1, u_int rs2, u_int rt)
+{
+  assem_debug("add %s,%s,%s\n", regname64[rt], regname64[rs1], regname64[rs2]);
+  output_w32(0x8b000000 | rm_rn_rd(rs2, rs1, rt));
+}
+
+static void emit_adds64(u_int rs1, u_int rs2, u_int rt)
+{
+  assem_debug("adds %s,%s,%s\n",regname64[rt],regname64[rs1],regname64[rs2]);
+  output_w32(0xab000000 | rm_rn_rd(rs2, rs1, rt));
+}
+#define emit_adds_ptr emit_adds64
+
+static void emit_neg(u_int rs, u_int rt)
+{
+  assem_debug("neg %s,%s\n",regname[rt],regname[rs]);
+  output_w32(0x4b000000 | rm_rn_rd(rs, WZR, rt));
+}
+
+static void emit_sub(u_int rs1, u_int rs2, u_int rt)
+{
+  assem_debug("sub %s,%s,%s\n", regname[rt], regname[rs1], regname[rs2]);
+  output_w32(0x4b000000 | rm_imm6_rn_rd(rs2, 0, rs1, rt));
+}
+
+static void emit_sub_asrimm(u_int rs1, u_int rs2, u_int shift, u_int rt)
+{
+  assem_debug("sub %s,%s,%s,asr #%u\n",regname[rt],regname[rs1],regname[rs2],shift);
+  output_w32(0x4b800000 | rm_imm6_rn_rd(rs2, shift, rs1, rt));
+}
+
+static void emit_movz(u_int imm, u_int rt)
+{
+  assem_debug("movz %s,#%#x\n", regname[rt], imm);
+  output_w32(0x52800000 | imm16_rd(imm, rt));
+}
+
+static void emit_movz_lsl16(u_int imm, u_int rt)
+{
+  assem_debug("movz %s,#%#x,lsl #16\n", regname[rt], imm);
+  output_w32(0x52a00000 | imm16_rd(imm, rt));
+}
+
+static void emit_movn(u_int imm, u_int rt)
+{
+  assem_debug("movn %s,#%#x\n", regname[rt], imm);
+  output_w32(0x12800000 | imm16_rd(imm, rt));
+}
+
+static void emit_movn_lsl16(u_int imm,u_int rt)
+{
+  assem_debug("movn %s,#%#x,lsl #16\n", regname[rt], imm);
+  output_w32(0x12a00000 | imm16_rd(imm, rt));
+}
+
+static void emit_movk(u_int imm,u_int rt)
+{
+  assem_debug("movk %s,#%#x\n", regname[rt], imm);
+  output_w32(0x72800000 | imm16_rd(imm, rt));
+}
+
+static void emit_movk_lsl16(u_int imm,u_int rt)
+{
+  assert(imm<65536);
+  assem_debug("movk %s,#%#x,lsl #16\n", regname[rt], imm);
+  output_w32(0x72a00000 | imm16_rd(imm, rt));
+}
+
+static void emit_zeroreg(u_int rt)
+{
+  emit_movz(0, rt);
+}
+
+static void emit_movimm(u_int imm, u_int rt)
+{
+  if (imm < 65536)
+    emit_movz(imm, rt);
+  else if ((~imm) < 65536)
+    emit_movn(~imm, rt);
+  else if ((imm&0xffff) == 0)
+    emit_movz_lsl16(imm >> 16, rt);
+  else if (((~imm)&0xffff) == 0)
+    emit_movn_lsl16(~imm >> 16, rt);
+  else if (is_rotated_mask(imm)) {
+    u_int immr, imms;
+    gen_logical_imm(imm, &immr, &imms);
+    assem_debug("orr %s,wzr,#%#x\n", regname[rt], imm);
+    output_w32(0x32000000 | n_immr_imms_rn_rd(0, immr, imms, WZR, rt));
+  }
+  else {
+    emit_movz(imm & 0xffff, rt);
+    emit_movk_lsl16(imm >> 16, rt);
+  }
+}
+
+static void emit_readword(void *addr, u_int rt)
+{
+  uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local;
+  if (!(offset & 3) && offset <= 16380) {
+    assem_debug("ldr %s,[x%d+%#lx]\n", regname[rt], FP, offset);
+    output_w32(0xb9400000 | imm12_rn_rd(offset >> 2, FP, rt));
+  }
+  else
+    abort();
+}
+
+static void emit_readdword(void *addr, u_int rt)
+{
+  uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local;
+  if (!(offset & 7) && offset <= 32760) {
+    assem_debug("ldr %s,[x%d+%#lx]\n", regname64[rt], FP, offset);
+    output_w32(0xf9400000 | imm12_rn_rd(offset >> 3, FP, rt));
+  }
+  else
+    abort();
+}
+#define emit_readptr emit_readdword
+
+static void emit_readshword(void *addr, u_int rt)
+{
+  uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local;
+  if (!(offset & 1) && offset <= 8190) {
+    assem_debug("ldrsh %s,[x%d+%#lx]\n", regname[rt], FP, offset);
+    output_w32(0x79c00000 | imm12_rn_rd(offset >> 1, FP, rt));
+  }
+  else
+    assert(0);
+}
+
+static void emit_loadreg(u_int r, u_int hr)
+{
+  int is64 = 0;
+  assert(r < 64);
+  if (r == 0)
+    emit_zeroreg(hr);
+  else {
+    void *addr = &psxRegs.GPR.r[r];
+    switch (r) {
+    //case HIREG: addr = &hi; break;
+    //case LOREG: addr = &lo; break;
+    case CCREG: addr = &cycle_count; break;
+    case CSREG: addr = &Status; break;
+    case INVCP: addr = &invc_ptr; is64 = 1; break;
+    case ROREG: addr = &ram_offset; is64 = 1; break;
+    default: assert(r < 34); break;
+    }
+    if (is64)
+      emit_readdword(addr, hr);
+    else
+      emit_readword(addr, hr);
+  }
+}
+
+static void emit_writeword(u_int rt, void *addr)
+{
+  uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local;
+  if (!(offset & 3) && offset <= 16380) {
+    assem_debug("str %s,[x%d+%#lx]\n", regname[rt], FP, offset);
+    output_w32(0xb9000000 | imm12_rn_rd(offset >> 2, FP, rt));
+  }
+  else
+    assert(0);
+}
+
+static void emit_writedword(u_int rt, void *addr)
+{
+  uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local;
+  if (!(offset & 7) && offset <= 32760) {
+    assem_debug("str %s,[x%d+%#lx]\n", regname64[rt], FP, offset);
+    output_w32(0xf9000000 | imm12_rn_rd(offset >> 3, FP, rt));
+  }
+  else
+    abort();
+}
+
+static void emit_storereg(u_int r, u_int hr)
+{
+  assert(r < 64);
+  void *addr = &psxRegs.GPR.r[r];
+  switch (r) {
+  //case HIREG: addr = &hi; break;
+  //case LOREG: addr = &lo; break;
+  case CCREG: addr = &cycle_count; break;
+  default: assert(r < 34); break;
+  }
+  emit_writeword(hr, addr);
+}
+
+static void emit_test(u_int rs, u_int rt)
+{
+  assem_debug("tst %s,%s\n", regname[rs], regname[rt]);
+  output_w32(0x6a000000 | rm_rn_rd(rt, rs, WZR));
+}
+
+static void emit_testimm(u_int rs, u_int imm)
+{
+  u_int immr, imms;
+  assem_debug("tst %s,#%#x\n", regname[rs], imm);
+  assert(is_rotated_mask(imm)); // good enough for PCSX
+  gen_logical_imm(imm, &immr, &imms);
+  output_w32(0x72000000 | n_immr_imms_rn_rd(0, immr, imms, rs, WZR));
+}
+
+static void emit_not(u_int rs,u_int rt)
+{
+  assem_debug("mvn %s,%s\n",regname[rt],regname[rs]);
+  output_w32(0x2a200000 | rm_rn_rd(rs, WZR, rt));
+}
+
+static void emit_and(u_int rs1,u_int rs2,u_int rt)
+{
+  assem_debug("and %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0x0a000000 | rm_rn_rd(rs2, rs1, rt));
+}
+
+static void emit_or(u_int rs1,u_int rs2,u_int rt)
+{
+  assem_debug("orr %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0x2a000000 | rm_rn_rd(rs2, rs1, rt));
+}
+
+static void emit_bic(u_int rs1,u_int rs2,u_int rt)
+{
+  assem_debug("bic %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0x0a200000 | rm_rn_rd(rs2, rs1, rt));
+}
+
+static void emit_orrshl_imm(u_int rs,u_int imm,u_int rt)
+{
+  assem_debug("orr %s,%s,%s,lsl #%d\n",regname[rt],regname[rt],regname[rs],imm);
+  output_w32(0x2a000000 | rm_imm6_rn_rd(rs, imm, rt, rt));
+}
+
+static void emit_orrshr_imm(u_int rs,u_int imm,u_int rt)
+{
+  assem_debug("orr %s,%s,%s,lsr #%d\n",regname[rt],regname[rt],regname[rs],imm);
+  output_w32(0x2a400000 | rm_imm6_rn_rd(rs, imm, rt, rt));
+}
+
+static void emit_bicsar_imm(u_int rs,u_int imm,u_int rt)
+{
+  assem_debug("bic %s,%s,%s,asr #%d\n",regname[rt],regname[rt],regname[rs],imm);
+  output_w32(0x0aa00000 | rm_imm6_rn_rd(rs, imm, rt, rt));
+}
+
+static void emit_xor(u_int rs1,u_int rs2,u_int rt)
+{
+  assem_debug("eor %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0x4a000000 | rm_rn_rd(rs2, rs1, rt));
+}
+
+static void emit_xorsar_imm(u_int rs1, u_int rs2, u_int imm, u_int rt)
+{
+  assem_debug("eor %s,%s,%s,asr #%d\n",regname[rt],regname[rs1],regname[rs2],imm);
+  output_w32(0x4a800000 | rm_imm6_rn_rd(rs2, imm, rs1, rt));
+}
+
+static void emit_addimm_s(u_int s, u_int is64, u_int rs, uintptr_t imm, u_int rt)
+{
+  unused const char *st = s ? "s" : "";
+  s = s ? 0x20000000 : 0;
+  is64 = is64 ? 0x80000000 : 0;
+  if (imm < 4096) {
+    assem_debug("add%s %s,%s,%#lx\n", st, regname[rt], regname[rs], imm);
+    output_w32(0x11000000 | is64 | s | imm12_rn_rd(imm, rs, rt));
+  }
+  else if (-imm < 4096) {
+    assem_debug("sub%s %s,%s,%#lx\n", st, regname[rt], regname[rs], -imm);
+    output_w32(0x51000000 | is64 | s | imm12_rn_rd(-imm, rs, rt));
+  }
+  else if (imm < 16777216) {
+    assem_debug("add %s,%s,#%#lx\n",regname[rt],regname[rt],imm&0xfff000);
+    output_w32(0x11400000 | is64 | imm12_rn_rd(imm >> 12, rs, rt));
+    if ((imm & 0xfff) || s) {
+      assem_debug("add%s %s,%s,#%#lx\n",st,regname[rt],regname[rs],imm&0xfff);
+      output_w32(0x11000000 | is64 | s | imm12_rn_rd(imm & 0xfff, rt, rt));
+    }
+  }
+  else if (-imm < 16777216) {
+    assem_debug("sub %s,%s,#%#lx\n",regname[rt],regname[rt],-imm&0xfff000);
+    output_w32(0x51400000 | is64 | imm12_rn_rd(-imm >> 12, rs, rt));
+    if ((imm & 0xfff) || s) {
+      assem_debug("sub%s %s,%s,#%#lx\n",st,regname[rt],regname[rs],-imm&0xfff);
+      output_w32(0x51000000 | is64 | s | imm12_rn_rd(-imm & 0xfff, rt, rt));
+    }
+  }
+  else
+    abort();
+}
+
+static void emit_addimm(u_int rs, uintptr_t imm, u_int rt)
+{
+  emit_addimm_s(0, 0, rs, imm, rt);
+}
+
+static void emit_addimm64(u_int rs, uintptr_t imm, u_int rt)
+{
+  emit_addimm_s(0, 1, rs, imm, rt);
+}
+
+static void emit_addimm_and_set_flags(int imm, u_int rt)
+{
+  emit_addimm_s(1, 0, rt, imm, rt);
+}
+
+static void emit_logicop_imm(u_int op, u_int rs, u_int imm, u_int rt)
+{
+  const char *names[] = { "and", "orr", "eor", "ands" };
+  const char *name = names[op];
+  u_int immr, imms;
+  op = op << 29;
+  if (is_rotated_mask(imm)) {
+    gen_logical_imm(imm, &immr, &imms);
+    assem_debug("%s %s,%s,#%#x\n", name, regname[rt], regname[rs], imm);
+    output_w32(op | 0x12000000 | n_immr_imms_rn_rd(0, immr, imms, rs, rt));
+  }
+  else {
+    if (rs == HOST_TEMPREG || rt != HOST_TEMPREG)
+      host_tempreg_acquire();
+    emit_movimm(imm, HOST_TEMPREG);
+    assem_debug("%s %s,%s,%s\n", name, regname[rt], regname[rs], regname[HOST_TEMPREG]);
+    output_w32(op | 0x0a000000 | rm_rn_rd(HOST_TEMPREG, rs, rt));
+    if (rs == HOST_TEMPREG || rt != HOST_TEMPREG)
+      host_tempreg_release();
+  }
+  (void)name;
+}
+
+static void emit_andimm(u_int rs, u_int imm, u_int rt)
+{
+  if (imm == 0)
+    emit_zeroreg(rt);
+  else
+    emit_logicop_imm(0, rs, imm, rt);
+}
+
+static void emit_orimm(u_int rs, u_int imm, u_int rt)
+{
+  if (imm == 0) {
+    if (rs != rt)
+      emit_mov(rs, rt);
+  }
+  else
+    emit_logicop_imm(1, rs, imm, rt);
+}
+
+static void emit_xorimm(u_int rs, u_int imm, u_int rt)
+{
+  if (imm == 0) {
+    if (rs != rt)
+      emit_mov(rs, rt);
+  }
+  else
+    emit_logicop_imm(2, rs, imm, rt);
+}
+
+static void emit_sbfm(u_int rs,u_int imm,u_int rt)
+{
+  assem_debug("sbfm %s,%s,#0,#%d\n",regname[rt],regname[rs],imm);
+  output_w32(0x13000000 | n_immr_imms_rn_rd(0, 0, imm, rs, rt));
+}
+
+static void emit_ubfm(u_int rs,u_int imm,u_int rt)
+{
+  assem_debug("ubfm %s,%s,#0,#%d\n",regname[rt],regname[rs],imm);
+  output_w32(0x53000000 | n_immr_imms_rn_rd(0, 0, imm, rs, rt));
+}
+
+static void emit_shlimm(u_int rs,u_int imm,u_int rt)
+{
+  assem_debug("lsl %s,%s,#%d\n",regname[rt],regname[rs],imm);
+  output_w32(0x53000000 | n_immr_imms_rn_rd(0, (31-imm)+1, 31-imm, rs, rt));
+}
+
+static void emit_shrimm(u_int rs,u_int imm,u_int rt)
+{
+  assem_debug("lsr %s,%s,#%d\n",regname[rt],regname[rs],imm);
+  output_w32(0x53000000 | n_immr_imms_rn_rd(0, imm, 31, rs, rt));
+}
+
+static void emit_shrimm64(u_int rs,u_int imm,u_int rt)
+{
+  assem_debug("lsr %s,%s,#%d\n",regname[rt],regname[rs],imm);
+  output_w32(0xd3400000 | n_immr_imms_rn_rd(0, imm, 63, rs, rt));
+}
+
+static void emit_sarimm(u_int rs,u_int imm,u_int rt)
+{
+  assem_debug("asr %s,%s,#%d\n",regname[rt],regname[rs],imm);
+  output_w32(0x13000000 | n_immr_imms_rn_rd(0, imm, 31, rs, rt));
+}
+
+static void emit_rorimm(u_int rs,u_int imm,u_int rt)
+{
+  assem_debug("ror %s,%s,#%d\n",regname[rt],regname[rs],imm);
+  output_w32(0x13800000 | rm_imm6_rn_rd(rs, imm, rs, rt));
+}
+
+static void emit_signextend16(u_int rs, u_int rt)
+{
+  assem_debug("sxth %s,%s\n", regname[rt], regname[rs]);
+  output_w32(0x13000000 | n_immr_imms_rn_rd(0, 0, 15, rs, rt));
+}
+
+static void emit_shl(u_int rs,u_int rshift,u_int rt)
+{
+  assem_debug("lsl %s,%s,%s\n",regname[rt],regname[rs],regname[rshift]);
+  output_w32(0x1ac02000 | rm_rn_rd(rshift, rs, rt));
+}
+
+static void emit_shr(u_int rs,u_int rshift,u_int rt)
+{
+  assem_debug("lsr %s,%s,%s\n",regname[rt],regname[rs],regname[rshift]);
+  output_w32(0x1ac02400 | rm_rn_rd(rshift, rs, rt));
+}
+
+static void emit_sar(u_int rs,u_int rshift,u_int rt)
+{
+  assem_debug("asr %s,%s,%s\n",regname[rt],regname[rs],regname[rshift]);
+  output_w32(0x1ac02800 | rm_rn_rd(rshift, rs, rt));
+}
+
+static void emit_cmpimm(u_int rs, u_int imm)
+{
+  if (imm < 4096) {
+    assem_debug("cmp %s,%#x\n", regname[rs], imm);
+    output_w32(0x71000000 | imm12_rn_rd(imm, rs, WZR));
+  }
+  else if (-imm < 4096) {
+    assem_debug("cmn %s,%#x\n", regname[rs], imm);
+    output_w32(0x31000000 | imm12_rn_rd(-imm, rs, WZR));
+  }
+  else if (imm < 16777216 && !(imm & 0xfff)) {
+    assem_debug("cmp %s,#%#x\n", regname[rs], imm);
+    output_w32(0x71400000 | imm12_rn_rd(imm >> 12, rs, WZR));
+  }
+  else {
+    host_tempreg_acquire();
+    emit_movimm(imm, HOST_TEMPREG);
+    assem_debug("cmp %s,%s\n", regname[rs], regname[HOST_TEMPREG]);
+    output_w32(0x6b000000 | rm_rn_rd(HOST_TEMPREG, rs, WZR));
+    host_tempreg_release();
+  }
+}
+
+static void emit_cmov_imm(u_int cond0, u_int cond1, u_int imm, u_int rt)
+{
+  assert(imm == 0 || imm == 1);
+  assert(cond0 < 0x10);
+  assert(cond1 < 0x10);
+  if (imm) {
+    assem_debug("csinc %s,%s,%s,%s\n",regname[rt],regname[rt],regname[WZR],condname[cond1]);
+    output_w32(0x1a800400 | (cond1 << 12) | rm_rn_rd(WZR, rt, rt));
+  } else {
+    assem_debug("csel %s,%s,%s,%s\n",regname[rt],regname[WZR],regname[rt],condname[cond0]);
+    output_w32(0x1a800000 | (cond0 << 12) | rm_rn_rd(rt, WZR, rt));
+  }
+}
+
+static void emit_cmovne_imm(u_int imm,u_int rt)
+{
+  emit_cmov_imm(COND_NE, COND_EQ, imm, rt);
+}
+
+static void emit_cmovl_imm(u_int imm,u_int rt)
+{
+  emit_cmov_imm(COND_LT, COND_GE, imm, rt);
+}
+
+static void emit_cmovb_imm(int imm,u_int rt)
+{
+  emit_cmov_imm(COND_CC, COND_CS, imm, rt);
+}
+
+static void emit_cmoveq_reg(u_int rs,u_int rt)
+{
+  assem_debug("csel %s,%s,%s,eq\n",regname[rt],regname[rs],regname[rt]);
+  output_w32(0x1a800000 | (COND_EQ << 12) | rm_rn_rd(rt, rs, rt));
+}
+
+static void emit_cmovne_reg(u_int rs,u_int rt)
+{
+  assem_debug("csel %s,%s,%s,ne\n",regname[rt],regname[rs],regname[rt]);
+  output_w32(0x1a800000 | (COND_NE << 12) | rm_rn_rd(rt, rs, rt));
+}
+
+static void emit_cmovl_reg(u_int rs,u_int rt)
+{
+  assem_debug("csel %s,%s,%s,lt\n",regname[rt],regname[rs],regname[rt]);
+  output_w32(0x1a800000 | (COND_LT << 12) | rm_rn_rd(rt, rs, rt));
+}
+
+static void emit_cmovb_reg(u_int rs,u_int rt)
+{
+  assem_debug("csel %s,%s,%s,cc\n",regname[rt],regname[rs],regname[rt]);
+  output_w32(0x1a800000 | (COND_CC << 12) | rm_rn_rd(rt, rs, rt));
+}
+
+static void emit_cmovs_reg(u_int rs,u_int rt)
+{
+  assem_debug("csel %s,%s,%s,mi\n",regname[rt],regname[rs],regname[rt]);
+  output_w32(0x1a800000 | (COND_MI << 12) | rm_rn_rd(rt, rs, rt));
+}
+
+static void emit_csinvle_reg(u_int rs1,u_int rs2,u_int rt)
+{
+  assem_debug("csinv %s,%s,%s,le\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0x5a800000 | (COND_LE << 12) | rm_rn_rd(rs2, rs1, rt));
+}
+
+static void emit_slti32(u_int rs,int imm,u_int rt)
+{
+  if(rs!=rt) emit_zeroreg(rt);
+  emit_cmpimm(rs,imm);
+  if(rs==rt) emit_movimm(0,rt);
+  emit_cmovl_imm(1,rt);
+}
+
+static void emit_sltiu32(u_int rs,int imm,u_int rt)
+{
+  if(rs!=rt) emit_zeroreg(rt);
+  emit_cmpimm(rs,imm);
+  if(rs==rt) emit_movimm(0,rt);
+  emit_cmovb_imm(1,rt);
+}
+
+static void emit_cmp(u_int rs,u_int rt)
+{
+  assem_debug("cmp %s,%s\n",regname[rs],regname[rt]);
+  output_w32(0x6b000000 | rm_rn_rd(rt, rs, WZR));
+}
+
+static void emit_set_gz32(u_int rs, u_int rt)
+{
+  //assem_debug("set_gz32\n");
+  emit_cmpimm(rs,1);
+  emit_movimm(1,rt);
+  emit_cmovl_imm(0,rt);
+}
+
+static void emit_set_nz32(u_int rs, u_int rt)
+{
+  //assem_debug("set_nz32\n");
+  if(rs!=rt) emit_mov(rs,rt);
+  emit_test(rs,rs);
+  emit_cmovne_imm(1,rt);
+}
+
+static void emit_set_if_less32(u_int rs1, u_int rs2, u_int rt)
+{
+  //assem_debug("set if less (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]);
+  if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt);
+  emit_cmp(rs1,rs2);
+  if(rs1==rt||rs2==rt) emit_movimm(0,rt);
+  emit_cmovl_imm(1,rt);
+}
+
+static void emit_set_if_carry32(u_int rs1, u_int rs2, u_int rt)
+{
+  //assem_debug("set if carry (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]);
+  if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt);
+  emit_cmp(rs1,rs2);
+  if(rs1==rt||rs2==rt) emit_movimm(0,rt);
+  emit_cmovb_imm(1,rt);
+}
+
+static int can_jump_or_call(const void *a)
+{
+  intptr_t diff = (u_char *)a - out;
+  return (-134217728 <= diff && diff <= 134217727);
+}
+
+static void emit_call(const void *a)
+{
+  intptr_t diff = (u_char *)a - out;
+  assem_debug("bl %p (%p+%lx)%s\n", a, out, diff, func_name(a));
+  assert(!(diff & 3));
+  if (-134217728 <= diff && diff <= 134217727)
+    output_w32(0x94000000 | ((diff >> 2) & 0x03ffffff));
+  else
+    abort();
+}
+
+static void emit_jmp(const void *a)
+{
+  assem_debug("b %p (%p+%lx)%s\n", a, out, (u_char *)a - out, func_name(a));
+  u_int offset = genjmp(a);
+  output_w32(0x14000000 | offset);
+}
+
+static void emit_jne(const void *a)
+{
+  assem_debug("bne %p\n", a);
+  u_int offset = genjmpcc(a);
+  output_w32(0x54000000 | (offset << 5) | COND_NE);
+}
+
+static void emit_jeq(const void *a)
+{
+  assem_debug("beq %p\n", a);
+  u_int offset = genjmpcc(a);
+  output_w32(0x54000000 | (offset << 5) | COND_EQ);
+}
+
+static void emit_js(const void *a)
+{
+  assem_debug("bmi %p\n", a);
+  u_int offset = genjmpcc(a);
+  output_w32(0x54000000 | (offset << 5) | COND_MI);
+}
+
+static void emit_jns(const void *a)
+{
+  assem_debug("bpl %p\n", a);
+  u_int offset = genjmpcc(a);
+  output_w32(0x54000000 | (offset << 5) | COND_PL);
+}
+
+static void emit_jl(const void *a)
+{
+  assem_debug("blt %p\n", a);
+  u_int offset = genjmpcc(a);
+  output_w32(0x54000000 | (offset << 5) | COND_LT);
+}
+
+static void emit_jge(const void *a)
+{
+  assem_debug("bge %p\n", a);
+  u_int offset = genjmpcc(a);
+  output_w32(0x54000000 | (offset << 5) | COND_GE);
+}
+
+static void emit_jno(const void *a)
+{
+  assem_debug("bvc %p\n", a);
+  u_int offset = genjmpcc(a);
+  output_w32(0x54000000 | (offset << 5) | COND_VC);
+}
+
+static void emit_jc(const void *a)
+{
+  assem_debug("bcs %p\n", a);
+  u_int offset = genjmpcc(a);
+  output_w32(0x54000000 | (offset << 5) | COND_CS);
+}
+
+static void emit_cb(u_int isnz, u_int is64, const void *a, u_int r)
+{
+  assem_debug("cb%sz %s,%p\n", isnz?"n":"", is64?regname64[r]:regname[r], a);
+  u_int offset = genjmpcc(a);
+  is64 = is64 ? 0x80000000 : 0;
+  isnz = isnz ? 0x01000000 : 0;
+  output_w32(0x34000000 | is64 | isnz | imm19_rt(offset, r));
+}
+
+static void emit_cbz(const void *a, u_int r)
+{
+  emit_cb(0, 0, a, r);
+}
+
+static void emit_jmpreg(u_int r)
+{
+  assem_debug("br %s\n", regname64[r]);
+  output_w32(0xd61f0000 | rm_rn_rd(0, r, 0));
+}
+
+static void emit_retreg(u_int r)
+{
+  assem_debug("ret %s\n", r == LR ? "" : regname64[r]);
+  output_w32(0xd65f0000 | rm_rn_rd(0, r, 0));
+}
+
+static void emit_ret(void)
+{
+  emit_retreg(LR);
+}
+
+static void emit_adr(void *addr, u_int rt)
+{
+  intptr_t offset = (u_char *)addr - out;
+  assert(-1048576 <= offset && offset < 1048576);
+  assert(rt < 31);
+  assem_debug("adr x%d,#%#lx\n", rt, offset);
+  output_w32(0x10000000 | ((offset&0x3) << 29) | (((offset>>2)&0x7ffff) << 5) | rt);
+}
+
+static void emit_adrp(void *addr, u_int rt)
+{
+  intptr_t offset = ((intptr_t)addr & ~0xfffl) - ((intptr_t)out & ~0xfffl);
+  assert(-4294967296l <= offset && offset < 4294967296l);
+  assert(rt < 31);
+  offset >>= 12;
+  assem_debug("adrp %s,#%#lx(000)\n",regname64[rt],offset);
+  output_w32(0x90000000 | ((offset&0x3)<<29) | (((offset>>2)&0x7ffff)<<5) | rt);
+}
+
+static void emit_readword_indexed(int offset, u_int rs, u_int rt)
+{
+  assem_debug("ldur %s,[%s+%#x]\n",regname[rt],regname64[rs],offset);
+  assert(-256 <= offset && offset < 256);
+  output_w32(0xb8400000 | imm9_rn_rt(offset&0x1ff, rs, rt));
+}
+
+static void emit_strb_dualindexed(u_int rs1, u_int rs2, u_int rt)
+{
+  assem_debug("strb %s, [%s,%s]\n",regname[rt],regname64[rs1],regname[rs2]);
+  output_w32(0x38204800 | rm_rn_rd(rs2, rs1, rt));
+}
+
+static void emit_strh_dualindexed(u_int rs1, u_int rs2, u_int rt)
+{
+  assem_debug("strh %s, [%s,%s]\n",regname[rt],regname64[rs1],regname[rs2]);
+  output_w32(0x78204800 | rm_rn_rd(rs2, rs1, rt));
+}
+
+static void emit_str_dualindexed(u_int rs1, u_int rs2, u_int rt)
+{
+  assem_debug("str %s, [%s,%s]\n",regname[rt],regname64[rs1],regname[rs2]);
+  output_w32(0xb8204800 | rm_rn_rd(rs2, rs1, rt));
+}
+
+static void emit_readdword_dualindexedx8(u_int rs1, u_int rs2, u_int rt)
+{
+  assem_debug("ldr %s, [%s,%s, uxtw #3]\n",regname64[rt],regname64[rs1],regname[rs2]);
+  output_w32(0xf8605800 | rm_rn_rd(rs2, rs1, rt));
+}
+#define emit_readptr_dualindexedx_ptrlen emit_readdword_dualindexedx8
+
+static void emit_ldrb_dualindexed(u_int rs1, u_int rs2, u_int rt)
+{
+  assem_debug("ldrb %s, [%s,%s]\n",regname[rt],regname64[rs1],regname[rs2]);
+  output_w32(0x38604800 | rm_rn_rd(rs2, rs1, rt));
+}
+
+static void emit_ldrsb_dualindexed(u_int rs1, u_int rs2, u_int rt)
+{
+  assem_debug("ldrsb %s, [%s,%s]\n",regname[rt],regname64[rs1],regname[rs2]);
+  output_w32(0x38a04800 | rm_rn_rd(rs2, rs1, rt));
+}
+
+static void emit_ldrh_dualindexed(u_int rs1, u_int rs2, u_int rt)
+{
+  assem_debug("ldrh %s, [%s,%s, uxtw]\n",regname[rt],regname64[rs1],regname[rs2]);
+  output_w32(0x78604800 | rm_rn_rd(rs2, rs1, rt));
+}
+
+static void emit_ldrsh_dualindexed(u_int rs1, u_int rs2, u_int rt)
+{
+  assem_debug("ldrsh %s, [%s,%s, uxtw]\n",regname[rt],regname64[rs1],regname[rs2]);
+  output_w32(0x78a04800 | rm_rn_rd(rs2, rs1, rt));
+}
+
+static void emit_ldr_dualindexed(u_int rs1, u_int rs2, u_int rt)
+{
+  assem_debug("ldr %s, [%s,%s, uxtw]\n",regname[rt],regname64[rs1],regname[rs2]);
+  output_w32(0xb8604800 | rm_rn_rd(rs2, rs1, rt));
+}
+
+static void emit_movsbl_indexed(int offset, u_int rs, u_int rt)
+{
+  assem_debug("ldursb %s,[%s+%#x]\n",regname[rt],regname64[rs],offset);
+  assert(-256 <= offset && offset < 256);
+  output_w32(0x38c00000 | imm9_rn_rt(offset&0x1ff, rs, rt));
+}
+
+static void emit_movswl_indexed(int offset, u_int rs, u_int rt)
+{
+  assem_debug("ldursh %s,[%s+%#x]\n",regname[rt],regname64[rs],offset);
+  assert(-256 <= offset && offset < 256);
+  output_w32(0x78c00000 | imm9_rn_rt(offset&0x1ff, rs, rt));
+}
+
+static void emit_movzbl_indexed(int offset, u_int rs, u_int rt)
+{
+  assem_debug("ldurb %s,[%s+%#x]\n",regname[rt],regname64[rs],offset);
+  assert(-256 <= offset && offset < 256);
+  output_w32(0x38400000 | imm9_rn_rt(offset&0x1ff, rs, rt));
+}
+
+static void emit_movzwl_indexed(int offset, u_int rs, u_int rt)
+{
+  assem_debug("ldurh %s,[%s+%#x]\n",regname[rt],regname64[rs],offset);
+  assert(-256 <= offset && offset < 256);
+  output_w32(0x78400000 | imm9_rn_rt(offset&0x1ff, rs, rt));
+}
+
+static void emit_writeword_indexed(u_int rt, int offset, u_int rs)
+{
+  if (!(offset & 3) && (u_int)offset <= 16380) {
+    assem_debug("str %s,[%s+%#x]\n", regname[rt], regname[rs], offset);
+    output_w32(0xb9000000 | imm12_rn_rd(offset >> 2, rs, rt));
+  }
+  else if (-256 <= offset && offset < 256) {
+    assem_debug("stur %s,[%s+%#x]\n", regname[rt], regname[rs], offset);
+    output_w32(0xb8000000 | imm9_rn_rt(offset & 0x1ff, rs, rt));
+  }
+  else
+    assert(0);
+}
+
+static void emit_writehword_indexed(u_int rt, int offset, u_int rs)
+{
+  if (!(offset & 1) && (u_int)offset <= 8190) {
+    assem_debug("strh %s,[%s+%#x]\n", regname[rt], regname64[rs], offset);
+    output_w32(0x79000000 | imm12_rn_rd(offset >> 1, rs, rt));
+  }
+  else if (-256 <= offset && offset < 256) {
+    assem_debug("sturh %s,[%s+%#x]\n", regname[rt], regname64[rs], offset);
+    output_w32(0x78000000 | imm9_rn_rt(offset & 0x1ff, rs, rt));
+  }
+  else
+    assert(0);
+}
+
+static void emit_writebyte_indexed(u_int rt, int offset, u_int rs)
+{
+  if ((u_int)offset < 4096) {
+    assem_debug("strb %s,[%s+%#x]\n", regname[rt], regname64[rs], offset);
+    output_w32(0x39000000 | imm12_rn_rd(offset, rs, rt));
+  }
+  else if (-256 <= offset && offset < 256) {
+    assem_debug("sturb %s,[%s+%#x]\n", regname[rt], regname64[rs], offset);
+    output_w32(0x38000000 | imm9_rn_rt(offset & 0x1ff, rs, rt));
+  }
+  else
+    assert(0);
+}
+
+static void emit_umull(u_int rs1, u_int rs2, u_int rt)
+{
+  assem_debug("umull %s,%s,%s\n",regname64[rt],regname[rs1],regname[rs2]);
+  output_w32(0x9ba00000 | rm_ra_rn_rd(rs2, WZR, rs1, rt));
+}
+
+static void emit_smull(u_int rs1, u_int rs2, u_int rt)
+{
+  assem_debug("smull %s,%s,%s\n",regname64[rt],regname[rs1],regname[rs2]);
+  output_w32(0x9b200000 | rm_ra_rn_rd(rs2, WZR, rs1, rt));
+}
+
+static void emit_msub(u_int rs1, u_int rs2, u_int rs3, u_int rt)
+{
+  assem_debug("msub %s,%s,%s,%s\n",regname[rt],regname[rs1],regname[rs2],regname[rs3]);
+  output_w32(0x1b008000 | rm_ra_rn_rd(rs2, rs3, rs1, rt));
+}
+
+static void emit_sdiv(u_int rs1, u_int rs2, u_int rt)
+{
+  assem_debug("sdiv %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0x1ac00c00 | rm_rn_rd(rs2, rs1, rt));
+}
+
+static void emit_udiv(u_int rs1, u_int rs2, u_int rt)
+{
+  assem_debug("udiv %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
+  output_w32(0x1ac00800 | rm_rn_rd(rs2, rs1, rt));
+}
+
+static void emit_clz(u_int rs, u_int rt)
+{
+  assem_debug("clz %s,%s\n",regname[rt],regname[rs]);
+  output_w32(0x5ac01000 | rn_rd(rs, rt));
+}
+
+// special case for checking invalid_code
+static void emit_cmpmem_indexedsr12_reg(u_int rbase, u_int r, u_int imm)
+{
+  host_tempreg_acquire();
+  emit_shrimm(r, 12, HOST_TEMPREG);
+  assem_debug("ldrb %s,[%s,%s,uxtw]\n",regname[HOST_TEMPREG],regname64[rbase],regname[HOST_TEMPREG]);
+  output_w32(0x38604800 | rm_rn_rd(HOST_TEMPREG, rbase, HOST_TEMPREG));
+  emit_cmpimm(HOST_TEMPREG, imm);
+  host_tempreg_release();
+}
+
+// special for loadlr_assemble, rs2 is destroyed
+static void emit_bic_lsl(u_int rs1,u_int rs2,u_int shift,u_int rt)
+{
+  emit_shl(rs2, shift, rs2);
+  emit_bic(rs1, rs2, rt);
+}
+
+static void emit_bic_lsr(u_int rs1,u_int rs2,u_int shift,u_int rt)
+{
+  emit_shr(rs2, shift, rs2);
+  emit_bic(rs1, rs2, rt);
+}
+
+static void emit_loadlp_ofs(u_int ofs, u_int rt)
+{
+  output_w32(0x58000000 | imm19_rt(ofs, rt));
+}
+
+static void emit_ldst(int is_st, int is64, u_int rt, u_int rn, u_int ofs)
+{
+  u_int op = 0xb9000000;
+  unused const char *ldst = is_st ? "st" : "ld";
+  unused char rp = is64 ? 'x' : 'w';
+  assem_debug("%sr %c%d,[x%d,#%#x]\n", ldst, rp, rt, rn, ofs);
+  is64 = is64 ? 1 : 0;
+  assert((ofs & ((1 << (2+is64)) - 1)) == 0);
+  ofs = (ofs >> (2+is64));
+  if (!is_st) op |= 0x00400000;
+  if (is64)   op |= 0x40000000;
+  output_w32(op | imm12_rn_rd(ofs, rn, rt));
+}
+
+static void emit_ldstp(int is_st, int is64, u_int rt1, u_int rt2, u_int rn, int ofs)
+{
+  u_int op = 0x29000000;
+  unused const char *ldst = is_st ? "st" : "ld";
+  unused char rp = is64 ? 'x' : 'w';
+  assem_debug("%sp %c%d,%c%d,[x%d,#%#x]\n", ldst, rp, rt1, rp, rt2, rn, ofs);
+  is64 = is64 ? 1 : 0;
+  assert((ofs & ((1 << (2+is64)) - 1)) == 0);
+  ofs = (ofs >> (2+is64));
+  assert(-64 <= ofs && ofs <= 63);
+  ofs &= 0x7f;
+  if (!is_st) op |= 0x00400000;
+  if (is64)   op |= 0x80000000;
+  output_w32(op | imm7_rt2_rn_rt(ofs, rt2, rn, rt1));
+}
+
+static void save_load_regs_all(int is_store, u_int reglist)
+{
+  int ofs = 0, c = 0;
+  u_int r, pair[2];
+  for (r = 0; reglist; r++, reglist >>= 1) {
+    if (reglist & 1)
+      pair[c++] = r;
+    if (c == 2) {
+      emit_ldstp(is_store, 1, pair[0], pair[1], SP, SSP_CALLEE_REGS + ofs);
+      ofs += 8 * 2;
+      c = 0;
+    }
+  }
+  if (c) {
+    emit_ldst(is_store, 1, pair[0], SP, SSP_CALLEE_REGS + ofs);
+    ofs += 8;
+  }
+  assert(ofs <= SSP_CALLER_REGS);
+}
+
+// Save registers before function call
+static void save_regs(u_int reglist)
+{
+  reglist &= CALLER_SAVE_REGS; // only save the caller-save registers
+  save_load_regs_all(1, reglist);
+}
+
+// Restore registers after function call
+static void restore_regs(u_int reglist)
+{
+  reglist &= CALLER_SAVE_REGS;
+  save_load_regs_all(0, reglist);
+}
+
+/* Stubs/epilogue */
+
+static void literal_pool(int n)
+{
+  (void)literals;
+}
+
+static void literal_pool_jumpover(int n)
+{
+}
+
+// parsed by get_pointer, find_extjump_insn
+static void emit_extjump2(u_char *addr, u_int target, void *linker)
+{
+  assert(((addr[3]&0xfc)==0x14) || ((addr[3]&0xff)==0x54)); // b or b.cond
+
+  emit_movz(target & 0xffff, 0);
+  emit_movk_lsl16(target >> 16, 0);
+
+  // addr is in the current recompiled block (max 256k)
+  // offset shouldn't exceed +/-1MB
+  emit_adr(addr, 1);
+  emit_far_jump(linker);
+}
+
+static void check_extjump2(void *src)
+{
+  u_int *ptr = src;
+  assert((ptr[0] & 0xffe0001f) == 0x52800000); // movz r0, #val
+  (void)ptr;
+}
+
+// put rt_val into rt, potentially making use of rs with value rs_val
+static void emit_movimm_from(u_int rs_val, u_int rs, u_int rt_val, u_int rt)
+{
+  int diff = rt_val - rs_val;
+  if ((-4096 < diff && diff < 4096)
+      || (-16777216 < diff && diff < 16777216 && !(diff & 0xfff)))
+    emit_addimm(rs, diff, rt);
+  else if (rt_val == ~rs_val)
+    emit_not(rs, rt);
+  else if (is_rotated_mask(rs_val ^ rt_val))
+    emit_xorimm(rs, rs_val ^ rt_val, rt);
+  else
+    emit_movimm(rt_val, rt);
+}
+
+// return 1 if the above function can do it's job cheaply
+static int is_similar_value(u_int v1, u_int v2)
+{
+  int diff = v1 - v2;
+  return (-4096 < diff && diff < 4096)
+    || (-16777216 < diff && diff < 16777216 && !(diff & 0xfff))
+    || v1 == ~v2
+    || is_rotated_mask(v1 ^ v2);
+}
+
+static void emit_movimm_from64(u_int rs_val, u_int rs, uintptr_t rt_val, u_int rt)
+{
+  if (rt_val < 0x100000000ull) {
+    emit_movimm_from(rs_val, rs, rt_val, rt);
+    return;
+  }
+  // just move the whole thing. At least on Linux all addresses
+  // seem to be 48bit, so 3 insns - not great not terrible
+  assem_debug("movz %s,#%#lx\n", regname64[rt], rt_val & 0xffff);
+  output_w32(0xd2800000 | imm16_rd(rt_val & 0xffff, rt));
+  assem_debug("movk %s,#%#lx,lsl #16\n", regname64[rt], (rt_val >> 16) & 0xffff);
+  output_w32(0xf2a00000 | imm16_rd((rt_val >> 16) & 0xffff, rt));
+  assem_debug("movk %s,#%#lx,lsl #32\n", regname64[rt], (rt_val >> 32) & 0xffff);
+  output_w32(0xf2c00000 | imm16_rd((rt_val >> 32) & 0xffff, rt));
+  if (rt_val >> 48) {
+    assem_debug("movk %s,#%#lx,lsl #48\n", regname64[rt], (rt_val >> 48) & 0xffff);
+    output_w32(0xf2e00000 | imm16_rd((rt_val >> 48) & 0xffff, rt));
+  }
+}
+
+// trashes x2
+static void pass_args64(u_int a0, u_int a1)
+{
+  if(a0==1&&a1==0) {
+    // must swap
+    emit_mov64(a0,2); emit_mov64(a1,1); emit_mov64(2,0);
+  }
+  else if(a0!=0&&a1==0) {
+    emit_mov64(a1,1);
+    if (a0>=0) emit_mov64(a0,0);
+  }
+  else {
+    if(a0>=0&&a0!=0) emit_mov64(a0,0);
+    if(a1>=0&&a1!=1) emit_mov64(a1,1);
+  }
+}
+
+static void loadstore_extend(enum stub_type type, u_int rs, u_int rt)
+{
+  switch(type) {
+    case LOADB_STUB:  emit_sbfm(rs, 7, rt); break;
+    case LOADBU_STUB:
+    case STOREB_STUB: emit_ubfm(rs, 7, rt); break;
+    case LOADH_STUB:  emit_sbfm(rs, 15, rt); break;
+    case LOADHU_STUB:
+    case STOREH_STUB: emit_ubfm(rs, 15, rt); break;
+    case LOADW_STUB:  
+    case STOREW_STUB: if (rs != rt) emit_mov(rs, rt); break;
+    default:          assert(0);
+  }
+}
+
+#include "pcsxmem.h"
+//#include "pcsxmem_inline.c"
+
+static void do_readstub(int n)
+{
+  assem_debug("do_readstub %x\n",start+stubs[n].a*4);
+  set_jump_target(stubs[n].addr, out);
+  enum stub_type type = stubs[n].type;
+  int i = stubs[n].a;
+  int rs = stubs[n].b;
+  const struct regstat *i_regs = (void *)stubs[n].c;
+  u_int reglist = stubs[n].e;
+  const signed char *i_regmap = i_regs->regmap;
+  int rt;
+  if(dops[i].itype==C1LS||dops[i].itype==C2LS||dops[i].itype==LOADLR) {
+    rt=get_reg(i_regmap,FTEMP);
+  }else{
+    rt=get_reg(i_regmap,dops[i].rt1);
+  }
+  assert(rs>=0);
+  int r,temp=-1,temp2=HOST_TEMPREG,regs_saved=0;
+  void *restore_jump = NULL, *handler_jump = NULL;
+  reglist|=(1<<rs);
+  for (r = 0; r < HOST_CCREG; r++) {
+    if (r != EXCLUDE_REG && ((1 << r) & reglist) == 0) {
+      temp = r;
+      break;
+    }
+  }
+  if(rt>=0&&dops[i].rt1!=0)
+    reglist&=~(1<<rt);
+  if(temp==-1) {
+    save_regs(reglist);
+    regs_saved=1;
+    temp=(rs==0)?2:0;
+  }
+  if((regs_saved||(reglist&2)==0)&&temp!=1&&rs!=1)
+    temp2=1;
+  emit_readdword(&mem_rtab,temp);
+  emit_shrimm(rs,12,temp2);
+  emit_readdword_dualindexedx8(temp,temp2,temp2);
+  emit_adds64(temp2,temp2,temp2);
+  handler_jump=out;
+  emit_jc(0);
+  if(dops[i].itype==C1LS||dops[i].itype==C2LS||(rt>=0&&dops[i].rt1!=0)) {
+    switch(type) {
+      case LOADB_STUB:  emit_ldrsb_dualindexed(temp2,rs,rt); break;
+      case LOADBU_STUB: emit_ldrb_dualindexed(temp2,rs,rt); break;
+      case LOADH_STUB:  emit_ldrsh_dualindexed(temp2,rs,rt); break;
+      case LOADHU_STUB: emit_ldrh_dualindexed(temp2,rs,rt); break;
+      case LOADW_STUB:  emit_ldr_dualindexed(temp2,rs,rt); break;
+      default:          assert(0);
+    }
+  }
+  if(regs_saved) {
+    restore_jump=out;
+    emit_jmp(0); // jump to reg restore
+  }
+  else
+    emit_jmp(stubs[n].retaddr); // return address
+  set_jump_target(handler_jump, out);
+
+  if(!regs_saved)
+    save_regs(reglist);
+  void *handler=NULL;
+  if(type==LOADB_STUB||type==LOADBU_STUB)
+    handler=jump_handler_read8;
+  if(type==LOADH_STUB||type==LOADHU_STUB)
+    handler=jump_handler_read16;
+  if(type==LOADW_STUB)
+    handler=jump_handler_read32;
+  assert(handler);
+  pass_args64(rs,temp2);
+  int cc=get_reg(i_regmap,CCREG);
+  if(cc<0)
+    emit_loadreg(CCREG,2);
+  emit_addimm(cc<0?2:cc,(int)stubs[n].d,2);
+  emit_far_call(handler);
+  // (no cycle reload after read)
+  if(dops[i].itype==C1LS||dops[i].itype==C2LS||(rt>=0&&dops[i].rt1!=0)) {
+    loadstore_extend(type,0,rt);
+  }
+  if(restore_jump)
+    set_jump_target(restore_jump, out);
+  restore_regs(reglist);
+  emit_jmp(stubs[n].retaddr);
+}
+
+static void inline_readstub(enum stub_type type, int i, u_int addr,
+  const signed char regmap[], int target, int adj, u_int reglist)
+{
+  int rs=get_reg(regmap,target);
+  int rt=get_reg(regmap,target);
+  if(rs<0) rs=get_reg(regmap,-1);
+  assert(rs>=0);
+  u_int is_dynamic=0;
+  uintptr_t host_addr = 0;
+  void *handler;
+  int cc=get_reg(regmap,CCREG);
+  //if(pcsx_direct_read(type,addr,adj,cc,target?rs:-1,rt))
+  //  return;
+  handler = get_direct_memhandler(mem_rtab, addr, type, &host_addr);
+  if (handler == NULL) {
+    if(rt<0||dops[i].rt1==0)
+      return;
+    if (addr != host_addr)
+      emit_movimm_from64(addr, rs, host_addr, rs);
+    switch(type) {
+      case LOADB_STUB:  emit_movsbl_indexed(0,rs,rt); break;
+      case LOADBU_STUB: emit_movzbl_indexed(0,rs,rt); break;
+      case LOADH_STUB:  emit_movswl_indexed(0,rs,rt); break;
+      case LOADHU_STUB: emit_movzwl_indexed(0,rs,rt); break;
+      case LOADW_STUB:  emit_readword_indexed(0,rs,rt); break;
+      default:          assert(0);
+    }
+    return;
+  }
+  is_dynamic = pcsxmem_is_handler_dynamic(addr);
+  if (is_dynamic) {
+    if(type==LOADB_STUB||type==LOADBU_STUB)
+      handler=jump_handler_read8;
+    if(type==LOADH_STUB||type==LOADHU_STUB)
+      handler=jump_handler_read16;
+    if(type==LOADW_STUB)
+      handler=jump_handler_read32;
+  }
+
+  // call a memhandler
+  if(rt>=0&&dops[i].rt1!=0)
+    reglist&=~(1<<rt);
+  save_regs(reglist);
+  if(target==0)
+    emit_movimm(addr,0);
+  else if(rs!=0)
+    emit_mov(rs,0);
+  if(cc<0)
+    emit_loadreg(CCREG,2);
+  emit_addimm(cc<0?2:cc,adj,2);
+  if(is_dynamic) {
+    uintptr_t l1 = ((uintptr_t *)mem_rtab)[addr>>12] << 1;
+    emit_adrp((void *)l1, 1);
+    emit_addimm64(1, l1 & 0xfff, 1);
+  }
+  else
+    emit_far_call(do_memhandler_pre);
+
+  emit_far_call(handler);
+
+  // (no cycle reload after read)
+  if(rt>=0&&dops[i].rt1!=0)
+    loadstore_extend(type, 0, rt);
+  restore_regs(reglist);
+}
+
+static void do_writestub(int n)
+{
+  assem_debug("do_writestub %x\n",start+stubs[n].a*4);
+  set_jump_target(stubs[n].addr, out);
+  enum stub_type type=stubs[n].type;
+  int i=stubs[n].a;
+  int rs=stubs[n].b;
+  struct regstat *i_regs=(struct regstat *)stubs[n].c;
+  u_int reglist=stubs[n].e;
+  signed char *i_regmap=i_regs->regmap;
+  int rt,r;
+  if(dops[i].itype==C1LS||dops[i].itype==C2LS) {
+    rt=get_reg(i_regmap,r=FTEMP);
+  }else{
+    rt=get_reg(i_regmap,r=dops[i].rs2);
+  }
+  assert(rs>=0);
+  assert(rt>=0);
+  int rtmp,temp=-1,temp2,regs_saved=0;
+  void *restore_jump = NULL, *handler_jump = NULL;
+  int reglist2=reglist|(1<<rs)|(1<<rt);
+  for (rtmp = 0; rtmp < HOST_CCREG; rtmp++) {
+    if (rtmp != EXCLUDE_REG && ((1 << rtmp) & reglist) == 0) {
+      temp = rtmp;
+      break;
+    }
+  }
+  if(temp==-1) {
+    save_regs(reglist);
+    regs_saved=1;
+    for(rtmp=0;rtmp<=3;rtmp++)
+      if(rtmp!=rs&&rtmp!=rt)
+        {temp=rtmp;break;}
+  }
+  if((regs_saved||(reglist2&8)==0)&&temp!=3&&rs!=3&&rt!=3)
+    temp2=3;
+  else {
+    host_tempreg_acquire();
+    temp2=HOST_TEMPREG;
+  }
+  emit_readdword(&mem_wtab,temp);
+  emit_shrimm(rs,12,temp2);
+  emit_readdword_dualindexedx8(temp,temp2,temp2);
+  emit_adds64(temp2,temp2,temp2);
+  handler_jump=out;
+  emit_jc(0);
+  switch(type) {
+    case STOREB_STUB: emit_strb_dualindexed(temp2,rs,rt); break;
+    case STOREH_STUB: emit_strh_dualindexed(temp2,rs,rt); break;
+    case STOREW_STUB: emit_str_dualindexed(temp2,rs,rt); break;
+    default:          assert(0);
+  }
+  if(regs_saved) {
+    restore_jump=out;
+    emit_jmp(0); // jump to reg restore
+  }
+  else
+    emit_jmp(stubs[n].retaddr); // return address (invcode check)
+  set_jump_target(handler_jump, out);
+
+  if(!regs_saved)
+    save_regs(reglist);
+  void *handler=NULL;
+  switch(type) {
+    case STOREB_STUB: handler=jump_handler_write8; break;
+    case STOREH_STUB: handler=jump_handler_write16; break;
+    case STOREW_STUB: handler=jump_handler_write32; break;
+    default:          assert(0);
+  }
+  assert(handler);
+  pass_args(rs,rt);
+  if(temp2!=3) {
+    emit_mov64(temp2,3);
+    host_tempreg_release();
+  }
+  int cc=get_reg(i_regmap,CCREG);
+  if(cc<0)
+    emit_loadreg(CCREG,2);
+  emit_addimm(cc<0?2:cc,(int)stubs[n].d,2);
+  // returns new cycle_count
+  emit_far_call(handler);
+  emit_addimm(0,-(int)stubs[n].d,cc<0?2:cc);
+  if(cc<0)
+    emit_storereg(CCREG,2);
+  if(restore_jump)
+    set_jump_target(restore_jump, out);
+  restore_regs(reglist);
+  emit_jmp(stubs[n].retaddr);
+}
+
+static void inline_writestub(enum stub_type type, int i, u_int addr,
+  const signed char regmap[], int target, int adj, u_int reglist)
+{
+  int rs = get_reg(regmap,-1);
+  int rt = get_reg(regmap,target);
+  assert(rs >= 0);
+  assert(rt >= 0);
+  uintptr_t host_addr = 0;
+  void *handler = get_direct_memhandler(mem_wtab, addr, type, &host_addr);
+  if (handler == NULL) {
+    if (addr != host_addr)
+      emit_movimm_from64(addr, rs, host_addr, rs);
+    switch (type) {
+      case STOREB_STUB: emit_writebyte_indexed(rt, 0, rs); break;
+      case STOREH_STUB: emit_writehword_indexed(rt, 0, rs); break;
+      case STOREW_STUB: emit_writeword_indexed(rt, 0, rs); break;
+      default:          assert(0);
+    }
+    return;
+  }
+
+  // call a memhandler
+  save_regs(reglist);
+  emit_writeword(rs, &address); // some handlers still need it
+  loadstore_extend(type, rt, 0);
+  int cc, cc_use;
+  cc = cc_use = get_reg(regmap, CCREG);
+  if (cc < 0)
+    emit_loadreg(CCREG, (cc_use = 2));
+  emit_addimm(cc_use, adj, 2);
+
+  emit_far_call(do_memhandler_pre);
+  emit_far_call(handler);
+  emit_far_call(do_memhandler_post);
+  emit_addimm(0, -adj, cc_use);
+  if (cc < 0)
+    emit_storereg(CCREG, cc_use);
+  restore_regs(reglist);
+}
+
+static int verify_code_arm64(const void *source, const void *copy, u_int size)
+{
+  int ret = memcmp(source, copy, size);
+  //printf("%s %p,%#x = %d\n", __func__, source, size, ret);
+  return ret;
+}
+
+// this output is parsed by verify_dirty, get_bounds, isclean, get_clean_addr
+static void do_dirty_stub_base(u_int vaddr, u_int source_len)
+{
+  assert(source_len <= MAXBLOCK*4);
+  emit_loadlp_ofs(0, 0); // ldr x1, source
+  emit_loadlp_ofs(0, 1); // ldr x2, copy
+  emit_movz(source_len, 2);
+  emit_far_call(verify_code_arm64);
+  void *jmp = out;
+  emit_cbz(0, 0);
+  emit_movz(vaddr & 0xffff, 0);
+  emit_movk_lsl16(vaddr >> 16, 0);
+  emit_far_call(get_addr);
+  emit_jmpreg(0);
+  set_jump_target(jmp, out);
+}
+
+static void assert_dirty_stub(const u_int *ptr)
+{
+  assert((ptr[0] & 0xff00001f) == 0x58000000); // ldr x0, source
+  assert((ptr[1] & 0xff00001f) == 0x58000001); // ldr x1, copy
+  assert((ptr[2] & 0xffe0001f) == 0x52800002); // movz w2, #source_len
+  assert( ptr[8]               == 0xd61f0000); // br x0
+}
+
+static void set_loadlp(u_int *loadl, void *lit)
+{
+  uintptr_t ofs = (u_char *)lit - (u_char *)loadl;
+  assert((*loadl & ~0x1f) == 0x58000000);
+  assert((ofs & 3) == 0);
+  assert(ofs < 0x100000);
+  *loadl |= (ofs >> 2) << 5;
+}
+
+static void do_dirty_stub_emit_literals(u_int *loadlps)
+{
+  set_loadlp(&loadlps[0], out);
+  output_w64((uintptr_t)source);
+  set_loadlp(&loadlps[1], out);
+  output_w64((uintptr_t)copy);
+}
+
+static void *do_dirty_stub(int i, u_int source_len)
+{
+  assem_debug("do_dirty_stub %x\n",start+i*4);
+  u_int *loadlps = (void *)out;
+  do_dirty_stub_base(start + i*4, source_len);
+  void *entry = out;
+  load_regs_entry(i);
+  if (entry == out)
+    entry = instr_addr[i];
+  emit_jmp(instr_addr[i]);
+  do_dirty_stub_emit_literals(loadlps);
+  return entry;
+}
+
+static void do_dirty_stub_ds(u_int source_len)
+{
+  u_int *loadlps = (void *)out;
+  do_dirty_stub_base(start + 1, source_len);
+  void *lit_jumpover = out;
+  emit_jmp(out + 8*2);
+  do_dirty_stub_emit_literals(loadlps);
+  set_jump_target(lit_jumpover, out);
+}
+
+static uint64_t get_from_ldr_literal(const u_int *i)
+{
+  signed int ofs;
+  assert((i[0] & 0xff000000) == 0x58000000);
+  ofs = i[0] << 8;
+  ofs >>= 5+8;
+  return *(uint64_t *)(i + ofs);
+}
+
+static uint64_t get_from_movz(const u_int *i)
+{
+  assert((i[0] & 0x7fe00000) == 0x52800000);
+  return (i[0] >> 5) & 0xffff;
+}
+
+// Find the "clean" entry point from a "dirty" entry point
+// by skipping past the call to verify_code
+static void *get_clean_addr(u_int *addr)
+{
+  assert_dirty_stub(addr);
+  return addr + 9;
+}
+
+static int verify_dirty(const u_int *ptr)
+{
+  const void *source, *copy;
+  u_int len;
+  assert_dirty_stub(ptr);
+  source = (void *)get_from_ldr_literal(&ptr[0]); // ldr x1, source
+  copy   = (void *)get_from_ldr_literal(&ptr[1]); // ldr x1, copy
+  len = get_from_movz(&ptr[2]);                   // movz w3, #source_len
+  return !memcmp(source, copy, len);
+}
+
+static int isclean(void *addr)
+{
+  const u_int *ptr = addr;
+  if ((*ptr >> 24) == 0x58) { // the only place ldr (literal) is used
+    assert_dirty_stub(ptr);
+    return 0;
+  }
+  return 1;
+}
+
+// get source that block at addr was compiled from (host pointers)
+static void get_bounds(void *addr, u_char **start, u_char **end)
+{
+  const u_int *ptr = addr;
+  assert_dirty_stub(ptr);
+  *start = (u_char *)get_from_ldr_literal(&ptr[0]); // ldr x1, source
+  *end = *start + get_from_movz(&ptr[2]);           // movz w3, #source_len
+}
+
+/* Special assem */
+
+static void c2op_prologue(u_int op, int i, const struct regstat *i_regs, u_int reglist)
+{
+  save_load_regs_all(1, reglist);
+  cop2_do_stall_check(op, i, i_regs, 0);
+#ifdef PCNT
+  emit_movimm(op, 0);
+  emit_far_call(pcnt_gte_start);
+#endif
+  // pointer to cop2 regs
+  emit_addimm64(FP, (u_char *)&psxRegs.CP2D.r[0] - (u_char *)&dynarec_local, 0);
+}
+
+static void c2op_epilogue(u_int op,u_int reglist)
+{
+#ifdef PCNT
+  emit_movimm(op, 0);
+  emit_far_call(pcnt_gte_end);
+#endif
+  save_load_regs_all(0, reglist);
+}
+
+static void c2op_assemble(int i, const struct regstat *i_regs)
+{
+  u_int c2op=source[i]&0x3f;
+  u_int hr,reglist_full=0,reglist;
+  int need_flags,need_ir;
+  for(hr=0;hr<HOST_REGS;hr++) {
+    if(i_regs->regmap[hr]>=0) reglist_full|=1<<hr;
+  }
+  reglist=reglist_full&CALLER_SAVE_REGS;
+
+  if (gte_handlers[c2op]!=NULL) {
+    need_flags=!(gte_unneeded[i+1]>>63); // +1 because of how liveness detection works
+    need_ir=(gte_unneeded[i+1]&0xe00)!=0xe00;
+    assem_debug("gte op %08x, unneeded %016lx, need_flags %d, need_ir %d\n",
+      source[i],gte_unneeded[i+1],need_flags,need_ir);
+    if(HACK_ENABLED(NDHACK_GTE_NO_FLAGS))
+      need_flags=0;
+    //int shift = (source[i] >> 19) & 1;
+    //int lm = (source[i] >> 10) & 1;
+    switch(c2op) {
+      default:
+        (void)need_ir;
+        c2op_prologue(c2op, i, i_regs, reglist);
+        emit_movimm(source[i],1); // opcode
+        emit_writeword(1,&psxRegs.code);
+        emit_far_call(need_flags?gte_handlers[c2op]:gte_handlers_nf[c2op]);
+        break;
+    }
+    c2op_epilogue(c2op,reglist);
+  }
+}
+
+static void c2op_ctc2_31_assemble(signed char sl, signed char temp)
+{
+  //value = value & 0x7ffff000;
+  //if (value & 0x7f87e000) value |= 0x80000000;
+  emit_andimm(sl, 0x7fffe000, temp);
+  emit_testimm(temp, 0xff87ffff);
+  emit_andimm(sl, 0x7ffff000, temp);
+  host_tempreg_acquire();
+  emit_orimm(temp, 0x80000000, HOST_TEMPREG);
+  emit_cmovne_reg(HOST_TEMPREG, temp);
+  host_tempreg_release();
+  assert(0); // testing needed
+}
+
+static void do_mfc2_31_one(u_int copr,signed char temp)
+{
+  emit_readshword(&reg_cop2d[copr],temp);
+  emit_bicsar_imm(temp,31,temp);
+  emit_cmpimm(temp,0xf80);
+  emit_csinvle_reg(temp,WZR,temp); // if (temp > 0xf80) temp = ~0;
+  emit_andimm(temp,0xf80,temp);
+}
+
+static void c2op_mfc2_29_assemble(signed char tl, signed char temp)
+{
+  if (temp < 0) {
+    host_tempreg_acquire();
+    temp = HOST_TEMPREG;
+  }
+  do_mfc2_31_one(9,temp);
+  emit_shrimm(temp,7,tl);
+  do_mfc2_31_one(10,temp);
+  emit_orrshr_imm(temp,2,tl);
+  do_mfc2_31_one(11,temp);
+  emit_orrshl_imm(temp,3,tl);
+  emit_writeword(tl,&reg_cop2d[29]);
+
+  if (temp == HOST_TEMPREG)
+    host_tempreg_release();
+}
+
+static void multdiv_assemble_arm64(int i, const struct regstat *i_regs)
+{
+  //  case 0x18: MULT
+  //  case 0x19: MULTU
+  //  case 0x1A: DIV
+  //  case 0x1B: DIVU
+  if(dops[i].rs1&&dops[i].rs2)
+  {
+    switch(dops[i].opcode2)
+    {
+    case 0x18: // MULT
+    case 0x19: // MULTU
+      {
+        signed char m1=get_reg(i_regs->regmap,dops[i].rs1);
+        signed char m2=get_reg(i_regs->regmap,dops[i].rs2);
+        signed char hi=get_reg(i_regs->regmap,HIREG);
+        signed char lo=get_reg(i_regs->regmap,LOREG);
+        assert(m1>=0);
+        assert(m2>=0);
+        assert(hi>=0);
+        assert(lo>=0);
+
+        if(dops[i].opcode2==0x18) // MULT
+          emit_smull(m1,m2,hi);
+        else                 // MULTU
+          emit_umull(m1,m2,hi);
+
+        emit_mov(hi,lo);
+        emit_shrimm64(hi,32,hi);
+        break;
+      }
+    case 0x1A: // DIV
+    case 0x1B: // DIVU
+      {
+        signed char numerator=get_reg(i_regs->regmap,dops[i].rs1);
+        signed char denominator=get_reg(i_regs->regmap,dops[i].rs2);
+        signed char quotient=get_reg(i_regs->regmap,LOREG);
+        signed char remainder=get_reg(i_regs->regmap,HIREG);
+        assert(numerator>=0);
+        assert(denominator>=0);
+        assert(quotient>=0);
+        assert(remainder>=0);
+
+        if (dops[i].opcode2 == 0x1A) // DIV
+          emit_sdiv(numerator,denominator,quotient);
+        else                    // DIVU
+          emit_udiv(numerator,denominator,quotient);
+        emit_msub(quotient,denominator,numerator,remainder);
+
+        // div 0 quotient (remainder is already correct)
+        host_tempreg_acquire();
+        if (dops[i].opcode2 == 0x1A) // DIV
+          emit_sub_asrimm(0,numerator,31,HOST_TEMPREG);
+        else
+          emit_movimm(~0,HOST_TEMPREG);
+        emit_test(denominator,denominator);
+        emit_cmoveq_reg(HOST_TEMPREG,quotient);
+        host_tempreg_release();
+        break;
+      }
+    default:
+      assert(0);
+    }
+  }
+  else
+  {
+    signed char hr=get_reg(i_regs->regmap,HIREG);
+    signed char lr=get_reg(i_regs->regmap,LOREG);
+    if ((dops[i].opcode2==0x1A || dops[i].opcode2==0x1B) && dops[i].rs2==0) // div 0
+    {
+      if (dops[i].rs1) {
+        signed char numerator = get_reg(i_regs->regmap, dops[i].rs1);
+        assert(numerator >= 0);
+        if (hr >= 0)
+          emit_mov(numerator,hr);
+        if (lr >= 0) {
+          if (dops[i].opcode2 == 0x1A) // DIV
+            emit_sub_asrimm(0,numerator,31,lr);
+          else
+            emit_movimm(~0,lr);
+        }
+      }
+      else {
+        if (hr >= 0) emit_zeroreg(hr);
+        if (lr >= 0) emit_movimm(~0,lr);
+      }
+    }
+    else
+    {
+      // Multiply by zero is zero.
+      if (hr >= 0) emit_zeroreg(hr);
+      if (lr >= 0) emit_zeroreg(lr);
+    }
+  }
+}
+#define multdiv_assemble multdiv_assemble_arm64
+
+static void do_jump_vaddr(u_int rs)
+{
+  if (rs != 0)
+    emit_mov(rs, 0);
+  emit_far_call(get_addr_ht);
+  emit_jmpreg(0);
+}
+
+static void do_preload_rhash(u_int r) {
+  // Don't need this for ARM.  On x86, this puts the value 0xf8 into the
+  // register.  On ARM the hash can be done with a single instruction (below)
+}
+
+static void do_preload_rhtbl(u_int ht) {
+  emit_addimm64(FP, (u_char *)&mini_ht - (u_char *)&dynarec_local, ht);
+}
+
+static void do_rhash(u_int rs,u_int rh) {
+  emit_andimm(rs, 0xf8, rh);
+}
+
+static void do_miniht_load(int ht, u_int rh) {
+  emit_add64(ht, rh, ht);
+  emit_ldst(0, 0, rh, ht, 0);
+}
+
+static void do_miniht_jump(u_int rs, u_int rh, u_int ht) {
+  emit_cmp(rh, rs);
+  void *jaddr = out;
+  emit_jeq(0);
+  do_jump_vaddr(rs);
+
+  set_jump_target(jaddr, out);
+  assem_debug("ldr %s,[%s,#8]\n",regname64[ht], regname64[ht]);
+  output_w32(0xf9400000 | imm12_rn_rd(8 >> 3, ht, ht));
+  emit_jmpreg(ht);
+}
+
+// parsed by set_jump_target?
+static void do_miniht_insert(u_int return_address,u_int rt,int temp) {
+  emit_movz_lsl16((return_address>>16)&0xffff,rt);
+  emit_movk(return_address&0xffff,rt);
+  add_to_linker(out,return_address,1);
+  emit_adr(out,temp);
+  emit_writedword(temp,&mini_ht[(return_address&0xFF)>>3][1]);
+  emit_writeword(rt,&mini_ht[(return_address&0xFF)>>3][0]);
+}
+
+static void clear_cache_arm64(char *start, char *end)
+{
+  // Don't rely on GCC's __clear_cache implementation, as it caches
+  // icache/dcache cache line sizes, that can vary between cores on
+  // big.LITTLE architectures.
+  uint64_t addr, ctr_el0;
+  static size_t icache_line_size = 0xffff, dcache_line_size = 0xffff;
+  size_t isize, dsize;
+
+  __asm__ volatile("mrs %0, ctr_el0" : "=r"(ctr_el0));
+  isize = 4 << ((ctr_el0 >> 0) & 0xf);
+  dsize = 4 << ((ctr_el0 >> 16) & 0xf);
+
+  // use the global minimum cache line size
+  icache_line_size = isize = icache_line_size < isize ? icache_line_size : isize;
+  dcache_line_size = dsize = dcache_line_size < dsize ? dcache_line_size : dsize;
+
+  /* If CTR_EL0.IDC is enabled, Data cache clean to the Point of Unification is
+     not required for instruction to data coherence.  */
+  if ((ctr_el0 & (1 << 28)) == 0x0) {
+    addr = (uint64_t)start & ~(uint64_t)(dsize - 1);
+    for (; addr < (uint64_t)end; addr += dsize)
+      // use "civac" instead of "cvau", as this is the suggested workaround for
+      // Cortex-A53 errata 819472, 826319, 827319 and 824069.
+      __asm__ volatile("dc civac, %0" : : "r"(addr) : "memory");
+  }
+  __asm__ volatile("dsb ish" : : : "memory");
+
+  /* If CTR_EL0.DIC is enabled, Instruction cache cleaning to the Point of
+     Unification is not required for instruction to data coherence.  */
+  if ((ctr_el0 & (1 << 29)) == 0x0) {
+    addr = (uint64_t)start & ~(uint64_t)(isize - 1);
+    for (; addr < (uint64_t)end; addr += isize)
+      __asm__ volatile("ic ivau, %0" : : "r"(addr) : "memory");
+
+    __asm__ volatile("dsb ish" : : : "memory");
+  }
+
+  __asm__ volatile("isb" : : : "memory");
+}
+
+// CPU-architecture-specific initialization
+static void arch_init(void)
+{
+  uintptr_t diff = (u_char *)&ndrc->tramp.f - (u_char *)&ndrc->tramp.ops;
+  struct tramp_insns *ops = ndrc->tramp.ops;
+  size_t i;
+  assert(!(diff & 3));
+  start_tcache_write(ops, (u_char *)ops + sizeof(ndrc->tramp.ops));
+  for (i = 0; i < ARRAY_SIZE(ndrc->tramp.ops); i++) {
+    ops[i].ldr = 0x58000000 | imm19_rt(diff >> 2, 17); // ldr x17, [=val]
+    ops[i].br  = 0xd61f0000 | rm_rn_rd(0, 17, 0);      // br x17
+  }
+  end_tcache_write(ops, (u_char *)ops + sizeof(ndrc->tramp.ops));
+}
+
+// vim:shiftwidth=2:expandtab
diff --git a/libpcsxcore/new_dynarec/assem_arm64.h b/libpcsxcore/new_dynarec/assem_arm64.h
new file mode 100644 (file)
index 0000000..c5fcadf
--- /dev/null
@@ -0,0 +1,49 @@
+#define HOST_IMM8 1
+
+/* calling convention:
+   r0 -r17: caller-save
+   r19-r29: callee-save */
+
+#define HOST_REGS 29
+#define HOST_BTREG 27
+#define EXCLUDE_REG -1
+
+#define SP 31
+#define WZR SP
+#define XZR SP
+
+#define LR 30
+#define HOST_TEMPREG LR
+
+// Note: FP is set to &dynarec_local when executing generated code.
+// Thus the local variables are actually global and not on the stack.
+#define FP 29
+#define rFP x29
+
+#define HOST_CCREG 28
+#define rCC w28
+
+#define CALLER_SAVE_REGS 0x0007ffff
+#define PREFERRED_REG_FIRST 19
+#define PREFERRED_REG_LAST  27
+
+// stack space
+#define SSP_CALLEE_REGS (8*12)
+#define SSP_CALLER_REGS (8*20)
+#define SSP_ALL (SSP_CALLEE_REGS+SSP_CALLER_REGS)
+
+#define TARGET_SIZE_2 24 // 2^24 = 16 megabytes
+
+#ifndef __ASSEMBLER__
+
+extern char *invc_ptr;
+
+struct tramp_insns
+{
+  u_int ldr;
+  u_int br;
+};
+
+static void clear_cache_arm64(char *start, char *end);
+
+#endif // !__ASSEMBLY__
similarity index 87%
rename from libpcsxcore/new_dynarec/backends/psx/emu_if.c
rename to libpcsxcore/new_dynarec/emu_if.c
index e9fa607..bbcd756 100644 (file)
@@ -9,19 +9,16 @@
 
 #include "emu_if.h"
 #include "pcsxmem.h"
-#include "../../../psxhle.h"
-#include "../../../r3000a.h"
-#include "../../../cdrom.h"
-#include "../../../psxdma.h"
-#include "../../../mdec.h"
-#include "../../../gte_arm.h"
-#include "../../../gte_neon.h"
-
-#include "../../../gte.h"
-
+#include "../psxhle.h"
+#include "../psxinterpreter.h"
+#include "../r3000a.h"
+#include "../cdrom.h"
+#include "../psxdma.h"
+#include "../mdec.h"
+#include "../gte_arm.h"
+#include "../gte_neon.h"
 #define FLAGLESS
-#include "../../../gte.h"
-#undef  FLAGLESS
+#include "../gte.h"
 
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
 
@@ -29,7 +26,6 @@
 #define evprintf(...)
 
 char invalid_code[0x100000];
-static u32 scratch_buf[8*8*2] __attribute__((aligned(64)));
 u32 event_cycles[PSXINT_COUNT];
 
 static void schedule_timeslice(void)
@@ -189,12 +185,15 @@ void new_dyna_freeze(void *f, int mode)
                if (bytes != size)
                        return;
 
-               new_dynarec_load_blocks(addrs, size);
+               if (psxCpu != &psxInt)
+                       new_dynarec_load_blocks(addrs, size);
        }
 
        //printf("drc: %d block info entries %s\n", size/8, mode ? "saved" : "loaded");
 }
 
+#if !defined(DRC_DISABLE) && !defined(LIGHTREC)
+
 /* GTE stuff */
 void *gte_handlers[64];
 
@@ -220,15 +219,6 @@ const char *gte_regnames[64] = {
        NULL  , NULL   , NULL   , NULL  , NULL , "GPF"  , "GPL"  , "NCCT", // 38
 };
 
-/* from gte.txt.. not sure if this is any good. */
-const char gte_cycletab[64] = {
-       /*   1   2   3   4   5   6   7   8   9   a   b   c   d   e   f */
-        0, 15,  0,  0,  0,  0,  8,  0,  0,  0,  0,  0,  6,  0,  0,  0,
-        8,  8,  8, 19, 13,  0, 44,  0,  0,  0,  0, 17, 11,  0, 14,  0,
-       30,  0,  0,  0,  0,  0,  0,  0,  5,  8, 17,  0,  0,  5,  6,  0,
-       23,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  5,  5, 39,
-};
-
 #define GCBIT(x) \
        (1ll << (32+x))
 #define GDBIT(x) \
@@ -307,6 +297,7 @@ const uint64_t gte_reg_writes[64] = {
 
 static int ari64_init()
 {
+       static u32 scratch_buf[8*8*2] __attribute__((aligned(64)));
        extern void (*psxCP2[64])();
        extern void psxNULL();
        extern unsigned char *out;
@@ -335,13 +326,12 @@ static int ari64_init()
 #ifdef DRC_DBG
        memcpy(gte_handlers_nf, gte_handlers, sizeof(gte_handlers_nf));
 #endif
-   
        psxH_ptr = psxH;
        zeromem_ptr = zero_mem;
        scratch_buf_ptr = scratch_buf;
 
        SysPrintf("Mapped (RAM/scrp/ROM/LUTs/TC):\n");
-       SysPrintf("%08x/%08x/%08x/%08x/%08x\n",
+       SysPrintf("%p/%p/%p/%p/%p\n",
                psxM, psxH, psxR, mem_rtab, out);
 
        return 0;
@@ -365,7 +355,7 @@ static void ari64_execute_until()
        evprintf("ari64_execute %08x, %u->%u (%d)\n", psxRegs.pc,
                psxRegs.cycle, next_interupt, next_interupt - psxRegs.cycle);
 
-       new_dyna_start();
+       new_dyna_start(dynarec_local);
 
        evprintf("ari64_execute end %08x, %u->%u (%d)\n", psxRegs.pc,
                psxRegs.cycle, next_interupt, next_interupt - psxRegs.cycle);
@@ -398,23 +388,37 @@ static void ari64_clear(u32 addr, u32 size)
                        invalidate_block(start);
 }
 
-#ifdef ICACHE_EMULATION
 static void ari64_notify(int note, void *data) {
        /*
-       To change once we have proper icache emulation
+       Should be fixed when ARM dynarec has proper icache emulation.
        switch (note)
        {
                case R3000ACPU_NOTIFY_CACHE_UNISOLATED:
-                       ari64_clear(0, 0x200000/4);
                        break;
                case R3000ACPU_NOTIFY_CACHE_ISOLATED:
-               // Sent from psxDma3().
+               Sent from psxDma3().
                case R3000ACPU_NOTIFY_DMA3_EXE_LOAD:
                default:
                        break;
-       }*/
+       }
+       */
+}
+
+static void ari64_apply_config()
+{
+       intApplyConfig();
+
+       if (Config.DisableStalls)
+               new_dynarec_hacks |= NDHACK_NO_STALLS;
+       else
+               new_dynarec_hacks &= ~NDHACK_NO_STALLS;
+
+       if (cycle_multiplier != cycle_multiplier_old
+           || new_dynarec_hacks != new_dynarec_hacks_old)
+       {
+               new_dynarec_clear_full();
+       }
 }
-#endif
 
 static void ari64_shutdown()
 {
@@ -422,44 +426,28 @@ static void ari64_shutdown()
        new_dyna_pcsx_mem_shutdown();
 }
 
-extern void intExecute();
-extern void intExecuteT();
-extern void intExecuteBlock();
-extern void intExecuteBlockT();
-#ifndef DRC_DBG
-#define intExecuteT intExecute
-#define intExecuteBlockT intExecuteBlock
-#endif
-
 R3000Acpu psxRec = {
        ari64_init,
        ari64_reset,
-#ifndef DRC_DISABLE
        ari64_execute,
        ari64_execute_until,
-#else
-       intExecuteT,
-       intExecuteBlockT,
-#endif
        ari64_clear,
-#ifdef ICACHE_EMULATION
        ari64_notify,
-#endif
+       ari64_apply_config,
        ari64_shutdown
 };
 
-// TODO: rm
-#ifndef DRC_DBG
-void do_insn_trace() {}
-void do_insn_cmp() {}
-#endif
+#else // if DRC_DISABLE
 
-#ifdef DRC_DISABLE
 unsigned int address;
 int pending_exception, stop;
-u32 next_interupt;
+unsigned int next_interupt;
 int new_dynarec_did_compile;
 int cycle_multiplier;
+int cycle_multiplier_override;
+int cycle_multiplier_old;
+int new_dynarec_hacks_pergame;
+int new_dynarec_hacks_old;
 int new_dynarec_hacks;
 void *psxH_ptr;
 void *zeromem_ptr;
@@ -467,8 +455,8 @@ u8 zero_mem[0x1000];
 unsigned char *out;
 void *mem_rtab;
 void *scratch_buf_ptr;
-void new_dynarec_init() { (void)ari64_execute; }
-void new_dyna_start() {}
+void new_dynarec_init() {}
+void new_dyna_start(void *context) {}
 void new_dynarec_cleanup() {}
 void new_dynarec_clear_full() {}
 void invalidate_all_pages() {}
@@ -485,7 +473,9 @@ void new_dynarec_load_blocks(const void *save, int size) {}
 
 #include <stddef.h>
 static FILE *f;
-extern u32 last_io_addr;
+u32 irq_test_cycle;
+u32 handler_cycle;
+u32 last_io_addr;
 
 static void dump_mem(const char *fname, void *mem, size_t size)
 {
@@ -511,11 +501,10 @@ static u32 memcheck_read(u32 a)
        return *(u32 *)(psxM + (a & 0x1ffffc));
 }
 
+#if 0
 void do_insn_trace(void)
 {
        static psxRegisters oldregs;
-       static u32 old_io_addr = (u32)-1;
-       static u32 old_io_data = 0xbad0c0de;
        static u32 event_cycles_o[PSXINT_COUNT];
        u32 *allregs_p = (void *)&psxRegs;
        u32 *allregs_o = (void *)&oldregs;
@@ -539,27 +528,27 @@ void do_insn_trace(void)
        // log event changes
        for (i = 0; i < PSXINT_COUNT; i++) {
                if (event_cycles[i] != event_cycles_o[i]) {
-                       byte = 0xfc;
+                       byte = 0xf8;
                        fwrite(&byte, 1, 1, f);
                        fwrite(&i, 1, 1, f);
                        fwrite(&event_cycles[i], 1, 4, f);
                        event_cycles_o[i] = event_cycles[i];
                }
        }
-       // log last io
-       if (old_io_addr != last_io_addr) {
-               byte = 0xfd;
-               fwrite(&byte, 1, 1, f);
-               fwrite(&last_io_addr, 1, 4, f);
-               old_io_addr = last_io_addr;
+       #define SAVE_IF_CHANGED(code_, name_) { \
+               static u32 old_##name_ = 0xbad0c0de; \
+               if (old_##name_ != name_) { \
+                       byte = code_; \
+                       fwrite(&byte, 1, 1, f); \
+                       fwrite(&name_, 1, 4, f); \
+                       old_##name_ = name_; \
+               } \
        }
+       SAVE_IF_CHANGED(0xfb, irq_test_cycle);
+       SAVE_IF_CHANGED(0xfc, handler_cycle);
+       SAVE_IF_CHANGED(0xfd, last_io_addr);
        io_data = memcheck_read(last_io_addr);
-       if (old_io_data != io_data) {
-               byte = 0xfe;
-               fwrite(&byte, 1, 1, f);
-               fwrite(&io_data, 1, 4, f);
-               old_io_data = io_data;
-       }
+       SAVE_IF_CHANGED(0xfe, io_data);
        byte = 0xff;
        fwrite(&byte, 1, 1, f);
 
@@ -572,6 +561,7 @@ void do_insn_trace(void)
        }
 #endif
 }
+#endif
 
 static const char *regnames[offsetof(psxRegisters, intCycle) / 4] = {
        "r0",  "r1",  "r2",  "r3",  "r4",  "r5",  "r6",  "r7",
@@ -620,12 +610,15 @@ void breakme() {}
 
 void do_insn_cmp(void)
 {
+       extern int last_count;
        static psxRegisters rregs;
        static u32 mem_addr, mem_val;
+       static u32 irq_test_cycle_intr;
+       static u32 handler_cycle_intr;
        u32 *allregs_p = (void *)&psxRegs;
        u32 *allregs_e = (void *)&rregs;
        static u32 ppc, failcount;
-       int i, ret, bad = 0, which_event = -1;
+       int i, ret, bad = 0, fatal = 0, which_event = -1;
        u32 ev_cycles = 0;
        u8 code;
 
@@ -640,11 +633,17 @@ void do_insn_cmp(void)
                if (code == 0xff)
                        break;
                switch (code) {
-               case 0xfc:
+               case 0xf8:
                        which_event = 0;
                        fread(&which_event, 1, 1, f);
                        fread(&ev_cycles, 1, 4, f);
                        continue;
+               case 0xfb:
+                       fread(&irq_test_cycle_intr, 1, 4, f);
+                       continue;
+               case 0xfc:
+                       fread(&handler_cycle_intr, 1, 4, f);
+                       continue;
                case 0xfd:
                        fread(&mem_addr, 1, 4, f);
                        continue;
@@ -652,23 +651,43 @@ void do_insn_cmp(void)
                        fread(&mem_val, 1, 4, f);
                        continue;
                }
+               assert(code < offsetof(psxRegisters, intCycle) / 4);
                fread(&allregs_e[code], 1, 4, f);
        }
 
        if (ret <= 0) {
                printf("EOF?\n");
-               goto end;
+               exit(1);
        }
 
        psxRegs.code = rregs.code; // don't care
-       psxRegs.cycle = rregs.cycle;
+       psxRegs.cycle += last_count;
+       //psxRegs.cycle = rregs.cycle;
        psxRegs.CP0.r[9] = rregs.CP0.r[9]; // Count
 
        //if (psxRegs.cycle == 166172) breakme();
 
-       if (memcmp(&psxRegs, &rregs, offsetof(psxRegisters, intCycle)) == 0 &&
-                       mem_val == memcheck_read(mem_addr)
-          ) {
+       if (which_event >= 0 && event_cycles[which_event] != ev_cycles) {
+               printf("bad ev_cycles #%d: %08x %08x\n", which_event, event_cycles[which_event], ev_cycles);
+               fatal = 1;
+       }
+
+       if (irq_test_cycle > irq_test_cycle_intr) {
+               printf("bad irq_test_cycle: %u %u\n", irq_test_cycle, irq_test_cycle_intr);
+               fatal = 1;
+       }
+
+       if (handler_cycle != handler_cycle_intr) {
+               printf("bad handler_cycle: %u %u\n", handler_cycle, handler_cycle_intr);
+               fatal = 1;
+       }
+
+       if (mem_val != memcheck_read(mem_addr)) {
+               printf("bad mem @%08x: %08x %08x\n", mem_addr, memcheck_read(mem_addr), mem_val);
+               fatal = 1;
+       }
+
+       if (!fatal && !memcmp(&psxRegs, &rregs, offsetof(psxRegisters, intCycle))) {
                failcount = 0;
                goto ok;
        }
@@ -677,20 +696,12 @@ void do_insn_cmp(void)
                if (allregs_p[i] != allregs_e[i]) {
                        miss_log_add(i, allregs_p[i], allregs_e[i], psxRegs.pc, psxRegs.cycle);
                        bad++;
+                       if (i > 32+2)
+                               fatal = 1;
                }
        }
 
-       if (mem_val != memcheck_read(mem_addr)) {
-               printf("bad mem @%08x: %08x %08x\n", mem_addr, memcheck_read(mem_addr), mem_val);
-               goto end;
-       }
-
-       if (which_event >= 0 && event_cycles[which_event] != ev_cycles) {
-               printf("bad ev_cycles #%d: %08x %08x\n", which_event, event_cycles[which_event], ev_cycles);
-               goto end;
-       }
-
-       if (psxRegs.pc == rregs.pc && bad < 6 && failcount < 32) {
+       if (!fatal && psxRegs.pc == rregs.pc && bad < 6 && failcount < 32) {
                static int last_mcycle;
                if (last_mcycle != psxRegs.cycle >> 20) {
                        printf("%u\n", psxRegs.cycle);
@@ -700,7 +711,6 @@ void do_insn_cmp(void)
                goto ok;
        }
 
-end:
        for (i = 0; i < miss_log_len; i++, miss_log_i = (miss_log_i + 1) & miss_log_mask)
                printf("bad %5s: %08x %08x, pc=%08x, cycle %u\n",
                        regnames[miss_log[miss_log_i].reg], miss_log[miss_log_i].val,
@@ -714,7 +724,7 @@ end:
        dump_mem("/mnt/ntz/dev/pnd/tmp/psxregs.dump", psxH, 0x10000);
        exit(1);
 ok:
-       psxRegs.cycle = rregs.cycle + 2; // sync timing
+       //psxRegs.cycle = rregs.cycle + 2; // sync timing
        ppc = psxRegs.pc;
 }
 
similarity index 86%
rename from libpcsxcore/new_dynarec/backends/psx/emu_if.h
rename to libpcsxcore/new_dynarec/emu_if.h
index e5396ef..30cb9ef 100644 (file)
@@ -1,8 +1,5 @@
-#ifndef __EMU_IF_H__
-#define __EMU_IF_H__
-
-#include "../../new_dynarec.h"
-#include "../../../r3000a.h"
+#include "new_dynarec.h"
+#include "../r3000a.h"
 
 extern char invalid_code[0x100000];
 
@@ -10,8 +7,7 @@ extern char invalid_code[0x100000];
 #define EAX 0
 #define ECX 1
 
-/* same as psxRegs */
-extern int reg[];
+extern int dynarec_local[];
 
 /* same as psxRegs.GPR.n.* */
 extern int hi, lo;
@@ -56,13 +52,9 @@ extern int reg_cop2d[], reg_cop2c[];
 extern void *gte_handlers[64];
 extern void *gte_handlers_nf[64];
 extern const char *gte_regnames[64];
-extern const char gte_cycletab[64];
 extern const uint64_t gte_reg_reads[64];
 extern const uint64_t gte_reg_writes[64];
 
-/* dummy */
-extern int FCR0, FCR31;
-
 /* mem */
 extern void *mem_rtab;
 extern void *mem_wtab;
@@ -89,10 +81,11 @@ extern void *zeromem_ptr;
 extern void *scratch_buf_ptr;
 
 // same as invalid_code, just a region for ram write checks (inclusive)
+// (psx/guest address range)
 extern u32 inv_code_start, inv_code_end;
 
 /* cycles/irqs */
-extern u32 next_interupt;
+extern unsigned int next_interupt;
 extern int pending_exception;
 
 /* called by drc */
@@ -100,14 +93,6 @@ void pcsx_mtc0(u32 reg, u32 val);
 void pcsx_mtc0_ds(u32 reg, u32 val);
 
 /* misc */
-extern const void (*psxHLEt[8])();
-
 extern void SysPrintf(const char *fmt, ...);
 
-#ifdef RAM_FIXED
-#define rdram ((u_int)0x80000000)
-#else
-#define rdram ((u_int)psxM)
-#endif
-
-#endif /* __EMU_IF_H__ */
+#define rdram ((u_char *)psxM)
similarity index 91%
rename from libpcsxcore/new_dynarec/arm/linkage_arm.S
rename to libpcsxcore/new_dynarec/linkage_arm.S
index 269eb99..6371731 100644 (file)
  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
 #include "arm_features.h"
-#include "../new_dynarec_config.h"
+#include "new_dynarec_config.h"
 #include "linkage_offsets.h"
 
 
 #ifdef __MACH__
 #define dynarec_local          ESYM(dynarec_local)
-#define add_link               ESYM(add_link)
+#define add_jump_out           ESYM(add_jump_out)
 #define new_recompile_block    ESYM(new_recompile_block)
 #define get_addr               ESYM(get_addr)
 #define get_addr_ht            ESYM(get_addr_ht)
 #define clean_blocks           ESYM(clean_blocks)
 #define gen_interupt           ESYM(gen_interupt)
-#define psxException           ESYM(psxException)
-#define execI                  ESYM(execI)
 #define invalidate_addr                ESYM(invalidate_addr)
+#define gteCheckStallRaw       ESYM(gteCheckStallRaw)
 #endif
 
        .bss
@@ -59,12 +58,13 @@ DRC_VAR(cycle_count, 4)
 DRC_VAR(last_count, 4)
 DRC_VAR(pending_exception, 4)
 DRC_VAR(stop, 4)
-DRC_VAR(invc_ptr, 4)
+DRC_VAR(branch_target, 4)
 DRC_VAR(address, 4)
+@DRC_VAR(align0, 4) /* unused/alignment */
 DRC_VAR(psxRegs, LO_psxRegs_end - LO_psxRegs)
 
 /* psxRegs */
-DRC_VAR(reg, 128)
+@DRC_VAR(reg, 128)
 DRC_VAR(lo, 4)
 DRC_VAR(hi, 4)
 DRC_VAR(reg_cop0, 128)
@@ -77,21 +77,18 @@ DRC_VAR(pcaddr, 4)
 @DRC_VAR(intCycle, 256)
 
 DRC_VAR(rcnts, 7*4*4)
+DRC_VAR(inv_code_start, 4)
+DRC_VAR(inv_code_end, 4)
 DRC_VAR(mem_rtab, 4)
 DRC_VAR(mem_wtab, 4)
 DRC_VAR(psxH_ptr, 4)
 DRC_VAR(zeromem_ptr, 4)
-DRC_VAR(inv_code_start, 4)
-DRC_VAR(inv_code_end, 4)
-DRC_VAR(branch_target, 4)
+DRC_VAR(invc_ptr, 4)
 DRC_VAR(scratch_buf_ptr, 4)
-@DRC_VAR(align0, 12) /* unused/alignment */
+DRC_VAR(ram_offset, 4)
 DRC_VAR(mini_ht, 256)
 DRC_VAR(restore_candidate, 512)
 
-/* unused */
-DRC_VAR(FCR0, 4)
-DRC_VAR(FCR31, 4)
 
 #ifdef TEXRELS_FORBIDDEN
        .data
@@ -180,7 +177,7 @@ ptr_hash_table:
        orrcs   r2, r6, #2048
        ldr     r5, [r3, r2, lsl #2]
        lsl     r12, r12, #8
-       add     r6, r1, r12, asr #6
+       add     r6, r1, r12, asr #6  /* old target */
        mov     r8, #0
        /* jump_in lookup */
 1:
@@ -200,7 +197,7 @@ ptr_hash_table:
 
        mov     r5, r1
        mov     r1, r6
-       bl      add_link
+       bl      add_jump_out
        sub     r2, r8, r5
        and     r1, r7, #0xff000000
        lsl     r2, r2, #6
@@ -221,8 +218,8 @@ ptr_hash_table:
        ldr     r5, [r3, r2, lsl #2]
        ldr     r7, [r6, r4]!
        teq     r7, r0
-       ldreq   pc, [r6, #4]
-       ldr     r7, [r6, #8]
+       ldreq   pc, [r6, #8]
+       ldr     r7, [r6, #4]
        teq     r7, r0
        ldreq   pc, [r6, #12]
        /* jump_dirty lookup */
@@ -237,10 +234,10 @@ ptr_hash_table:
        ldr     r1, [r4, #8]
        /* hash_table insert */
        ldr     r2, [r6]
-       ldr     r3, [r6, #4]
+       ldr     r3, [r6, #8]
        str     r0, [r6]
-       str     r1, [r6, #4]
-       str     r2, [r6, #8]
+       str     r1, [r6, #8]
+       str     r2, [r6, #4]
        str     r3, [r6, #12]
        mov     pc, r1
 8:
@@ -382,8 +379,8 @@ FUNCTION(jump_vaddr):
        and     r2, r3, r2, lsr #12
        ldr     r2, [r1, r2]!
        teq     r2, r0
-       ldreq   pc, [r1, #4]
-       ldr     r2, [r1, #8]
+       ldreq   pc, [r1, #8]
+       ldr     r2, [r1, #4]
        teq     r2, r0
        ldreq   pc, [r1, #12]
        str     r10, [fp, #LO_cycle_count]
@@ -395,8 +392,7 @@ FUNCTION(jump_vaddr):
        .align  2
 
 FUNCTION(verify_code_ds):
-       str     r8, [fp, #LO_branch_target]
-FUNCTION(verify_code_vm):
+       str     r8, [fp, #LO_branch_target]  @ preserve HOST_BTREG?
 FUNCTION(verify_code):
        /* r1 = source */
        /* r2 = target */
@@ -431,7 +427,7 @@ FUNCTION(verify_code):
        bl      get_addr
        mov     pc, r0
        .size   verify_code, .-verify_code
-       .size   verify_code_vm, .-verify_code_vm
+       .size   verify_code_ds, .-verify_code_ds
 
        .align  2
 FUNCTION(cc_interrupt):
@@ -479,14 +475,6 @@ FUNCTION(cc_interrupt):
        b       .E1
        .size   cc_interrupt, .-cc_interrupt
 
-       .align  2
-FUNCTION(do_interrupt):
-       ldr     r0, [fp, #LO_pcaddr]
-       bl      get_addr_ht
-       add     r10, r10, #2
-       mov     pc, r0
-       .size   do_interrupt, .-do_interrupt
-
        .align  2
 FUNCTION(fp_exception):
        mov     r2, #0x10000000
@@ -523,19 +511,9 @@ FUNCTION(jump_syscall):
        .size   jump_syscall, .-jump_syscall
        .align  2
 
-       .align  2
-FUNCTION(jump_syscall_hle):
-       str     r0, [fp, #LO_pcaddr] /* PC must be set to EPC for psxException */
-       ldr     r2, [fp, #LO_last_count]
-       mov     r1, #0    /* in delay slot */
-       add     r2, r2, r10
-       mov     r0, #0x20 /* cause */
-       str     r2, [fp, #LO_cycle] /* PCSX cycle counter */
-       bl      psxException
-
        /* note: psxException might do recursive recompiler call from it's HLE code,
         * so be ready for this */
-pcsx_return:
+FUNCTION(jump_to_new_pc):
        ldr     r1, [fp, #LO_next_interupt]
        ldr     r10, [fp, #LO_cycle]
        ldr     r0, [fp, #LO_pcaddr]
@@ -543,27 +521,7 @@ pcsx_return:
        str     r1, [fp, #LO_last_count]
        bl      get_addr_ht
        mov     pc, r0
-       .size   jump_syscall_hle, .-jump_syscall_hle
-
-       .align  2
-FUNCTION(jump_hlecall):
-       ldr     r2, [fp, #LO_last_count]
-       str     r0, [fp, #LO_pcaddr]
-       add     r2, r2, r10
-       adr     lr, pcsx_return
-       str     r2, [fp, #LO_cycle] /* PCSX cycle counter */
-       bx      r1
-       .size   jump_hlecall, .-jump_hlecall
-
-       .align  2
-FUNCTION(jump_intcall):
-       ldr     r2, [fp, #LO_last_count]
-       str     r0, [fp, #LO_pcaddr]
-       add     r2, r2, r10
-       adr     lr, pcsx_return
-       str     r2, [fp, #LO_cycle] /* PCSX cycle counter */
-       b       execI
-       .size   jump_hlecall, .-jump_hlecall
+       .size   jump_to_new_pc, .-jump_to_new_pc
 
        .align  2
 FUNCTION(new_dyna_leave):
@@ -658,7 +616,7 @@ invalidate_addr_call:
 FUNCTION(new_dyna_start):
        /* ip is stored to conform EABI alignment */
        stmfd   sp!, {r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
-       load_varadr fp, dynarec_local
+       mov     fp, r0 /* dynarec_local */
        ldr     r0, [fp, #LO_pcaddr]
        bl      get_addr_ht
        ldr     r1, [fp, #LO_next_interupt]
@@ -703,6 +661,13 @@ FUNCTION(jump_handler_read32):
        pcsx_read_mem ldrcc, 2
 
 
+.macro memhandler_post
+       ldr     r0, [fp, #LO_next_interupt]
+       ldr     r2, [fp, #LO_cycle]        @ memhandlers can modify cc, like dma
+       str     r0, [fp, #LO_last_count]
+       sub     r0, r2, r0
+.endm
+
 .macro pcsx_write_mem wrtop tab_shift
        /* r0 = address, r1 = data, r2 = cycles, r3 = handler_tab */
        lsl     r12,r0, #20
@@ -710,7 +675,7 @@ FUNCTION(jump_handler_read32):
        ldr     r3, [r3, r12, lsl #2]
        str     r0, [fp, #LO_address]      @ some handlers still need it..
        lsls    r3, #1
-       mov     r0, r2                                @ cycle return in case of direct store
+       mov     r0, r2                     @ cycle return in case of direct store
 .if \tab_shift == 1
        lsl     r12, #1
        \wrtop  r1, [r3, r12]
@@ -721,15 +686,14 @@ FUNCTION(jump_handler_read32):
        ldr     r12, [fp, #LO_last_count]
        mov     r0, r1
        add     r2, r2, r12
-       push    {r2, lr}
        str     r2, [fp, #LO_cycle]
+
+       str     lr, [fp, #LO_saved_lr]
        blx     r3
+       ldr     lr, [fp, #LO_saved_lr]
 
-       ldr     r0, [fp, #LO_next_interupt]
-       pop     {r2, r3}
-       str     r0, [fp, #LO_last_count]
-       sub     r0, r2, r0
-       bx      r3
+       memhandler_post
+       bx      lr
 .endm
 
 FUNCTION(jump_handler_write8):
@@ -749,15 +713,14 @@ FUNCTION(jump_handler_write_h):
        str     r0, [fp, #LO_address]      @ some handlers still need it..
        add     r2, r2, r12
        mov     r0, r1
-       push    {r2, lr}
        str     r2, [fp, #LO_cycle]
+
+       str     lr, [fp, #LO_saved_lr]
        blx     r3
+       ldr     lr, [fp, #LO_saved_lr]
 
-       ldr     r0, [fp, #LO_next_interupt]
-       pop     {r2, r3}
-       str     r0, [fp, #LO_last_count]
-       sub     r0, r2, r0
-       bx      r3
+       memhandler_post
+       bx      lr
 
 FUNCTION(jump_handle_swl):
        /* r0 = address, r1 = data, r2 = cycles */
@@ -863,4 +826,16 @@ FUNCTION(rcnt2_read_count_m1):
        lsr     r0, #16                 @ /= 8
        bx      lr
 
+FUNCTION(call_gteStall):
+       /* r0 = op_cycles, r1 = cycles */
+       ldr     r2, [fp, #LO_last_count]
+       str     lr, [fp, #LO_saved_lr]
+       add     r1, r1, r2
+       str     r1, [fp, #LO_cycle]
+       add     r1, fp, #LO_psxRegs
+       bl      gteCheckStallRaw
+       ldr     lr, [fp, #LO_saved_lr]
+       add     r10, r10, r0
+       bx      lr
+
 @ vim:filetype=armasm
diff --git a/libpcsxcore/new_dynarec/linkage_arm64.S b/libpcsxcore/new_dynarec/linkage_arm64.S
new file mode 100644 (file)
index 0000000..5e9626f
--- /dev/null
@@ -0,0 +1,414 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ *   linkage_arm.s for PCSX                                                *
+ *   Copyright (C) 2009-2011 Ari64                                         *
+ *   Copyright (C) 2021 notaz                                              *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include "arm_features.h"
+#include "new_dynarec_config.h"
+#include "assem_arm64.h"
+#include "linkage_offsets.h"
+
+#if (LO_mem_wtab & 7)
+#error misligned pointers
+#endif
+
+.bss
+       .align  4
+       .global dynarec_local
+       .type   dynarec_local, %object
+       .size   dynarec_local, LO_dynarec_local_size
+dynarec_local:
+       .space  LO_dynarec_local_size
+
+#define DRC_VAR_(name, vname, size_) \
+       vname = dynarec_local + LO_##name; \
+       .global vname; \
+       .type   vname, %object; \
+       .size   vname, size_
+
+#define DRC_VAR(name, size_) \
+       DRC_VAR_(name, ESYM(name), size_)
+
+DRC_VAR(next_interupt, 4)
+DRC_VAR(cycle_count, 4)
+DRC_VAR(last_count, 4)
+DRC_VAR(pending_exception, 4)
+DRC_VAR(stop, 4)
+DRC_VAR(branch_target, 4)
+DRC_VAR(address, 4)
+#DRC_VAR(align0, 16) /* unused/alignment */
+DRC_VAR(psxRegs, LO_psxRegs_end - LO_psxRegs)
+
+/* psxRegs */
+#DRC_VAR(reg, 128)
+DRC_VAR(lo, 4)
+DRC_VAR(hi, 4)
+DRC_VAR(reg_cop0, 128)
+DRC_VAR(reg_cop2d, 128)
+DRC_VAR(reg_cop2c, 128)
+DRC_VAR(pcaddr, 4)
+#DRC_VAR(code, 4)
+#DRC_VAR(cycle, 4)
+#DRC_VAR(interrupt, 4)
+#DRC_VAR(intCycle, 256)
+
+DRC_VAR(rcnts, 7*4*4)
+DRC_VAR(inv_code_start, 4)
+DRC_VAR(inv_code_end, 4)
+DRC_VAR(mem_rtab, 8)
+DRC_VAR(mem_wtab, 8)
+DRC_VAR(psxH_ptr, 8)
+DRC_VAR(invc_ptr, 8)
+DRC_VAR(zeromem_ptr, 8)
+DRC_VAR(scratch_buf_ptr, 8)
+DRC_VAR(ram_offset, 8)
+DRC_VAR(mini_ht, 256)
+DRC_VAR(restore_candidate, 512)
+
+
+       .text
+       .align  2
+
+/* r0 = virtual target address */
+/* r1 = instruction to patch */
+.macro dyna_linker_main
+       /* XXX TODO: should be able to do better than this... */
+       bl      get_addr_ht
+       br      x0
+.endm
+
+
+FUNCTION(dyna_linker):
+       /* r0 = virtual target address */
+       /* r1 = instruction to patch */
+       dyna_linker_main
+       .size   dyna_linker, .-dyna_linker
+
+FUNCTION(exec_pagefault):
+       /* r0 = instruction pointer */
+       /* r1 = fault address */
+       /* r2 = cause */
+       bl      abort
+       .size   exec_pagefault, .-exec_pagefault
+
+/* Special dynamic linker for the case where a page fault
+   may occur in a branch delay slot */
+FUNCTION(dyna_linker_ds):
+       /* r0 = virtual target address */
+       /* r1 = instruction to patch */
+       dyna_linker_main
+       .size   dyna_linker_ds, .-dyna_linker_ds
+
+       .align  2
+FUNCTION(cc_interrupt):
+       ldr     w0, [rFP, #LO_last_count]
+       mov     w2, #0x1fc
+       add     rCC, w0, rCC
+       str     wzr, [rFP, #LO_pending_exception]
+       and     w2, w2, rCC, lsr #17
+       add     x3, rFP, #LO_restore_candidate
+       str     rCC, [rFP, #LO_cycle]           /* PCSX cycles */
+#      str     rCC, [rFP, #LO_reg_cop0+36]     /* Count */
+       ldr     w19, [x3, w2, uxtw]
+       mov     x21, lr
+       cbnz    w19, 4f
+1:
+       bl      gen_interupt
+       mov     lr, x21
+       ldr     rCC, [rFP, #LO_cycle]
+       ldr     w0, [rFP, #LO_next_interupt]
+       ldr     w1, [rFP, #LO_pending_exception]
+       ldr     w2, [rFP, #LO_stop]
+       str     w0, [rFP, #LO_last_count]
+       sub     rCC, rCC, w0
+       cbnz    w2, new_dyna_leave
+       cbnz    w1, 2f
+       ret
+2:
+       ldr     w0, [rFP, #LO_pcaddr]
+       bl      get_addr_ht
+       br      x0
+4:
+       /* Move 'dirty' blocks to the 'clean' list */
+       lsl     w20, w2, #3
+       str     wzr, [x3, w2, uxtw]
+5:
+       mov     w0, w20
+       add     w20, w20, #1
+       tbz     w19, #0, 6f
+       bl      clean_blocks
+6:
+       lsr     w19, w19, #1
+       tst     w20, #31
+       bne     5b
+       b       1b
+       .size   cc_interrupt, .-cc_interrupt
+
+       .align  2
+FUNCTION(fp_exception):
+       mov     w2, #0x10000000
+0:
+       ldr     w1, [rFP, #LO_reg_cop0+48] /* Status */
+       mov     w3, #0x80000000
+       str     w0, [rFP, #LO_reg_cop0+56] /* EPC */
+       orr     w1, w1, #2
+       add     w2, w2, #0x2c
+       str     w1, [rFP, #LO_reg_cop0+48] /* Status */
+       str     w2, [rFP, #LO_reg_cop0+52] /* Cause */
+       add     w0, w3, #0x80
+       bl      get_addr_ht
+       br      x0
+       .size   fp_exception, .-fp_exception
+       .align  2
+FUNCTION(fp_exception_ds):
+       mov     w2, #0x90000000 /* Set high bit if delay slot */
+       b       0b
+       .size   fp_exception_ds, .-fp_exception_ds
+
+       .align  2
+FUNCTION(jump_syscall):
+       ldr     w1, [rFP, #LO_reg_cop0+48] /* Status */
+       mov     w3, #0x80000000
+       str     w0, [rFP, #LO_reg_cop0+56] /* EPC */
+       orr     w1, w1, #2
+       mov     w2, #0x20
+       str     w1, [rFP, #LO_reg_cop0+48] /* Status */
+       str     w2, [rFP, #LO_reg_cop0+52] /* Cause */
+       add     w0, w3, #0x80
+       bl      get_addr_ht
+       br      x0
+       .size   jump_syscall, .-jump_syscall
+       .align  2
+
+       /* note: psxException might do recursive recompiler call from it's HLE code,
+        * so be ready for this */
+FUNCTION(jump_to_new_pc):
+       ldr     w1, [rFP, #LO_next_interupt]
+       ldr     rCC, [rFP, #LO_cycle]
+       ldr     w0, [rFP, #LO_pcaddr]
+       sub     rCC, rCC, w1
+       str     w1, [rFP, #LO_last_count]
+       bl      get_addr_ht
+       br      x0
+       .size   jump_to_new_pc, .-jump_to_new_pc
+
+       /* stack must be aligned by 16, and include space for save_regs() use */
+       .align  2
+FUNCTION(new_dyna_start):
+       stp     x29, x30, [sp, #-SSP_ALL]!
+       ldr     w1,  [x0, #LO_next_interupt]
+       ldr     w2,  [x0, #LO_cycle]
+       stp     x19, x20, [sp, #16*1]
+       stp     x21, x22, [sp, #16*2]
+       stp     x23, x24, [sp, #16*3]
+       stp     x25, x26, [sp, #16*4]
+       stp     x27, x28, [sp, #16*5]
+       mov     rFP, x0
+       ldr     w0,  [rFP, #LO_pcaddr]
+       str     w1,  [rFP, #LO_last_count]
+       sub     rCC, w2, w1
+       bl      get_addr_ht
+       br      x0
+       .size   new_dyna_start, .-new_dyna_start
+
+       .align  2
+FUNCTION(new_dyna_leave):
+       ldr     w0,  [rFP, #LO_last_count]
+       add     rCC, rCC, w0
+       str     rCC, [rFP, #LO_cycle]
+       ldp     x19, x20, [sp, #16*1]
+       ldp     x21, x22, [sp, #16*2]
+       ldp     x23, x24, [sp, #16*3]
+       ldp     x25, x26, [sp, #16*4]
+       ldp     x27, x28, [sp, #16*5]
+       ldp     x29, x30, [sp], #SSP_ALL
+       ret
+       .size   new_dyna_leave, .-new_dyna_leave
+
+/* --------------------------------------- */
+
+.align 2
+
+.macro memhandler_pre
+       /* w0 = adddr/data, x1 = rhandler, w2 = cycles, x3 = whandler */
+       ldr     w4, [rFP, #LO_last_count]
+       add     w4, w4, w2
+       str     w4, [rFP, #LO_cycle]
+.endm
+
+.macro memhandler_post
+       ldr     w0, [rFP, #LO_next_interupt]
+       ldr     w2, [rFP, #LO_cycle]        // memhandlers can modify cc, like dma
+       str     w0, [rFP, #LO_last_count]
+       sub     w0, w2, w0
+.endm
+
+FUNCTION(do_memhandler_pre):
+       memhandler_pre
+       ret
+
+FUNCTION(do_memhandler_post):
+       memhandler_post
+       ret
+
+.macro pcsx_read_mem readop tab_shift
+       /* w0 = address, x1 = handler_tab, w2 = cycles */
+       ubfm    w4, w0, #\tab_shift, #11
+       ldr     x3, [x1, w4, uxtw #3]
+       adds    x3, x3, x3
+       bcs     0f
+       \readop w0, [x3, w4, uxtw #\tab_shift]
+       ret
+0:
+       stp     xzr, x30, [sp, #-16]!
+       memhandler_pre
+       blr     x3
+.endm
+
+FUNCTION(jump_handler_read8):
+       add     x1, x1, #0x1000/4*8 + 0x1000/2*8  /* shift to r8 part */
+       pcsx_read_mem ldrb, 0
+       b       handler_read_end
+
+FUNCTION(jump_handler_read16):
+       add     x1, x1, #0x1000/4*8               /* shift to r16 part */
+       pcsx_read_mem ldrh, 1
+       b       handler_read_end
+
+FUNCTION(jump_handler_read32):
+       pcsx_read_mem ldr, 2
+
+handler_read_end:
+       ldp     xzr, x30, [sp], #16
+       ret
+
+.macro pcsx_write_mem wrtop movop tab_shift
+       /* w0 = address, w1 = data, w2 = cycles, x3 = handler_tab */
+       ubfm    w4, w0, #\tab_shift, #11
+       ldr     x3, [x3, w4, uxtw #3]
+       adds    x3, x3, x3
+       bcs     0f
+       mov     w0, w2                    /* cycle return */
+       \wrtop  w1, [x3, w4, uxtw #\tab_shift]
+       ret
+0:
+       stp     xzr, x30, [sp, #-16]!
+       str     w0, [rFP, #LO_address]    /* some handlers still need it... */
+       \movop  w0, w1
+       memhandler_pre
+       blr     x3
+.endm
+
+FUNCTION(jump_handler_write8):
+       add     x3, x3, #0x1000/4*8 + 0x1000/2*8  /* shift to r8 part */
+       pcsx_write_mem strb uxtb 0
+       b       handler_write_end
+
+FUNCTION(jump_handler_write16):
+       add     x3, x3, #0x1000/4*8               /* shift to r16 part */
+       pcsx_write_mem strh uxth 1
+       b       handler_write_end
+
+FUNCTION(jump_handler_write32):
+       pcsx_write_mem str mov 2
+
+handler_write_end:
+       memhandler_post
+       ldp     xzr, x30, [sp], #16
+       ret
+
+FUNCTION(jump_handle_swl):
+       /* w0 = address, w1 = data, w2 = cycles */
+       ldr     x3, [rFP, #LO_mem_wtab]
+       orr     w4, wzr, w0, lsr #12
+       ldr     x3, [x3, w4, uxtw #3]
+       adds    x3, x3, x3
+       bcs     4f
+       add     x3, x0, x3
+       mov     w0, w2
+       tbz     x3, #1, 10f     // & 2
+       tbz     x3, #0, 2f      // & 1
+3:
+       stur    w1, [x3, #-3]
+       ret
+2:
+       lsr     w2, w1, #8
+       lsr     w1, w1, #24
+       sturh   w2, [x3, #-2]
+       strb    w1, [x3]
+       ret
+10:
+       tbz     x3, #0, 0f      // & 1
+1:
+       lsr     w1, w1, #16
+       sturh   w1, [x3, #-1]
+       ret
+0:
+       lsr     w2, w1, #24
+       strb    w2, [x3]
+       ret
+4:
+       mov     w0, w2          // todo
+       bl      abort
+       ret
+
+FUNCTION(jump_handle_swr):
+       /* w0 = address, w1 = data, w2 = cycles */
+       ldr     x3, [rFP, #LO_mem_wtab]
+       orr     w4, wzr, w0, lsr #12
+       ldr     x3, [x3, w4, uxtw #3]
+       adds    x3, x3, x3
+       bcs     4f
+       add     x3, x0, x3
+       mov     w0, w2
+       tbz     x3, #1, 10f     // & 2
+       tbz     x3, #0, 2f      // & 1
+3:
+       strb    w1, [x3]
+       ret
+2:
+       strh    w1, [x3]
+       ret
+10:
+       tbz     x3, #0, 0f      // & 1
+1:
+       lsr     w2, w1, #8
+       strb    w1, [x3]
+       sturh   w2, [x3, #1]
+       ret
+0:
+       str     w1, [x3]
+       ret
+4:
+       mov     w0, w2          // todo
+       bl      abort
+       ret
+
+FUNCTION(call_gteStall):
+       /* w0 = op_cycles, w1 = cycles */
+       ldr     w2, [rFP, #LO_last_count]
+       str     lr, [rFP, #LO_saved_lr]
+       add     w1, w1, w2
+       str     w1, [rFP, #LO_cycle]
+       add     x1, rFP, #LO_psxRegs
+       bl      gteCheckStallRaw
+       ldr     lr, [rFP, #LO_saved_lr]
+       add     rCC, rCC, w0
+       ret
+
diff --git a/libpcsxcore/new_dynarec/linkage_offsets.h b/libpcsxcore/new_dynarec/linkage_offsets.h
new file mode 100644 (file)
index 0000000..e9bb3ab
--- /dev/null
@@ -0,0 +1,45 @@
+
+#define PTRSZ __SIZEOF_POINTER__
+
+#define LO_next_interupt       64
+#define LO_cycle_count         (LO_next_interupt + 4)
+#define LO_last_count          (LO_cycle_count + 4)
+#define LO_pending_exception   (LO_last_count + 4)
+#define LO_stop                        (LO_pending_exception + 4)
+#define LO_branch_target       (LO_stop + 4)
+#define LO_address             (LO_branch_target + 4)
+#define LO_align0              (LO_address + 4)
+#define LO_psxRegs             (LO_align0 + 4)
+#define LO_reg                 (LO_psxRegs)
+#define LO_lo                  (LO_reg + 128)
+#define LO_hi                  (LO_lo + 4)
+#define LO_reg_cop0            (LO_hi + 4)
+#define LO_reg_cop2d           (LO_reg_cop0 + 128)
+#define LO_reg_cop2c           (LO_reg_cop2d + 128)
+#define LO_PC                  (LO_reg_cop2c + 128)
+#define LO_pcaddr              (LO_PC)
+#define LO_code                        (LO_PC + 4)
+#define LO_cycle               (LO_code + 4)
+#define LO_interrupt           (LO_cycle + 4)
+#define LO_intCycle            (LO_interrupt + 4)
+#define LO_gteBusyCycle                (LO_intCycle + 256)
+#define LO_muldivBusyCycle     (LO_gteBusyCycle + 4)
+#define LO_psxRegs_reserved    (LO_muldivBusyCycle + 4)
+#define LO_psxRegs_end         (LO_psxRegs_reserved + 4*2)
+#define LO_rcnts               (LO_psxRegs_end)
+#define LO_rcnts_end           (LO_rcnts + 7*4*4)
+#define LO_inv_code_start      (LO_rcnts_end)
+#define LO_inv_code_end                (LO_inv_code_start + 4)
+#define LO_mem_rtab            (LO_inv_code_end + 4)
+#define LO_mem_wtab            (LO_mem_rtab + PTRSZ)
+#define LO_psxH_ptr            (LO_mem_wtab + PTRSZ)
+#define LO_zeromem_ptr         (LO_psxH_ptr + PTRSZ)
+#define LO_invc_ptr            (LO_zeromem_ptr + PTRSZ)
+#define LO_scratch_buf_ptr     (LO_invc_ptr + PTRSZ)
+#define LO_saved_lr            (LO_scratch_buf_ptr + PTRSZ)
+#define LO_ram_offset          (LO_saved_lr + PTRSZ)
+#define LO_mini_ht             (LO_ram_offset + PTRSZ)
+#define LO_restore_candidate   (LO_mini_ht + PTRSZ*32*2)
+#define LO_dynarec_local_size  (LO_restore_candidate + 512)
+
+#define LO_cop2_to_scratch_buf (LO_scratch_buf_ptr - LO_reg_cop2d)
index 52deb85..27d9d46 100644 (file)
 #ifdef VITA
 #include <psp2/kernel/sysmem.h>
 static int sceBlock;
-int getVMBlock();
 #endif
 
 #include "new_dynarec_config.h"
-#include "backends/psx/emu_if.h" //emulator interface
+#include "../psxhle.h"
+#include "../psxinterpreter.h"
+#include "../gte.h"
+#include "emu_if.h" // emulator interface
+
+#define noinline __attribute__((noinline,noclone))
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
+#endif
+#ifndef min
+#define min(a, b) ((b) < (a) ? (b) : (a))
+#endif
+#ifndef max
+#define max(a, b) ((b) > (a) ? (b) : (a))
+#endif
 
 //#define DISASM
-//#define assem_debug printf
-//#define inv_debug printf
+//#define ASSEM_PRINT
+
+#ifdef ASSEM_PRINT
+#define assem_debug printf
+#else
 #define assem_debug(...)
+#endif
+//#define inv_debug printf
 #define inv_debug(...)
 
 #ifdef __i386__
-#include "x86/assem_x86.h"
+#include "assem_x86.h"
 #endif
 #ifdef __x86_64__
-#include "x64/assem_x64.h"
+#include "assem_x64.h"
 #endif
 #ifdef __arm__
-#include "arm/assem_arm.h"
+#include "assem_arm.h"
 #endif
-
-#ifdef VITA
-int _newlib_vm_size_user = 1 << TARGET_SIZE_2;
+#ifdef __aarch64__
+#include "assem_arm64.h"
 #endif
 
+#define RAM_SIZE 0x200000
 #define MAXBLOCK 4096
 #define MAX_OUTPUT_BLOCK_SIZE 262144
 
+struct ndrc_mem
+{
+  u_char translation_cache[1 << TARGET_SIZE_2];
+  struct
+  {
+    struct tramp_insns ops[2048 / sizeof(struct tramp_insns)];
+    const void *f[2048 / sizeof(void *)];
+  } tramp;
+};
+
+#ifdef BASE_ADDR_DYNAMIC
+static struct ndrc_mem *ndrc;
+#else
+static struct ndrc_mem ndrc_ __attribute__((aligned(4096)));
+static struct ndrc_mem *ndrc = &ndrc_;
+#endif
+
+// stubs
+enum stub_type {
+  CC_STUB = 1,
+  FP_STUB = 2,
+  LOADB_STUB = 3,
+  LOADH_STUB = 4,
+  LOADW_STUB = 5,
+  LOADD_STUB = 6,
+  LOADBU_STUB = 7,
+  LOADHU_STUB = 8,
+  STOREB_STUB = 9,
+  STOREH_STUB = 10,
+  STOREW_STUB = 11,
+  STORED_STUB = 12,
+  STORELR_STUB = 13,
+  INVCODE_STUB = 14,
+};
+
 struct regstat
 {
-  signed char regmap_entry[HOST_REGS];
+  signed char regmap_entry[HOST_REGS]; // pre-insn + loop preloaded regs?
   signed char regmap[HOST_REGS];
-  uint64_t was32;
-  uint64_t is32;
   uint64_t wasdirty;
   uint64_t dirty;
   uint64_t u;
-  uint64_t uu;
-  u_int wasconst;
-  u_int isconst;
+  u_int wasconst;                // before; for example 'lw r2, (r2)' wasconst is true
+  u_int isconst;                 //  ... but isconst is false when r2 is known
   u_int loadedconst;             // host regs that have constants loaded
   u_int waswritten;              // MIPS regs that were used as store base before
 };
@@ -86,9 +136,53 @@ struct ll_entry
   struct ll_entry *next;
 };
 
+struct ht_entry
+{
+  u_int vaddr[2];
+  void *tcaddr[2];
+};
+
+struct code_stub
+{
+  enum stub_type type;
+  void *addr;
+  void *retaddr;
+  u_int a;
+  uintptr_t b;
+  uintptr_t c;
+  u_int d;
+  u_int e;
+};
+
+struct link_entry
+{
+  void *addr;
+  u_int target;
+  u_int ext;
+};
+
+static struct decoded_insn
+{
+  u_char itype;
+  u_char opcode;
+  u_char opcode2;
+  u_char rs1;
+  u_char rs2;
+  u_char rt1;
+  u_char rt2;
+  u_char lt1;
+  u_char bt:1;
+  u_char ooo:1;
+  u_char is_ds:1;
+  u_char is_jump:1;
+  u_char is_ujump:1;
+  u_char is_load:1;
+  u_char is_store:1;
+} dops[MAXBLOCK];
+
   // used by asm:
   u_char *out;
-  u_int hash_table[65536][4]  __attribute__((aligned(16)));
+  struct ht_entry hash_table[65536]  __attribute__((aligned(16)));
   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
   struct ll_entry *jump_dirty[4096];
 
@@ -96,19 +190,6 @@ struct ll_entry
   static u_int start;
   static u_int *source;
   static char insn[MAXBLOCK][10];
-  static u_char itype[MAXBLOCK];
-  static u_char opcode[MAXBLOCK];
-  static u_char opcode2[MAXBLOCK];
-  static u_char bt[MAXBLOCK];
-  static u_char rs1[MAXBLOCK];
-  static u_char rs2[MAXBLOCK];
-  static u_char rt1[MAXBLOCK];
-  static u_char rt2[MAXBLOCK];
-  static u_char us1[MAXBLOCK];
-  static u_char us2[MAXBLOCK];
-  static u_char dep1[MAXBLOCK];
-  static u_char dep2[MAXBLOCK];
-  static u_char lt1[MAXBLOCK];
   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
   static uint64_t gte_rt[MAXBLOCK];
   static uint64_t gte_unneeded[MAXBLOCK];
@@ -119,16 +200,14 @@ struct ll_entry
   static u_int smrv_weak_next;
   static int imm[MAXBLOCK];
   static u_int ba[MAXBLOCK];
-  static char likely[MAXBLOCK];
-  static char is_ds[MAXBLOCK];
-  static char ooo[MAXBLOCK];
   static uint64_t unneeded_reg[MAXBLOCK];
-  static uint64_t unneeded_reg_upper[MAXBLOCK];
   static uint64_t branch_unneeded_reg[MAXBLOCK];
-  static uint64_t branch_unneeded_reg_upper[MAXBLOCK];
+  // pre-instruction [i], excluding loop-preload regs?
   static signed char regmap_pre[MAXBLOCK][HOST_REGS];
-  static uint64_t current_constmap[HOST_REGS];
-  static uint64_t constmap[MAXBLOCK][HOST_REGS];
+  // contains 'real' consts at [i] insn, but may differ from what's actually
+  // loaded in host reg as 'final' value is always loaded, see get_final_value()
+  static uint32_t current_constmap[HOST_REGS];
+  static uint32_t constmap[MAXBLOCK][HOST_REGS];
   static struct regstat regs[MAXBLOCK];
   static struct regstat branch_regs[MAXBLOCK];
   static signed char minimum_free_regs[MAXBLOCK];
@@ -137,35 +216,41 @@ struct ll_entry
   static u_int will_dirty[MAXBLOCK];
   static int ccadj[MAXBLOCK];
   static int slen;
-  static u_int instr_addr[MAXBLOCK];
-  static u_int link_addr[MAXBLOCK][3];
+  static void *instr_addr[MAXBLOCK];
+  static struct link_entry link_addr[MAXBLOCK];
   static int linkcount;
-  static u_int stubs[MAXBLOCK*3][8];
+  static struct code_stub stubs[MAXBLOCK*3];
   static int stubcount;
   static u_int literals[1024][2];
   static int literalcount;
   static int is_delayslot;
-  static int cop1_usable;
   static char shadow[1048576]  __attribute__((aligned(16)));
   static void *copy;
   static int expirep;
   static u_int stop_after_jal;
-#ifndef RAM_FIXED
-  static u_int ram_offset;
-#else
-  static const u_int ram_offset=0;
-#endif
+  static u_int f1_hack; // 0 - off, ~0 - capture address, else addr
 
   int new_dynarec_hacks;
+  int new_dynarec_hacks_pergame;
+  int new_dynarec_hacks_old;
   int new_dynarec_did_compile;
+
+  #define HACK_ENABLED(x) ((new_dynarec_hacks | new_dynarec_hacks_pergame) & (x))
+
+  extern int cycle_count; // ... until end of the timeslice, counts -N -> 0
+  extern int last_count;  // last absolute target, often = next_interupt
+  extern int pcaddr;
+  extern int pending_exception;
+  extern int branch_target;
+  extern uintptr_t ram_offset;
+  extern uintptr_t mini_ht[32][2];
   extern u_char restore_candidate[512];
-  extern int cycle_count;
 
   /* registers that may be allocated */
   /* 1-31 gpr */
-#define HIREG 32 // hi
-#define LOREG 33 // lo
-#define FSREG 34 // FPU status (FCSR)
+#define LOREG 32 // lo
+#define HIREG 33 // hi
+//#define FSREG 34 // FPU status (FCSR)
 #define CSREG 35 // Coprocessor status
 #define CCREG 36 // Cycle count
 #define INVCP 37 // Pointer to invalid_code
@@ -204,10 +289,10 @@ struct ll_entry
 #define COP0 15   // Coprocessor 0
 #define COP1 16   // Coprocessor 1
 #define C1LS 17   // Coprocessor 1 load/store
-#define FJUMP 18  // Conditional branch (floating point)
-#define FLOAT 19  // Floating point unit
-#define FCONV 20  // Convert integer to float
-#define FCOMP 21  // Floating point compare (sets FSREG)
+//#define FJUMP 18  // Conditional branch (floating point)
+//#define FLOAT 19  // Floating point unit
+//#define FCONV 20  // Convert integer to float
+//#define FCOMP 21  // Floating point compare (sets FSREG)
 #define SYSCALL 22// SYSCALL
 #define OTHER 23  // Other
 #define SPAN 24   // Branch/delay slot spans 2 pages
@@ -218,29 +303,16 @@ struct ll_entry
 #define C2OP 29   // Coprocessor 2 operation
 #define INTCALL 30// Call interpreter to handle rare corner cases
 
-  /* stubs */
-#define CC_STUB 1
-#define FP_STUB 2
-#define LOADB_STUB 3
-#define LOADH_STUB 4
-#define LOADW_STUB 5
-#define LOADD_STUB 6
-#define LOADBU_STUB 7
-#define LOADHU_STUB 8
-#define STOREB_STUB 9
-#define STOREH_STUB 10
-#define STOREW_STUB 11
-#define STORED_STUB 12
-#define STORELR_STUB 13
-#define INVCODE_STUB 14
-
   /* branch codes */
 #define TAKEN 1
 #define NOTTAKEN 2
 #define NULLDS 3
 
+#define DJT_1 (void *)1l // no function, just a label in assem_debug log
+#define DJT_2 (void *)2l
+
 // asm linkage
-int new_recompile_block(int addr);
+int new_recompile_block(u_int addr);
 void *get_addr_ht(u_int vaddr);
 void invalidate_block(u_int block);
 void invalidate_addr(u_int addr);
@@ -248,31 +320,39 @@ void remove_hash(int vaddr);
 void dyna_linker();
 void dyna_linker_ds();
 void verify_code();
-void verify_code_vm();
 void verify_code_ds();
 void cc_interrupt();
 void fp_exception();
 void fp_exception_ds();
-void jump_syscall_hle();
-void jump_hlecall();
-void jump_intcall();
+void jump_to_new_pc();
+void call_gteStall();
 void new_dyna_leave();
 
 // Needed by assembler
-static void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
-static void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
-static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
-static void load_all_regs(signed char i_regmap[]);
-static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
+static void wb_register(signed char r, const signed char regmap[], uint64_t dirty);
+static void wb_dirtys(const signed char i_regmap[], uint64_t i_dirty);
+static void wb_needed_dirtys(const signed char i_regmap[], uint64_t i_dirty, int addr);
+static void load_all_regs(const signed char i_regmap[]);
+static void load_needed_regs(const signed char i_regmap[], const signed char next_regmap[]);
 static void load_regs_entry(int t);
-static void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
+static void load_all_consts(const signed char regmap[], u_int dirty, int i);
+static u_int get_host_reglist(const signed char *regmap);
 
-static int verify_dirty(u_int *ptr);
+static int verify_dirty(const u_int *ptr);
 static int get_final_value(int hr, int i, int *value);
-static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e);
-static void add_to_linker(int addr,int target,int ext);
-
-static int tracedebug=0;
+static void add_stub(enum stub_type type, void *addr, void *retaddr,
+  u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e);
+static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
+  int i, int addr_reg, const struct regstat *i_regs, int ccadj, u_int reglist);
+static void add_to_linker(void *addr, u_int target, int ext);
+static void *emit_fastpath_cmp_jump(int i, const struct regstat *i_regs,
+  int addr, int *offset_reg, int *addr_reg_override);
+static void *get_direct_memhandler(void *table, u_int addr,
+  enum stub_type type, uintptr_t *addr_host);
+static void cop2_do_stall_check(u_int op, int i, const struct regstat *i_regs, u_int reglist);
+static void pass_args(int a0, int a1);
+static void emit_far_jump(const void *f);
+static void emit_far_call(const void *f);
 
 static void mprotect_w_x(void *start, void *end, int is_x)
 {
@@ -301,7 +381,7 @@ static void start_tcache_write(void *start, void *end)
 
 static void end_tcache_write(void *start, void *end)
 {
-#ifdef __arm__
+#if defined(__arm__) || defined(__aarch64__)
   size_t len = (char *)end - (char *)start;
   #if   defined(__BLACKBERRY_QNX__)
   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
@@ -311,6 +391,10 @@ static void end_tcache_write(void *start, void *end)
   sceKernelSyncVMDomain(sceBlock, start, len);
   #elif defined(_3DS)
   ctr_flush_invalidate_cache();
+  #elif defined(__aarch64__)
+  // as of 2021, __clear_cache() is still broken on arm64
+  // so here is a custom one :(
+  clear_cache_arm64(start, end);
   #else
   __clear_cache(start, end);
   #endif
@@ -323,8 +407,8 @@ static void end_tcache_write(void *start, void *end)
 static void *start_block(void)
 {
   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
-  if (end > (u_char *)BASE_ADDR + (1<<TARGET_SIZE_2))
-    end = (u_char *)BASE_ADDR + (1<<TARGET_SIZE_2);
+  if (end > ndrc->translation_cache + sizeof(ndrc->translation_cache))
+    end = ndrc->translation_cache + sizeof(ndrc->translation_cache);
   start_tcache_write(out, end);
   return out;
 }
@@ -334,16 +418,68 @@ static void end_block(void *start)
   end_tcache_write(start, out);
 }
 
+// also takes care of w^x mappings when patching code
+static u_int needs_clear_cache[1<<(TARGET_SIZE_2-17)];
+
+static void mark_clear_cache(void *target)
+{
+  uintptr_t offset = (u_char *)target - ndrc->translation_cache;
+  u_int mask = 1u << ((offset >> 12) & 31);
+  if (!(needs_clear_cache[offset >> 17] & mask)) {
+    char *start = (char *)((uintptr_t)target & ~4095l);
+    start_tcache_write(start, start + 4095);
+    needs_clear_cache[offset >> 17] |= mask;
+  }
+}
+
+// Clearing the cache is rather slow on ARM Linux, so mark the areas
+// that need to be cleared, and then only clear these areas once.
+static void do_clear_cache(void)
+{
+  int i, j;
+  for (i = 0; i < (1<<(TARGET_SIZE_2-17)); i++)
+  {
+    u_int bitmap = needs_clear_cache[i];
+    if (!bitmap)
+      continue;
+    for (j = 0; j < 32; j++)
+    {
+      u_char *start, *end;
+      if (!(bitmap & (1<<j)))
+        continue;
+
+      start = ndrc->translation_cache + i*131072 + j*4096;
+      end = start + 4095;
+      for (j++; j < 32; j++) {
+        if (!(bitmap & (1<<j)))
+          break;
+        end += 4096;
+      }
+      end_tcache_write(start, end);
+    }
+    needs_clear_cache[i] = 0;
+  }
+}
+
 //#define DEBUG_CYCLE_COUNT 1
 
 #define NO_CYCLE_PENALTY_THR 12
 
-int cycle_multiplier; // 100 for 1.0
+int cycle_multiplier = CYCLE_MULT_DEFAULT; // 100 for 1.0
+int cycle_multiplier_override;
+int cycle_multiplier_old;
+static int cycle_multiplier_active;
 
 static int CLOCK_ADJUST(int x)
 {
-  int s=(x>>31)|1;
-  return (x * cycle_multiplier + s * 50) / 100;
+  int m = cycle_multiplier_active;
+  int s = (x >> 31) | 1;
+  return (x * m + s * 50) / 100;
+}
+
+static int ds_writes_rjump_rs(int i)
+{
+  return dops[i].rs1 != 0 && (dops[i].rs1 == dops[i+1].rt1 || dops[i].rs1 == dops[i+1].rt2);
 }
 
 static u_int get_page(u_int vaddr)
@@ -362,71 +498,72 @@ static u_int get_vpage(u_int vaddr)
   return get_page(vaddr);
 }
 
+static struct ht_entry *hash_table_get(u_int vaddr)
+{
+  return &hash_table[((vaddr>>16)^vaddr)&0xFFFF];
+}
+
+static void hash_table_add(struct ht_entry *ht_bin, u_int vaddr, void *tcaddr)
+{
+  ht_bin->vaddr[1] = ht_bin->vaddr[0];
+  ht_bin->tcaddr[1] = ht_bin->tcaddr[0];
+  ht_bin->vaddr[0] = vaddr;
+  ht_bin->tcaddr[0] = tcaddr;
+}
+
+// some messy ari64's code, seems to rely on unsigned 32bit overflow
+static int doesnt_expire_soon(void *tcaddr)
+{
+  u_int diff = (u_int)((u_char *)tcaddr - out) << (32-TARGET_SIZE_2);
+  return diff > (u_int)(0x60000000 + (MAX_OUTPUT_BLOCK_SIZE << (32-TARGET_SIZE_2)));
+}
+
 // Get address from virtual address
 // This is called from the recompiled JR/JALR instructions
-void *get_addr(u_int vaddr)
+void noinline *get_addr(u_int vaddr)
 {
-  struct ll_entry *head = NULL;
-  u_int page            = get_page(vaddr);
-  u_int vpage           = get_vpage(vaddr);
+  u_int page=get_page(vaddr);
+  u_int vpage=get_vpage(vaddr);
+  struct ll_entry *head;
   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
   head=jump_in[page];
-  while(head!=NULL)
-  {
-    if(head->vaddr==vaddr)
-    {
-      //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
-      u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
-      ht_bin[3]=ht_bin[1];
-      ht_bin[2]=ht_bin[0];
-      ht_bin[1]=(u_int)head->addr;
-      ht_bin[0]=vaddr;
+  while(head!=NULL) {
+    if(head->vaddr==vaddr) {
+  //printf("TRACE: count=%d next=%d (get_addr match %x: %p)\n",Count,next_interupt,vaddr,head->addr);
+      hash_table_add(hash_table_get(vaddr), vaddr, head->addr);
       return head->addr;
     }
     head=head->next;
   }
   head=jump_dirty[vpage];
-  while(head!=NULL)
-  {
-    if(head->vaddr==vaddr)
-    {
-      //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
+  while(head!=NULL) {
+    if(head->vaddr==vaddr) {
+      //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %p)\n",Count,next_interupt,vaddr,head->addr);
       // Don't restore blocks which are about to expire from the cache
-      if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
-        if(verify_dirty(head->addr))
-        {
-          //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
-          invalid_code[vaddr>>12]=0;
-          inv_code_start=inv_code_end=~0;
-          if(vpage<2048)
-          {
-            restore_candidate[vpage>>3]|=1<<(vpage&7);
-          }
-          else
-          {
-            restore_candidate[page>>3]|=1<<(page&7);
-          }
-          u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
+      if (doesnt_expire_soon(head->addr))
+      if (verify_dirty(head->addr)) {
+        //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
+        invalid_code[vaddr>>12]=0;
+        inv_code_start=inv_code_end=~0;
+        if(vpage<2048) {
+          restore_candidate[vpage>>3]|=1<<(vpage&7);
+        }
+        else restore_candidate[page>>3]|=1<<(page&7);
+        struct ht_entry *ht_bin = hash_table_get(vaddr);
+        if (ht_bin->vaddr[0] == vaddr)
+          ht_bin->tcaddr[0] = head->addr; // Replace existing entry
+        else
+          hash_table_add(ht_bin, vaddr, head->addr);
 
-          if(ht_bin[0]==vaddr)
-            ht_bin[1]=(u_int)head->addr; // Replace existing entry
-          else
-          {
-            ht_bin[3]=ht_bin[1];
-            ht_bin[2]=ht_bin[0];
-            ht_bin[1]=(int)head->addr;
-            ht_bin[0]=vaddr;
-          }
-          return head->addr;
-        }
+        return head->addr;
+      }
     }
     head=head->next;
   }
   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
   int r=new_recompile_block(vaddr);
-  if(r==0)
-    return get_addr(vaddr);
-  // Execute in unmapped page, generate pagefault exception
+  if(r==0) return get_addr(vaddr);
+  // Execute in unmapped page, generate pagefault execption
   Status|=2;
   Cause=(vaddr<<31)|0x8;
   EPC=(vaddr&1)?vaddr-5:vaddr;
@@ -435,14 +572,13 @@ void *get_addr(u_int vaddr)
   EntryHi=BadVAddr&0xFFFFE000;
   return get_addr_ht(0x80000000);
 }
-
 // Look up address in hash table first
 void *get_addr_ht(u_int vaddr)
 {
   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
-  u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
-  if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
-  if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
+  const struct ht_entry *ht_bin = hash_table_get(vaddr);
+  if (ht_bin->vaddr[0] == vaddr) return ht_bin->tcaddr[0];
+  if (ht_bin->vaddr[1] == vaddr) return ht_bin->tcaddr[1];
   return get_addr(vaddr);
 }
 
@@ -452,7 +588,7 @@ void clear_all_regs(signed char regmap[])
   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
 }
 
-signed char get_reg(signed char regmap[],int r)
+static signed char get_reg(const signed char regmap[],int r)
 {
   int hr;
   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
@@ -460,7 +596,7 @@ signed char get_reg(signed char regmap[],int r)
 }
 
 // Find a register that is available for two consecutive cycles
-signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
+static signed char get_reg2(signed char regmap1[], const signed char regmap2[], int r)
 {
   int hr;
   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
@@ -491,24 +627,7 @@ void dirty_reg(struct regstat *cur,signed char reg)
   }
 }
 
-// If we dirty the lower half of a 64 bit register which is now being
-// sign-extended, we need to dump the upper half.
-// Note: Do this only after completion of the instruction, because
-// some instructions may need to read the full 64-bit value even if
-// overwriting it (eg SLTI, DSRA32).
-static void flush_dirty_uppers(struct regstat *cur)
-{
-  int hr,reg;
-  for (hr=0;hr<HOST_REGS;hr++) {
-    if((cur->dirty>>hr)&1) {
-      reg=cur->regmap[hr];
-      if(reg>=64)
-        if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
-    }
-  }
-}
-
-void set_const(struct regstat *cur,signed char reg,uint64_t value)
+static void set_const(struct regstat *cur, signed char reg, uint32_t value)
 {
   int hr;
   if(!reg) return;
@@ -517,14 +636,10 @@ void set_const(struct regstat *cur,signed char reg,uint64_t value)
       cur->isconst|=1<<hr;
       current_constmap[hr]=value;
     }
-    else if((cur->regmap[hr]^64)==reg) {
-      cur->isconst|=1<<hr;
-      current_constmap[hr]=value>>32;
-    }
   }
 }
 
-void clear_const(struct regstat *cur,signed char reg)
+static void clear_const(struct regstat *cur, signed char reg)
 {
   int hr;
   if(!reg) return;
@@ -535,7 +650,7 @@ void clear_const(struct regstat *cur,signed char reg)
   }
 }
 
-int is_const(struct regstat *cur,signed char reg)
+static int is_const(struct regstat *cur, signed char reg)
 {
   int hr;
   if(reg<0) return 0;
@@ -547,7 +662,8 @@ int is_const(struct regstat *cur,signed char reg)
   }
   return 0;
 }
-uint64_t get_const(struct regstat *cur,signed char reg)
+
+static uint32_t get_const(struct regstat *cur, signed char reg)
 {
   int hr;
   if(!reg) return 0;
@@ -557,7 +673,7 @@ uint64_t get_const(struct regstat *cur,signed char reg)
     }
   }
   SysPrintf("Unknown constant in r%d\n",reg);
-  exit(1);
+  abort();
 }
 
 // Least soon needed registers
@@ -573,7 +689,7 @@ void lsn(u_char hsn[], int i, int *preferred_reg)
       j=slen-i-1;
       break;
     }
-    if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
+    if (dops[i+j].is_ujump)
     {
       // Don't go past an unconditonal jump
       j++;
@@ -582,22 +698,23 @@ void lsn(u_char hsn[], int i, int *preferred_reg)
   }
   for(;j>=0;j--)
   {
-    if(rs1[i+j]) hsn[rs1[i+j]]=j;
-    if(rs2[i+j]) hsn[rs2[i+j]]=j;
-    if(rt1[i+j]) hsn[rt1[i+j]]=j;
-    if(rt2[i+j]) hsn[rt2[i+j]]=j;
-    if(itype[i+j]==STORE || itype[i+j]==STORELR) {
+    if(dops[i+j].rs1) hsn[dops[i+j].rs1]=j;
+    if(dops[i+j].rs2) hsn[dops[i+j].rs2]=j;
+    if(dops[i+j].rt1) hsn[dops[i+j].rt1]=j;
+    if(dops[i+j].rt2) hsn[dops[i+j].rt2]=j;
+    if(dops[i+j].itype==STORE || dops[i+j].itype==STORELR) {
       // Stores can allocate zero
-      hsn[rs1[i+j]]=j;
-      hsn[rs2[i+j]]=j;
+      hsn[dops[i+j].rs1]=j;
+      hsn[dops[i+j].rs2]=j;
     }
+    if (ram_offset && (dops[i+j].is_load || dops[i+j].is_store))
+      hsn[ROREG] = j;
     // On some architectures stores need invc_ptr
     #if defined(HOST_IMM8)
-    if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
-      hsn[INVCP]=j;
-    }
+    if (dops[i+j].is_store)
+      hsn[INVCP] = j;
     #endif
-    if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
+    if(i+j>=0&&(dops[i+j].itype==UJUMP||dops[i+j].itype==CJUMP||dops[i+j].itype==SJUMP))
     {
       hsn[CCREG]=j;
       b=j;
@@ -612,37 +729,37 @@ void lsn(u_char hsn[], int i, int *preferred_reg)
       j=7-b;if(t+j>=slen) j=slen-t-1;
       for(;j>=0;j--)
       {
-        if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
-        if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
-        //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
-        //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
+        if(dops[t+j].rs1) if(hsn[dops[t+j].rs1]>j+b+2) hsn[dops[t+j].rs1]=j+b+2;
+        if(dops[t+j].rs2) if(hsn[dops[t+j].rs2]>j+b+2) hsn[dops[t+j].rs2]=j+b+2;
+        //if(dops[t+j].rt1) if(hsn[dops[t+j].rt1]>j+b+2) hsn[dops[t+j].rt1]=j+b+2;
+        //if(dops[t+j].rt2) if(hsn[dops[t+j].rt2]>j+b+2) hsn[dops[t+j].rt2]=j+b+2;
       }
     }
     // TODO: preferred register based on backward branch
   }
   // Delay slot should preferably not overwrite branch conditions or cycle count
-  if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
-    if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
-    if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
+  if (i > 0 && dops[i-1].is_jump) {
+    if(dops[i-1].rs1) if(hsn[dops[i-1].rs1]>1) hsn[dops[i-1].rs1]=1;
+    if(dops[i-1].rs2) if(hsn[dops[i-1].rs2]>1) hsn[dops[i-1].rs2]=1;
     hsn[CCREG]=1;
     // ...or hash tables
     hsn[RHASH]=1;
     hsn[RHTBL]=1;
   }
   // Coprocessor load/store needs FTEMP, even if not declared
-  if(itype[i]==C1LS||itype[i]==C2LS) {
+  if(dops[i].itype==C2LS) {
     hsn[FTEMP]=0;
   }
   // Load L/R also uses FTEMP as a temporary register
-  if(itype[i]==LOADLR) {
+  if(dops[i].itype==LOADLR) {
     hsn[FTEMP]=0;
   }
   // Also SWL/SWR/SDL/SDR
-  if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
+  if(dops[i].opcode==0x2a||dops[i].opcode==0x2e||dops[i].opcode==0x2c||dops[i].opcode==0x2d) {
     hsn[FTEMP]=0;
   }
   // Don't remove the miniht registers
-  if(itype[i]==UJUMP||itype[i]==RJUMP)
+  if(dops[i].itype==UJUMP||dops[i].itype==RJUMP)
   {
     hsn[RHASH]=0;
     hsn[RHTBL]=0;
@@ -656,7 +773,7 @@ int needed_again(int r, int i)
   int b=-1;
   int rn=10;
 
-  if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
+  if (i > 0 && dops[i-1].is_ujump)
   {
     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
       return 0; // Don't need any registers if exiting the block
@@ -667,46 +784,27 @@ int needed_again(int r, int i)
       j=slen-i-1;
       break;
     }
-    if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
+    if (dops[i+j].is_ujump)
     {
       // Don't go past an unconditonal jump
       j++;
       break;
     }
-    if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
+    if(dops[i+j].itype==SYSCALL||dops[i+j].itype==HLECALL||dops[i+j].itype==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
     {
       break;
     }
   }
   for(;j>=1;j--)
   {
-    if(rs1[i+j]==r) rn=j;
-    if(rs2[i+j]==r) rn=j;
+    if(dops[i+j].rs1==r) rn=j;
+    if(dops[i+j].rs2==r) rn=j;
     if((unneeded_reg[i+j]>>r)&1) rn=10;
-    if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
+    if(i+j>=0&&(dops[i+j].itype==UJUMP||dops[i+j].itype==CJUMP||dops[i+j].itype==SJUMP))
     {
       b=j;
     }
   }
-  /*
-  if(b>=0)
-  {
-    if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
-    {
-      // Follow first branch
-      int o=rn;
-      int t=(ba[i+b]-start)>>2;
-      j=7-b;if(t+j>=slen) j=slen-t-1;
-      for(;j>=0;j--)
-      {
-        if(!((unneeded_reg[t+j]>>r)&1)) {
-          if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
-          if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
-        }
-        else rn=o;
-      }
-    }
-  }*/
   if(rn<10) return 1;
   (void)b;
   return 0;
@@ -723,7 +821,7 @@ int loop_reg(int i, int r, int hr)
       j=slen-i-1;
       break;
     }
-    if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
+    if (dops[i+j].is_ujump)
     {
       // Don't go past an unconditonal jump
       j++;
@@ -732,14 +830,14 @@ int loop_reg(int i, int r, int hr)
   }
   k=0;
   if(i>0){
-    if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
+    if(dops[i-1].itype==UJUMP||dops[i-1].itype==CJUMP||dops[i-1].itype==SJUMP)
       k--;
   }
   for(;k<j;k++)
   {
-    if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
-    if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
-    if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
+    assert(r < 64);
+    if((unneeded_reg[i+k]>>r)&1) return hr;
+    if(i+k>=0&&(dops[i+k].itype==UJUMP||dops[i+k].itype==CJUMP||dops[i+k].itype==SJUMP))
     {
       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
       {
@@ -762,8 +860,8 @@ void alloc_all(struct regstat *cur,int i)
 
   for(hr=0;hr<HOST_REGS;hr++) {
     if(hr!=EXCLUDE_REG) {
-      if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
-         ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
+      if(((cur->regmap[hr]&63)!=dops[i].rs1)&&((cur->regmap[hr]&63)!=dops[i].rs2)&&
+         ((cur->regmap[hr]&63)!=dops[i].rt1)&&((cur->regmap[hr]&63)!=dops[i].rt2))
       {
         cur->regmap[hr]=-1;
         cur->dirty&=~(1<<hr);
@@ -778,16 +876,123 @@ void alloc_all(struct regstat *cur,int i)
   }
 }
 
+#ifndef NDEBUG
+static int host_tempreg_in_use;
+
+static void host_tempreg_acquire(void)
+{
+  assert(!host_tempreg_in_use);
+  host_tempreg_in_use = 1;
+}
+
+static void host_tempreg_release(void)
+{
+  host_tempreg_in_use = 0;
+}
+#else
+static void host_tempreg_acquire(void) {}
+static void host_tempreg_release(void) {}
+#endif
+
+#ifdef ASSEM_PRINT
+extern void gen_interupt();
+extern void do_insn_cmp();
+#define FUNCNAME(f) { f, " " #f }
+static const struct {
+  void *addr;
+  const char *name;
+} function_names[] = {
+  FUNCNAME(cc_interrupt),
+  FUNCNAME(gen_interupt),
+  FUNCNAME(get_addr_ht),
+  FUNCNAME(get_addr),
+  FUNCNAME(jump_handler_read8),
+  FUNCNAME(jump_handler_read16),
+  FUNCNAME(jump_handler_read32),
+  FUNCNAME(jump_handler_write8),
+  FUNCNAME(jump_handler_write16),
+  FUNCNAME(jump_handler_write32),
+  FUNCNAME(invalidate_addr),
+  FUNCNAME(jump_to_new_pc),
+  FUNCNAME(call_gteStall),
+  FUNCNAME(new_dyna_leave),
+  FUNCNAME(pcsx_mtc0),
+  FUNCNAME(pcsx_mtc0_ds),
+#ifdef DRC_DBG
+  FUNCNAME(do_insn_cmp),
+#endif
+#ifdef __arm__
+  FUNCNAME(verify_code),
+#endif
+};
+
+static const char *func_name(const void *a)
+{
+  int i;
+  for (i = 0; i < sizeof(function_names)/sizeof(function_names[0]); i++)
+    if (function_names[i].addr == a)
+      return function_names[i].name;
+  return "";
+}
+#else
+#define func_name(x) ""
+#endif
+
 #ifdef __i386__
-#include "x86/assem_x86.c"
+#include "assem_x86.c"
 #endif
 #ifdef __x86_64__
-#include "x64/assem_x64.c"
+#include "assem_x64.c"
 #endif
 #ifdef __arm__
-#include "arm/assem_arm.c"
+#include "assem_arm.c"
+#endif
+#ifdef __aarch64__
+#include "assem_arm64.c"
 #endif
 
+static void *get_trampoline(const void *f)
+{
+  size_t i;
+
+  for (i = 0; i < ARRAY_SIZE(ndrc->tramp.f); i++) {
+    if (ndrc->tramp.f[i] == f || ndrc->tramp.f[i] == NULL)
+      break;
+  }
+  if (i == ARRAY_SIZE(ndrc->tramp.f)) {
+    SysPrintf("trampoline table is full, last func %p\n", f);
+    abort();
+  }
+  if (ndrc->tramp.f[i] == NULL) {
+    start_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]);
+    ndrc->tramp.f[i] = f;
+    end_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]);
+  }
+  return &ndrc->tramp.ops[i];
+}
+
+static void emit_far_jump(const void *f)
+{
+  if (can_jump_or_call(f)) {
+    emit_jmp(f);
+    return;
+  }
+
+  f = get_trampoline(f);
+  emit_jmp(f);
+}
+
+static void emit_far_call(const void *f)
+{
+  if (can_jump_or_call(f)) {
+    emit_call(f);
+    return;
+  }
+
+  f = get_trampoline(f);
+  emit_call(f);
+}
+
 // Add virtual address mapping to linked list
 void ll_add(struct ll_entry **head,int vaddr,void *addr)
 {
@@ -811,39 +1016,39 @@ void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr
 // but don't return addresses which are about to expire from the cache
 void *check_addr(u_int vaddr)
 {
-  u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
-  if(ht_bin[0]==vaddr) {
-    if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
-      if(isclean(ht_bin[1])) return (void *)ht_bin[1];
-  }
-  if(ht_bin[2]==vaddr) {
-    if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
-      if(isclean(ht_bin[3])) return (void *)ht_bin[3];
+  struct ht_entry *ht_bin = hash_table_get(vaddr);
+  size_t i;
+  for (i = 0; i < ARRAY_SIZE(ht_bin->vaddr); i++) {
+    if (ht_bin->vaddr[i] == vaddr)
+      if (doesnt_expire_soon((u_char *)ht_bin->tcaddr[i] - MAX_OUTPUT_BLOCK_SIZE))
+        if (isclean(ht_bin->tcaddr[i]))
+          return ht_bin->tcaddr[i];
   }
   u_int page=get_page(vaddr);
   struct ll_entry *head;
   head=jump_in[page];
-  while(head!=NULL) {
-    if(head->vaddr==vaddr) {
-      if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
+  while (head != NULL) {
+    if (head->vaddr == vaddr) {
+      if (doesnt_expire_soon(head->addr)) {
         // Update existing entry with current address
-        if(ht_bin[0]==vaddr) {
-          ht_bin[1]=(int)head->addr;
+        if (ht_bin->vaddr[0] == vaddr) {
+          ht_bin->tcaddr[0] = head->addr;
           return head->addr;
         }
-        if(ht_bin[2]==vaddr) {
-          ht_bin[3]=(int)head->addr;
+        if (ht_bin->vaddr[1] == vaddr) {
+          ht_bin->tcaddr[1] = head->addr;
           return head->addr;
         }
         // Insert into hash table with low priority.
         // Don't evict existing entries, as they are probably
         // addresses that are being accessed frequently.
-        if(ht_bin[0]==-1) {
-          ht_bin[1]=(int)head->addr;
-          ht_bin[0]=vaddr;
-        }else if(ht_bin[2]==-1) {
-          ht_bin[3]=(int)head->addr;
-          ht_bin[2]=vaddr;
+        if (ht_bin->vaddr[0] == -1) {
+          ht_bin->vaddr[0] = vaddr;
+          ht_bin->tcaddr[0] = head->addr;
+        }
+        else if (ht_bin->vaddr[1] == -1) {
+          ht_bin->vaddr[1] = vaddr;
+          ht_bin->tcaddr[1] = head->addr;
         }
         return head->addr;
       }
@@ -856,25 +1061,29 @@ void *check_addr(u_int vaddr)
 void remove_hash(int vaddr)
 {
   //printf("remove hash: %x\n",vaddr);
-  u_int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
-  if(ht_bin[2]==vaddr) {
-    ht_bin[2]=ht_bin[3]=-1;
+  struct ht_entry *ht_bin = hash_table_get(vaddr);
+  if (ht_bin->vaddr[1] == vaddr) {
+    ht_bin->vaddr[1] = -1;
+    ht_bin->tcaddr[1] = NULL;
   }
-  if(ht_bin[0]==vaddr) {
-    ht_bin[0]=ht_bin[2];
-    ht_bin[1]=ht_bin[3];
-    ht_bin[2]=ht_bin[3]=-1;
+  if (ht_bin->vaddr[0] == vaddr) {
+    ht_bin->vaddr[0] = ht_bin->vaddr[1];
+    ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
+    ht_bin->vaddr[1] = -1;
+    ht_bin->tcaddr[1] = NULL;
   }
 }
 
-void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
+static void ll_remove_matching_addrs(struct ll_entry **head,
+  uintptr_t base_offs_s, int shift)
 {
   struct ll_entry *next;
   while(*head) {
-    if(((u_int)((*head)->addr)>>shift)==(addr>>shift) ||
-       ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
+    uintptr_t o1 = (u_char *)(*head)->addr - ndrc->translation_cache;
+    uintptr_t o2 = o1 - MAX_OUTPUT_BLOCK_SIZE;
+    if ((o1 >> shift) == base_offs_s || (o2 >> shift) == base_offs_s)
     {
-      inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
+      inv_debug("EXP: Remove pointer to %p (%x)\n",(*head)->addr,(*head)->vaddr);
       remove_hash((*head)->vaddr);
       next=(*head)->next;
       free(*head);
@@ -903,27 +1112,27 @@ void ll_clear(struct ll_entry **head)
 }
 
 // Dereference the pointers and remove if it matches
-static void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
+static void ll_kill_pointers(struct ll_entry *head,
+  uintptr_t base_offs_s, int shift)
 {
   while(head) {
-    int ptr=get_pointer(head->addr);
-    inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
-    if(((ptr>>shift)==(addr>>shift)) ||
-       (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
+    u_char *ptr = get_pointer(head->addr);
+    uintptr_t o1 = ptr - ndrc->translation_cache;
+    uintptr_t o2 = o1 - MAX_OUTPUT_BLOCK_SIZE;
+    inv_debug("EXP: Lookup pointer to %p at %p (%x)\n",ptr,head->addr,head->vaddr);
+    if ((o1 >> shift) == base_offs_s || (o2 >> shift) == base_offs_s)
     {
-      inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
+      inv_debug("EXP: Kill pointer at %p (%x)\n",head->addr,head->vaddr);
       void *host_addr=find_extjump_insn(head->addr);
-      #ifdef __arm__
-        mark_clear_cache(host_addr);
-      #endif
-      set_jump_target((int)host_addr,(int)head->addr);
+      mark_clear_cache(host_addr);
+      set_jump_target(host_addr, head->addr);
     }
     head=head->next;
   }
 }
 
 // This is called when we write to a compiled block (see do_invstub)
-void invalidate_page(u_int page)
+static void invalidate_page(u_int page)
 {
   struct ll_entry *head;
   struct ll_entry *next;
@@ -939,12 +1148,10 @@ void invalidate_page(u_int page)
   head=jump_out[page];
   jump_out[page]=0;
   while(head!=NULL) {
-    inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
+    inv_debug("INVALIDATE: kill pointer to %x (%p)\n",head->vaddr,head->addr);
     void *host_addr=find_extjump_insn(head->addr);
-    #ifdef __arm__
-      mark_clear_cache(host_addr);
-    #endif
-    set_jump_target((int)host_addr,(int)head->addr);
+    mark_clear_cache(host_addr);
+    set_jump_target(host_addr, head->addr); // point back to dyna_linker
     next=head->next;
     free(head);
     head=next;
@@ -959,26 +1166,21 @@ static void invalidate_block_range(u_int block, u_int first, u_int last)
   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
   assert(last<page+5);
   // Invalidate the adjacent pages if a block crosses a 4K boundary
-  while(first<page)
-  {
+  while(first<page) {
     invalidate_page(first);
     first++;
   }
-  for(first=page+1;first<last;first++)
-  {
+  for(first=page+1;first<last;first++) {
     invalidate_page(first);
   }
-
-#ifdef __arm__
   do_clear_cache();
-#endif
 
   // Don't trap writes
   invalid_code[block]=1;
 
-#ifdef USE_MINI_HT
+  #ifdef USE_MINI_HT
   memset(mini_ht,-1,sizeof(mini_ht));
-#endif
+  #endif
 }
 
 void invalidate_block(u_int block)
@@ -986,24 +1188,21 @@ void invalidate_block(u_int block)
   u_int page=get_page(block<<12);
   u_int vpage=get_vpage(block<<12);
   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
+  //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
   u_int first,last;
   first=last=page;
   struct ll_entry *head;
   head=jump_dirty[vpage];
   //printf("page=%d vpage=%d\n",page,vpage);
-  while(head!=NULL)
-  {
-    u_int start,end;
-    if(vpage>2047||(head->vaddr>>12)==block)
-    { // Ignore vaddr hash collision
-      get_bounds((int)head->addr,&start,&end);
-      //printf("start: %x end: %x\n",start,end);
-      if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE)
-      {
-        if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page)
-        {
-          if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
-          if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
+  while(head!=NULL) {
+    if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
+      u_char *start, *end;
+      get_bounds(head->addr, &start, &end);
+      //printf("start: %p end: %p\n", start, end);
+      if (page < 2048 && start >= rdram && end < rdram+RAM_SIZE) {
+        if (((start-rdram)>>12) <= page && ((end-1-rdram)>>12) >= page) {
+          if ((((start-rdram)>>12)&2047) < first) first = ((start-rdram)>>12)&2047;
+          if ((((end-1-rdram)>>12)&2047) > last)  last = ((end-1-rdram)>>12)&2047;
         }
       }
     }
@@ -1034,12 +1233,11 @@ void invalidate_addr(u_int addr)
     }
     for(;pg1<=page;pg1++) {
       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
-        u_int start,end;
-        get_bounds((int)head->addr,&start,&end);
-        if(ram_offset) {
-          start-=ram_offset;
-          end-=ram_offset;
-        }
+        u_char *start_h, *end_h;
+        u_int start, end;
+        get_bounds(head->addr, &start_h, &end_h);
+        start = (uintptr_t)start_h - ram_offset;
+        end = (uintptr_t)end_h - ram_offset;
         if(start<=addr_main&&addr_main<end) {
           if(start<addr_min) addr_min=start;
           if(end>addr_max) addr_max=end;
@@ -1078,30 +1276,37 @@ void invalidate_all_pages(void)
   for(page=0;page<4096;page++)
     invalidate_page(page);
   for(page=0;page<1048576;page++)
-  {
-    if(!invalid_code[page])
-    {
+    if(!invalid_code[page]) {
       restore_candidate[(page&2047)>>3]|=1<<(page&7);
       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
     }
-  }
-
-#ifdef USE_MINI_HT
+  #ifdef USE_MINI_HT
   memset(mini_ht,-1,sizeof(mini_ht));
-#endif
+  #endif
+  do_clear_cache();
+}
+
+static void do_invstub(int n)
+{
+  literal_pool(20);
+  u_int reglist=stubs[n].a;
+  set_jump_target(stubs[n].addr, out);
+  save_regs(reglist);
+  if(stubs[n].b!=0) emit_mov(stubs[n].b,0);
+  emit_far_call(invalidate_addr);
+  restore_regs(reglist);
+  emit_jmp(stubs[n].retaddr); // return address
 }
 
 // Add an entry to jump_out after making a link
-void add_link(u_int vaddr,void *src)
+// src should point to code by emit_extjump2()
+void add_jump_out(u_int vaddr,void *src)
 {
   u_int page=get_page(vaddr);
-  inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
-  int *ptr=(int *)(src+4);
-  assert((*ptr&0x0fff0000)==0x059f0000);
-  (void)ptr;
+  inv_debug("add_jump_out: %p -> %x (%d)\n",src,vaddr,page);
+  check_extjump2(src);
   ll_add(jump_out+page,vaddr,src);
-  //int ptr=get_pointer(src);
-  //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
+  //inv_debug("add_jump_out:  to %p\n",get_pointer(src));
 }
 
 // If a code block was found to be unmodified (bit was set in
@@ -1113,50 +1318,37 @@ void clean_blocks(u_int page)
   struct ll_entry *head;
   inv_debug("INV: clean_blocks page=%d\n",page);
   head=jump_dirty[page];
-  while(head!=NULL)
-  {
-    if(!invalid_code[head->vaddr>>12])
-    {
+  while(head!=NULL) {
+    if(!invalid_code[head->vaddr>>12]) {
       // Don't restore blocks which are about to expire from the cache
-      if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
-      {
-        u_int start,end;
-        if(verify_dirty(head->addr))
-        {
-          //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
+      if (doesnt_expire_soon(head->addr)) {
+        if(verify_dirty(head->addr)) {
+          u_char *start, *end;
+          //printf("Possibly Restore %x (%p)\n",head->vaddr, head->addr);
           u_int i;
           u_int inv=0;
-          get_bounds((int)head->addr,&start,&end);
-          if(start-(u_int)rdram<RAM_SIZE)
-          {
-            for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++)
-            {
+          get_bounds(head->addr, &start, &end);
+          if (start - rdram < RAM_SIZE) {
+            for (i = (start-rdram+0x80000000)>>12; i <= (end-1-rdram+0x80000000)>>12; i++) {
               inv|=invalid_code[i];
             }
           }
-          else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE)
-          {
+          else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
             inv=1;
           }
-          if(!inv)
-          {
-            void * clean_addr=(void *)get_clean_addr((int)head->addr);
-            if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
-            {
+          if(!inv) {
+            void *clean_addr = get_clean_addr(head->addr);
+            if (doesnt_expire_soon(clean_addr)) {
               u_int ppage=page;
-              inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
+              inv_debug("INV: Restored %x (%p/%p)\n",head->vaddr, head->addr, clean_addr);
               //printf("page=%x, addr=%x\n",page,head->vaddr);
               //assert(head->vaddr>>12==(page|0x80000));
               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
-              u_int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
-              if(ht_bin[0]==head->vaddr)
-              {
-                ht_bin[1]=(u_int)clean_addr; // Replace existing entry
-              }
-              if(ht_bin[2]==head->vaddr)
-              {
-                ht_bin[3]=(u_int)clean_addr; // Replace existing entry
-              }
+              struct ht_entry *ht_bin = hash_table_get(head->vaddr);
+              if (ht_bin->vaddr[0] == head->vaddr)
+                ht_bin->tcaddr[0] = clean_addr; // Replace existing entry
+              if (ht_bin->vaddr[1] == head->vaddr)
+                ht_bin->tcaddr[1] = clean_addr; // Replace existing entry
             }
           }
         }
@@ -1166,326 +1358,443 @@ void clean_blocks(u_int page)
   }
 }
 
-static void mov_alloc(struct regstat *current,int i)
+/* Register allocation */
+
+// Note: registers are allocated clean (unmodified state)
+// if you intend to modify the register, you must call dirty_reg().
+static void alloc_reg(struct regstat *cur,int i,signed char reg)
 {
-  // Note: Don't need to actually alloc the source registers
-  if((~current->is32>>rs1[i])&1)
+  int r,hr;
+  int preferred_reg = PREFERRED_REG_FIRST
+    + reg % (PREFERRED_REG_LAST - PREFERRED_REG_FIRST + 1);
+  if (reg == CCREG) preferred_reg = HOST_CCREG;
+  if (reg == PTEMP || reg == FTEMP) preferred_reg = 12;
+  assert(PREFERRED_REG_FIRST != EXCLUDE_REG && EXCLUDE_REG != HOST_REGS);
+
+  // Don't allocate unused registers
+  if((cur->u>>reg)&1) return;
+
+  // see if it's already allocated
+  for(hr=0;hr<HOST_REGS;hr++)
   {
-    //alloc_reg64(current,i,rs1[i]);
-    alloc_reg64(current,i,rt1[i]);
-    current->is32&=~(1LL<<rt1[i]);
-  }
-  else
+    if(cur->regmap[hr]==reg) return;
+  }
+
+  // Keep the same mapping if the register was already allocated in a loop
+  preferred_reg = loop_reg(i,reg,preferred_reg);
+
+  // Try to allocate the preferred register
+  if(cur->regmap[preferred_reg]==-1) {
+    cur->regmap[preferred_reg]=reg;
+    cur->dirty&=~(1<<preferred_reg);
+    cur->isconst&=~(1<<preferred_reg);
+    return;
+  }
+  r=cur->regmap[preferred_reg];
+  assert(r < 64);
+  if((cur->u>>r)&1) {
+    cur->regmap[preferred_reg]=reg;
+    cur->dirty&=~(1<<preferred_reg);
+    cur->isconst&=~(1<<preferred_reg);
+    return;
+  }
+
+  // Clear any unneeded registers
+  // We try to keep the mapping consistent, if possible, because it
+  // makes branches easier (especially loops).  So we try to allocate
+  // first (see above) before removing old mappings.  If this is not
+  // possible then go ahead and clear out the registers that are no
+  // longer needed.
+  for(hr=0;hr<HOST_REGS;hr++)
   {
-    //alloc_reg(current,i,rs1[i]);
-    alloc_reg(current,i,rt1[i]);
-    current->is32|=(1LL<<rt1[i]);
+    r=cur->regmap[hr];
+    if(r>=0) {
+      assert(r < 64);
+      if((cur->u>>r)&1) {cur->regmap[hr]=-1;break;}
+    }
   }
-  clear_const(current,rs1[i]);
-  clear_const(current,rt1[i]);
-  dirty_reg(current,rt1[i]);
-}
 
-void shiftimm_alloc(struct regstat *current,int i)
-{
-  if(opcode2[i]<=0x3) // SLL/SRL/SRA
-  {
-    if(rt1[i]) {
-      if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
-      else lt1[i]=rs1[i];
-      alloc_reg(current,i,rt1[i]);
-      current->is32|=1LL<<rt1[i];
-      dirty_reg(current,rt1[i]);
-      if(is_const(current,rs1[i])) {
-        int v=get_const(current,rs1[i]);
-        if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
-        if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
-        if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
+  // Try to allocate any available register, but prefer
+  // registers that have not been used recently.
+  if (i > 0) {
+    for (hr = PREFERRED_REG_FIRST; ; ) {
+      if (cur->regmap[hr] < 0) {
+        int oldreg = regs[i-1].regmap[hr];
+        if (oldreg < 0 || (oldreg != dops[i-1].rs1 && oldreg != dops[i-1].rs2
+             && oldreg != dops[i-1].rt1 && oldreg != dops[i-1].rt2))
+        {
+          cur->regmap[hr]=reg;
+          cur->dirty&=~(1<<hr);
+          cur->isconst&=~(1<<hr);
+          return;
+        }
       }
-      else clear_const(current,rt1[i]);
+      hr++;
+      if (hr == EXCLUDE_REG)
+        hr++;
+      if (hr == HOST_REGS)
+        hr = 0;
+      if (hr == PREFERRED_REG_FIRST)
+        break;
     }
   }
-  else
-  {
-    clear_const(current,rs1[i]);
-    clear_const(current,rt1[i]);
-  }
 
-  if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
-  {
-    if(rt1[i]) {
-      if(rs1[i]) alloc_reg64(current,i,rs1[i]);
-      alloc_reg64(current,i,rt1[i]);
-      current->is32&=~(1LL<<rt1[i]);
-      dirty_reg(current,rt1[i]);
+  // Try to allocate any available register
+  for (hr = PREFERRED_REG_FIRST; ; ) {
+    if (cur->regmap[hr] < 0) {
+      cur->regmap[hr]=reg;
+      cur->dirty&=~(1<<hr);
+      cur->isconst&=~(1<<hr);
+      return;
     }
+    hr++;
+    if (hr == EXCLUDE_REG)
+      hr++;
+    if (hr == HOST_REGS)
+      hr = 0;
+    if (hr == PREFERRED_REG_FIRST)
+      break;
   }
-  if(opcode2[i]==0x3c) // DSLL32
-  {
-    if(rt1[i]) {
-      if(rs1[i]) alloc_reg(current,i,rs1[i]);
-      alloc_reg64(current,i,rt1[i]);
-      current->is32&=~(1LL<<rt1[i]);
-      dirty_reg(current,rt1[i]);
+
+  // Ok, now we have to evict someone
+  // Pick a register we hopefully won't need soon
+  u_char hsn[MAXREG+1];
+  memset(hsn,10,sizeof(hsn));
+  int j;
+  lsn(hsn,i,&preferred_reg);
+  //printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",cur->regmap[0],cur->regmap[1],cur->regmap[2],cur->regmap[3],cur->regmap[5],cur->regmap[6],cur->regmap[7]);
+  //printf("hsn(%x): %d %d %d %d %d %d %d\n",start+i*4,hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]);
+  if(i>0) {
+    // Don't evict the cycle count at entry points, otherwise the entry
+    // stub will have to write it.
+    if(dops[i].bt&&hsn[CCREG]>2) hsn[CCREG]=2;
+    if (i>1 && hsn[CCREG] > 2 && dops[i-2].is_jump) hsn[CCREG]=2;
+    for(j=10;j>=3;j--)
+    {
+      // Alloc preferred register if available
+      if(hsn[r=cur->regmap[preferred_reg]&63]==j) {
+        for(hr=0;hr<HOST_REGS;hr++) {
+          // Evict both parts of a 64-bit register
+          if((cur->regmap[hr]&63)==r) {
+            cur->regmap[hr]=-1;
+            cur->dirty&=~(1<<hr);
+            cur->isconst&=~(1<<hr);
+          }
+        }
+        cur->regmap[preferred_reg]=reg;
+        return;
+      }
+      for(r=1;r<=MAXREG;r++)
+      {
+        if(hsn[r]==j&&r!=dops[i-1].rs1&&r!=dops[i-1].rs2&&r!=dops[i-1].rt1&&r!=dops[i-1].rt2) {
+          for(hr=0;hr<HOST_REGS;hr++) {
+            if(hr!=HOST_CCREG||j<hsn[CCREG]) {
+              if(cur->regmap[hr]==r) {
+                cur->regmap[hr]=reg;
+                cur->dirty&=~(1<<hr);
+                cur->isconst&=~(1<<hr);
+                return;
+              }
+            }
+          }
+        }
+      }
     }
   }
-  if(opcode2[i]==0x3e) // DSRL32
+  for(j=10;j>=0;j--)
   {
-    if(rt1[i]) {
-      alloc_reg64(current,i,rs1[i]);
-      if(imm[i]==32) {
-        alloc_reg64(current,i,rt1[i]);
-        current->is32&=~(1LL<<rt1[i]);
-      } else {
-        alloc_reg(current,i,rt1[i]);
-        current->is32|=1LL<<rt1[i];
+    for(r=1;r<=MAXREG;r++)
+    {
+      if(hsn[r]==j) {
+        for(hr=0;hr<HOST_REGS;hr++) {
+          if(cur->regmap[hr]==r) {
+            cur->regmap[hr]=reg;
+            cur->dirty&=~(1<<hr);
+            cur->isconst&=~(1<<hr);
+            return;
+          }
+        }
       }
-      dirty_reg(current,rt1[i]);
     }
   }
-  if(opcode2[i]==0x3f) // DSRA32
-  {
-    if(rt1[i]) {
-      alloc_reg64(current,i,rs1[i]);
-      alloc_reg(current,i,rt1[i]);
-      current->is32|=1LL<<rt1[i];
-      dirty_reg(current,rt1[i]);
-    }
+  SysPrintf("This shouldn't happen (alloc_reg)");abort();
+}
+
+// Allocate a temporary register.  This is done without regard to
+// dirty status or whether the register we request is on the unneeded list
+// Note: This will only allocate one register, even if called multiple times
+static void alloc_reg_temp(struct regstat *cur,int i,signed char reg)
+{
+  int r,hr;
+  int preferred_reg = -1;
+
+  // see if it's already allocated
+  for(hr=0;hr<HOST_REGS;hr++)
+  {
+    if(hr!=EXCLUDE_REG&&cur->regmap[hr]==reg) return;
+  }
+
+  // Try to allocate any available register
+  for(hr=HOST_REGS-1;hr>=0;hr--) {
+    if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
+      cur->regmap[hr]=reg;
+      cur->dirty&=~(1<<hr);
+      cur->isconst&=~(1<<hr);
+      return;
+    }
+  }
+
+  // Find an unneeded register
+  for(hr=HOST_REGS-1;hr>=0;hr--)
+  {
+    r=cur->regmap[hr];
+    if(r>=0) {
+      assert(r < 64);
+      if((cur->u>>r)&1) {
+        if(i==0||((unneeded_reg[i-1]>>r)&1)) {
+          cur->regmap[hr]=reg;
+          cur->dirty&=~(1<<hr);
+          cur->isconst&=~(1<<hr);
+          return;
+        }
+      }
+    }
+  }
+
+  // Ok, now we have to evict someone
+  // Pick a register we hopefully won't need soon
+  // TODO: we might want to follow unconditional jumps here
+  // TODO: get rid of dupe code and make this into a function
+  u_char hsn[MAXREG+1];
+  memset(hsn,10,sizeof(hsn));
+  int j;
+  lsn(hsn,i,&preferred_reg);
+  //printf("hsn: %d %d %d %d %d %d %d\n",hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]);
+  if(i>0) {
+    // Don't evict the cycle count at entry points, otherwise the entry
+    // stub will have to write it.
+    if(dops[i].bt&&hsn[CCREG]>2) hsn[CCREG]=2;
+    if (i>1 && hsn[CCREG] > 2 && dops[i-2].is_jump) hsn[CCREG]=2;
+    for(j=10;j>=3;j--)
+    {
+      for(r=1;r<=MAXREG;r++)
+      {
+        if(hsn[r]==j&&r!=dops[i-1].rs1&&r!=dops[i-1].rs2&&r!=dops[i-1].rt1&&r!=dops[i-1].rt2) {
+          for(hr=0;hr<HOST_REGS;hr++) {
+            if(hr!=HOST_CCREG||hsn[CCREG]>2) {
+              if(cur->regmap[hr]==r) {
+                cur->regmap[hr]=reg;
+                cur->dirty&=~(1<<hr);
+                cur->isconst&=~(1<<hr);
+                return;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  for(j=10;j>=0;j--)
+  {
+    for(r=1;r<=MAXREG;r++)
+    {
+      if(hsn[r]==j) {
+        for(hr=0;hr<HOST_REGS;hr++) {
+          if(cur->regmap[hr]==r) {
+            cur->regmap[hr]=reg;
+            cur->dirty&=~(1<<hr);
+            cur->isconst&=~(1<<hr);
+            return;
+          }
+        }
+      }
+    }
+  }
+  SysPrintf("This shouldn't happen");abort();
+}
+
+static void mov_alloc(struct regstat *current,int i)
+{
+  if (dops[i].rs1 == HIREG || dops[i].rs1 == LOREG) {
+    alloc_cc(current,i); // for stalls
+    dirty_reg(current,CCREG);
+  }
+
+  // Note: Don't need to actually alloc the source registers
+  //alloc_reg(current,i,dops[i].rs1);
+  alloc_reg(current,i,dops[i].rt1);
+
+  clear_const(current,dops[i].rs1);
+  clear_const(current,dops[i].rt1);
+  dirty_reg(current,dops[i].rt1);
+}
+
+static void shiftimm_alloc(struct regstat *current,int i)
+{
+  if(dops[i].opcode2<=0x3) // SLL/SRL/SRA
+  {
+    if(dops[i].rt1) {
+      if(dops[i].rs1&&needed_again(dops[i].rs1,i)) alloc_reg(current,i,dops[i].rs1);
+      else dops[i].lt1=dops[i].rs1;
+      alloc_reg(current,i,dops[i].rt1);
+      dirty_reg(current,dops[i].rt1);
+      if(is_const(current,dops[i].rs1)) {
+        int v=get_const(current,dops[i].rs1);
+        if(dops[i].opcode2==0x00) set_const(current,dops[i].rt1,v<<imm[i]);
+        if(dops[i].opcode2==0x02) set_const(current,dops[i].rt1,(u_int)v>>imm[i]);
+        if(dops[i].opcode2==0x03) set_const(current,dops[i].rt1,v>>imm[i]);
+      }
+      else clear_const(current,dops[i].rt1);
+    }
+  }
+  else
+  {
+    clear_const(current,dops[i].rs1);
+    clear_const(current,dops[i].rt1);
+  }
+
+  if(dops[i].opcode2>=0x38&&dops[i].opcode2<=0x3b) // DSLL/DSRL/DSRA
+  {
+    assert(0);
+  }
+  if(dops[i].opcode2==0x3c) // DSLL32
+  {
+    assert(0);
+  }
+  if(dops[i].opcode2==0x3e) // DSRL32
+  {
+    assert(0);
+  }
+  if(dops[i].opcode2==0x3f) // DSRA32
+  {
+    assert(0);
   }
 }
 
-void shift_alloc(struct regstat *current,int i)
+static void shift_alloc(struct regstat *current,int i)
 {
-  if(rt1[i]) {
-    if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
+  if(dops[i].rt1) {
+    if(dops[i].opcode2<=0x07) // SLLV/SRLV/SRAV
     {
-      if(rs1[i]) alloc_reg(current,i,rs1[i]);
-      if(rs2[i]) alloc_reg(current,i,rs2[i]);
-      alloc_reg(current,i,rt1[i]);
-      if(rt1[i]==rs2[i]) {
+      if(dops[i].rs1) alloc_reg(current,i,dops[i].rs1);
+      if(dops[i].rs2) alloc_reg(current,i,dops[i].rs2);
+      alloc_reg(current,i,dops[i].rt1);
+      if(dops[i].rt1==dops[i].rs2) {
         alloc_reg_temp(current,i,-1);
         minimum_free_regs[i]=1;
       }
-      current->is32|=1LL<<rt1[i];
     } else { // DSLLV/DSRLV/DSRAV
-      if(rs1[i]) alloc_reg64(current,i,rs1[i]);
-      if(rs2[i]) alloc_reg(current,i,rs2[i]);
-      alloc_reg64(current,i,rt1[i]);
-      current->is32&=~(1LL<<rt1[i]);
-      if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
-      {
-        alloc_reg_temp(current,i,-1);
-        minimum_free_regs[i]=1;
-      }
+      assert(0);
     }
-    clear_const(current,rs1[i]);
-    clear_const(current,rs2[i]);
-    clear_const(current,rt1[i]);
-    dirty_reg(current,rt1[i]);
+    clear_const(current,dops[i].rs1);
+    clear_const(current,dops[i].rs2);
+    clear_const(current,dops[i].rt1);
+    dirty_reg(current,dops[i].rt1);
   }
 }
 
-void alu_alloc(struct regstat *current,int i)
+static void alu_alloc(struct regstat *current,int i)
 {
-  if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
-    if(rt1[i]) {
-      if(rs1[i]&&rs2[i]) {
-        alloc_reg(current,i,rs1[i]);
-        alloc_reg(current,i,rs2[i]);
+  if(dops[i].opcode2>=0x20&&dops[i].opcode2<=0x23) { // ADD/ADDU/SUB/SUBU
+    if(dops[i].rt1) {
+      if(dops[i].rs1&&dops[i].rs2) {
+        alloc_reg(current,i,dops[i].rs1);
+        alloc_reg(current,i,dops[i].rs2);
       }
       else {
-        if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
-        if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
+        if(dops[i].rs1&&needed_again(dops[i].rs1,i)) alloc_reg(current,i,dops[i].rs1);
+        if(dops[i].rs2&&needed_again(dops[i].rs2,i)) alloc_reg(current,i,dops[i].rs2);
       }
-      alloc_reg(current,i,rt1[i]);
+      alloc_reg(current,i,dops[i].rt1);
     }
-    current->is32|=1LL<<rt1[i];
   }
-  if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
-    if(rt1[i]) {
-      if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
-      {
-        alloc_reg64(current,i,rs1[i]);
-        alloc_reg64(current,i,rs2[i]);
-        alloc_reg(current,i,rt1[i]);
-      } else {
-        alloc_reg(current,i,rs1[i]);
-        alloc_reg(current,i,rs2[i]);
-        alloc_reg(current,i,rt1[i]);
-      }
+  if(dops[i].opcode2==0x2a||dops[i].opcode2==0x2b) { // SLT/SLTU
+    if(dops[i].rt1) {
+      alloc_reg(current,i,dops[i].rs1);
+      alloc_reg(current,i,dops[i].rs2);
+      alloc_reg(current,i,dops[i].rt1);
     }
-    current->is32|=1LL<<rt1[i];
   }
-  if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
-    if(rt1[i]) {
-      if(rs1[i]&&rs2[i]) {
-        alloc_reg(current,i,rs1[i]);
-        alloc_reg(current,i,rs2[i]);
+  if(dops[i].opcode2>=0x24&&dops[i].opcode2<=0x27) { // AND/OR/XOR/NOR
+    if(dops[i].rt1) {
+      if(dops[i].rs1&&dops[i].rs2) {
+        alloc_reg(current,i,dops[i].rs1);
+        alloc_reg(current,i,dops[i].rs2);
       }
       else
       {
-        if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
-        if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
-      }
-      alloc_reg(current,i,rt1[i]);
-      if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
-      {
-        if(!((current->uu>>rt1[i])&1)) {
-          alloc_reg64(current,i,rt1[i]);
-        }
-        if(get_reg(current->regmap,rt1[i]|64)>=0) {
-          if(rs1[i]&&rs2[i]) {
-            alloc_reg64(current,i,rs1[i]);
-            alloc_reg64(current,i,rs2[i]);
-          }
-          else
-          {
-            // Is is really worth it to keep 64-bit values in registers?
-            #ifdef NATIVE_64BIT
-            if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
-            if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
-            #endif
-          }
-        }
-        current->is32&=~(1LL<<rt1[i]);
-      } else {
-        current->is32|=1LL<<rt1[i];
+        if(dops[i].rs1&&needed_again(dops[i].rs1,i)) alloc_reg(current,i,dops[i].rs1);
+        if(dops[i].rs2&&needed_again(dops[i].rs2,i)) alloc_reg(current,i,dops[i].rs2);
       }
+      alloc_reg(current,i,dops[i].rt1);
     }
   }
-  if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
-    if(rt1[i]) {
-      if(rs1[i]&&rs2[i]) {
-        if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
-          alloc_reg64(current,i,rs1[i]);
-          alloc_reg64(current,i,rs2[i]);
-          alloc_reg64(current,i,rt1[i]);
-        } else {
-          alloc_reg(current,i,rs1[i]);
-          alloc_reg(current,i,rs2[i]);
-          alloc_reg(current,i,rt1[i]);
-        }
-      }
-      else {
-        alloc_reg(current,i,rt1[i]);
-        if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
-          // DADD used as move, or zeroing
-          // If we have a 64-bit source, then make the target 64 bits too
-          if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
-            if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
-            alloc_reg64(current,i,rt1[i]);
-          } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
-            if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
-            alloc_reg64(current,i,rt1[i]);
-          }
-          if(opcode2[i]>=0x2e&&rs2[i]) {
-            // DSUB used as negation - 64-bit result
-            // If we have a 32-bit register, extend it to 64 bits
-            if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
-            alloc_reg64(current,i,rt1[i]);
-          }
-        }
-      }
-      if(rs1[i]&&rs2[i]) {
-        current->is32&=~(1LL<<rt1[i]);
-      } else if(rs1[i]) {
-        current->is32&=~(1LL<<rt1[i]);
-        if((current->is32>>rs1[i])&1)
-          current->is32|=1LL<<rt1[i];
-      } else if(rs2[i]) {
-        current->is32&=~(1LL<<rt1[i]);
-        if((current->is32>>rs2[i])&1)
-          current->is32|=1LL<<rt1[i];
-      } else {
-        current->is32|=1LL<<rt1[i];
-      }
-    }
+  if(dops[i].opcode2>=0x2c&&dops[i].opcode2<=0x2f) { // DADD/DADDU/DSUB/DSUBU
+    assert(0);
   }
-  clear_const(current,rs1[i]);
-  clear_const(current,rs2[i]);
-  clear_const(current,rt1[i]);
-  dirty_reg(current,rt1[i]);
+  clear_const(current,dops[i].rs1);
+  clear_const(current,dops[i].rs2);
+  clear_const(current,dops[i].rt1);
+  dirty_reg(current,dops[i].rt1);
 }
 
-void imm16_alloc(struct regstat *current,int i)
+static void imm16_alloc(struct regstat *current,int i)
 {
-  if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
-  else lt1[i]=rs1[i];
-  if(rt1[i]) alloc_reg(current,i,rt1[i]);
-  if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
-    current->is32&=~(1LL<<rt1[i]);
-    if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
-      // TODO: Could preserve the 32-bit flag if the immediate is zero
-      alloc_reg64(current,i,rt1[i]);
-      alloc_reg64(current,i,rs1[i]);
-    }
-    clear_const(current,rs1[i]);
-    clear_const(current,rt1[i]);
+  if(dops[i].rs1&&needed_again(dops[i].rs1,i)) alloc_reg(current,i,dops[i].rs1);
+  else dops[i].lt1=dops[i].rs1;
+  if(dops[i].rt1) alloc_reg(current,i,dops[i].rt1);
+  if(dops[i].opcode==0x18||dops[i].opcode==0x19) { // DADDI/DADDIU
+    assert(0);
   }
-  else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
-    if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
-    current->is32|=1LL<<rt1[i];
-    clear_const(current,rs1[i]);
-    clear_const(current,rt1[i]);
+  else if(dops[i].opcode==0x0a||dops[i].opcode==0x0b) { // SLTI/SLTIU
+    clear_const(current,dops[i].rs1);
+    clear_const(current,dops[i].rt1);
   }
-  else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
-    if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
-      if(rs1[i]!=rt1[i]) {
-        if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
-        alloc_reg64(current,i,rt1[i]);
-        current->is32&=~(1LL<<rt1[i]);
-      }
+  else if(dops[i].opcode>=0x0c&&dops[i].opcode<=0x0e) { // ANDI/ORI/XORI
+    if(is_const(current,dops[i].rs1)) {
+      int v=get_const(current,dops[i].rs1);
+      if(dops[i].opcode==0x0c) set_const(current,dops[i].rt1,v&imm[i]);
+      if(dops[i].opcode==0x0d) set_const(current,dops[i].rt1,v|imm[i]);
+      if(dops[i].opcode==0x0e) set_const(current,dops[i].rt1,v^imm[i]);
     }
-    else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
-    if(is_const(current,rs1[i])) {
-      int v=get_const(current,rs1[i]);
-      if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
-      if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
-      if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
-    }
-    else clear_const(current,rt1[i]);
+    else clear_const(current,dops[i].rt1);
   }
-  else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
-    if(is_const(current,rs1[i])) {
-      int v=get_const(current,rs1[i]);
-      set_const(current,rt1[i],v+imm[i]);
+  else if(dops[i].opcode==0x08||dops[i].opcode==0x09) { // ADDI/ADDIU
+    if(is_const(current,dops[i].rs1)) {
+      int v=get_const(current,dops[i].rs1);
+      set_const(current,dops[i].rt1,v+imm[i]);
     }
-    else clear_const(current,rt1[i]);
-    current->is32|=1LL<<rt1[i];
+    else clear_const(current,dops[i].rt1);
   }
   else {
-    set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
-    current->is32|=1LL<<rt1[i];
+    set_const(current,dops[i].rt1,imm[i]<<16); // LUI
   }
-  dirty_reg(current,rt1[i]);
+  dirty_reg(current,dops[i].rt1);
 }
 
-void load_alloc(struct regstat *current,int i)
+static void load_alloc(struct regstat *current,int i)
 {
-  clear_const(current,rt1[i]);
-  //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
-  if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
-  if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
-  if(rt1[i]&&!((current->u>>rt1[i])&1)) {
-    alloc_reg(current,i,rt1[i]);
-    assert(get_reg(current->regmap,rt1[i])>=0);
-    if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
+  clear_const(current,dops[i].rt1);
+  //if(dops[i].rs1!=dops[i].rt1&&needed_again(dops[i].rs1,i)) clear_const(current,dops[i].rs1); // Does this help or hurt?
+  if(!dops[i].rs1) current->u&=~1LL; // Allow allocating r0 if it's the source register
+  if (needed_again(dops[i].rs1, i))
+    alloc_reg(current, i, dops[i].rs1);
+  if (ram_offset)
+    alloc_reg(current, i, ROREG);
+  if(dops[i].rt1&&!((current->u>>dops[i].rt1)&1)) {
+    alloc_reg(current,i,dops[i].rt1);
+    assert(get_reg(current->regmap,dops[i].rt1)>=0);
+    if(dops[i].opcode==0x27||dops[i].opcode==0x37) // LWU/LD
     {
-      current->is32&=~(1LL<<rt1[i]);
-      alloc_reg64(current,i,rt1[i]);
+      assert(0);
     }
-    else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
+    else if(dops[i].opcode==0x1A||dops[i].opcode==0x1B) // LDL/LDR
     {
-      current->is32&=~(1LL<<rt1[i]);
-      alloc_reg64(current,i,rt1[i]);
-      alloc_all(current,i);
-      alloc_reg64(current,i,FTEMP);
-      minimum_free_regs[i]=HOST_REGS;
+      assert(0);
     }
-    else current->is32|=1LL<<rt1[i];
-    dirty_reg(current,rt1[i]);
+    dirty_reg(current,dops[i].rt1);
     // LWL/LWR need a temporary register for the old value
-    if(opcode[i]==0x22||opcode[i]==0x26)
+    if(dops[i].opcode==0x22||dops[i].opcode==0x26)
     {
       alloc_reg(current,i,FTEMP);
       alloc_reg_temp(current,i,-1);
@@ -1496,36 +1805,35 @@ void load_alloc(struct regstat *current,int i)
   {
     // Load to r0 or unneeded register (dummy load)
     // but we still need a register to calculate the address
-    if(opcode[i]==0x22||opcode[i]==0x26)
+    if(dops[i].opcode==0x22||dops[i].opcode==0x26)
     {
       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
     }
     alloc_reg_temp(current,i,-1);
     minimum_free_regs[i]=1;
-    if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
+    if(dops[i].opcode==0x1A||dops[i].opcode==0x1B) // LDL/LDR
     {
-      alloc_all(current,i);
-      alloc_reg64(current,i,FTEMP);
-      minimum_free_regs[i]=HOST_REGS;
+      assert(0);
     }
   }
 }
 
 void store_alloc(struct regstat *current,int i)
 {
-  clear_const(current,rs2[i]);
-  if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
-  if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
-  alloc_reg(current,i,rs2[i]);
-  if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
-    alloc_reg64(current,i,rs2[i]);
-    if(rs2[i]) alloc_reg(current,i,FTEMP);
-  }
+  clear_const(current,dops[i].rs2);
+  if(!(dops[i].rs2)) current->u&=~1LL; // Allow allocating r0 if necessary
+  if(needed_again(dops[i].rs1,i)) alloc_reg(current,i,dops[i].rs1);
+  alloc_reg(current,i,dops[i].rs2);
+  if(dops[i].opcode==0x2c||dops[i].opcode==0x2d||dops[i].opcode==0x3f) { // 64-bit SDL/SDR/SD
+    assert(0);
+  }
+  if (ram_offset)
+    alloc_reg(current, i, ROREG);
   #if defined(HOST_IMM8)
   // On CPUs without 32-bit immediates we need a pointer to invalid_code
-  else alloc_reg(current,i,INVCP);
+  alloc_reg(current, i, INVCP);
   #endif
-  if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
+  if(dops[i].opcode==0x2a||dops[i].opcode==0x2e||dops[i].opcode==0x2c||dops[i].opcode==0x2d) { // SWL/SWL/SDL/SDR
     alloc_reg(current,i,FTEMP);
   }
   // We need a temporary register for address generation
@@ -1535,31 +1843,20 @@ void store_alloc(struct regstat *current,int i)
 
 void c1ls_alloc(struct regstat *current,int i)
 {
-  //clear_const(current,rs1[i]); // FIXME
-  clear_const(current,rt1[i]);
-  if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
+  clear_const(current,dops[i].rt1);
   alloc_reg(current,i,CSREG); // Status
-  alloc_reg(current,i,FTEMP);
-  if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
-    alloc_reg64(current,i,FTEMP);
-  }
-  #if defined(HOST_IMM8)
-  // On CPUs without 32-bit immediates we need a pointer to invalid_code
-  else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
-    alloc_reg(current,i,INVCP);
-  #endif
-  // We need a temporary register for address generation
-  alloc_reg_temp(current,i,-1);
 }
 
 void c2ls_alloc(struct regstat *current,int i)
 {
-  clear_const(current,rt1[i]);
-  if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
+  clear_const(current,dops[i].rt1);
+  if(needed_again(dops[i].rs1,i)) alloc_reg(current,i,dops[i].rs1);
   alloc_reg(current,i,FTEMP);
+  if (ram_offset)
+    alloc_reg(current, i, ROREG);
   #if defined(HOST_IMM8)
   // On CPUs without 32-bit immediates we need a pointer to invalid_code
-  if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
+  if (dops[i].opcode == 0x3a) // SWC2
     alloc_reg(current,i,INVCP);
   #endif
   // We need a temporary register for address generation
@@ -1578,39 +1875,25 @@ void multdiv_alloc(struct regstat *current,int i)
   //  case 0x1D: DMULTU
   //  case 0x1E: DDIV
   //  case 0x1F: DDIVU
-  clear_const(current,rs1[i]);
-  clear_const(current,rs2[i]);
-  if(rs1[i]&&rs2[i])
+  clear_const(current,dops[i].rs1);
+  clear_const(current,dops[i].rs2);
+  alloc_cc(current,i); // for stalls
+  if(dops[i].rs1&&dops[i].rs2)
   {
-    if((opcode2[i]&4)==0) // 32-bit
+    if((dops[i].opcode2&4)==0) // 32-bit
     {
       current->u&=~(1LL<<HIREG);
       current->u&=~(1LL<<LOREG);
       alloc_reg(current,i,HIREG);
       alloc_reg(current,i,LOREG);
-      alloc_reg(current,i,rs1[i]);
-      alloc_reg(current,i,rs2[i]);
-      current->is32|=1LL<<HIREG;
-      current->is32|=1LL<<LOREG;
+      alloc_reg(current,i,dops[i].rs1);
+      alloc_reg(current,i,dops[i].rs2);
       dirty_reg(current,HIREG);
       dirty_reg(current,LOREG);
     }
     else // 64-bit
     {
-      current->u&=~(1LL<<HIREG);
-      current->u&=~(1LL<<LOREG);
-      current->uu&=~(1LL<<HIREG);
-      current->uu&=~(1LL<<LOREG);
-      alloc_reg64(current,i,HIREG);
-      //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
-      alloc_reg64(current,i,rs1[i]);
-      alloc_reg64(current,i,rs2[i]);
-      alloc_all(current,i);
-      current->is32&=~(1LL<<HIREG);
-      current->is32&=~(1LL<<LOREG);
-      dirty_reg(current,HIREG);
-      dirty_reg(current,LOREG);
-      minimum_free_regs[i]=HOST_REGS;
+      assert(0);
     }
   }
   else
@@ -1620,8 +1903,6 @@ void multdiv_alloc(struct regstat *current,int i)
     // The result is undefined, we return zero.
     alloc_reg(current,i,HIREG);
     alloc_reg(current,i,LOREG);
-    current->is32|=1LL<<HIREG;
-    current->is32|=1LL<<LOREG;
     dirty_reg(current,HIREG);
     dirty_reg(current,LOREG);
   }
@@ -1630,21 +1911,20 @@ void multdiv_alloc(struct regstat *current,int i)
 
 void cop0_alloc(struct regstat *current,int i)
 {
-  if(opcode2[i]==0) // MFC0
+  if(dops[i].opcode2==0) // MFC0
   {
-    if(rt1[i]) {
-      clear_const(current,rt1[i]);
+    if(dops[i].rt1) {
+      clear_const(current,dops[i].rt1);
       alloc_all(current,i);
-      alloc_reg(current,i,rt1[i]);
-      current->is32|=1LL<<rt1[i];
-      dirty_reg(current,rt1[i]);
+      alloc_reg(current,i,dops[i].rt1);
+      dirty_reg(current,dops[i].rt1);
     }
   }
-  else if(opcode2[i]==4) // MTC0
+  else if(dops[i].opcode2==4) // MTC0
   {
-    if(rs1[i]){
-      clear_const(current,rs1[i]);
-      alloc_reg(current,i,rs1[i]);
+    if(dops[i].rs1){
+      clear_const(current,dops[i].rs1);
+      alloc_reg(current,i,dops[i].rs1);
       alloc_all(current,i);
     }
     else {
@@ -1656,72 +1936,45 @@ void cop0_alloc(struct regstat *current,int i)
   else
   {
     // TLBR/TLBWI/TLBWR/TLBP/ERET
-    assert(opcode2[i]==0x10);
+    assert(dops[i].opcode2==0x10);
     alloc_all(current,i);
   }
   minimum_free_regs[i]=HOST_REGS;
 }
 
-void cop1_alloc(struct regstat *current,int i)
+static void cop2_alloc(struct regstat *current,int i)
 {
-  alloc_reg(current,i,CSREG); // Load status
-  if(opcode2[i]<3) // MFC1/DMFC1/CFC1
+  if (dops[i].opcode2 < 3) // MFC2/CFC2
   {
-    if(rt1[i]){
-      clear_const(current,rt1[i]);
-      if(opcode2[i]==1) {
-        alloc_reg64(current,i,rt1[i]); // DMFC1
-        current->is32&=~(1LL<<rt1[i]);
-      }else{
-        alloc_reg(current,i,rt1[i]); // MFC1/CFC1
-        current->is32|=1LL<<rt1[i];
-      }
-      dirty_reg(current,rt1[i]);
+    alloc_cc(current,i); // for stalls
+    dirty_reg(current,CCREG);
+    if(dops[i].rt1){
+      clear_const(current,dops[i].rt1);
+      alloc_reg(current,i,dops[i].rt1);
+      dirty_reg(current,dops[i].rt1);
     }
-    alloc_reg_temp(current,i,-1);
   }
-  else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
+  else if (dops[i].opcode2 > 3) // MTC2/CTC2
   {
-    if(rs1[i]){
-      clear_const(current,rs1[i]);
-      if(opcode2[i]==5)
-        alloc_reg64(current,i,rs1[i]); // DMTC1
-      else
-        alloc_reg(current,i,rs1[i]); // MTC1/CTC1
-      alloc_reg_temp(current,i,-1);
+    if(dops[i].rs1){
+      clear_const(current,dops[i].rs1);
+      alloc_reg(current,i,dops[i].rs1);
     }
     else {
       current->u&=~1LL;
       alloc_reg(current,i,0);
-      alloc_reg_temp(current,i,-1);
     }
   }
-  minimum_free_regs[i]=1;
-}
-void fconv_alloc(struct regstat *current,int i)
-{
-  alloc_reg(current,i,CSREG); // Load status
-  alloc_reg_temp(current,i,-1);
-  minimum_free_regs[i]=1;
-}
-void float_alloc(struct regstat *current,int i)
-{
-  alloc_reg(current,i,CSREG); // Load status
   alloc_reg_temp(current,i,-1);
   minimum_free_regs[i]=1;
 }
+
 void c2op_alloc(struct regstat *current,int i)
 {
+  alloc_cc(current,i); // for stalls
+  dirty_reg(current,CCREG);
   alloc_reg_temp(current,i,-1);
 }
-void fcomp_alloc(struct regstat *current,int i)
-{
-  alloc_reg(current,i,CSREG); // Load status
-  alloc_reg(current,i,FSREG); // Load flags
-  dirty_reg(current,FSREG); // Flag will be modified
-  alloc_reg_temp(current,i,-1);
-  minimum_free_regs[i]=1;
-}
 
 void syscall_alloc(struct regstat *current,int i)
 {
@@ -1734,17 +1987,15 @@ void syscall_alloc(struct regstat *current,int i)
 
 void delayslot_alloc(struct regstat *current,int i)
 {
-  switch(itype[i])
-  {
+  switch(dops[i].itype) {
     case UJUMP:
     case CJUMP:
     case SJUMP:
     case RJUMP:
-    case FJUMP:
     case SYSCALL:
     case HLECALL:
     case SPAN:
-      assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
+      assem_debug("jump in the delay slot.  this shouldn't happen.\n");//abort();
       SysPrintf("Disabled speculative precompilation\n");
       stop_after_jal=1;
       break;
@@ -1778,8 +2029,9 @@ void delayslot_alloc(struct regstat *current,int i)
       cop0_alloc(current,i);
       break;
     case COP1:
+      break;
     case COP2:
-      cop1_alloc(current,i);
+      cop2_alloc(current,i);
       break;
     case C1LS:
       c1ls_alloc(current,i);
@@ -1787,15 +2039,6 @@ void delayslot_alloc(struct regstat *current,int i)
     case C2LS:
       c2ls_alloc(current,i);
       break;
-    case FCONV:
-      fconv_alloc(current,i);
-      break;
-    case FLOAT:
-      float_alloc(current,i);
-      break;
-    case FCOMP:
-      fcomp_alloc(current,i);
-      break;
     case C2OP:
       c2op_alloc(current,i);
       break;
@@ -1812,309 +2055,174 @@ static void pagespan_alloc(struct regstat *current,int i)
   alloc_all(current,i);
   alloc_cc(current,i);
   dirty_reg(current,CCREG);
-  if(opcode[i]==3) // JAL
+  if(dops[i].opcode==3) // JAL
   {
     alloc_reg(current,i,31);
     dirty_reg(current,31);
   }
-  if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
-  {
-    alloc_reg(current,i,rs1[i]);
-    if (rt1[i]!=0) {
-      alloc_reg(current,i,rt1[i]);
-      dirty_reg(current,rt1[i]);
-    }
-  }
-  if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
+  if(dops[i].opcode==0&&(dops[i].opcode2&0x3E)==8) // JR/JALR
   {
-    if(rs1[i]) alloc_reg(current,i,rs1[i]);
-    if(rs2[i]) alloc_reg(current,i,rs2[i]);
-    if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
-    {
-      if(rs1[i]) alloc_reg64(current,i,rs1[i]);
-      if(rs2[i]) alloc_reg64(current,i,rs2[i]);
+    alloc_reg(current,i,dops[i].rs1);
+    if (dops[i].rt1!=0) {
+      alloc_reg(current,i,dops[i].rt1);
+      dirty_reg(current,dops[i].rt1);
     }
   }
-  else
-  if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
+  if((dops[i].opcode&0x2E)==4) // BEQ/BNE/BEQL/BNEL
   {
-    if(rs1[i]) alloc_reg(current,i,rs1[i]);
-    if(!((current->is32>>rs1[i])&1))
-    {
-      if(rs1[i]) alloc_reg64(current,i,rs1[i]);
-    }
+    if(dops[i].rs1) alloc_reg(current,i,dops[i].rs1);
+    if(dops[i].rs2) alloc_reg(current,i,dops[i].rs2);
   }
   else
-  if(opcode[i]==0x11) // BC1
+  if((dops[i].opcode&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
   {
-    alloc_reg(current,i,FSREG);
-    alloc_reg(current,i,CSREG);
+    if(dops[i].rs1) alloc_reg(current,i,dops[i].rs1);
   }
   //else ...
 }
 
-static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
+static void add_stub(enum stub_type type, void *addr, void *retaddr,
+  u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e)
 {
-  stubs[stubcount][0]=type;
-  stubs[stubcount][1]=addr;
-  stubs[stubcount][2]=retaddr;
-  stubs[stubcount][3]=a;
-  stubs[stubcount][4]=b;
-  stubs[stubcount][5]=c;
-  stubs[stubcount][6]=d;
-  stubs[stubcount][7]=e;
+  assert(stubcount < ARRAY_SIZE(stubs));
+  stubs[stubcount].type = type;
+  stubs[stubcount].addr = addr;
+  stubs[stubcount].retaddr = retaddr;
+  stubs[stubcount].a = a;
+  stubs[stubcount].b = b;
+  stubs[stubcount].c = c;
+  stubs[stubcount].d = d;
+  stubs[stubcount].e = e;
   stubcount++;
 }
 
+static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
+  int i, int addr_reg, const struct regstat *i_regs, int ccadj, u_int reglist)
+{
+  add_stub(type, addr, retaddr, i, addr_reg, (uintptr_t)i_regs, ccadj, reglist);
+}
+
 // Write out a single register
-void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
+static void wb_register(signed char r, const signed char regmap[], uint64_t dirty)
 {
   int hr;
   for(hr=0;hr<HOST_REGS;hr++) {
     if(hr!=EXCLUDE_REG) {
       if((regmap[hr]&63)==r) {
         if((dirty>>hr)&1) {
-          if(regmap[hr]<64) {
-            emit_storereg(r,hr);
-          }else{
-            emit_storereg(r|64,hr);
-          }
+          assert(regmap[hr]<64);
+          emit_storereg(r,hr);
         }
       }
     }
   }
 }
 
-#if 0
-static int mchecksum(void)
+static void wb_valid(signed char pre[],signed char entry[],u_int dirty_pre,u_int dirty,uint64_t u)
 {
-  //if(!tracedebug) return 0;
-  int i;
-  int sum=0;
-  for(i=0;i<2097152;i++) {
-    unsigned int temp=sum;
-    sum<<=1;
-    sum|=(~temp)>>31;
-    sum^=((u_int *)rdram)[i];
+  //if(dirty_pre==dirty) return;
+  int hr,reg;
+  for(hr=0;hr<HOST_REGS;hr++) {
+    if(hr!=EXCLUDE_REG) {
+      reg=pre[hr];
+      if(((~u)>>(reg&63))&1) {
+        if(reg>0) {
+          if(((dirty_pre&~dirty)>>hr)&1) {
+            if(reg>0&&reg<34) {
+              emit_storereg(reg,hr);
+            }
+            else if(reg>=64) {
+              assert(0);
+            }
+          }
+        }
+      }
+    }
   }
-  return sum;
-}
-
-static int rchecksum(void)
-{
-  int i;
-  int sum=0;
-  for(i=0;i<64;i++)
-    sum^=((u_int *)reg)[i];
-  return sum;
-}
-
-static void rlist(void)
-{
-  int i;
-  printf("TRACE: ");
-  for(i=0;i<32;i++)
-    printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
-  printf("\n");
-}
-
-static void enabletrace(void)
-{
-  tracedebug=1;
 }
 
-static void memdebug(int i)
+// trashes r2
+static void pass_args(int a0, int a1)
 {
-  //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
-  //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
-  //rlist();
-  //if(tracedebug) {
-  //if(Count>=-2084597794) {
-  if((signed int)Count>=-2084597794&&(signed int)Count<0) {
-  //if(0) {
-    printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
-    //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
-    //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
-    rlist();
-    #ifdef __i386__
-    printf("TRACE: %x\n",(&i)[-1]);
-    #endif
-    #ifdef __arm__
-    int j;
-    printf("TRACE: %x \n",(&j)[10]);
-    printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
-    #endif
-    //fflush(stdout);
+  if(a0==1&&a1==0) {
+    // must swap
+    emit_mov(a0,2); emit_mov(a1,1); emit_mov(2,0);
+  }
+  else if(a0!=0&&a1==0) {
+    emit_mov(a1,1);
+    if (a0>=0) emit_mov(a0,0);
+  }
+  else {
+    if(a0>=0&&a0!=0) emit_mov(a0,0);
+    if(a1>=0&&a1!=1) emit_mov(a1,1);
   }
-  //printf("TRACE: %x\n",(&i)[-1]);
 }
-#endif
 
-void alu_assemble(int i,struct regstat *i_regs)
+static void alu_assemble(int i, const struct regstat *i_regs)
 {
-  if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
-    if(rt1[i]) {
+  if(dops[i].opcode2>=0x20&&dops[i].opcode2<=0x23) { // ADD/ADDU/SUB/SUBU
+    if(dops[i].rt1) {
       signed char s1,s2,t;
-      t=get_reg(i_regs->regmap,rt1[i]);
+      t=get_reg(i_regs->regmap,dops[i].rt1);
       if(t>=0) {
-        s1=get_reg(i_regs->regmap,rs1[i]);
-        s2=get_reg(i_regs->regmap,rs2[i]);
-        if(rs1[i]&&rs2[i]) {
+        s1=get_reg(i_regs->regmap,dops[i].rs1);
+        s2=get_reg(i_regs->regmap,dops[i].rs2);
+        if(dops[i].rs1&&dops[i].rs2) {
           assert(s1>=0);
           assert(s2>=0);
-          if(opcode2[i]&2) emit_sub(s1,s2,t);
+          if(dops[i].opcode2&2) emit_sub(s1,s2,t);
           else emit_add(s1,s2,t);
         }
-        else if(rs1[i]) {
+        else if(dops[i].rs1) {
           if(s1>=0) emit_mov(s1,t);
-          else emit_loadreg(rs1[i],t);
+          else emit_loadreg(dops[i].rs1,t);
         }
-        else if(rs2[i]) {
+        else if(dops[i].rs2) {
           if(s2>=0) {
-            if(opcode2[i]&2) emit_neg(s2,t);
+            if(dops[i].opcode2&2) emit_neg(s2,t);
             else emit_mov(s2,t);
           }
           else {
-            emit_loadreg(rs2[i],t);
-            if(opcode2[i]&2) emit_neg(t,t);
+            emit_loadreg(dops[i].rs2,t);
+            if(dops[i].opcode2&2) emit_neg(t,t);
           }
         }
         else emit_zeroreg(t);
       }
     }
   }
-  if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
-    if(rt1[i]) {
-      signed char s1l,s2l,s1h,s2h,tl,th;
-      tl=get_reg(i_regs->regmap,rt1[i]);
-      th=get_reg(i_regs->regmap,rt1[i]|64);
-      if(tl>=0) {
-        s1l=get_reg(i_regs->regmap,rs1[i]);
-        s2l=get_reg(i_regs->regmap,rs2[i]);
-        s1h=get_reg(i_regs->regmap,rs1[i]|64);
-        s2h=get_reg(i_regs->regmap,rs2[i]|64);
-        if(rs1[i]&&rs2[i]) {
-          assert(s1l>=0);
-          assert(s2l>=0);
-          if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
-          else emit_adds(s1l,s2l,tl);
-          if(th>=0) {
-            #ifdef INVERTED_CARRY
-            if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
-            #else
-            if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
-            #endif
-            else emit_add(s1h,s2h,th);
-          }
-        }
-        else if(rs1[i]) {
-          if(s1l>=0) emit_mov(s1l,tl);
-          else emit_loadreg(rs1[i],tl);
-          if(th>=0) {
-            if(s1h>=0) emit_mov(s1h,th);
-            else emit_loadreg(rs1[i]|64,th);
-          }
-        }
-        else if(rs2[i]) {
-          if(s2l>=0) {
-            if(opcode2[i]&2) emit_negs(s2l,tl);
-            else emit_mov(s2l,tl);
-          }
-          else {
-            emit_loadreg(rs2[i],tl);
-            if(opcode2[i]&2) emit_negs(tl,tl);
-          }
-          if(th>=0) {
-            #ifdef INVERTED_CARRY
-            if(s2h>=0) emit_mov(s2h,th);
-            else emit_loadreg(rs2[i]|64,th);
-            if(opcode2[i]&2) {
-              emit_adcimm(-1,th); // x86 has inverted carry flag
-              emit_not(th,th);
-            }
-            #else
-            if(opcode2[i]&2) {
-              if(s2h>=0) emit_rscimm(s2h,0,th);
-              else {
-                emit_loadreg(rs2[i]|64,th);
-                emit_rscimm(th,0,th);
-              }
-            }else{
-              if(s2h>=0) emit_mov(s2h,th);
-              else emit_loadreg(rs2[i]|64,th);
-            }
-            #endif
-          }
-        }
-        else {
-          emit_zeroreg(tl);
-          if(th>=0) emit_zeroreg(th);
-        }
-      }
-    }
+  if(dops[i].opcode2>=0x2c&&dops[i].opcode2<=0x2f) { // DADD/DADDU/DSUB/DSUBU
+    assert(0);
   }
-  if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
-    if(rt1[i]) {
-      signed char s1l,s1h,s2l,s2h,t;
-      if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
+  if(dops[i].opcode2==0x2a||dops[i].opcode2==0x2b) { // SLT/SLTU
+    if(dops[i].rt1) {
+      signed char s1l,s2l,t;
       {
-        t=get_reg(i_regs->regmap,rt1[i]);
-        //assert(t>=0);
-        if(t>=0) {
-          s1l=get_reg(i_regs->regmap,rs1[i]);
-          s1h=get_reg(i_regs->regmap,rs1[i]|64);
-          s2l=get_reg(i_regs->regmap,rs2[i]);
-          s2h=get_reg(i_regs->regmap,rs2[i]|64);
-          if(rs2[i]==0) // rx<r0
-          {
-            assert(s1h>=0);
-            if(opcode2[i]==0x2a) // SLT
-              emit_shrimm(s1h,31,t);
-            else // SLTU (unsigned can not be less than zero)
-              emit_zeroreg(t);
-          }
-          else if(rs1[i]==0) // r0<rx
-          {
-            assert(s2h>=0);
-            if(opcode2[i]==0x2a) // SLT
-              emit_set_gz64_32(s2h,s2l,t);
-            else // SLTU (set if not zero)
-              emit_set_nz64_32(s2h,s2l,t);
-          }
-          else {
-            assert(s1l>=0);assert(s1h>=0);
-            assert(s2l>=0);assert(s2h>=0);
-            if(opcode2[i]==0x2a) // SLT
-              emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
-            else // SLTU
-              emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
-          }
-        }
-      } else {
-        t=get_reg(i_regs->regmap,rt1[i]);
+        t=get_reg(i_regs->regmap,dops[i].rt1);
         //assert(t>=0);
         if(t>=0) {
-          s1l=get_reg(i_regs->regmap,rs1[i]);
-          s2l=get_reg(i_regs->regmap,rs2[i]);
-          if(rs2[i]==0) // rx<r0
+          s1l=get_reg(i_regs->regmap,dops[i].rs1);
+          s2l=get_reg(i_regs->regmap,dops[i].rs2);
+          if(dops[i].rs2==0) // rx<r0
           {
-            assert(s1l>=0);
-            if(opcode2[i]==0x2a) // SLT
+            if(dops[i].opcode2==0x2a&&dops[i].rs1!=0) { // SLT
+              assert(s1l>=0);
               emit_shrimm(s1l,31,t);
-            else // SLTU (unsigned can not be less than zero)
+            }
+            else // SLTU (unsigned can not be less than zero, 0<0)
               emit_zeroreg(t);
           }
-          else if(rs1[i]==0) // r0<rx
+          else if(dops[i].rs1==0) // r0<rx
           {
             assert(s2l>=0);
-            if(opcode2[i]==0x2a) // SLT
+            if(dops[i].opcode2==0x2a) // SLT
               emit_set_gz32(s2l,t);
             else // SLTU (set if not zero)
               emit_set_nz32(s2l,t);
           }
           else{
             assert(s1l>=0);assert(s2l>=0);
-            if(opcode2[i]==0x2a) // SLT
+            if(dops[i].opcode2==0x2a) // SLT
               emit_set_if_less32(s1l,s2l,t);
             else // SLTU
               emit_set_if_carry32(s1l,s2l,t);
@@ -2123,153 +2231,61 @@ void alu_assemble(int i,struct regstat *i_regs)
       }
     }
   }
-  if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
-    if(rt1[i]) {
-      signed char s1l,s1h,s2l,s2h,th,tl;
-      tl=get_reg(i_regs->regmap,rt1[i]);
-      th=get_reg(i_regs->regmap,rt1[i]|64);
-      if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
-      {
-        assert(tl>=0);
-        if(tl>=0) {
-          s1l=get_reg(i_regs->regmap,rs1[i]);
-          s1h=get_reg(i_regs->regmap,rs1[i]|64);
-          s2l=get_reg(i_regs->regmap,rs2[i]);
-          s2h=get_reg(i_regs->regmap,rs2[i]|64);
-          if(rs1[i]&&rs2[i]) {
-            assert(s1l>=0);assert(s1h>=0);
-            assert(s2l>=0);assert(s2h>=0);
-            if(opcode2[i]==0x24) { // AND
-              emit_and(s1l,s2l,tl);
-              emit_and(s1h,s2h,th);
-            } else
-            if(opcode2[i]==0x25) { // OR
-              emit_or(s1l,s2l,tl);
-              emit_or(s1h,s2h,th);
-            } else
-            if(opcode2[i]==0x26) { // XOR
-              emit_xor(s1l,s2l,tl);
-              emit_xor(s1h,s2h,th);
-            } else
-            if(opcode2[i]==0x27) { // NOR
-              emit_or(s1l,s2l,tl);
-              emit_or(s1h,s2h,th);
-              emit_not(tl,tl);
-              emit_not(th,th);
-            }
-          }
-          else
-          {
-            if(opcode2[i]==0x24) { // AND
-              emit_zeroreg(tl);
-              emit_zeroreg(th);
-            } else
-            if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
-              if(rs1[i]){
-                if(s1l>=0) emit_mov(s1l,tl);
-                else emit_loadreg(rs1[i],tl);
-                if(s1h>=0) emit_mov(s1h,th);
-                else emit_loadreg(rs1[i]|64,th);
-              }
-              else
-              if(rs2[i]){
-                if(s2l>=0) emit_mov(s2l,tl);
-                else emit_loadreg(rs2[i],tl);
-                if(s2h>=0) emit_mov(s2h,th);
-                else emit_loadreg(rs2[i]|64,th);
-              }
-              else{
-                emit_zeroreg(tl);
-                emit_zeroreg(th);
-              }
-            } else
-            if(opcode2[i]==0x27) { // NOR
-              if(rs1[i]){
-                if(s1l>=0) emit_not(s1l,tl);
-                else{
-                  emit_loadreg(rs1[i],tl);
-                  emit_not(tl,tl);
-                }
-                if(s1h>=0) emit_not(s1h,th);
-                else{
-                  emit_loadreg(rs1[i]|64,th);
-                  emit_not(th,th);
-                }
-              }
-              else
-              if(rs2[i]){
-                if(s2l>=0) emit_not(s2l,tl);
-                else{
-                  emit_loadreg(rs2[i],tl);
-                  emit_not(tl,tl);
-                }
-                if(s2h>=0) emit_not(s2h,th);
-                else{
-                  emit_loadreg(rs2[i]|64,th);
-                  emit_not(th,th);
-                }
-              }
-              else {
-                emit_movimm(-1,tl);
-                emit_movimm(-1,th);
-              }
-            }
-          }
-        }
-      }
-      else
+  if(dops[i].opcode2>=0x24&&dops[i].opcode2<=0x27) { // AND/OR/XOR/NOR
+    if(dops[i].rt1) {
+      signed char s1l,s2l,tl;
+      tl=get_reg(i_regs->regmap,dops[i].rt1);
       {
-        // 32 bit
         if(tl>=0) {
-          s1l=get_reg(i_regs->regmap,rs1[i]);
-          s2l=get_reg(i_regs->regmap,rs2[i]);
-          if(rs1[i]&&rs2[i]) {
+          s1l=get_reg(i_regs->regmap,dops[i].rs1);
+          s2l=get_reg(i_regs->regmap,dops[i].rs2);
+          if(dops[i].rs1&&dops[i].rs2) {
             assert(s1l>=0);
             assert(s2l>=0);
-            if(opcode2[i]==0x24) { // AND
+            if(dops[i].opcode2==0x24) { // AND
               emit_and(s1l,s2l,tl);
             } else
-            if(opcode2[i]==0x25) { // OR
+            if(dops[i].opcode2==0x25) { // OR
               emit_or(s1l,s2l,tl);
             } else
-            if(opcode2[i]==0x26) { // XOR
+            if(dops[i].opcode2==0x26) { // XOR
               emit_xor(s1l,s2l,tl);
             } else
-            if(opcode2[i]==0x27) { // NOR
+            if(dops[i].opcode2==0x27) { // NOR
               emit_or(s1l,s2l,tl);
               emit_not(tl,tl);
             }
           }
           else
           {
-            if(opcode2[i]==0x24) { // AND
+            if(dops[i].opcode2==0x24) { // AND
               emit_zeroreg(tl);
             } else
-            if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
-              if(rs1[i]){
+            if(dops[i].opcode2==0x25||dops[i].opcode2==0x26) { // OR/XOR
+              if(dops[i].rs1){
                 if(s1l>=0) emit_mov(s1l,tl);
-                else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
+                else emit_loadreg(dops[i].rs1,tl); // CHECK: regmap_entry?
               }
               else
-              if(rs2[i]){
+              if(dops[i].rs2){
                 if(s2l>=0) emit_mov(s2l,tl);
-                else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
+                else emit_loadreg(dops[i].rs2,tl); // CHECK: regmap_entry?
               }
               else emit_zeroreg(tl);
             } else
-            if(opcode2[i]==0x27) { // NOR
-              if(rs1[i]){
+            if(dops[i].opcode2==0x27) { // NOR
+              if(dops[i].rs1){
                 if(s1l>=0) emit_not(s1l,tl);
                 else {
-                  emit_loadreg(rs1[i],tl);
+                  emit_loadreg(dops[i].rs1,tl);
                   emit_not(tl,tl);
                 }
               }
               else
-              if(rs2[i]){
+              if(dops[i].rs2){
                 if(s2l>=0) emit_not(s2l,tl);
                 else {
-                  emit_loadreg(rs2[i],tl);
+                  emit_loadreg(dops[i].rs2,tl);
                   emit_not(tl,tl);
                 }
               }
@@ -2282,12 +2298,12 @@ void alu_assemble(int i,struct regstat *i_regs)
   }
 }
 
-void imm16_assemble(int i,struct regstat *i_regs)
+static void imm16_assemble(int i, const struct regstat *i_regs)
 {
-  if (opcode[i]==0x0f) { // LUI
-    if(rt1[i]) {
+  if (dops[i].opcode==0x0f) { // LUI
+    if(dops[i].rt1) {
       signed char t;
-      t=get_reg(i_regs->regmap,rt1[i]);
+      t=get_reg(i_regs->regmap,dops[i].rt1);
       //assert(t>=0);
       if(t>=0) {
         if(!((i_regs->isconst>>t)&1))
@@ -2295,18 +2311,18 @@ void imm16_assemble(int i,struct regstat *i_regs)
       }
     }
   }
-  if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
-    if(rt1[i]) {
+  if(dops[i].opcode==0x08||dops[i].opcode==0x09) { // ADDI/ADDIU
+    if(dops[i].rt1) {
       signed char s,t;
-      t=get_reg(i_regs->regmap,rt1[i]);
-      s=get_reg(i_regs->regmap,rs1[i]);
-      if(rs1[i]) {
+      t=get_reg(i_regs->regmap,dops[i].rt1);
+      s=get_reg(i_regs->regmap,dops[i].rs1);
+      if(dops[i].rs1) {
         //assert(t>=0);
         //assert(s>=0);
         if(t>=0) {
           if(!((i_regs->isconst>>t)&1)) {
             if(s<0) {
-              if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
+              if(i_regs->regmap_entry[t]!=dops[i].rs1) emit_loadreg(dops[i].rs1,t);
               emit_addimm(t,imm[i],t);
             }else{
               if(!((i_regs->wasconst>>s)&1))
@@ -2324,45 +2340,33 @@ void imm16_assemble(int i,struct regstat *i_regs)
       }
     }
   }
-  if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
-    if(rt1[i]) {
-      signed char sh,sl,th,tl;
-      th=get_reg(i_regs->regmap,rt1[i]|64);
-      tl=get_reg(i_regs->regmap,rt1[i]);
-      sh=get_reg(i_regs->regmap,rs1[i]|64);
-      sl=get_reg(i_regs->regmap,rs1[i]);
+  if(dops[i].opcode==0x18||dops[i].opcode==0x19) { // DADDI/DADDIU
+    if(dops[i].rt1) {
+      signed char sl,tl;
+      tl=get_reg(i_regs->regmap,dops[i].rt1);
+      sl=get_reg(i_regs->regmap,dops[i].rs1);
       if(tl>=0) {
-        if(rs1[i]) {
-          assert(sh>=0);
+        if(dops[i].rs1) {
           assert(sl>=0);
-          if(th>=0) {
-            emit_addimm64_32(sh,sl,imm[i],th,tl);
-          }
-          else {
-            emit_addimm(sl,imm[i],tl);
-          }
+          emit_addimm(sl,imm[i],tl);
         } else {
           emit_movimm(imm[i],tl);
-          if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
         }
       }
     }
   }
-  else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
-    if(rt1[i]) {
-      //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
-      signed char sh,sl,t;
-      t=get_reg(i_regs->regmap,rt1[i]);
-      sh=get_reg(i_regs->regmap,rs1[i]|64);
-      sl=get_reg(i_regs->regmap,rs1[i]);
+  else if(dops[i].opcode==0x0a||dops[i].opcode==0x0b) { // SLTI/SLTIU
+    if(dops[i].rt1) {
+      //assert(dops[i].rs1!=0); // r0 might be valid, but it's probably a bug
+      signed char sl,t;
+      t=get_reg(i_regs->regmap,dops[i].rt1);
+      sl=get_reg(i_regs->regmap,dops[i].rs1);
       //assert(t>=0);
       if(t>=0) {
-        if(rs1[i]>0) {
-          if(sh<0) assert((i_regs->was32>>rs1[i])&1);
-          if(sh<0||((i_regs->was32>>rs1[i])&1)) {
-            if(opcode[i]==0x0a) { // SLTI
+        if(dops[i].rs1>0) {
+            if(dops[i].opcode==0x0a) { // SLTI
               if(sl<0) {
-                if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
+                if(i_regs->regmap_entry[t]!=dops[i].rs1) emit_loadreg(dops[i].rs1,t);
                 emit_slti32(t,imm[i],t);
               }else{
                 emit_slti32(sl,imm[i],t);
@@ -2370,23 +2374,16 @@ void imm16_assemble(int i,struct regstat *i_regs)
             }
             else { // SLTIU
               if(sl<0) {
-                if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
+                if(i_regs->regmap_entry[t]!=dops[i].rs1) emit_loadreg(dops[i].rs1,t);
                 emit_sltiu32(t,imm[i],t);
               }else{
                 emit_sltiu32(sl,imm[i],t);
               }
             }
-          }else{ // 64-bit
-            assert(sl>=0);
-            if(opcode[i]==0x0a) // SLTI
-              emit_slti64_32(sh,sl,imm[i],t);
-            else // SLTIU
-              emit_sltiu64_32(sh,sl,imm[i],t);
-          }
         }else{
           // SLTI(U) with r0 is just stupid,
           // nonetheless examples can be found
-          if(opcode[i]==0x0a) // SLTI
+          if(dops[i].opcode==0x0a) // SLTI
             if(0<imm[i]) emit_movimm(1,t);
             else emit_zeroreg(t);
           else // SLTIU
@@ -2398,19 +2395,17 @@ void imm16_assemble(int i,struct regstat *i_regs)
       }
     }
   }
-  else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
-    if(rt1[i]) {
-      signed char sh,sl,th,tl;
-      th=get_reg(i_regs->regmap,rt1[i]|64);
-      tl=get_reg(i_regs->regmap,rt1[i]);
-      sh=get_reg(i_regs->regmap,rs1[i]|64);
-      sl=get_reg(i_regs->regmap,rs1[i]);
+  else if(dops[i].opcode>=0x0c&&dops[i].opcode<=0x0e) { // ANDI/ORI/XORI
+    if(dops[i].rt1) {
+      signed char sl,tl;
+      tl=get_reg(i_regs->regmap,dops[i].rt1);
+      sl=get_reg(i_regs->regmap,dops[i].rs1);
       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
-        if(opcode[i]==0x0c) //ANDI
+        if(dops[i].opcode==0x0c) //ANDI
         {
-          if(rs1[i]) {
+          if(dops[i].rs1) {
             if(sl<0) {
-              if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
+              if(i_regs->regmap_entry[tl]!=dops[i].rs1) emit_loadreg(dops[i].rs1,tl);
               emit_andimm(tl,imm[i],tl);
             }else{
               if(!((i_regs->wasconst>>sl)&1))
@@ -2421,22 +2416,14 @@ void imm16_assemble(int i,struct regstat *i_regs)
           }
           else
             emit_zeroreg(tl);
-          if(th>=0) emit_zeroreg(th);
         }
         else
         {
-          if(rs1[i]) {
+          if(dops[i].rs1) {
             if(sl<0) {
-              if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
+              if(i_regs->regmap_entry[tl]!=dops[i].rs1) emit_loadreg(dops[i].rs1,tl);
             }
-            if(th>=0) {
-              if(sh<0) {
-                emit_loadreg(rs1[i]|64,th);
-              }else{
-                emit_mov(sh,th);
-              }
-            }
-            if(opcode[i]==0x0d) { // ORI
+            if(dops[i].opcode==0x0d) { // ORI
               if(sl<0) {
                 emit_orimm(tl,imm[i],tl);
               }else{
@@ -2446,7 +2433,7 @@ void imm16_assemble(int i,struct regstat *i_regs)
                   emit_movimm(constmap[i][sl]|imm[i],tl);
               }
             }
-            if(opcode[i]==0x0e) { // XORI
+            if(dops[i].opcode==0x0e) { // XORI
               if(sl<0) {
                 emit_xorimm(tl,imm[i],tl);
               }else{
@@ -2459,7 +2446,6 @@ void imm16_assemble(int i,struct regstat *i_regs)
           }
           else {
             emit_movimm(imm[i],tl);
-            if(th>=0) emit_zeroreg(th);
           }
         }
       }
@@ -2467,33 +2453,33 @@ void imm16_assemble(int i,struct regstat *i_regs)
   }
 }
 
-void shiftimm_assemble(int i,struct regstat *i_regs)
+static void shiftimm_assemble(int i, const struct regstat *i_regs)
 {
-  if(opcode2[i]<=0x3) // SLL/SRL/SRA
+  if(dops[i].opcode2<=0x3) // SLL/SRL/SRA
   {
-    if(rt1[i]) {
+    if(dops[i].rt1) {
       signed char s,t;
-      t=get_reg(i_regs->regmap,rt1[i]);
-      s=get_reg(i_regs->regmap,rs1[i]);
+      t=get_reg(i_regs->regmap,dops[i].rt1);
+      s=get_reg(i_regs->regmap,dops[i].rs1);
       //assert(t>=0);
       if(t>=0&&!((i_regs->isconst>>t)&1)){
-        if(rs1[i]==0)
+        if(dops[i].rs1==0)
         {
           emit_zeroreg(t);
         }
         else
         {
-          if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
+          if(s<0&&i_regs->regmap_entry[t]!=dops[i].rs1) emit_loadreg(dops[i].rs1,t);
           if(imm[i]) {
-            if(opcode2[i]==0) // SLL
+            if(dops[i].opcode2==0) // SLL
             {
               emit_shlimm(s<0?t:s,imm[i],t);
             }
-            if(opcode2[i]==2) // SRL
+            if(dops[i].opcode2==2) // SRL
             {
               emit_shrimm(s<0?t:s,imm[i],t);
             }
-            if(opcode2[i]==3) // SRA
+            if(dops[i].opcode2==3) // SRA
             {
               emit_sarimm(s<0?t:s,imm[i],t);
             }
@@ -2503,131 +2489,285 @@ void shiftimm_assemble(int i,struct regstat *i_regs)
           }
         }
       }
-      //emit_storereg(rt1[i],t); //DEBUG
+      //emit_storereg(dops[i].rt1,t); //DEBUG
     }
   }
-  if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
+  if(dops[i].opcode2>=0x38&&dops[i].opcode2<=0x3b) // DSLL/DSRL/DSRA
   {
-    if(rt1[i]) {
-      signed char sh,sl,th,tl;
-      th=get_reg(i_regs->regmap,rt1[i]|64);
-      tl=get_reg(i_regs->regmap,rt1[i]);
-      sh=get_reg(i_regs->regmap,rs1[i]|64);
-      sl=get_reg(i_regs->regmap,rs1[i]);
-      if(tl>=0) {
-        if(rs1[i]==0)
-        {
-          emit_zeroreg(tl);
-          if(th>=0) emit_zeroreg(th);
-        }
-        else
-        {
-          assert(sl>=0);
-          assert(sh>=0);
-          if(imm[i]) {
-            if(opcode2[i]==0x38) // DSLL
-            {
-              if(th>=0) emit_shldimm(sh,sl,imm[i],th);
-              emit_shlimm(sl,imm[i],tl);
-            }
-            if(opcode2[i]==0x3a) // DSRL
-            {
-              emit_shrdimm(sl,sh,imm[i],tl);
-              if(th>=0) emit_shrimm(sh,imm[i],th);
-            }
-            if(opcode2[i]==0x3b) // DSRA
-            {
-              emit_shrdimm(sl,sh,imm[i],tl);
-              if(th>=0) emit_sarimm(sh,imm[i],th);
-            }
-          }else{
-            // Shift by zero
-            if(sl!=tl) emit_mov(sl,tl);
-            if(th>=0&&sh!=th) emit_mov(sh,th);
-          }
-        }
-      }
-    }
+    assert(0);
   }
-  if(opcode2[i]==0x3c) // DSLL32
+  if(dops[i].opcode2==0x3c) // DSLL32
   {
-    if(rt1[i]) {
-      signed char sl,tl,th;
-      tl=get_reg(i_regs->regmap,rt1[i]);
-      th=get_reg(i_regs->regmap,rt1[i]|64);
-      sl=get_reg(i_regs->regmap,rs1[i]);
-      if(th>=0||tl>=0){
-        assert(tl>=0);
-        assert(th>=0);
-        assert(sl>=0);
-        emit_mov(sl,th);
-        emit_zeroreg(tl);
-        if(imm[i]>32)
-        {
-          emit_shlimm(th,imm[i]&31,th);
-        }
-      }
-    }
+    assert(0);
   }
-  if(opcode2[i]==0x3e) // DSRL32
+  if(dops[i].opcode2==0x3e) // DSRL32
   {
-    if(rt1[i]) {
-      signed char sh,tl,th;
-      tl=get_reg(i_regs->regmap,rt1[i]);
-      th=get_reg(i_regs->regmap,rt1[i]|64);
-      sh=get_reg(i_regs->regmap,rs1[i]|64);
-      if(tl>=0){
-        assert(sh>=0);
-        emit_mov(sh,tl);
-        if(th>=0) emit_zeroreg(th);
-        if(imm[i]>32)
-        {
-          emit_shrimm(tl,imm[i]&31,tl);
-        }
-      }
-    }
+    assert(0);
   }
-  if(opcode2[i]==0x3f) // DSRA32
+  if(dops[i].opcode2==0x3f) // DSRA32
   {
-    if(rt1[i]) {
-      signed char sh,tl;
-      tl=get_reg(i_regs->regmap,rt1[i]);
-      sh=get_reg(i_regs->regmap,rs1[i]|64);
-      if(tl>=0){
-        assert(sh>=0);
-        emit_mov(sh,tl);
-        if(imm[i]>32)
-        {
-          emit_sarimm(tl,imm[i]&31,tl);
-        }
-      }
-    }
+    assert(0);
   }
 }
 
 #ifndef shift_assemble
-void shift_assemble(int i,struct regstat *i_regs)
+static void shift_assemble(int i, const struct regstat *i_regs)
 {
-  printf("Need shift_assemble for this architecture.\n");
-  exit(1);
+  signed char s,t,shift;
+  if (dops[i].rt1 == 0)
+    return;
+  assert(dops[i].opcode2<=0x07); // SLLV/SRLV/SRAV
+  t = get_reg(i_regs->regmap, dops[i].rt1);
+  s = get_reg(i_regs->regmap, dops[i].rs1);
+  shift = get_reg(i_regs->regmap, dops[i].rs2);
+  if (t < 0)
+    return;
+
+  if(dops[i].rs1==0)
+    emit_zeroreg(t);
+  else if(dops[i].rs2==0) {
+    assert(s>=0);
+    if(s!=t) emit_mov(s,t);
+  }
+  else {
+    host_tempreg_acquire();
+    emit_andimm(shift,31,HOST_TEMPREG);
+    switch(dops[i].opcode2) {
+    case 4: // SLLV
+      emit_shl(s,HOST_TEMPREG,t);
+      break;
+    case 6: // SRLV
+      emit_shr(s,HOST_TEMPREG,t);
+      break;
+    case 7: // SRAV
+      emit_sar(s,HOST_TEMPREG,t);
+      break;
+    default:
+      assert(0);
+    }
+    host_tempreg_release();
+  }
 }
+
 #endif
 
-void load_assemble(int i,struct regstat *i_regs)
+enum {
+  MTYPE_8000 = 0,
+  MTYPE_8020,
+  MTYPE_0000,
+  MTYPE_A000,
+  MTYPE_1F80,
+};
+
+static int get_ptr_mem_type(u_int a)
+{
+  if(a < 0x00200000) {
+    if(a<0x1000&&((start>>20)==0xbfc||(start>>24)==0xa0))
+      // return wrong, must use memhandler for BIOS self-test to pass
+      // 007 does similar stuff from a00 mirror, weird stuff
+      return MTYPE_8000;
+    return MTYPE_0000;
+  }
+  if(0x1f800000 <= a && a < 0x1f801000)
+    return MTYPE_1F80;
+  if(0x80200000 <= a && a < 0x80800000)
+    return MTYPE_8020;
+  if(0xa0000000 <= a && a < 0xa0200000)
+    return MTYPE_A000;
+  return MTYPE_8000;
+}
+
+static int get_ro_reg(const struct regstat *i_regs, int host_tempreg_free)
+{
+  int r = get_reg(i_regs->regmap, ROREG);
+  if (r < 0 && host_tempreg_free) {
+    host_tempreg_acquire();
+    emit_loadreg(ROREG, r = HOST_TEMPREG);
+  }
+  if (r < 0)
+    abort();
+  return r;
+}
+
+static void *emit_fastpath_cmp_jump(int i, const struct regstat *i_regs,
+  int addr, int *offset_reg, int *addr_reg_override)
+{
+  void *jaddr = NULL;
+  int type = 0;
+  int mr = dops[i].rs1;
+  *offset_reg = -1;
+  if(((smrv_strong|smrv_weak)>>mr)&1) {
+    type=get_ptr_mem_type(smrv[mr]);
+    //printf("set %08x @%08x r%d %d\n", smrv[mr], start+i*4, mr, type);
+  }
+  else {
+    // use the mirror we are running on
+    type=get_ptr_mem_type(start);
+    //printf("set nospec   @%08x r%d %d\n", start+i*4, mr, type);
+  }
+
+  if(type==MTYPE_8020) { // RAM 80200000+ mirror
+    host_tempreg_acquire();
+    emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
+    addr=*addr_reg_override=HOST_TEMPREG;
+    type=0;
+  }
+  else if(type==MTYPE_0000) { // RAM 0 mirror
+    host_tempreg_acquire();
+    emit_orimm(addr,0x80000000,HOST_TEMPREG);
+    addr=*addr_reg_override=HOST_TEMPREG;
+    type=0;
+  }
+  else if(type==MTYPE_A000) { // RAM A mirror
+    host_tempreg_acquire();
+    emit_andimm(addr,~0x20000000,HOST_TEMPREG);
+    addr=*addr_reg_override=HOST_TEMPREG;
+    type=0;
+  }
+  else if(type==MTYPE_1F80) { // scratchpad
+    if (psxH == (void *)0x1f800000) {
+      host_tempreg_acquire();
+      emit_xorimm(addr,0x1f800000,HOST_TEMPREG);
+      emit_cmpimm(HOST_TEMPREG,0x1000);
+      host_tempreg_release();
+      jaddr=out;
+      emit_jc(0);
+    }
+    else {
+      // do the usual RAM check, jump will go to the right handler
+      type=0;
+    }
+  }
+
+  if (type == 0) // need ram check
+  {
+    emit_cmpimm(addr,RAM_SIZE);
+    jaddr = out;
+    #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
+    // Hint to branch predictor that the branch is unlikely to be taken
+    if (dops[i].rs1 >= 28)
+      emit_jno_unlikely(0);
+    else
+    #endif
+      emit_jno(0);
+    if (ram_offset != 0)
+      *offset_reg = get_ro_reg(i_regs, 0);
+  }
+
+  return jaddr;
+}
+
+// return memhandler, or get directly accessable address and return 0
+static void *get_direct_memhandler(void *table, u_int addr,
+  enum stub_type type, uintptr_t *addr_host)
+{
+  uintptr_t msb = 1ull << (sizeof(uintptr_t)*8 - 1);
+  uintptr_t l1, l2 = 0;
+  l1 = ((uintptr_t *)table)[addr>>12];
+  if (!(l1 & msb)) {
+    uintptr_t v = l1 << 1;
+    *addr_host = v + addr;
+    return NULL;
+  }
+  else {
+    l1 <<= 1;
+    if (type == LOADB_STUB || type == LOADBU_STUB || type == STOREB_STUB)
+      l2 = ((uintptr_t *)l1)[0x1000/4 + 0x1000/2 + (addr&0xfff)];
+    else if (type == LOADH_STUB || type == LOADHU_STUB || type == STOREH_STUB)
+      l2 = ((uintptr_t *)l1)[0x1000/4 + (addr&0xfff)/2];
+    else
+      l2 = ((uintptr_t *)l1)[(addr&0xfff)/4];
+    if (!(l2 & msb)) {
+      uintptr_t v = l2 << 1;
+      *addr_host = v + (addr&0xfff);
+      return NULL;
+    }
+    return (void *)(l2 << 1);
+  }
+}
+
+static u_int get_host_reglist(const signed char *regmap)
+{
+  u_int reglist = 0, hr;
+  for (hr = 0; hr < HOST_REGS; hr++) {
+    if (hr != EXCLUDE_REG && regmap[hr] >= 0)
+      reglist |= 1 << hr;
+  }
+  return reglist;
+}
+
+static u_int reglist_exclude(u_int reglist, int r1, int r2)
+{
+  if (r1 >= 0)
+    reglist &= ~(1u << r1);
+  if (r2 >= 0)
+    reglist &= ~(1u << r2);
+  return reglist;
+}
+
+// find a temp caller-saved register not in reglist (so assumed to be free)
+static int reglist_find_free(u_int reglist)
+{
+  u_int free_regs = ~reglist & CALLER_SAVE_REGS;
+  if (free_regs == 0)
+    return -1;
+  return __builtin_ctz(free_regs);
+}
+
+static void do_load_word(int a, int rt, int offset_reg)
+{
+  if (offset_reg >= 0)
+    emit_ldr_dualindexed(offset_reg, a, rt);
+  else
+    emit_readword_indexed(0, a, rt);
+}
+
+static void do_store_word(int a, int ofs, int rt, int offset_reg, int preseve_a)
+{
+  if (offset_reg < 0) {
+    emit_writeword_indexed(rt, ofs, a);
+    return;
+  }
+  if (ofs != 0)
+    emit_addimm(a, ofs, a);
+  emit_str_dualindexed(offset_reg, a, rt);
+  if (ofs != 0 && preseve_a)
+    emit_addimm(a, -ofs, a);
+}
+
+static void do_store_hword(int a, int ofs, int rt, int offset_reg, int preseve_a)
+{
+  if (offset_reg < 0) {
+    emit_writehword_indexed(rt, ofs, a);
+    return;
+  }
+  if (ofs != 0)
+    emit_addimm(a, ofs, a);
+  emit_strh_dualindexed(offset_reg, a, rt);
+  if (ofs != 0 && preseve_a)
+    emit_addimm(a, -ofs, a);
+}
+
+static void do_store_byte(int a, int rt, int offset_reg)
+{
+  if (offset_reg >= 0)
+    emit_strb_dualindexed(offset_reg, a, rt);
+  else
+    emit_writebyte_indexed(rt, 0, a);
+}
+
+static void load_assemble(int i, const struct regstat *i_regs, int ccadj_)
 {
-  int s,th,tl,addr,map=-1;
+  int s,tl,addr;
   int offset;
-  int jaddr=0;
+  void *jaddr=0;
   int memtarget=0,c=0;
-  int fastload_reg_override=0;
-  u_int hr,reglist=0;
-  th=get_reg(i_regs->regmap,rt1[i]|64);
-  tl=get_reg(i_regs->regmap,rt1[i]);
-  s=get_reg(i_regs->regmap,rs1[i]);
+  int offset_reg = -1;
+  int fastio_reg_override = -1;
+  u_int reglist=get_host_reglist(i_regs->regmap);
+  tl=get_reg(i_regs->regmap,dops[i].rt1);
+  s=get_reg(i_regs->regmap,dops[i].rs1);
   offset=imm[i];
-  for(hr=0;hr<HOST_REGS;hr++) {
-    if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
-  }
   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
   if(s>=0) {
     c=(i_regs->wasconst>>s)&1;
@@ -2636,10 +2776,10 @@ void load_assemble(int i,struct regstat *i_regs)
     }
   }
   //printf("load_assemble: c=%d\n",c);
-  //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
+  //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
   // FIXME: Even if the load is a NOP, we should check for pagefaults...
   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
-    ||rt1[i]==0) {
+    ||dops[i].rt1==0) {
       // could be FIFO, must perform the read
       // ||dummy read
       assem_debug("(forced read)\n");
@@ -2651,274 +2791,219 @@ void load_assemble(int i,struct regstat *i_regs)
   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
  if(tl>=0) {
   //printf("load_assemble: c=%d\n",c);
-  //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
+  //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
   reglist&=~(1<<tl);
-  if(th>=0) reglist&=~(1<<th);
   if(!c) {
-    #ifdef RAM_OFFSET
-    map=get_reg(i_regs->regmap,ROREG);
-    if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
-    #endif
     #ifdef R29_HACK
     // Strmnnrmn's speed hack
-    if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
+    if(dops[i].rs1!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
     #endif
     {
-      jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
+      jaddr = emit_fastpath_cmp_jump(i, i_regs, addr,
+                &offset_reg, &fastio_reg_override);
     }
   }
-  else if(ram_offset&&memtarget) {
-    emit_addimm(addr,ram_offset,HOST_TEMPREG);
-    fastload_reg_override=HOST_TEMPREG;
+  else if (ram_offset && memtarget) {
+    offset_reg = get_ro_reg(i_regs, 0);
   }
-  int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
-  if (opcode[i]==0x20) { // LB
+  int dummy=(dops[i].rt1==0)||(tl!=get_reg(i_regs->regmap,dops[i].rt1)); // ignore loads to r0 and unneeded reg
+  switch (dops[i].opcode) {
+  case 0x20: // LB
     if(!c||memtarget) {
       if(!dummy) {
-        #ifdef HOST_IMM_ADDR32
-        if(c)
-          emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
-        else
-        #endif
-        {
-          //emit_xorimm(addr,3,tl);
-          //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
-          int x=0,a=tl;
-#ifdef BIG_ENDIAN_MIPS
-          if(!c) emit_xorimm(addr,3,tl);
-          else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
-#else
-          if(!c) a=addr;
-#endif
-          if(fastload_reg_override) a=fastload_reg_override;
+        int a = tl;
+        if (!c) a = addr;
+        if (fastio_reg_override >= 0)
+          a = fastio_reg_override;
 
-          emit_movsbl_indexed_tlb(x,a,map,tl);
-        }
+        if (offset_reg >= 0)
+          emit_ldrsb_dualindexed(offset_reg, a, tl);
+        else
+          emit_movsbl_indexed(0, a, tl);
       }
       if(jaddr)
-        add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
+        add_stub_r(LOADB_STUB,jaddr,out,i,addr,i_regs,ccadj_,reglist);
     }
     else
-      inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
-  }
-  if (opcode[i]==0x21) { // LH
+      inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,dops[i].rt1,ccadj_,reglist);
+    break;
+  case 0x21: // LH
     if(!c||memtarget) {
       if(!dummy) {
-        #ifdef HOST_IMM_ADDR32
-        if(c)
-          emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
+        int a = tl;
+        if (!c) a = addr;
+        if (fastio_reg_override >= 0)
+          a = fastio_reg_override;
+        if (offset_reg >= 0)
+          emit_ldrsh_dualindexed(offset_reg, a, tl);
         else
-        #endif
-        {
-          int x=0,a=tl;
-#ifdef BIG_ENDIAN_MIPS
-          if(!c) emit_xorimm(addr,2,tl);
-          else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
-#else
-          if(!c) a=addr;
-#endif
-          if(fastload_reg_override) a=fastload_reg_override;
-          //#ifdef
-          //emit_movswl_indexed_tlb(x,tl,map,tl);
-          //else
-          if(map>=0) {
-            emit_movswl_indexed(x,a,tl);
-          }else{
-            #if 1 //def RAM_OFFSET
-            emit_movswl_indexed(x,a,tl);
-            #else
-            emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
-            #endif
-          }
-        }
+          emit_movswl_indexed(0, a, tl);
       }
       if(jaddr)
-        add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
+        add_stub_r(LOADH_STUB,jaddr,out,i,addr,i_regs,ccadj_,reglist);
     }
     else
-      inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
-  }
-  if (opcode[i]==0x23) { // LW
+      inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,dops[i].rt1,ccadj_,reglist);
+    break;
+  case 0x23: // LW
     if(!c||memtarget) {
       if(!dummy) {
-        int a=addr;
-        if(fastload_reg_override) a=fastload_reg_override;
-        //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
-        #ifdef HOST_IMM_ADDR32
-        if(c)
-          emit_readword_tlb(constmap[i][s]+offset,map,tl);
-        else
-        #endif
-        emit_readword_indexed_tlb(0,a,map,tl);
+        int a = addr;
+        if (fastio_reg_override >= 0)
+          a = fastio_reg_override;
+        do_load_word(a, tl, offset_reg);
       }
       if(jaddr)
-        add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
+        add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj_,reglist);
     }
     else
-      inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
-  }
-  if (opcode[i]==0x24) { // LBU
+      inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,dops[i].rt1,ccadj_,reglist);
+    break;
+  case 0x24: // LBU
     if(!c||memtarget) {
       if(!dummy) {
-        #ifdef HOST_IMM_ADDR32
-        if(c)
-          emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
-        else
-        #endif
-        {
-          //emit_xorimm(addr,3,tl);
-          //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
-          int x=0,a=tl;
-#ifdef BIG_ENDIAN_MIPS
-          if(!c) emit_xorimm(addr,3,tl);
-          else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
-#else
-          if(!c) a=addr;
-#endif
-          if(fastload_reg_override) a=fastload_reg_override;
+        int a = tl;
+        if (!c) a = addr;
+        if (fastio_reg_override >= 0)
+          a = fastio_reg_override;
 
-          emit_movzbl_indexed_tlb(x,a,map,tl);
-        }
+        if (offset_reg >= 0)
+          emit_ldrb_dualindexed(offset_reg, a, tl);
+        else
+          emit_movzbl_indexed(0, a, tl);
       }
       if(jaddr)
-        add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
+        add_stub_r(LOADBU_STUB,jaddr,out,i,addr,i_regs,ccadj_,reglist);
     }
     else
-      inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
-  }
-  if (opcode[i]==0x25) { // LHU
+      inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,dops[i].rt1,ccadj_,reglist);
+    break;
+  case 0x25: // LHU
     if(!c||memtarget) {
       if(!dummy) {
-        #ifdef HOST_IMM_ADDR32
-        if(c)
-          emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
+        int a = tl;
+        if(!c) a = addr;
+        if (fastio_reg_override >= 0)
+          a = fastio_reg_override;
+        if (offset_reg >= 0)
+          emit_ldrh_dualindexed(offset_reg, a, tl);
         else
-        #endif
-        {
-          int x=0,a=tl;
-#ifdef BIG_ENDIAN_MIPS
-          if(!c) emit_xorimm(addr,2,tl);
-          else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
-#else
-          if(!c) a=addr;
-#endif
-          if(fastload_reg_override) a=fastload_reg_override;
-          //#ifdef
-          //emit_movzwl_indexed_tlb(x,tl,map,tl);
-          //#else
-          if(map>=0) {
-            emit_movzwl_indexed(x,a,tl);
-          }else{
-            #if 1 //def RAM_OFFSET
-            emit_movzwl_indexed(x,a,tl);
-            #else
-            emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
-            #endif
-          }
-        }
+          emit_movzwl_indexed(0, a, tl);
       }
       if(jaddr)
-        add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
+        add_stub_r(LOADHU_STUB,jaddr,out,i,addr,i_regs,ccadj_,reglist);
     }
     else
-      inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
+      inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,dops[i].rt1,ccadj_,reglist);
+    break;
+  case 0x27: // LWU
+  case 0x37: // LD
+  default:
+    assert(0);
   }
-  if (opcode[i]==0x27) { // LWU
-    assert(th>=0);
-    if(!c||memtarget) {
-      if(!dummy) {
-        int a=addr;
-        if(fastload_reg_override) a=fastload_reg_override;
-        //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
-        #ifdef HOST_IMM_ADDR32
-        if(c)
-          emit_readword_tlb(constmap[i][s]+offset,map,tl);
-        else
-        #endif
-        emit_readword_indexed_tlb(0,a,map,tl);
-      }
-      if(jaddr)
-        add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
+ }
+ if (fastio_reg_override == HOST_TEMPREG || offset_reg == HOST_TEMPREG)
+   host_tempreg_release();
+}
+
+#ifndef loadlr_assemble
+static void loadlr_assemble(int i, const struct regstat *i_regs, int ccadj_)
+{
+  int s,tl,temp,temp2,addr;
+  int offset;
+  void *jaddr=0;
+  int memtarget=0,c=0;
+  int offset_reg = -1;
+  int fastio_reg_override = -1;
+  u_int reglist=get_host_reglist(i_regs->regmap);
+  tl=get_reg(i_regs->regmap,dops[i].rt1);
+  s=get_reg(i_regs->regmap,dops[i].rs1);
+  temp=get_reg(i_regs->regmap,-1);
+  temp2=get_reg(i_regs->regmap,FTEMP);
+  addr=get_reg(i_regs->regmap,AGEN1+(i&1));
+  assert(addr<0);
+  offset=imm[i];
+  reglist|=1<<temp;
+  if(offset||s<0||c) addr=temp2;
+  else addr=s;
+  if(s>=0) {
+    c=(i_regs->wasconst>>s)&1;
+    if(c) {
+      memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
     }
-    else {
-      inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
+  }
+  if(!c) {
+    emit_shlimm(addr,3,temp);
+    if (dops[i].opcode==0x22||dops[i].opcode==0x26) {
+      emit_andimm(addr,0xFFFFFFFC,temp2); // LWL/LWR
+    }else{
+      emit_andimm(addr,0xFFFFFFF8,temp2); // LDL/LDR
+    }
+    jaddr = emit_fastpath_cmp_jump(i, i_regs, temp2,
+              &offset_reg, &fastio_reg_override);
+  }
+  else {
+    if (ram_offset && memtarget) {
+      offset_reg = get_ro_reg(i_regs, 0);
+    }
+    if (dops[i].opcode==0x22||dops[i].opcode==0x26) {
+      emit_movimm(((constmap[i][s]+offset)<<3)&24,temp); // LWL/LWR
+    }else{
+      emit_movimm(((constmap[i][s]+offset)<<3)&56,temp); // LDL/LDR
     }
-    emit_zeroreg(th);
   }
-  if (opcode[i]==0x37) { // LD
+  if (dops[i].opcode==0x22||dops[i].opcode==0x26) { // LWL/LWR
     if(!c||memtarget) {
-      if(!dummy) {
-        int a=addr;
-        if(fastload_reg_override) a=fastload_reg_override;
-        //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
-        //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
-        #ifdef HOST_IMM_ADDR32
-        if(c)
-          emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
-        else
-        #endif
-        emit_readdword_indexed_tlb(0,a,map,th,tl);
-      }
-      if(jaddr)
-        add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
+      int a = temp2;
+      if (fastio_reg_override >= 0)
+        a = fastio_reg_override;
+      do_load_word(a, temp2, offset_reg);
+      if (fastio_reg_override == HOST_TEMPREG || offset_reg == HOST_TEMPREG)
+        host_tempreg_release();
+      if(jaddr) add_stub_r(LOADW_STUB,jaddr,out,i,temp2,i_regs,ccadj_,reglist);
     }
     else
-      inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
+      inline_readstub(LOADW_STUB,i,(constmap[i][s]+offset)&0xFFFFFFFC,i_regs->regmap,FTEMP,ccadj_,reglist);
+    if(dops[i].rt1) {
+      assert(tl>=0);
+      emit_andimm(temp,24,temp);
+      if (dops[i].opcode==0x22) // LWL
+        emit_xorimm(temp,24,temp);
+      host_tempreg_acquire();
+      emit_movimm(-1,HOST_TEMPREG);
+      if (dops[i].opcode==0x26) {
+        emit_shr(temp2,temp,temp2);
+        emit_bic_lsr(tl,HOST_TEMPREG,temp,tl);
+      }else{
+        emit_shl(temp2,temp,temp2);
+        emit_bic_lsl(tl,HOST_TEMPREG,temp,tl);
+      }
+      host_tempreg_release();
+      emit_or(temp2,tl,tl);
+    }
+    //emit_storereg(dops[i].rt1,tl); // DEBUG
+  }
+  if (dops[i].opcode==0x1A||dops[i].opcode==0x1B) { // LDL/LDR
+    assert(0);
   }
- }
-  //emit_storereg(rt1[i],tl); // DEBUG
-  //if(opcode[i]==0x23)
-  //if(opcode[i]==0x24)
-  //if(opcode[i]==0x23||opcode[i]==0x24)
-  /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
-  {
-    //emit_pusha();
-    save_regs(0x100f);
-        emit_readword((int)&last_count,ECX);
-        #ifdef __i386__
-        if(get_reg(i_regs->regmap,CCREG)<0)
-          emit_loadreg(CCREG,HOST_CCREG);
-        emit_add(HOST_CCREG,ECX,HOST_CCREG);
-        emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
-        emit_writeword(HOST_CCREG,(int)&Count);
-        #endif
-        #ifdef __arm__
-        if(get_reg(i_regs->regmap,CCREG)<0)
-          emit_loadreg(CCREG,0);
-        else
-          emit_mov(HOST_CCREG,0);
-        emit_add(0,ECX,0);
-        emit_addimm(0,2*ccadj[i],0);
-        emit_writeword(0,(int)&Count);
-        #endif
-    emit_call((int)memdebug);
-    //emit_popa();
-    restore_regs(0x100f);
-  }*/
-}
-
-#ifndef loadlr_assemble
-void loadlr_assemble(int i,struct regstat *i_regs)
-{
-  printf("Need loadlr_assemble for this architecture.\n");
-  exit(1);
 }
 #endif
 
-void store_assemble(int i,struct regstat *i_regs)
+static void store_assemble(int i, const struct regstat *i_regs, int ccadj_)
 {
-  int s,th,tl,map=-1;
+  int s,tl;
   int addr,temp;
   int offset;
-  int jaddr=0,type;
+  void *jaddr=0;
+  enum stub_type type=0;
   int memtarget=0,c=0;
   int agr=AGEN1+(i&1);
-  int faststore_reg_override=0;
-  u_int hr,reglist=0;
-  th=get_reg(i_regs->regmap,rs2[i]|64);
-  tl=get_reg(i_regs->regmap,rs2[i]);
-  s=get_reg(i_regs->regmap,rs1[i]);
+  int offset_reg = -1;
+  int fastio_reg_override = -1;
+  u_int reglist=get_host_reglist(i_regs->regmap);
+  tl=get_reg(i_regs->regmap,dops[i].rs2);
+  s=get_reg(i_regs->regmap,dops[i].rs1);
   temp=get_reg(i_regs->regmap,agr);
   if(temp<0) temp=get_reg(i_regs->regmap,-1);
   offset=imm[i];
@@ -2930,90 +3015,60 @@ void store_assemble(int i,struct regstat *i_regs)
   }
   assert(tl>=0);
   assert(temp>=0);
-  for(hr=0;hr<HOST_REGS;hr++) {
-    if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
-  }
   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
   if(offset||s<0||c) addr=temp;
   else addr=s;
-  if(!c) {
-    jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
+  if (!c) {
+    jaddr = emit_fastpath_cmp_jump(i, i_regs, addr,
+              &offset_reg, &fastio_reg_override);
   }
-  else if(ram_offset&&memtarget) {
-    emit_addimm(addr,ram_offset,HOST_TEMPREG);
-    faststore_reg_override=HOST_TEMPREG;
+  else if (ram_offset && memtarget) {
+    offset_reg = get_ro_reg(i_regs, 0);
   }
 
-  if (opcode[i]==0x28) { // SB
-    if(!c||memtarget) {
-      int x=0,a=temp;
-#ifdef BIG_ENDIAN_MIPS
-      if(!c) emit_xorimm(addr,3,temp);
-      else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
-#else
-      if(!c) a=addr;
-#endif
-      if(faststore_reg_override) a=faststore_reg_override;
-      //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
-      emit_writebyte_indexed_tlb(tl,x,a,map,a);
-    }
-    type=STOREB_STUB;
-  }
-  if (opcode[i]==0x29) { // SH
+  switch (dops[i].opcode) {
+  case 0x28: // SB
     if(!c||memtarget) {
-      int x=0,a=temp;
-#ifdef BIG_ENDIAN_MIPS
-      if(!c) emit_xorimm(addr,2,temp);
-      else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
-#else
-      if(!c) a=addr;
-#endif
-      if(faststore_reg_override) a=faststore_reg_override;
-      //#ifdef
-      //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
-      //#else
-      if(map>=0) {
-        emit_writehword_indexed(tl,x,a);
-      }else
-        //emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
-        emit_writehword_indexed(tl,x,a);
-    }
-    type=STOREH_STUB;
-  }
-  if (opcode[i]==0x2B) { // SW
+      int a = temp;
+      if (!c) a = addr;
+      if (fastio_reg_override >= 0)
+        a = fastio_reg_override;
+      do_store_byte(a, tl, offset_reg);
+    }
+    type = STOREB_STUB;
+    break;
+  case 0x29: // SH
     if(!c||memtarget) {
-      int a=addr;
-      if(faststore_reg_override) a=faststore_reg_override;
-      //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
-      emit_writeword_indexed_tlb(tl,0,a,map,temp);
-    }
-    type=STOREW_STUB;
-  }
-  if (opcode[i]==0x3F) { // SD
+      int a = temp;
+      if (!c) a = addr;
+      if (fastio_reg_override >= 0)
+        a = fastio_reg_override;
+      do_store_hword(a, 0, tl, offset_reg, 1);
+    }
+    type = STOREH_STUB;
+    break;
+  case 0x2B: // SW
     if(!c||memtarget) {
-      int a=addr;
-      if(faststore_reg_override) a=faststore_reg_override;
-      if(rs2[i]) {
-        assert(th>=0);
-        //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
-        //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
-        emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
-      }else{
-        // Store zero
-        //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
-        //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
-        emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
-      }
-    }
-    type=STORED_STUB;
-  }
+      int a = addr;
+      if (fastio_reg_override >= 0)
+        a = fastio_reg_override;
+      do_store_word(a, 0, tl, offset_reg, 1);
+    }
+    type = STOREW_STUB;
+    break;
+  case 0x3F: // SD
+  default:
+    assert(0);
+  }
+  if (fastio_reg_override == HOST_TEMPREG || offset_reg == HOST_TEMPREG)
+    host_tempreg_release();
   if(jaddr) {
     // PCSX store handlers don't check invcode again
     reglist|=1<<addr;
-    add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
+    add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj_,reglist);
     jaddr=0;
   }
-  if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
+  if(!(i_regs->waswritten&(1<<dops[i].rs1)) && !HACK_ENABLED(NDHACK_NO_SMC_CHECK)) {
     if(!c||memtarget) {
       #ifdef DESTRUCTIVE_SHIFT
       // The x86 shift operation is 'destructive'; it overwrites the
@@ -3025,90 +3080,55 @@ void store_assemble(int i,struct regstat *i_regs)
       assert(ir>=0);
       emit_cmpmem_indexedsr12_reg(ir,addr,1);
       #else
-      emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
+      emit_cmpmem_indexedsr12_imm(invalid_code,addr,1);
       #endif
       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
       emit_callne(invalidate_addr_reg[addr]);
       #else
-      int jaddr2=(int)out;
+      void *jaddr2 = out;
       emit_jne(0);
-      add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
+      add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),addr,0,0,0);
       #endif
     }
   }
   u_int addr_val=constmap[i][s]+offset;
   if(jaddr) {
-    add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
+    add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj_,reglist);
   } else if(c&&!memtarget) {
-    inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
+    inline_writestub(type,i,addr_val,i_regs->regmap,dops[i].rs2,ccadj_,reglist);
   }
   // basic current block modification detection..
   // not looking back as that should be in mips cache already
+  // (see Spyro2 title->attract mode)
   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
     assert(i_regs->regmap==regs[i].regmap); // not delay slot
     if(i_regs->regmap==regs[i].regmap) {
-      load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
-      wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
+      load_all_consts(regs[i].regmap_entry,regs[i].wasdirty,i);
+      wb_dirtys(regs[i].regmap_entry,regs[i].wasdirty);
       emit_movimm(start+i*4+4,0);
-      emit_writeword(0,(int)&pcaddr);
-      emit_jmp((int)do_interrupt);
+      emit_writeword(0,&pcaddr);
+      emit_addimm(HOST_CCREG,2,HOST_CCREG);
+      emit_far_call(get_addr_ht);
+      emit_jmpreg(0);
     }
   }
-  //if(opcode[i]==0x2B || opcode[i]==0x3F)
-  //if(opcode[i]==0x2B || opcode[i]==0x28)
-  //if(opcode[i]==0x2B || opcode[i]==0x29)
-  //if(opcode[i]==0x2B)
-  /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
-  {
-    #ifdef __i386__
-    emit_pusha();
-    #endif
-    #ifdef __arm__
-    save_regs(0x100f);
-    #endif
-        emit_readword((int)&last_count,ECX);
-        #ifdef __i386__
-        if(get_reg(i_regs->regmap,CCREG)<0)
-          emit_loadreg(CCREG,HOST_CCREG);
-        emit_add(HOST_CCREG,ECX,HOST_CCREG);
-        emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
-        emit_writeword(HOST_CCREG,(int)&Count);
-        #endif
-        #ifdef __arm__
-        if(get_reg(i_regs->regmap,CCREG)<0)
-          emit_loadreg(CCREG,0);
-        else
-          emit_mov(HOST_CCREG,0);
-        emit_add(0,ECX,0);
-        emit_addimm(0,2*ccadj[i],0);
-        emit_writeword(0,(int)&Count);
-        #endif
-    emit_call((int)memdebug);
-    #ifdef __i386__
-    emit_popa();
-    #endif
-    #ifdef __arm__
-    restore_regs(0x100f);
-    #endif
-  }*/
 }
 
-void storelr_assemble(int i,struct regstat *i_regs)
+static void storelr_assemble(int i, const struct regstat *i_regs, int ccadj_)
 {
-  int s,th,tl;
+  int s,tl;
   int temp;
-  int temp2=-1;
   int offset;
-  int jaddr=0;
-  int case1,case2,case3;
-  int done0,done1,done2;
+  void *jaddr=0;
+  void *case1, *case23, *case3;
+  void *done0, *done1, *done2;
   int memtarget=0,c=0;
   int agr=AGEN1+(i&1);
-  u_int hr,reglist=0;
-  th=get_reg(i_regs->regmap,rs2[i]|64);
-  tl=get_reg(i_regs->regmap,rs2[i]);
-  s=get_reg(i_regs->regmap,rs1[i]);
+  int offset_reg = -1;
+  u_int reglist=get_host_reglist(i_regs->regmap);
+  tl=get_reg(i_regs->regmap,dops[i].rs2);
+  s=get_reg(i_regs->regmap,dops[i].rs1);
   temp=get_reg(i_regs->regmap,agr);
   if(temp<0) temp=get_reg(i_regs->regmap,-1);
   offset=imm[i];
@@ -3119,240 +3139,574 @@ void storelr_assemble(int i,struct regstat *i_regs)
     }
   }
   assert(tl>=0);
-  for(hr=0;hr<HOST_REGS;hr++) {
-    if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
-  }
   assert(temp>=0);
   if(!c) {
     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
     if(!offset&&s!=temp) emit_mov(s,temp);
-    jaddr=(int)out;
+    jaddr=out;
     emit_jno(0);
   }
   else
   {
-    if(!memtarget||!rs1[i]) {
-      jaddr=(int)out;
+    if(!memtarget||!dops[i].rs1) {
+      jaddr=out;
       emit_jmp(0);
     }
   }
-  #ifdef RAM_OFFSET
-  int map=get_reg(i_regs->regmap,ROREG);
-  if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
-  #else
-  if((u_int)rdram!=0x80000000)
-    emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
-  #endif
+  if (ram_offset)
+    offset_reg = get_ro_reg(i_regs, 0);
 
-  if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
-    temp2=get_reg(i_regs->regmap,FTEMP);
-    if(!rs2[i]) temp2=th=tl;
+  if (dops[i].opcode==0x2C||dops[i].opcode==0x2D) { // SDL/SDR
+    assert(0);
   }
 
-#ifndef BIG_ENDIAN_MIPS
-    emit_xorimm(temp,3,temp);
-#endif
   emit_testimm(temp,2);
-  case2=(int)out;
+  case23=out;
   emit_jne(0);
   emit_testimm(temp,1);
-  case1=(int)out;
+  case1=out;
   emit_jne(0);
   // 0
-  if (opcode[i]==0x2A) { // SWL
-    emit_writeword_indexed(tl,0,temp);
-  }
-  if (opcode[i]==0x2E) { // SWR
-    emit_writebyte_indexed(tl,3,temp);
-  }
-  if (opcode[i]==0x2C) { // SDL
-    emit_writeword_indexed(th,0,temp);
-    if(rs2[i]) emit_mov(tl,temp2);
+  if (dops[i].opcode == 0x2A) { // SWL
+    // Write msb into least significant byte
+    if (dops[i].rs2) emit_rorimm(tl, 24, tl);
+    do_store_byte(temp, tl, offset_reg);
+    if (dops[i].rs2) emit_rorimm(tl, 8, tl);
   }
-  if (opcode[i]==0x2D) { // SDR
-    emit_writebyte_indexed(tl,3,temp);
-    if(rs2[i]) emit_shldimm(th,tl,24,temp2);
+  else if (dops[i].opcode == 0x2E) { // SWR
+    // Write entire word
+    do_store_word(temp, 0, tl, offset_reg, 1);
   }
-  done0=(int)out;
+  done0 = out;
   emit_jmp(0);
   // 1
-  set_jump_target(case1,(int)out);
-  if (opcode[i]==0x2A) { // SWL
-    // Write 3 msb into three least significant bytes
-    if(rs2[i]) emit_rorimm(tl,8,tl);
-    emit_writehword_indexed(tl,-1,temp);
-    if(rs2[i]) emit_rorimm(tl,16,tl);
-    emit_writebyte_indexed(tl,1,temp);
-    if(rs2[i]) emit_rorimm(tl,8,tl);
-  }
-  if (opcode[i]==0x2E) { // SWR
-    // Write two lsb into two most significant bytes
-    emit_writehword_indexed(tl,1,temp);
+  set_jump_target(case1, out);
+  if (dops[i].opcode == 0x2A) { // SWL
+    // Write two msb into two least significant bytes
+    if (dops[i].rs2) emit_rorimm(tl, 16, tl);
+    do_store_hword(temp, -1, tl, offset_reg, 0);
+    if (dops[i].rs2) emit_rorimm(tl, 16, tl);
   }
-  if (opcode[i]==0x2C) { // SDL
-    if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
-    // Write 3 msb into three least significant bytes
-    if(rs2[i]) emit_rorimm(th,8,th);
-    emit_writehword_indexed(th,-1,temp);
-    if(rs2[i]) emit_rorimm(th,16,th);
-    emit_writebyte_indexed(th,1,temp);
-    if(rs2[i]) emit_rorimm(th,8,th);
-  }
-  if (opcode[i]==0x2D) { // SDR
-    if(rs2[i]) emit_shldimm(th,tl,16,temp2);
-    // Write two lsb into two most significant bytes
-    emit_writehword_indexed(tl,1,temp);
+  else if (dops[i].opcode == 0x2E) { // SWR
+    // Write 3 lsb into three most significant bytes
+    do_store_byte(temp, tl, offset_reg);
+    if (dops[i].rs2) emit_rorimm(tl, 8, tl);
+    do_store_hword(temp, 1, tl, offset_reg, 0);
+    if (dops[i].rs2) emit_rorimm(tl, 24, tl);
   }
-  done1=(int)out;
+  done1=out;
   emit_jmp(0);
-  // 2
-  set_jump_target(case2,(int)out);
+  // 2,3
+  set_jump_target(case23, out);
   emit_testimm(temp,1);
-  case3=(int)out;
+  case3 = out;
   emit_jne(0);
-  if (opcode[i]==0x2A) { // SWL
-    // Write two msb into two least significant bytes
-    if(rs2[i]) emit_rorimm(tl,16,tl);
-    emit_writehword_indexed(tl,-2,temp);
-    if(rs2[i]) emit_rorimm(tl,16,tl);
-  }
-  if (opcode[i]==0x2E) { // SWR
-    // Write 3 lsb into three most significant bytes
-    emit_writebyte_indexed(tl,-1,temp);
-    if(rs2[i]) emit_rorimm(tl,8,tl);
-    emit_writehword_indexed(tl,0,temp);
-    if(rs2[i]) emit_rorimm(tl,24,tl);
-  }
-  if (opcode[i]==0x2C) { // SDL
-    if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
-    // Write two msb into two least significant bytes
-    if(rs2[i]) emit_rorimm(th,16,th);
-    emit_writehword_indexed(th,-2,temp);
-    if(rs2[i]) emit_rorimm(th,16,th);
+  // 2
+  if (dops[i].opcode==0x2A) { // SWL
+    // Write 3 msb into three least significant bytes
+    if (dops[i].rs2) emit_rorimm(tl, 8, tl);
+    do_store_hword(temp, -2, tl, offset_reg, 1);
+    if (dops[i].rs2) emit_rorimm(tl, 16, tl);
+    do_store_byte(temp, tl, offset_reg);
+    if (dops[i].rs2) emit_rorimm(tl, 8, tl);
   }
-  if (opcode[i]==0x2D) { // SDR
-    if(rs2[i]) emit_shldimm(th,tl,8,temp2);
-    // Write 3 lsb into three most significant bytes
-    emit_writebyte_indexed(tl,-1,temp);
-    if(rs2[i]) emit_rorimm(tl,8,tl);
-    emit_writehword_indexed(tl,0,temp);
-    if(rs2[i]) emit_rorimm(tl,24,tl);
+  else if (dops[i].opcode == 0x2E) { // SWR
+    // Write two lsb into two most significant bytes
+    do_store_hword(temp, 0, tl, offset_reg, 1);
   }
-  done2=(int)out;
+  done2 = out;
   emit_jmp(0);
   // 3
-  set_jump_target(case3,(int)out);
-  if (opcode[i]==0x2A) { // SWL
-    // Write msb into least significant byte
-    if(rs2[i]) emit_rorimm(tl,24,tl);
-    emit_writebyte_indexed(tl,-3,temp);
-    if(rs2[i]) emit_rorimm(tl,8,tl);
-  }
-  if (opcode[i]==0x2E) { // SWR
-    // Write entire word
-    emit_writeword_indexed(tl,-3,temp);
-  }
-  if (opcode[i]==0x2C) { // SDL
-    if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
-    // Write msb into least significant byte
-    if(rs2[i]) emit_rorimm(th,24,th);
-    emit_writebyte_indexed(th,-3,temp);
-    if(rs2[i]) emit_rorimm(th,8,th);
-  }
-  if (opcode[i]==0x2D) { // SDR
-    if(rs2[i]) emit_mov(th,temp2);
-    // Write entire word
-    emit_writeword_indexed(tl,-3,temp);
-  }
-  set_jump_target(done0,(int)out);
-  set_jump_target(done1,(int)out);
-  set_jump_target(done2,(int)out);
-  if (opcode[i]==0x2C) { // SDL
-    emit_testimm(temp,4);
-    done0=(int)out;
-    emit_jne(0);
-    emit_andimm(temp,~3,temp);
-    emit_writeword_indexed(temp2,4,temp);
-    set_jump_target(done0,(int)out);
-  }
-  if (opcode[i]==0x2D) { // SDR
-    emit_testimm(temp,4);
-    done0=(int)out;
-    emit_jeq(0);
-    emit_andimm(temp,~3,temp);
-    emit_writeword_indexed(temp2,-4,temp);
-    set_jump_target(done0,(int)out);
-  }
+  set_jump_target(case3, out);
+  if (dops[i].opcode == 0x2A) { // SWL
+    do_store_word(temp, -3, tl, offset_reg, 0);
+  }
+  else if (dops[i].opcode == 0x2E) { // SWR
+    do_store_byte(temp, tl, offset_reg);
+  }
+  set_jump_target(done0, out);
+  set_jump_target(done1, out);
+  set_jump_target(done2, out);
+  if (offset_reg == HOST_TEMPREG)
+    host_tempreg_release();
   if(!c||!memtarget)
-    add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
-  if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
-    #ifdef RAM_OFFSET
-    int map=get_reg(i_regs->regmap,ROREG);
-    if(map<0) map=HOST_TEMPREG;
-    gen_orig_addr_w(temp,map);
-    #else
-    emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
-    #endif
+    add_stub_r(STORELR_STUB,jaddr,out,i,temp,i_regs,ccadj_,reglist);
+  if(!(i_regs->waswritten&(1<<dops[i].rs1)) && !HACK_ENABLED(NDHACK_NO_SMC_CHECK)) {
     #if defined(HOST_IMM8)
     int ir=get_reg(i_regs->regmap,INVCP);
     assert(ir>=0);
     emit_cmpmem_indexedsr12_reg(ir,temp,1);
     #else
-    emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
+    emit_cmpmem_indexedsr12_imm(invalid_code,temp,1);
     #endif
     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
     emit_callne(invalidate_addr_reg[temp]);
     #else
-    int jaddr2=(int)out;
+    void *jaddr2 = out;
     emit_jne(0);
-    add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
+    add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),temp,0,0,0);
     #endif
   }
-  /*
-    emit_pusha();
-    //save_regs(0x100f);
-        emit_readword((int)&last_count,ECX);
-        if(get_reg(i_regs->regmap,CCREG)<0)
-          emit_loadreg(CCREG,HOST_CCREG);
-        emit_add(HOST_CCREG,ECX,HOST_CCREG);
-        emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
-        emit_writeword(HOST_CCREG,(int)&Count);
-    emit_call((int)memdebug);
-    emit_popa();
-    //restore_regs(0x100f);
-  */
 }
 
-void c1ls_assemble(int i,struct regstat *i_regs)
+static void cop0_assemble(int i, const struct regstat *i_regs, int ccadj_)
+{
+  if(dops[i].opcode2==0) // MFC0
+  {
+    signed char t=get_reg(i_regs->regmap,dops[i].rt1);
+    u_int copr=(source[i]>>11)&0x1f;
+    //assert(t>=0); // Why does this happen?  OOT is weird
+    if(t>=0&&dops[i].rt1!=0) {
+      emit_readword(&reg_cop0[copr],t);
+    }
+  }
+  else if(dops[i].opcode2==4) // MTC0
+  {
+    signed char s=get_reg(i_regs->regmap,dops[i].rs1);
+    char copr=(source[i]>>11)&0x1f;
+    assert(s>=0);
+    wb_register(dops[i].rs1,i_regs->regmap,i_regs->dirty);
+    if(copr==9||copr==11||copr==12||copr==13) {
+      emit_readword(&last_count,HOST_TEMPREG);
+      emit_loadreg(CCREG,HOST_CCREG); // TODO: do proper reg alloc
+      emit_add(HOST_CCREG,HOST_TEMPREG,HOST_CCREG);
+      emit_addimm(HOST_CCREG,ccadj_,HOST_CCREG);
+      emit_writeword(HOST_CCREG,&Count);
+    }
+    // What a mess.  The status register (12) can enable interrupts,
+    // so needs a special case to handle a pending interrupt.
+    // The interrupt must be taken immediately, because a subsequent
+    // instruction might disable interrupts again.
+    if(copr==12||copr==13) {
+      if (is_delayslot) {
+        // burn cycles to cause cc_interrupt, which will
+        // reschedule next_interupt. Relies on CCREG from above.
+        assem_debug("MTC0 DS %d\n", copr);
+        emit_writeword(HOST_CCREG,&last_count);
+        emit_movimm(0,HOST_CCREG);
+        emit_storereg(CCREG,HOST_CCREG);
+        emit_loadreg(dops[i].rs1,1);
+        emit_movimm(copr,0);
+        emit_far_call(pcsx_mtc0_ds);
+        emit_loadreg(dops[i].rs1,s);
+        return;
+      }
+      emit_movimm(start+i*4+4,HOST_TEMPREG);
+      emit_writeword(HOST_TEMPREG,&pcaddr);
+      emit_movimm(0,HOST_TEMPREG);
+      emit_writeword(HOST_TEMPREG,&pending_exception);
+    }
+    if(s==HOST_CCREG)
+      emit_loadreg(dops[i].rs1,1);
+    else if(s!=1)
+      emit_mov(s,1);
+    emit_movimm(copr,0);
+    emit_far_call(pcsx_mtc0);
+    if(copr==9||copr==11||copr==12||copr==13) {
+      emit_readword(&Count,HOST_CCREG);
+      emit_readword(&next_interupt,HOST_TEMPREG);
+      emit_addimm(HOST_CCREG,-ccadj_,HOST_CCREG);
+      emit_sub(HOST_CCREG,HOST_TEMPREG,HOST_CCREG);
+      emit_writeword(HOST_TEMPREG,&last_count);
+      emit_storereg(CCREG,HOST_CCREG);
+    }
+    if(copr==12||copr==13) {
+      assert(!is_delayslot);
+      emit_readword(&pending_exception,14);
+      emit_test(14,14);
+      void *jaddr = out;
+      emit_jeq(0);
+      emit_readword(&pcaddr, 0);
+      emit_addimm(HOST_CCREG,2,HOST_CCREG);
+      emit_far_call(get_addr_ht);
+      emit_jmpreg(0);
+      set_jump_target(jaddr, out);
+    }
+    emit_loadreg(dops[i].rs1,s);
+  }
+  else
+  {
+    assert(dops[i].opcode2==0x10);
+    //if((source[i]&0x3f)==0x10) // RFE
+    {
+      emit_readword(&Status,0);
+      emit_andimm(0,0x3c,1);
+      emit_andimm(0,~0xf,0);
+      emit_orrshr_imm(1,2,0);
+      emit_writeword(0,&Status);
+    }
+  }
+}
+
+static void cop1_unusable(int i, const struct regstat *i_regs)
+{
+  // XXX: should just just do the exception instead
+  //if(!cop1_usable)
+  {
+    void *jaddr=out;
+    emit_jmp(0);
+    add_stub_r(FP_STUB,jaddr,out,i,0,i_regs,is_delayslot,0);
+  }
+}
+
+static void cop1_assemble(int i, const struct regstat *i_regs)
 {
   cop1_unusable(i, i_regs);
 }
 
-void c2ls_assemble(int i,struct regstat *i_regs)
+static void c1ls_assemble(int i, const struct regstat *i_regs)
+{
+  cop1_unusable(i, i_regs);
+}
+
+// FP_STUB
+static void do_cop1stub(int n)
+{
+  literal_pool(256);
+  assem_debug("do_cop1stub %x\n",start+stubs[n].a*4);
+  set_jump_target(stubs[n].addr, out);
+  int i=stubs[n].a;
+//  int rs=stubs[n].b;
+  struct regstat *i_regs=(struct regstat *)stubs[n].c;
+  int ds=stubs[n].d;
+  if(!ds) {
+    load_all_consts(regs[i].regmap_entry,regs[i].wasdirty,i);
+    //if(i_regs!=&regs[i]) printf("oops: regs[i]=%x i_regs=%x",(int)&regs[i],(int)i_regs);
+  }
+  //else {printf("fp exception in delay slot\n");}
+  wb_dirtys(i_regs->regmap_entry,i_regs->wasdirty);
+  if(regs[i].regmap_entry[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
+  emit_movimm(start+(i-ds)*4,EAX); // Get PC
+  emit_addimm(HOST_CCREG,ccadj[i],HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
+  emit_far_jump(ds?fp_exception_ds:fp_exception);
+}
+
+static int cop2_is_stalling_op(int i, int *cycles)
+{
+  if (dops[i].opcode == 0x3a) { // SWC2
+    *cycles = 0;
+    return 1;
+  }
+  if (dops[i].itype == COP2 && (dops[i].opcode2 == 0 || dops[i].opcode2 == 2)) { // MFC2/CFC2
+    *cycles = 0;
+    return 1;
+  }
+  if (dops[i].itype == C2OP) {
+    *cycles = gte_cycletab[source[i] & 0x3f];
+    return 1;
+  }
+  // ... what about MTC2/CTC2/LWC2?
+  return 0;
+}
+
+#if 0
+static void log_gte_stall(int stall, u_int cycle)
+{
+  if ((u_int)stall <= 44)
+    printf("x    stall %2d %u\n", stall, cycle + last_count);
+}
+
+static void emit_log_gte_stall(int i, int stall, u_int reglist)
+{
+  save_regs(reglist);
+  if (stall > 0)
+    emit_movimm(stall, 0);
+  else
+    emit_mov(HOST_TEMPREG, 0);
+  emit_addimm(HOST_CCREG, ccadj[i], 1);
+  emit_far_call(log_gte_stall);
+  restore_regs(reglist);
+}
+#endif
+
+static void cop2_do_stall_check(u_int op, int i, const struct regstat *i_regs, u_int reglist)
+{
+  int j = i, other_gte_op_cycles = -1, stall = -MAXBLOCK, cycles_passed;
+  int rtmp = reglist_find_free(reglist);
+
+  if (HACK_ENABLED(NDHACK_NO_STALLS))
+    return;
+  if (get_reg(i_regs->regmap, CCREG) != HOST_CCREG) {
+    // happens occasionally... cc evicted? Don't bother then
+    //printf("no cc %08x\n", start + i*4);
+    return;
+  }
+  if (!dops[i].bt) {
+    for (j = i - 1; j >= 0; j--) {
+      //if (dops[j].is_ds) break;
+      if (cop2_is_stalling_op(j, &other_gte_op_cycles) || dops[j].bt)
+        break;
+      if (j > 0 && ccadj[j - 1] > ccadj[j])
+        break;
+    }
+    j = max(j, 0);
+  }
+  cycles_passed = ccadj[i] - ccadj[j];
+  if (other_gte_op_cycles >= 0)
+    stall = other_gte_op_cycles - cycles_passed;
+  else if (cycles_passed >= 44)
+    stall = 0; // can't stall
+  if (stall == -MAXBLOCK && rtmp >= 0) {
+    // unknown stall, do the expensive runtime check
+    assem_debug("; cop2_do_stall_check\n");
+#if 0 // too slow
+    save_regs(reglist);
+    emit_movimm(gte_cycletab[op], 0);
+    emit_addimm(HOST_CCREG, ccadj[i], 1);
+    emit_far_call(call_gteStall);
+    restore_regs(reglist);
+#else
+    host_tempreg_acquire();
+    emit_readword(&psxRegs.gteBusyCycle, rtmp);
+    emit_addimm(rtmp, -ccadj[i], rtmp);
+    emit_sub(rtmp, HOST_CCREG, HOST_TEMPREG);
+    emit_cmpimm(HOST_TEMPREG, 44);
+    emit_cmovb_reg(rtmp, HOST_CCREG);
+    //emit_log_gte_stall(i, 0, reglist);
+    host_tempreg_release();
+#endif
+  }
+  else if (stall > 0) {
+    //emit_log_gte_stall(i, stall, reglist);
+    emit_addimm(HOST_CCREG, stall, HOST_CCREG);
+  }
+
+  // save gteBusyCycle, if needed
+  if (gte_cycletab[op] == 0)
+    return;
+  other_gte_op_cycles = -1;
+  for (j = i + 1; j < slen; j++) {
+    if (cop2_is_stalling_op(j, &other_gte_op_cycles))
+      break;
+    if (dops[j].is_jump) {
+      // check ds
+      if (j + 1 < slen && cop2_is_stalling_op(j + 1, &other_gte_op_cycles))
+        j++;
+      break;
+    }
+  }
+  if (other_gte_op_cycles >= 0)
+    // will handle stall when assembling that op
+    return;
+  cycles_passed = ccadj[min(j, slen -1)] - ccadj[i];
+  if (cycles_passed >= 44)
+    return;
+  assem_debug("; save gteBusyCycle\n");
+  host_tempreg_acquire();
+#if 0
+  emit_readword(&last_count, HOST_TEMPREG);
+  emit_add(HOST_TEMPREG, HOST_CCREG, HOST_TEMPREG);
+  emit_addimm(HOST_TEMPREG, ccadj[i], HOST_TEMPREG);
+  emit_addimm(HOST_TEMPREG, gte_cycletab[op]), HOST_TEMPREG);
+  emit_writeword(HOST_TEMPREG, &psxRegs.gteBusyCycle);
+#else
+  emit_addimm(HOST_CCREG, ccadj[i] + gte_cycletab[op], HOST_TEMPREG);
+  emit_writeword(HOST_TEMPREG, &psxRegs.gteBusyCycle);
+#endif
+  host_tempreg_release();
+}
+
+static int is_mflohi(int i)
+{
+  return (dops[i].itype == MOV && (dops[i].rs1 == HIREG || dops[i].rs1 == LOREG));
+}
+
+static int check_multdiv(int i, int *cycles)
+{
+  if (dops[i].itype != MULTDIV)
+    return 0;
+  if (dops[i].opcode2 == 0x18 || dops[i].opcode2 == 0x19) // MULT(U)
+    *cycles = 11; // approx from 7 11 14
+  else
+    *cycles = 37;
+  return 1;
+}
+
+static void multdiv_prepare_stall(int i, const struct regstat *i_regs, int ccadj_)
+{
+  int j, found = 0, c = 0;
+  if (HACK_ENABLED(NDHACK_NO_STALLS))
+    return;
+  if (get_reg(i_regs->regmap, CCREG) != HOST_CCREG) {
+    // happens occasionally... cc evicted? Don't bother then
+    return;
+  }
+  for (j = i + 1; j < slen; j++) {
+    if (dops[j].bt)
+      break;
+    if ((found = is_mflohi(j)))
+      break;
+    if (dops[j].is_jump) {
+      // check ds
+      if (j + 1 < slen && (found = is_mflohi(j + 1)))
+        j++;
+      break;
+    }
+  }
+  if (found)
+    // handle all in multdiv_do_stall()
+    return;
+  check_multdiv(i, &c);
+  assert(c > 0);
+  assem_debug("; muldiv prepare stall %d\n", c);
+  host_tempreg_acquire();
+  emit_addimm(HOST_CCREG, ccadj_ + c, HOST_TEMPREG);
+  emit_writeword(HOST_TEMPREG, &psxRegs.muldivBusyCycle);
+  host_tempreg_release();
+}
+
+static void multdiv_do_stall(int i, const struct regstat *i_regs)
+{
+  int j, known_cycles = 0;
+  u_int reglist = get_host_reglist(i_regs->regmap);
+  int rtmp = get_reg(i_regs->regmap, -1);
+  if (rtmp < 0)
+    rtmp = reglist_find_free(reglist);
+  if (HACK_ENABLED(NDHACK_NO_STALLS))
+    return;
+  if (get_reg(i_regs->regmap, CCREG) != HOST_CCREG || rtmp < 0) {
+    // happens occasionally... cc evicted? Don't bother then
+    //printf("no cc/rtmp %08x\n", start + i*4);
+    return;
+  }
+  if (!dops[i].bt) {
+    for (j = i - 1; j >= 0; j--) {
+      if (dops[j].is_ds) break;
+      if (check_multdiv(j, &known_cycles))
+        break;
+      if (is_mflohi(j))
+        // already handled by this op
+        return;
+      if (dops[j].bt || (j > 0 && ccadj[j - 1] > ccadj[j]))
+        break;
+    }
+    j = max(j, 0);
+  }
+  if (known_cycles > 0) {
+    known_cycles -= ccadj[i] - ccadj[j];
+    assem_debug("; muldiv stall resolved %d\n", known_cycles);
+    if (known_cycles > 0)
+      emit_addimm(HOST_CCREG, known_cycles, HOST_CCREG);
+    return;
+  }
+  assem_debug("; muldiv stall unresolved\n");
+  host_tempreg_acquire();
+  emit_readword(&psxRegs.muldivBusyCycle, rtmp);
+  emit_addimm(rtmp, -ccadj[i], rtmp);
+  emit_sub(rtmp, HOST_CCREG, HOST_TEMPREG);
+  emit_cmpimm(HOST_TEMPREG, 37);
+  emit_cmovb_reg(rtmp, HOST_CCREG);
+  //emit_log_gte_stall(i, 0, reglist);
+  host_tempreg_release();
+}
+
+static void cop2_get_dreg(u_int copr,signed char tl,signed char temp)
+{
+  switch (copr) {
+    case 1:
+    case 3:
+    case 5:
+    case 8:
+    case 9:
+    case 10:
+    case 11:
+      emit_readword(&reg_cop2d[copr],tl);
+      emit_signextend16(tl,tl);
+      emit_writeword(tl,&reg_cop2d[copr]); // hmh
+      break;
+    case 7:
+    case 16:
+    case 17:
+    case 18:
+    case 19:
+      emit_readword(&reg_cop2d[copr],tl);
+      emit_andimm(tl,0xffff,tl);
+      emit_writeword(tl,&reg_cop2d[copr]);
+      break;
+    case 15:
+      emit_readword(&reg_cop2d[14],tl); // SXY2
+      emit_writeword(tl,&reg_cop2d[copr]);
+      break;
+    case 28:
+    case 29:
+      c2op_mfc2_29_assemble(tl,temp);
+      break;
+    default:
+      emit_readword(&reg_cop2d[copr],tl);
+      break;
+  }
+}
+
+static void cop2_put_dreg(u_int copr,signed char sl,signed char temp)
+{
+  switch (copr) {
+    case 15:
+      emit_readword(&reg_cop2d[13],temp);  // SXY1
+      emit_writeword(sl,&reg_cop2d[copr]);
+      emit_writeword(temp,&reg_cop2d[12]); // SXY0
+      emit_readword(&reg_cop2d[14],temp);  // SXY2
+      emit_writeword(sl,&reg_cop2d[14]);
+      emit_writeword(temp,&reg_cop2d[13]); // SXY1
+      break;
+    case 28:
+      emit_andimm(sl,0x001f,temp);
+      emit_shlimm(temp,7,temp);
+      emit_writeword(temp,&reg_cop2d[9]);
+      emit_andimm(sl,0x03e0,temp);
+      emit_shlimm(temp,2,temp);
+      emit_writeword(temp,&reg_cop2d[10]);
+      emit_andimm(sl,0x7c00,temp);
+      emit_shrimm(temp,3,temp);
+      emit_writeword(temp,&reg_cop2d[11]);
+      emit_writeword(sl,&reg_cop2d[28]);
+      break;
+    case 30:
+      emit_xorsar_imm(sl,sl,31,temp);
+#if defined(HAVE_ARMV5) || defined(__aarch64__)
+      emit_clz(temp,temp);
+#else
+      emit_movs(temp,HOST_TEMPREG);
+      emit_movimm(0,temp);
+      emit_jeq((int)out+4*4);
+      emit_addpl_imm(temp,1,temp);
+      emit_lslpls_imm(HOST_TEMPREG,1,HOST_TEMPREG);
+      emit_jns((int)out-2*4);
+#endif
+      emit_writeword(sl,&reg_cop2d[30]);
+      emit_writeword(temp,&reg_cop2d[31]);
+      break;
+    case 31:
+      break;
+    default:
+      emit_writeword(sl,&reg_cop2d[copr]);
+      break;
+  }
+}
+
+static void c2ls_assemble(int i, const struct regstat *i_regs, int ccadj_)
 {
   int s,tl;
   int ar;
   int offset;
   int memtarget=0,c=0;
-  int jaddr2=0,type;
+  void *jaddr2=NULL;
+  enum stub_type type;
   int agr=AGEN1+(i&1);
-  int fastio_reg_override=0;
-  u_int hr,reglist=0;
+  int offset_reg = -1;
+  int fastio_reg_override = -1;
+  u_int reglist=get_host_reglist(i_regs->regmap);
   u_int copr=(source[i]>>16)&0x1f;
-  s=get_reg(i_regs->regmap,rs1[i]);
+  s=get_reg(i_regs->regmap,dops[i].rs1);
   tl=get_reg(i_regs->regmap,FTEMP);
   offset=imm[i];
-  assert(rs1[i]>0);
+  assert(dops[i].rs1>0);
   assert(tl>=0);
 
-  for(hr=0;hr<HOST_REGS;hr++) {
-    if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
-  }
   if(i_regs->regmap[HOST_CCREG]==CCREG)
     reglist&=~(1<<HOST_CCREG);
 
   // get the address
-  if (opcode[i]==0x3a) { // SWC2
+  if (dops[i].opcode==0x3a) { // SWC2
     ar=get_reg(i_regs->regmap,agr);
     if(ar<0) ar=get_reg(i_regs->regmap,-1);
     reglist|=1<<ar;
@@ -3364,195 +3718,426 @@ void c2ls_assemble(int i,struct regstat *i_regs)
   if (!offset&&!c&&s>=0) ar=s;
   assert(ar>=0);
 
-  if (opcode[i]==0x3a) { // SWC2
-    cop2_get_dreg(copr,tl,HOST_TEMPREG);
+  cop2_do_stall_check(0, i, i_regs, reglist);
+
+  if (dops[i].opcode==0x3a) { // SWC2
+    cop2_get_dreg(copr,tl,-1);
     type=STOREW_STUB;
   }
   else
     type=LOADW_STUB;
 
   if(c&&!memtarget) {
-    jaddr2=(int)out;
+    jaddr2=out;
     emit_jmp(0); // inline_readstub/inline_writestub?
   }
   else {
     if(!c) {
-      jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
-    }
-    else if(ram_offset&&memtarget) {
-      emit_addimm(ar,ram_offset,HOST_TEMPREG);
-      fastio_reg_override=HOST_TEMPREG;
-    }
-    if (opcode[i]==0x32) { // LWC2
-      #ifdef HOST_IMM_ADDR32
-      if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
-      else
-      #endif
-      int a=ar;
-      if(fastio_reg_override) a=fastio_reg_override;
-      emit_readword_indexed(0,a,tl);
+      jaddr2 = emit_fastpath_cmp_jump(i, i_regs, ar,
+                &offset_reg, &fastio_reg_override);
+    }
+    else if (ram_offset && memtarget) {
+      offset_reg = get_ro_reg(i_regs, 0);
+    }
+    switch (dops[i].opcode) {
+    case 0x32: { // LWC2
+      int a = ar;
+      if (fastio_reg_override >= 0)
+        a = fastio_reg_override;
+      do_load_word(a, tl, offset_reg);
+      break;
     }
-    if (opcode[i]==0x3a) { // SWC2
+    case 0x3a: { // SWC2
       #ifdef DESTRUCTIVE_SHIFT
       if(!offset&&!c&&s>=0) emit_mov(s,ar);
       #endif
-      int a=ar;
-      if(fastio_reg_override) a=fastio_reg_override;
-      emit_writeword_indexed(tl,0,a);
+      int a = ar;
+      if (fastio_reg_override >= 0)
+        a = fastio_reg_override;
+      do_store_word(a, 0, tl, offset_reg, 1);
+      break;
+    }
+    default:
+      assert(0);
     }
   }
+  if (fastio_reg_override == HOST_TEMPREG || offset_reg == HOST_TEMPREG)
+    host_tempreg_release();
   if(jaddr2)
-    add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
-  if(opcode[i]==0x3a) // SWC2
-  if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
+    add_stub_r(type,jaddr2,out,i,ar,i_regs,ccadj_,reglist);
+  if(dops[i].opcode==0x3a) // SWC2
+  if(!(i_regs->waswritten&(1<<dops[i].rs1)) && !HACK_ENABLED(NDHACK_NO_SMC_CHECK)) {
 #if defined(HOST_IMM8)
     int ir=get_reg(i_regs->regmap,INVCP);
     assert(ir>=0);
     emit_cmpmem_indexedsr12_reg(ir,ar,1);
 #else
-    emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
+    emit_cmpmem_indexedsr12_imm(invalid_code,ar,1);
 #endif
     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
     emit_callne(invalidate_addr_reg[ar]);
     #else
-    int jaddr3=(int)out;
+    void *jaddr3 = out;
     emit_jne(0);
-    add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
+    add_stub(INVCODE_STUB,jaddr3,out,reglist|(1<<HOST_CCREG),ar,0,0,0);
     #endif
   }
-  if (opcode[i]==0x32) { // LWC2
-    cop2_put_dreg(copr,tl,HOST_TEMPREG);
+  if (dops[i].opcode==0x32) { // LWC2
+    host_tempreg_acquire();
+    cop2_put_dreg(copr,tl,HOST_TEMPREG);
+    host_tempreg_release();
+  }
+}
+
+static void cop2_assemble(int i, const struct regstat *i_regs)
+{
+  u_int copr = (source[i]>>11) & 0x1f;
+  signed char temp = get_reg(i_regs->regmap, -1);
+
+  if (!HACK_ENABLED(NDHACK_NO_STALLS)) {
+    u_int reglist = reglist_exclude(get_host_reglist(i_regs->regmap), temp, -1);
+    if (dops[i].opcode2 == 0 || dops[i].opcode2 == 2) { // MFC2/CFC2
+      signed char tl = get_reg(i_regs->regmap, dops[i].rt1);
+      reglist = reglist_exclude(reglist, tl, -1);
+    }
+    cop2_do_stall_check(0, i, i_regs, reglist);
+  }
+  if (dops[i].opcode2==0) { // MFC2
+    signed char tl=get_reg(i_regs->regmap,dops[i].rt1);
+    if(tl>=0&&dops[i].rt1!=0)
+      cop2_get_dreg(copr,tl,temp);
+  }
+  else if (dops[i].opcode2==4) { // MTC2
+    signed char sl=get_reg(i_regs->regmap,dops[i].rs1);
+    cop2_put_dreg(copr,sl,temp);
+  }
+  else if (dops[i].opcode2==2) // CFC2
+  {
+    signed char tl=get_reg(i_regs->regmap,dops[i].rt1);
+    if(tl>=0&&dops[i].rt1!=0)
+      emit_readword(&reg_cop2c[copr],tl);
   }
+  else if (dops[i].opcode2==6) // CTC2
+  {
+    signed char sl=get_reg(i_regs->regmap,dops[i].rs1);
+    switch(copr) {
+      case 4:
+      case 12:
+      case 20:
+      case 26:
+      case 27:
+      case 29:
+      case 30:
+        emit_signextend16(sl,temp);
+        break;
+      case 31:
+        c2op_ctc2_31_assemble(sl,temp);
+        break;
+      default:
+        temp=sl;
+        break;
+    }
+    emit_writeword(temp,&reg_cop2c[copr]);
+    assert(sl>=0);
+  }
+}
+
+static void do_unalignedwritestub(int n)
+{
+  assem_debug("do_unalignedwritestub %x\n",start+stubs[n].a*4);
+  literal_pool(256);
+  set_jump_target(stubs[n].addr, out);
+
+  int i=stubs[n].a;
+  struct regstat *i_regs=(struct regstat *)stubs[n].c;
+  int addr=stubs[n].b;
+  u_int reglist=stubs[n].e;
+  signed char *i_regmap=i_regs->regmap;
+  int temp2=get_reg(i_regmap,FTEMP);
+  int rt;
+  rt=get_reg(i_regmap,dops[i].rs2);
+  assert(rt>=0);
+  assert(addr>=0);
+  assert(dops[i].opcode==0x2a||dops[i].opcode==0x2e); // SWL/SWR only implemented
+  reglist|=(1<<addr);
+  reglist&=~(1<<temp2);
+
+  // don't bother with it and call write handler
+  save_regs(reglist);
+  pass_args(addr,rt);
+  int cc=get_reg(i_regmap,CCREG);
+  if(cc<0)
+    emit_loadreg(CCREG,2);
+  emit_addimm(cc<0?2:cc,(int)stubs[n].d+1,2);
+  emit_far_call((dops[i].opcode==0x2a?jump_handle_swl:jump_handle_swr));
+  emit_addimm(0,-((int)stubs[n].d+1),cc<0?2:cc);
+  if(cc<0)
+    emit_storereg(CCREG,2);
+  restore_regs(reglist);
+  emit_jmp(stubs[n].retaddr); // return address
 }
 
 #ifndef multdiv_assemble
 void multdiv_assemble(int i,struct regstat *i_regs)
 {
   printf("Need multdiv_assemble for this architecture.\n");
-  exit(1);
+  abort();
 }
 #endif
 
-void mov_assemble(int i,struct regstat *i_regs)
+static void mov_assemble(int i, const struct regstat *i_regs)
 {
-  //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
-  //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
-  if(rt1[i]) {
-    signed char sh,sl,th,tl;
-    th=get_reg(i_regs->regmap,rt1[i]|64);
-    tl=get_reg(i_regs->regmap,rt1[i]);
+  //if(dops[i].opcode2==0x10||dops[i].opcode2==0x12) { // MFHI/MFLO
+  //if(dops[i].opcode2==0x11||dops[i].opcode2==0x13) { // MTHI/MTLO
+  if(dops[i].rt1) {
+    signed char sl,tl;
+    tl=get_reg(i_regs->regmap,dops[i].rt1);
     //assert(tl>=0);
     if(tl>=0) {
-      sh=get_reg(i_regs->regmap,rs1[i]|64);
-      sl=get_reg(i_regs->regmap,rs1[i]);
+      sl=get_reg(i_regs->regmap,dops[i].rs1);
       if(sl>=0) emit_mov(sl,tl);
-      else emit_loadreg(rs1[i],tl);
-      if(th>=0) {
-        if(sh>=0) emit_mov(sh,th);
-        else emit_loadreg(rs1[i]|64,th);
-      }
+      else emit_loadreg(dops[i].rs1,tl);
     }
   }
+  if (dops[i].rs1 == HIREG || dops[i].rs1 == LOREG) // MFHI/MFLO
+    multdiv_do_stall(i, i_regs);
 }
 
-#ifndef fconv_assemble
-void fconv_assemble(int i,struct regstat *i_regs)
+// call interpreter, exception handler, things that change pc/regs/cycles ...
+static void call_c_cpu_handler(int i, const struct regstat *i_regs, int ccadj_, u_int pc, void *func)
 {
-  printf("Need fconv_assemble for this architecture.\n");
-  exit(1);
+  signed char ccreg=get_reg(i_regs->regmap,CCREG);
+  assert(ccreg==HOST_CCREG);
+  assert(!is_delayslot);
+  (void)ccreg;
+
+  emit_movimm(pc,3); // Get PC
+  emit_readword(&last_count,2);
+  emit_writeword(3,&psxRegs.pc);
+  emit_addimm(HOST_CCREG,ccadj_,HOST_CCREG);
+  emit_add(2,HOST_CCREG,2);
+  emit_writeword(2,&psxRegs.cycle);
+  emit_far_call(func);
+  emit_far_jump(jump_to_new_pc);
 }
-#endif
 
-#if 0
-void float_assemble(int i,struct regstat *i_regs)
+static void syscall_assemble(int i, const struct regstat *i_regs, int ccadj_)
 {
-  printf("Need float_assemble for this architecture.\n");
-  exit(1);
+  emit_movimm(0x20,0); // cause code
+  emit_movimm(0,1);    // not in delay slot
+  call_c_cpu_handler(i, i_regs, ccadj_, start+i*4, psxException);
 }
-#endif
 
-void syscall_assemble(int i,struct regstat *i_regs)
+static void hlecall_assemble(int i, const struct regstat *i_regs, int ccadj_)
 {
-  signed char ccreg=get_reg(i_regs->regmap,CCREG);
-  assert(ccreg==HOST_CCREG);
-  assert(!is_delayslot);
-  (void)ccreg;
-  emit_movimm(start+i*4,EAX); // Get PC
-  emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
-  emit_jmp((int)jump_syscall_hle); // XXX
+  void *hlefunc = psxNULL;
+  uint32_t hleCode = source[i] & 0x03ffffff;
+  if (hleCode < ARRAY_SIZE(psxHLEt))
+    hlefunc = psxHLEt[hleCode];
+
+  call_c_cpu_handler(i, i_regs, ccadj_, start + i*4+4, hlefunc);
 }
 
-void hlecall_assemble(int i,struct regstat *i_regs)
+static void intcall_assemble(int i, const struct regstat *i_regs, int ccadj_)
 {
-  extern void psxNULL();
-  signed char ccreg=get_reg(i_regs->regmap,CCREG);
-  assert(ccreg==HOST_CCREG);
-  assert(!is_delayslot);
-  (void)ccreg;
-  emit_movimm(start+i*4+4,0); // Get PC
-  uint32_t hleCode = source[i] & 0x03ffffff;
-  if (hleCode >= (sizeof(psxHLEt) / sizeof(psxHLEt[0])))
-    emit_movimm((int)psxNULL,1);
-  else
-    emit_movimm((int)psxHLEt[hleCode],1);
-  emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
-  emit_jmp((int)jump_hlecall);
+  call_c_cpu_handler(i, i_regs, ccadj_, start + i*4, execI);
 }
 
-void intcall_assemble(int i,struct regstat *i_regs)
+static void speculate_mov(int rs,int rt)
 {
-  signed char ccreg=get_reg(i_regs->regmap,CCREG);
-  assert(ccreg==HOST_CCREG);
-  assert(!is_delayslot);
-  (void)ccreg;
-  emit_movimm(start+i*4,0); // Get PC
-  emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
-  emit_jmp((int)jump_intcall);
+  if(rt!=0) {
+    smrv_strong_next|=1<<rt;
+    smrv[rt]=smrv[rs];
+  }
 }
 
-void ds_assemble(int i,struct regstat *i_regs)
+static void speculate_mov_weak(int rs,int rt)
 {
-  speculate_register_values(i);
-  is_delayslot=1;
-  switch(itype[i]) {
+  if(rt!=0) {
+    smrv_weak_next|=1<<rt;
+    smrv[rt]=smrv[rs];
+  }
+}
+
+static void speculate_register_values(int i)
+{
+  if(i==0) {
+    memcpy(smrv,psxRegs.GPR.r,sizeof(smrv));
+    // gp,sp are likely to stay the same throughout the block
+    smrv_strong_next=(1<<28)|(1<<29)|(1<<30);
+    smrv_weak_next=~smrv_strong_next;
+    //printf(" llr %08x\n", smrv[4]);
+  }
+  smrv_strong=smrv_strong_next;
+  smrv_weak=smrv_weak_next;
+  switch(dops[i].itype) {
+    case ALU:
+      if     ((smrv_strong>>dops[i].rs1)&1) speculate_mov(dops[i].rs1,dops[i].rt1);
+      else if((smrv_strong>>dops[i].rs2)&1) speculate_mov(dops[i].rs2,dops[i].rt1);
+      else if((smrv_weak>>dops[i].rs1)&1) speculate_mov_weak(dops[i].rs1,dops[i].rt1);
+      else if((smrv_weak>>dops[i].rs2)&1) speculate_mov_weak(dops[i].rs2,dops[i].rt1);
+      else {
+        smrv_strong_next&=~(1<<dops[i].rt1);
+        smrv_weak_next&=~(1<<dops[i].rt1);
+      }
+      break;
+    case SHIFTIMM:
+      smrv_strong_next&=~(1<<dops[i].rt1);
+      smrv_weak_next&=~(1<<dops[i].rt1);
+      // fallthrough
+    case IMM16:
+      if(dops[i].rt1&&is_const(&regs[i],dops[i].rt1)) {
+        int value,hr=get_reg(regs[i].regmap,dops[i].rt1);
+        if(hr>=0) {
+          if(get_final_value(hr,i,&value))
+               smrv[dops[i].rt1]=value;
+          else smrv[dops[i].rt1]=constmap[i][hr];
+          smrv_strong_next|=1<<dops[i].rt1;
+        }
+      }
+      else {
+        if     ((smrv_strong>>dops[i].rs1)&1) speculate_mov(dops[i].rs1,dops[i].rt1);
+        else if((smrv_weak>>dops[i].rs1)&1) speculate_mov_weak(dops[i].rs1,dops[i].rt1);
+      }
+      break;
+    case LOAD:
+      if(start<0x2000&&(dops[i].rt1==26||(smrv[dops[i].rt1]>>24)==0xa0)) {
+        // special case for BIOS
+        smrv[dops[i].rt1]=0xa0000000;
+        smrv_strong_next|=1<<dops[i].rt1;
+        break;
+      }
+      // fallthrough
+    case SHIFT:
+    case LOADLR:
+    case MOV:
+      smrv_strong_next&=~(1<<dops[i].rt1);
+      smrv_weak_next&=~(1<<dops[i].rt1);
+      break;
+    case COP0:
+    case COP2:
+      if(dops[i].opcode2==0||dops[i].opcode2==2) { // MFC/CFC
+        smrv_strong_next&=~(1<<dops[i].rt1);
+        smrv_weak_next&=~(1<<dops[i].rt1);
+      }
+      break;
+    case C2LS:
+      if (dops[i].opcode==0x32) { // LWC2
+        smrv_strong_next&=~(1<<dops[i].rt1);
+        smrv_weak_next&=~(1<<dops[i].rt1);
+      }
+      break;
+  }
+#if 0
+  int r=4;
+  printf("x %08x %08x %d %d c %08x %08x\n",smrv[r],start+i*4,
+    ((smrv_strong>>r)&1),(smrv_weak>>r)&1,regs[i].isconst,regs[i].wasconst);
+#endif
+}
+
+static void ujump_assemble(int i, const struct regstat *i_regs);
+static void rjump_assemble(int i, const struct regstat *i_regs);
+static void cjump_assemble(int i, const struct regstat *i_regs);
+static void sjump_assemble(int i, const struct regstat *i_regs);
+static void pagespan_assemble(int i, const struct regstat *i_regs);
+
+static int assemble(int i, const struct regstat *i_regs, int ccadj_)
+{
+  int ds = 0;
+  switch (dops[i].itype) {
     case ALU:
-      alu_assemble(i,i_regs);break;
+      alu_assemble(i, i_regs);
+      break;
     case IMM16:
-      imm16_assemble(i,i_regs);break;
+      imm16_assemble(i, i_regs);
+      break;
     case SHIFT:
-      shift_assemble(i,i_regs);break;
+      shift_assemble(i, i_regs);
+      break;
     case SHIFTIMM:
-      shiftimm_assemble(i,i_regs);break;
+      shiftimm_assemble(i, i_regs);
+      break;
     case LOAD:
-      load_assemble(i,i_regs);break;
+      load_assemble(i, i_regs, ccadj_);
+      break;
     case LOADLR:
-      loadlr_assemble(i,i_regs);break;
+      loadlr_assemble(i, i_regs, ccadj_);
+      break;
     case STORE:
-      store_assemble(i,i_regs);break;
+      store_assemble(i, i_regs, ccadj_);
+      break;
     case STORELR:
-      storelr_assemble(i,i_regs);break;
+      storelr_assemble(i, i_regs, ccadj_);
+      break;
     case COP0:
-      cop0_assemble(i,i_regs);break;
+      cop0_assemble(i, i_regs, ccadj_);
+      break;
     case COP1:
-      cop1_assemble(i,i_regs);break;
+      cop1_assemble(i, i_regs);
+      break;
     case C1LS:
-      c1ls_assemble(i,i_regs);break;
+      c1ls_assemble(i, i_regs);
+      break;
     case COP2:
-      cop2_assemble(i,i_regs);break;
+      cop2_assemble(i, i_regs);
+      break;
     case C2LS:
-      c2ls_assemble(i,i_regs);break;
+      c2ls_assemble(i, i_regs, ccadj_);
+      break;
     case C2OP:
-      c2op_assemble(i,i_regs);break;
-    case FCONV:
-      fconv_assemble(i,i_regs);break;
-    case FLOAT:
-      float_assemble(i,i_regs);break;
-    case FCOMP:
-      fcomp_assemble(i,i_regs);break;
+      c2op_assemble(i, i_regs);
+      break;
     case MULTDIV:
-      multdiv_assemble(i,i_regs);break;
+      multdiv_assemble(i, i_regs);
+      multdiv_prepare_stall(i, i_regs, ccadj_);
+      break;
     case MOV:
-      mov_assemble(i,i_regs);break;
+      mov_assemble(i, i_regs);
+      break;
+    case SYSCALL:
+      syscall_assemble(i, i_regs, ccadj_);
+      break;
+    case HLECALL:
+      hlecall_assemble(i, i_regs, ccadj_);
+      break;
+    case INTCALL:
+      intcall_assemble(i, i_regs, ccadj_);
+      break;
+    case UJUMP:
+      ujump_assemble(i, i_regs);
+      ds = 1;
+      break;
+    case RJUMP:
+      rjump_assemble(i, i_regs);
+      ds = 1;
+      break;
+    case CJUMP:
+      cjump_assemble(i, i_regs);
+      ds = 1;
+      break;
+    case SJUMP:
+      sjump_assemble(i, i_regs);
+      ds = 1;
+      break;
+    case SPAN:
+      pagespan_assemble(i, i_regs);
+      break;
+    case NOP:
+    case OTHER:
+    case NI:
+      // not handled, just skip
+      break;
+    default:
+      assert(0);
+  }
+  return ds;
+}
+
+static void ds_assemble(int i, const struct regstat *i_regs)
+{
+  speculate_register_values(i);
+  is_delayslot = 1;
+  switch (dops[i].itype) {
     case SYSCALL:
     case HLECALL:
     case INTCALL:
@@ -3561,36 +4146,26 @@ void ds_assemble(int i,struct regstat *i_regs)
     case RJUMP:
     case CJUMP:
     case SJUMP:
-    case FJUMP:
       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
+      break;
+    default:
+      assemble(i, i_regs, ccadj[i]);
   }
-  is_delayslot=0;
+  is_delayslot = 0;
 }
 
 // Is the branch target a valid internal jump?
-int internal_branch(uint64_t i_is32,int addr)
+static int internal_branch(int addr)
 {
   if(addr&1) return 0; // Indirect (register) jump
   if(addr>=start && addr<start+slen*4-4)
   {
-    //int t=(addr-start)>>2;
-    // Delay slots are not valid branch targets
-    //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
-    // 64 -> 32 bit transition requires a recompile
-    /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
-    {
-      if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
-      else printf("optimizable: yes\n");
-    }*/
-    //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
     return 1;
   }
   return 0;
 }
 
-#ifndef wb_invalidate
-void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
-  uint64_t u,uint64_t uu)
+static void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t u)
 {
   int hr;
   for(hr=0;hr<HOST_REGS;hr++) {
@@ -3599,19 +4174,9 @@ void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t
         if(pre[hr]>=0) {
           if((dirty>>hr)&1) {
             if(get_reg(entry,pre[hr])<0) {
-              if(pre[hr]<64) {
-                if(!((u>>pre[hr])&1)) {
-                  emit_storereg(pre[hr],hr);
-                  if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
-                    emit_sarimm(hr,31,hr);
-                    emit_storereg(pre[hr]|64,hr);
-                  }
-                }
-              }else{
-                if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
-                  emit_storereg(pre[hr],hr);
-                }
-              }
+              assert(pre[hr]<64);
+              if(!((u>>pre[hr])&1))
+                emit_storereg(pre[hr],hr);
             }
           }
         }
@@ -3632,12 +4197,11 @@ void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t
     }
   }
 }
-#endif
 
 // Load the specified registers
 // This only loads the registers given as arguments because
 // we don't want to load things that will be overwritten
-void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
+static void load_regs(signed char entry[],signed char regmap[],int rs1,int rs2)
 {
   int hr;
   // Load 32-bit regs
@@ -3657,28 +4221,6 @@ void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2
       }
     }
   }
-  //Load 64-bit regs
-  for(hr=0;hr<HOST_REGS;hr++) {
-    if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
-      if(entry[hr]!=regmap[hr]) {
-        if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
-        {
-          assert(regmap[hr]!=64);
-          if((is32>>(regmap[hr]&63))&1) {
-            int lr=get_reg(regmap,regmap[hr]-64);
-            if(lr>=0)
-              emit_sarimm(lr,31,hr);
-            else
-              emit_loadreg(regmap[hr],hr);
-          }
-          else
-          {
-            emit_loadreg(regmap[hr],hr);
-          }
-        }
-      }
-    }
-  }
 }
 
 // Load registers prior to the start of a loop
@@ -3713,41 +4255,41 @@ static void loop_preload(signed char pre[],signed char entry[])
 
 // Generate address for load/store instruction
 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
-void address_generation(int i,struct regstat *i_regs,signed char entry[])
+void address_generation(int i, const struct regstat *i_regs, signed char entry[])
 {
-  if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
+  if (dops[i].is_load || dops[i].is_store) {
     int ra=-1;
     int agr=AGEN1+(i&1);
-    if(itype[i]==LOAD) {
-      ra=get_reg(i_regs->regmap,rt1[i]);
+    if(dops[i].itype==LOAD) {
+      ra=get_reg(i_regs->regmap,dops[i].rt1);
       if(ra<0) ra=get_reg(i_regs->regmap,-1);
       assert(ra>=0);
     }
-    if(itype[i]==LOADLR) {
+    if(dops[i].itype==LOADLR) {
       ra=get_reg(i_regs->regmap,FTEMP);
     }
-    if(itype[i]==STORE||itype[i]==STORELR) {
+    if(dops[i].itype==STORE||dops[i].itype==STORELR) {
       ra=get_reg(i_regs->regmap,agr);
       if(ra<0) ra=get_reg(i_regs->regmap,-1);
     }
-    if(itype[i]==C1LS||itype[i]==C2LS) {
-      if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
+    if(dops[i].itype==C2LS) {
+      if ((dops[i].opcode&0x3b)==0x31||(dops[i].opcode&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
         ra=get_reg(i_regs->regmap,FTEMP);
       else { // SWC1/SDC1/SWC2/SDC2
         ra=get_reg(i_regs->regmap,agr);
         if(ra<0) ra=get_reg(i_regs->regmap,-1);
       }
     }
-    int rs=get_reg(i_regs->regmap,rs1[i]);
+    int rs=get_reg(i_regs->regmap,dops[i].rs1);
     if(ra>=0) {
       int offset=imm[i];
       int c=(i_regs->wasconst>>rs)&1;
-      if(rs1[i]==0) {
+      if(dops[i].rs1==0) {
         // Using r0 as a base address
         if(!entry||entry[ra]!=agr) {
-          if (opcode[i]==0x22||opcode[i]==0x26) {
+          if (dops[i].opcode==0x22||dops[i].opcode==0x26) {
             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
-          }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
+          }else if (dops[i].opcode==0x1a||dops[i].opcode==0x1b) {
             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
           }else{
             emit_movimm(offset,ra);
@@ -3755,29 +4297,26 @@ void address_generation(int i,struct regstat *i_regs,signed char entry[])
         } // else did it in the previous cycle
       }
       else if(rs<0) {
-        if(!entry||entry[ra]!=rs1[i])
-          emit_loadreg(rs1[i],ra);
-        //if(!entry||entry[ra]!=rs1[i])
+        if(!entry||entry[ra]!=dops[i].rs1)
+          emit_loadreg(dops[i].rs1,ra);
+        //if(!entry||entry[ra]!=dops[i].rs1)
         //  printf("poor load scheduling!\n");
       }
       else if(c) {
-        if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
+        if(dops[i].rs1!=dops[i].rt1||dops[i].itype!=LOAD) {
           if(!entry||entry[ra]!=agr) {
-            if (opcode[i]==0x22||opcode[i]==0x26) {
+            if (dops[i].opcode==0x22||dops[i].opcode==0x26) {
               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
-            }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
+            }else if (dops[i].opcode==0x1a||dops[i].opcode==0x1b) {
               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
             }else{
-              #ifdef HOST_IMM_ADDR32
-              if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
-              #endif
               emit_movimm(constmap[i][rs]+offset,ra);
               regs[i].loadedconst|=1<<ra;
             }
           } // else did it in the previous cycle
         } // else load_consts already did it
       }
-      if(offset&&!c&&rs1[i]) {
+      if(offset&&!c&&dops[i].rs1) {
         if(rs>=0) {
           emit_addimm(rs,offset,ra);
         }else{
@@ -3787,33 +4326,30 @@ void address_generation(int i,struct regstat *i_regs,signed char entry[])
     }
   }
   // Preload constants for next instruction
-  if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
+  if (dops[i+1].is_load || dops[i+1].is_store) {
     int agr,ra;
     // Actual address
     agr=AGEN1+((i+1)&1);
     ra=get_reg(i_regs->regmap,agr);
     if(ra>=0) {
-      int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
+      int rs=get_reg(regs[i+1].regmap,dops[i+1].rs1);
       int offset=imm[i+1];
       int c=(regs[i+1].wasconst>>rs)&1;
-      if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
-        if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
+      if(c&&(dops[i+1].rs1!=dops[i+1].rt1||dops[i+1].itype!=LOAD)) {
+        if (dops[i+1].opcode==0x22||dops[i+1].opcode==0x26) {
           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
-        }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
+        }else if (dops[i+1].opcode==0x1a||dops[i+1].opcode==0x1b) {
           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
         }else{
-          #ifdef HOST_IMM_ADDR32
-          if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
-          #endif
           emit_movimm(constmap[i+1][rs]+offset,ra);
           regs[i+1].loadedconst|=1<<ra;
         }
       }
-      else if(rs1[i+1]==0) {
+      else if(dops[i+1].rs1==0) {
         // Using r0 as a base address
-        if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
+        if (dops[i+1].opcode==0x22||dops[i+1].opcode==0x26) {
           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
-        }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
+        }else if (dops[i+1].opcode==0x1a||dops[i+1].opcode==0x1b) {
           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
         }else{
           emit_movimm(offset,ra);
@@ -3829,49 +4365,46 @@ static int get_final_value(int hr, int i, int *value)
   while(i<slen-1) {
     if(regs[i+1].regmap[hr]!=reg) break;
     if(!((regs[i+1].isconst>>hr)&1)) break;
-    if(bt[i+1]) break;
+    if(dops[i+1].bt) break;
     i++;
   }
   if(i<slen-1) {
-    if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
+    if (dops[i].is_jump) {
       *value=constmap[i][hr];
       return 1;
     }
-    if(!bt[i+1]) {
-      if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
+    if(!dops[i+1].bt) {
+      if (dops[i+1].is_jump) {
         // Load in delay slot, out-of-order execution
-        if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
+        if(dops[i+2].itype==LOAD&&dops[i+2].rs1==reg&&dops[i+2].rt1==reg&&((regs[i+1].wasconst>>hr)&1))
         {
           // Precompute load address
           *value=constmap[i][hr]+imm[i+2];
           return 1;
         }
       }
-      if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
+      if(dops[i+1].itype==LOAD&&dops[i+1].rs1==reg&&dops[i+1].rt1==reg)
       {
         // Precompute load address
         *value=constmap[i][hr]+imm[i+1];
-        //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
+        //printf("c=%x imm=%lx\n",(long)constmap[i][hr],imm[i+1]);
         return 1;
       }
     }
   }
   *value=constmap[i][hr];
-  //printf("c=%x\n",(int)constmap[i][hr]);
+  //printf("c=%lx\n",(long)constmap[i][hr]);
   if(i==slen-1) return 1;
-  if(reg<64) {
-    return !((unneeded_reg[i+1]>>reg)&1);
-  }else{
-    return !((unneeded_reg_upper[i+1]>>reg)&1);
-  }
+  assert(reg < 64);
+  return !((unneeded_reg[i+1]>>reg)&1);
 }
 
 // Load registers with known constants
-void load_consts(signed char pre[],signed char regmap[],int is32,int i)
+static void load_consts(signed char pre[],signed char regmap[],int i)
 {
   int hr,hr2;
   // propagate loaded constant flags
-  if(i==0||bt[i])
+  if(i==0||dops[i].bt)
     regs[i].loadedconst=0;
   else {
     for(hr=0;hr<HOST_REGS;hr++) {
@@ -3887,7 +4420,8 @@ void load_consts(signed char pre[],signed char regmap[],int is32,int i)
     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
       //if(entry[hr]!=regmap[hr]) {
       if(!((regs[i].loadedconst>>hr)&1)) {
-        if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
+        assert(regmap[hr]<64);
+        if(((regs[i].isconst>>hr)&1)&&regmap[hr]>0) {
           int value,similar=0;
           if(get_final_value(hr,i,&value)) {
             // see if some other register has similar value
@@ -3918,41 +4452,16 @@ void load_consts(signed char pre[],signed char regmap[],int is32,int i)
       }
     }
   }
-  // Load 64-bit regs
-  for(hr=0;hr<HOST_REGS;hr++) {
-    if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
-      //if(entry[hr]!=regmap[hr]) {
-      if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
-        if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
-          if((is32>>(regmap[hr]&63))&1) {
-            int lr=get_reg(regmap,regmap[hr]-64);
-            assert(lr>=0);
-            emit_sarimm(lr,31,hr);
-          }
-          else
-          {
-            int value;
-            if(get_final_value(hr,i,&value)) {
-              if(value==0) {
-                emit_zeroreg(hr);
-              }
-              else {
-                emit_movimm(value,hr);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
 }
-void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
+
+static void load_all_consts(const signed char regmap[], u_int dirty, int i)
 {
   int hr;
   // Load 32-bit regs
   for(hr=0;hr<HOST_REGS;hr++) {
     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
-      if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
+      assert(regmap[hr] < 64);
+      if(((regs[i].isconst>>hr)&1)&&regmap[hr]>0) {
         int value=constmap[i][hr];
         if(value==0) {
           emit_zeroreg(hr);
@@ -3963,32 +4472,10 @@ void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
       }
     }
   }
-  // Load 64-bit regs
-  for(hr=0;hr<HOST_REGS;hr++) {
-    if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
-      if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
-        if((is32>>(regmap[hr]&63))&1) {
-          int lr=get_reg(regmap,regmap[hr]-64);
-          assert(lr>=0);
-          emit_sarimm(lr,31,hr);
-        }
-        else
-        {
-          int value=constmap[i][hr];
-          if(value==0) {
-            emit_zeroreg(hr);
-          }
-          else {
-            emit_movimm(value,hr);
-          }
-        }
-      }
-    }
-  }
 }
 
 // Write out all dirty registers (except cycle count)
-void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
+static void wb_dirtys(const signed char i_regmap[], uint64_t i_dirty)
 {
   int hr;
   for(hr=0;hr<HOST_REGS;hr++) {
@@ -3996,22 +4483,18 @@ void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
       if(i_regmap[hr]>0) {
         if(i_regmap[hr]!=CCREG) {
           if((i_dirty>>hr)&1) {
-            if(i_regmap[hr]<64) {
-              emit_storereg(i_regmap[hr],hr);
-            }else{
-              if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
-                emit_storereg(i_regmap[hr],hr);
-              }
-            }
+            assert(i_regmap[hr]<64);
+            emit_storereg(i_regmap[hr],hr);
           }
         }
       }
     }
   }
 }
+
 // Write out dirty registers that we need to reload (pair with load_needed_regs)
 // This writes the registers not written by store_regs_bt
-void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
+static void wb_needed_dirtys(const signed char i_regmap[], uint64_t i_dirty, int addr)
 {
   int hr;
   int t=(addr-start)>>2;
@@ -4019,15 +4502,10 @@ void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,in
     if(hr!=EXCLUDE_REG) {
       if(i_regmap[hr]>0) {
         if(i_regmap[hr]!=CCREG) {
-          if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
+          if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1)) {
             if((i_dirty>>hr)&1) {
-              if(i_regmap[hr]<64) {
-                emit_storereg(i_regmap[hr],hr);
-              }else{
-                if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
-                  emit_storereg(i_regmap[hr],hr);
-                }
-              }
+              assert(i_regmap[hr]<64);
+              emit_storereg(i_regmap[hr],hr);
             }
           }
         }
@@ -4037,7 +4515,7 @@ void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,in
 }
 
 // Load all registers (except cycle count)
-void load_all_regs(signed char i_regmap[])
+static void load_all_regs(const signed char i_regmap[])
 {
   int hr;
   for(hr=0;hr<HOST_REGS;hr++) {
@@ -4055,7 +4533,7 @@ void load_all_regs(signed char i_regmap[])
 }
 
 // Load all current registers also needed by next instruction
-void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
+static void load_needed_regs(const signed char i_regmap[], const signed char next_regmap[])
 {
   int hr;
   for(hr=0;hr<HOST_REGS;hr++) {
@@ -4075,11 +4553,11 @@ void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
 }
 
 // Load all regs, storing cycle count if necessary
-void load_regs_entry(int t)
+static void load_regs_entry(int t)
 {
   int hr;
-  if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
-  else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
+  if(dops[t].is_ds) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
+  else if(ccadj[t]) emit_addimm(HOST_CCREG,-ccadj[t],HOST_CCREG);
   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
     emit_storereg(CCREG,HOST_CCREG);
   }
@@ -4095,58 +4573,23 @@ void load_regs_entry(int t)
       }
     }
   }
-  // Load 64-bit regs
-  for(hr=0;hr<HOST_REGS;hr++) {
-    if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
-      assert(regs[t].regmap_entry[hr]!=64);
-      if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
-        int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
-        if(lr<0) {
-          emit_loadreg(regs[t].regmap_entry[hr],hr);
-        }
-        else
-        {
-          emit_sarimm(lr,31,hr);
-        }
-      }
-      else
-      {
-        emit_loadreg(regs[t].regmap_entry[hr],hr);
-      }
-    }
-  }
 }
 
 // Store dirty registers prior to branch
-void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
+void store_regs_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
 {
-  if(internal_branch(i_is32,addr))
+  if(internal_branch(addr))
   {
     int t=(addr-start)>>2;
     int hr;
     for(hr=0;hr<HOST_REGS;hr++) {
       if(hr!=EXCLUDE_REG) {
         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
-          if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
+          if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1)) {
             if((i_dirty>>hr)&1) {
-              if(i_regmap[hr]<64) {
-                if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
-                  emit_storereg(i_regmap[hr],hr);
-                  if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
-                    #ifdef DESTRUCTIVE_WRITEBACK
-                    emit_sarimm(hr,31,hr);
-                    emit_storereg(i_regmap[hr]|64,hr);
-                    #else
-                    emit_sarimm(hr,31,HOST_TEMPREG);
-                    emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
-                    #endif
-                  }
-                }
-              }else{
-                if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
-                  emit_storereg(i_regmap[hr],hr);
-                }
-              }
+              assert(i_regmap[hr]<64);
+              if(!((unneeded_reg[t]>>i_regmap[hr])&1))
+                emit_storereg(i_regmap[hr],hr);
             }
           }
         }
@@ -4156,15 +4599,15 @@ void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int a
   else
   {
     // Branch out of this block, write out all dirty regs
-    wb_dirtys(i_regmap,i_is32,i_dirty);
+    wb_dirtys(i_regmap,i_dirty);
   }
 }
 
 // Load all needed registers for branch target
-void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
+static void load_regs_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
 {
   //if(addr>=start && addr<(start+slen*4))
-  if(internal_branch(i_is32,addr))
+  if(internal_branch(addr))
   {
     int t=(addr-start)>>2;
     int hr;
@@ -4178,11 +4621,7 @@ void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int ad
     // Load 32-bit regs
     for(hr=0;hr<HOST_REGS;hr++) {
       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
-        #ifdef DESTRUCTIVE_WRITEBACK
-        if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
-        #else
-        if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
-        #endif
+        if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
           if(regs[t].regmap_entry[hr]==0) {
             emit_zeroreg(hr);
           }
@@ -4193,37 +4632,10 @@ void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int ad
         }
       }
     }
-    //Load 64-bit regs
-    for(hr=0;hr<HOST_REGS;hr++) {
-      if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
-        if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
-          assert(regs[t].regmap_entry[hr]!=64);
-          if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
-            int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
-            if(lr<0) {
-              emit_loadreg(regs[t].regmap_entry[hr],hr);
-            }
-            else
-            {
-              emit_sarimm(lr,31,hr);
-            }
-          }
-          else
-          {
-            emit_loadreg(regs[t].regmap_entry[hr],hr);
-          }
-        }
-        else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
-          int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
-          assert(lr>=0);
-          emit_sarimm(lr,31,hr);
-        }
-      }
-    }
   }
 }
 
-int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
+static int match_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
 {
   if(addr>=start && addr<start+slen*4-4)
   {
@@ -4250,8 +4662,7 @@ int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
             }
             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
             {
-              if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
-                return 0;
+              assert(0);
             }
           }
         }
@@ -4269,19 +4680,13 @@ int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
               }
             }
           }
-          if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
-          {
-            //printf("%x: is32 no match\n",addr);
-            return 0;
-          }
         }
       }
     }
-    //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
     // Delay slots are not valid branch targets
-    //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
+    //if(t>0&&(dops[t-1].is_jump) return 0;
     // Delay slots require additional processing, so do not match
-    if(is_ds[t]) return 0;
+    if(dops[t].is_ds) return 0;
   }
   else
   {
@@ -4306,60 +4711,68 @@ int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
   return 1;
 }
 
-// Used when a branch jumps into the delay slot of another branch
-void ds_assemble_entry(int i)
+#ifdef DRC_DBG
+static void drc_dbg_emit_do_cmp(int i, int ccadj_)
 {
-  int t=(ba[i]-start)>>2;
-  if(!instr_addr[t]) instr_addr[t]=(u_int)out;
-  assem_debug("Assemble delay slot at %x\n",ba[i]);
-  assem_debug("<->\n");
-  if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
-    wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
-  load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
-  address_generation(t,&regs[t],regs[t].regmap_entry);
-  if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
-    load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
-  cop1_usable=0;
-  is_delayslot=0;
-  switch(itype[t]) {
-    case ALU:
-      alu_assemble(t,&regs[t]);break;
-    case IMM16:
-      imm16_assemble(t,&regs[t]);break;
-    case SHIFT:
-      shift_assemble(t,&regs[t]);break;
-    case SHIFTIMM:
-      shiftimm_assemble(t,&regs[t]);break;
-    case LOAD:
-      load_assemble(t,&regs[t]);break;
-    case LOADLR:
-      loadlr_assemble(t,&regs[t]);break;
-    case STORE:
-      store_assemble(t,&regs[t]);break;
-    case STORELR:
-      storelr_assemble(t,&regs[t]);break;
-    case COP0:
-      cop0_assemble(t,&regs[t]);break;
-    case COP1:
-      cop1_assemble(t,&regs[t]);break;
-    case C1LS:
-      c1ls_assemble(t,&regs[t]);break;
-    case COP2:
-      cop2_assemble(t,&regs[t]);break;
-    case C2LS:
-      c2ls_assemble(t,&regs[t]);break;
-    case C2OP:
-      c2op_assemble(t,&regs[t]);break;
-    case FCONV:
-      fconv_assemble(t,&regs[t]);break;
-    case FLOAT:
-      float_assemble(t,&regs[t]);break;
-    case FCOMP:
-      fcomp_assemble(t,&regs[t]);break;
-    case MULTDIV:
-      multdiv_assemble(t,&regs[t]);break;
-    case MOV:
-      mov_assemble(t,&regs[t]);break;
+  extern void do_insn_cmp();
+  //extern int cycle;
+  u_int hr, reglist = get_host_reglist(regs[i].regmap);
+
+  assem_debug("//do_insn_cmp %08x\n", start+i*4);
+  save_regs(reglist);
+  // write out changed consts to match the interpreter
+  if (i > 0 && !dops[i].bt) {
+    for (hr = 0; hr < HOST_REGS; hr++) {
+      int reg = regs[i].regmap_entry[hr]; // regs[i-1].regmap[hr];
+      if (hr == EXCLUDE_REG || reg < 0)
+        continue;
+      if (!((regs[i-1].isconst >> hr) & 1))
+        continue;
+      if (i > 1 && reg == regs[i-2].regmap[hr] && constmap[i-1][hr] == constmap[i-2][hr])
+        continue;
+      emit_movimm(constmap[i-1][hr],0);
+      emit_storereg(reg, 0);
+    }
+  }
+  emit_movimm(start+i*4,0);
+  emit_writeword(0,&pcaddr);
+  int cc = get_reg(regs[i].regmap_entry, CCREG);
+  if (cc < 0)
+    emit_loadreg(CCREG, cc = 0);
+  emit_addimm(cc, ccadj_, 0);
+  emit_writeword(0, &psxRegs.cycle);
+  emit_far_call(do_insn_cmp);
+  //emit_readword(&cycle,0);
+  //emit_addimm(0,2,0);
+  //emit_writeword(0,&cycle);
+  (void)get_reg2;
+  restore_regs(reglist);
+  assem_debug("\\\\do_insn_cmp\n");
+}
+#else
+#define drc_dbg_emit_do_cmp(x,y)
+#endif
+
+// Used when a branch jumps into the delay slot of another branch
+static void ds_assemble_entry(int i)
+{
+  int t = (ba[i] - start) >> 2;
+  int ccadj_ = -CLOCK_ADJUST(1);
+  if (!instr_addr[t])
+    instr_addr[t] = out;
+  assem_debug("Assemble delay slot at %x\n",ba[i]);
+  assem_debug("<->\n");
+  drc_dbg_emit_do_cmp(t, ccadj_);
+  if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
+    wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty);
+  load_regs(regs[t].regmap_entry,regs[t].regmap,dops[t].rs1,dops[t].rs2);
+  address_generation(t,&regs[t],regs[t].regmap_entry);
+  if (ram_offset && (dops[t].is_load || dops[t].is_store))
+    load_regs(regs[t].regmap_entry,regs[t].regmap,ROREG,ROREG);
+  if (dops[t].is_store)
+    load_regs(regs[t].regmap_entry,regs[t].regmap,INVCP,INVCP);
+  is_delayslot=0;
+  switch (dops[t].itype) {
     case SYSCALL:
     case HLECALL:
     case INTCALL:
@@ -4368,137 +4781,155 @@ void ds_assemble_entry(int i)
     case RJUMP:
     case CJUMP:
     case SJUMP:
-    case FJUMP:
       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
+      break;
+    default:
+      assemble(t, &regs[t], ccadj_);
   }
-  store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
-  load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
-  if(internal_branch(regs[t].is32,ba[i]+4))
+  store_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+4);
+  load_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+4);
+  if(internal_branch(ba[i]+4))
     assem_debug("branch: internal\n");
   else
     assem_debug("branch: external\n");
-  assert(internal_branch(regs[t].is32,ba[i]+4));
-  add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
+  assert(internal_branch(ba[i]+4));
+  add_to_linker(out,ba[i]+4,internal_branch(ba[i]+4));
   emit_jmp(0);
 }
 
-void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
+static void emit_extjump(void *addr, u_int target)
+{
+  emit_extjump2(addr, target, dyna_linker);
+}
+
+static void emit_extjump_ds(void *addr, u_int target)
+{
+  emit_extjump2(addr, target, dyna_linker_ds);
+}
+
+// Load 2 immediates optimizing for small code size
+static void emit_mov2imm_compact(int imm1,u_int rt1,int imm2,u_int rt2)
 {
-  int count;
-  int jaddr;
-  int idle=0;
+  emit_movimm(imm1,rt1);
+  emit_movimm_from(imm1,rt1,imm2,rt2);
+}
+
+static void do_cc(int i, const signed char i_regmap[], int *adj,
+  int addr, int taken, int invert)
+{
+  int count, count_plus2;
+  void *jaddr;
+  void *idle=NULL;
   int t=0;
-  if(itype[i]==RJUMP)
+  if(dops[i].itype==RJUMP)
   {
     *adj=0;
   }
   //if(ba[i]>=start && ba[i]<(start+slen*4))
-  if(internal_branch(branch_regs[i].is32,ba[i]))
+  if(internal_branch(ba[i]))
   {
     t=(ba[i]-start)>>2;
-    if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
+    if(dops[t].is_ds) *adj=-CLOCK_ADJUST(1); // Branch into delay slot adds an extra cycle
     else *adj=ccadj[t];
   }
   else
   {
     *adj=0;
   }
-  count=ccadj[i];
+  count = ccadj[i];
+  count_plus2 = count + CLOCK_ADJUST(2);
   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
     // Idle loop
     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
-    idle=(int)out;
+    idle=out;
     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
     emit_andimm(HOST_CCREG,3,HOST_CCREG);
-    jaddr=(int)out;
+    jaddr=out;
     emit_jmp(0);
   }
   else if(*adj==0||invert) {
-    int cycles=CLOCK_ADJUST(count+2);
+    int cycles = count_plus2;
     // faster loop HACK
+#if 0
     if (t&&*adj) {
       int rel=t-i;
       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
-        cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
+        cycles=*adj+count+2-*adj;
     }
-    emit_addimm_and_set_flags(cycles,HOST_CCREG);
-    jaddr=(int)out;
+#endif
+    emit_addimm_and_set_flags(cycles, HOST_CCREG);
+    jaddr = out;
     emit_jns(0);
   }
   else
   {
-    emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
-    jaddr=(int)out;
+    emit_cmpimm(HOST_CCREG, -count_plus2);
+    jaddr = out;
     emit_jns(0);
   }
-  add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
+  add_stub(CC_STUB,jaddr,idle?idle:out,(*adj==0||invert||idle)?0:count_plus2,i,addr,taken,0);
 }
 
-void do_ccstub(int n)
+static void do_ccstub(int n)
 {
   literal_pool(256);
-  assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
-  set_jump_target(stubs[n][1],(int)out);
-  int i=stubs[n][4];
-  if(stubs[n][6]==NULLDS) {
+  assem_debug("do_ccstub %x\n",start+(u_int)stubs[n].b*4);
+  set_jump_target(stubs[n].addr, out);
+  int i=stubs[n].b;
+  if(stubs[n].d==NULLDS) {
     // Delay slot instruction is nullified ("likely" branch)
-    wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
+    wb_dirtys(regs[i].regmap,regs[i].dirty);
   }
-  else if(stubs[n][6]!=TAKEN) {
-    wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
+  else if(stubs[n].d!=TAKEN) {
+    wb_dirtys(branch_regs[i].regmap,branch_regs[i].dirty);
   }
   else {
-    if(internal_branch(branch_regs[i].is32,ba[i]))
-      wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
+    if(internal_branch(ba[i]))
+      wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
   }
-  if(stubs[n][5]!=-1)
+  if(stubs[n].c!=-1)
   {
     // Save PC as return address
-    emit_movimm(stubs[n][5],EAX);
-    emit_writeword(EAX,(int)&pcaddr);
+    emit_movimm(stubs[n].c,EAX);
+    emit_writeword(EAX,&pcaddr);
   }
   else
   {
     // Return address depends on which way the branch goes
-    if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
+    if(dops[i].itype==CJUMP||dops[i].itype==SJUMP)
     {
-      int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
-      int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
-      int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
-      int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
-      if(rs1[i]==0)
+      int s1l=get_reg(branch_regs[i].regmap,dops[i].rs1);
+      int s2l=get_reg(branch_regs[i].regmap,dops[i].rs2);
+      if(dops[i].rs1==0)
       {
-        s1l=s2l;s1h=s2h;
-        s2l=s2h=-1;
+        s1l=s2l;
+        s2l=-1;
       }
-      else if(rs2[i]==0)
+      else if(dops[i].rs2==0)
       {
-        s2l=s2h=-1;
-      }
-      if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
-        s1h=s2h=-1;
+        s2l=-1;
       }
       assert(s1l>=0);
       #ifdef DESTRUCTIVE_WRITEBACK
-      if(rs1[i]) {
-        if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
-          emit_loadreg(rs1[i],s1l);
+      if(dops[i].rs1) {
+        if((branch_regs[i].dirty>>s1l)&&1)
+          emit_loadreg(dops[i].rs1,s1l);
       }
       else {
-        if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
-          emit_loadreg(rs2[i],s1l);
+        if((branch_regs[i].dirty>>s1l)&1)
+          emit_loadreg(dops[i].rs2,s1l);
       }
       if(s2l>=0)
-        if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
-          emit_loadreg(rs2[i],s2l);
+        if((branch_regs[i].dirty>>s2l)&1)
+          emit_loadreg(dops[i].rs2,s2l);
       #endif
       int hr=0;
       int addr=-1,alt=-1,ntaddr=-1;
       while(hr<HOST_REGS)
       {
         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
-           (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
-           (branch_regs[i].regmap[hr]&63)!=rs2[i] )
+           (branch_regs[i].regmap[hr]&63)!=dops[i].rs1 &&
+           (branch_regs[i].regmap[hr]&63)!=dops[i].rs2 )
         {
           addr=hr++;break;
         }
@@ -4507,20 +4938,20 @@ void do_ccstub(int n)
       while(hr<HOST_REGS)
       {
         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
-           (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
-           (branch_regs[i].regmap[hr]&63)!=rs2[i] )
+           (branch_regs[i].regmap[hr]&63)!=dops[i].rs1 &&
+           (branch_regs[i].regmap[hr]&63)!=dops[i].rs2 )
         {
           alt=hr++;break;
         }
         hr++;
       }
-      if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
+      if((dops[i].opcode&0x2E)==6) // BLEZ/BGTZ needs another register
       {
         while(hr<HOST_REGS)
         {
           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
-             (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
-             (branch_regs[i].regmap[hr]&63)!=rs2[i] )
+             (branch_regs[i].regmap[hr]&63)!=dops[i].rs1 &&
+             (branch_regs[i].regmap[hr]&63)!=dops[i].rs2 )
           {
             ntaddr=hr;break;
           }
@@ -4528,97 +4959,65 @@ void do_ccstub(int n)
         }
         assert(hr<HOST_REGS);
       }
-      if((opcode[i]&0x2f)==4) // BEQ
+      if((dops[i].opcode&0x2f)==4) // BEQ
       {
         #ifdef HAVE_CMOV_IMM
-        if(s1h<0) {
-          if(s2l>=0) emit_cmp(s1l,s2l);
-          else emit_test(s1l,s1l);
-          emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
-        }
-        else
+        if(s2l>=0) emit_cmp(s1l,s2l);
+        else emit_test(s1l,s1l);
+        emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
+        #else
+        emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
+        if(s2l>=0) emit_cmp(s1l,s2l);
+        else emit_test(s1l,s1l);
+        emit_cmovne_reg(alt,addr);
         #endif
-        {
-          emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
-          if(s1h>=0) {
-            if(s2h>=0) emit_cmp(s1h,s2h);
-            else emit_test(s1h,s1h);
-            emit_cmovne_reg(alt,addr);
-          }
-          if(s2l>=0) emit_cmp(s1l,s2l);
-          else emit_test(s1l,s1l);
-          emit_cmovne_reg(alt,addr);
-        }
       }
-      if((opcode[i]&0x2f)==5) // BNE
+      if((dops[i].opcode&0x2f)==5) // BNE
       {
         #ifdef HAVE_CMOV_IMM
-        if(s1h<0) {
-          if(s2l>=0) emit_cmp(s1l,s2l);
-          else emit_test(s1l,s1l);
-          emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
-        }
-        else
+        if(s2l>=0) emit_cmp(s1l,s2l);
+        else emit_test(s1l,s1l);
+        emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
+        #else
+        emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
+        if(s2l>=0) emit_cmp(s1l,s2l);
+        else emit_test(s1l,s1l);
+        emit_cmovne_reg(alt,addr);
         #endif
-        {
-          emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
-          if(s1h>=0) {
-            if(s2h>=0) emit_cmp(s1h,s2h);
-            else emit_test(s1h,s1h);
-            emit_cmovne_reg(alt,addr);
-          }
-          if(s2l>=0) emit_cmp(s1l,s2l);
-          else emit_test(s1l,s1l);
-          emit_cmovne_reg(alt,addr);
-        }
       }
-      if((opcode[i]&0x2f)==6) // BLEZ
+      if((dops[i].opcode&0x2f)==6) // BLEZ
       {
         //emit_movimm(ba[i],alt);
         //emit_movimm(start+i*4+8,addr);
         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
         emit_cmpimm(s1l,1);
-        if(s1h>=0) emit_mov(addr,ntaddr);
         emit_cmovl_reg(alt,addr);
-        if(s1h>=0) {
-          emit_test(s1h,s1h);
-          emit_cmovne_reg(ntaddr,addr);
-          emit_cmovs_reg(alt,addr);
-        }
       }
-      if((opcode[i]&0x2f)==7) // BGTZ
+      if((dops[i].opcode&0x2f)==7) // BGTZ
       {
         //emit_movimm(ba[i],addr);
         //emit_movimm(start+i*4+8,ntaddr);
         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
         emit_cmpimm(s1l,1);
-        if(s1h>=0) emit_mov(addr,alt);
         emit_cmovl_reg(ntaddr,addr);
-        if(s1h>=0) {
-          emit_test(s1h,s1h);
-          emit_cmovne_reg(alt,addr);
-          emit_cmovs_reg(ntaddr,addr);
-        }
       }
-      if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
+      if((dops[i].opcode==1)&&(dops[i].opcode2&0x2D)==0) // BLTZ
       {
         //emit_movimm(ba[i],alt);
         //emit_movimm(start+i*4+8,addr);
         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
-        if(s1h>=0) emit_test(s1h,s1h);
-        else emit_test(s1l,s1l);
+        emit_test(s1l,s1l);
         emit_cmovs_reg(alt,addr);
       }
-      if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
+      if((dops[i].opcode==1)&&(dops[i].opcode2&0x2D)==1) // BGEZ
       {
         //emit_movimm(ba[i],addr);
         //emit_movimm(start+i*4+8,alt);
         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
-        if(s1h>=0) emit_test(s1h,s1h);
-        else emit_test(s1l,s1l);
+        emit_test(s1l,s1l);
         emit_cmovs_reg(alt,addr);
       }
-      if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
+      if(dops[i].opcode==0x11 && dops[i].opcode2==0x08 ) {
         if(source[i]&0x10000) // BC1T
         {
           //emit_movimm(ba[i],alt);
@@ -4636,79 +5035,55 @@ void do_ccstub(int n)
           emit_cmovne_reg(alt,addr);
         }
       }
-      emit_writeword(addr,(int)&pcaddr);
+      emit_writeword(addr,&pcaddr);
     }
     else
-    if(itype[i]==RJUMP)
+    if(dops[i].itype==RJUMP)
     {
-      int r=get_reg(branch_regs[i].regmap,rs1[i]);
-      if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
+      int r=get_reg(branch_regs[i].regmap,dops[i].rs1);
+      if (ds_writes_rjump_rs(i)) {
         r=get_reg(branch_regs[i].regmap,RTEMP);
       }
-      emit_writeword(r,(int)&pcaddr);
+      emit_writeword(r,&pcaddr);
     }
-    else {SysPrintf("Unknown branch type in do_ccstub\n");exit(1);}
+    else {SysPrintf("Unknown branch type in do_ccstub\n");abort();}
   }
   // Update cycle count
   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
-  if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
-  emit_call((int)cc_interrupt);
-  if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
-  if(stubs[n][6]==TAKEN) {
-    if(internal_branch(branch_regs[i].is32,ba[i]))
+  if(stubs[n].a) emit_addimm(HOST_CCREG,(int)stubs[n].a,HOST_CCREG);
+  emit_far_call(cc_interrupt);
+  if(stubs[n].a) emit_addimm(HOST_CCREG,-(int)stubs[n].a,HOST_CCREG);
+  if(stubs[n].d==TAKEN) {
+    if(internal_branch(ba[i]))
       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
-    else if(itype[i]==RJUMP) {
+    else if(dops[i].itype==RJUMP) {
       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
-        emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
+        emit_readword(&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
       else
-        emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
+        emit_loadreg(dops[i].rs1,get_reg(branch_regs[i].regmap,dops[i].rs1));
     }
-  }else if(stubs[n][6]==NOTTAKEN) {
+  }else if(stubs[n].d==NOTTAKEN) {
     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
     else load_all_regs(branch_regs[i].regmap);
-  }else if(stubs[n][6]==NULLDS) {
+  }else if(stubs[n].d==NULLDS) {
     // Delay slot instruction is nullified ("likely" branch)
     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
     else load_all_regs(regs[i].regmap);
   }else{
     load_all_regs(branch_regs[i].regmap);
   }
-  emit_jmp(stubs[n][2]); // return address
-
-  /* This works but uses a lot of memory...
-  emit_readword((int)&last_count,ECX);
-  emit_add(HOST_CCREG,ECX,EAX);
-  emit_writeword(EAX,(int)&Count);
-  emit_call((int)gen_interupt);
-  emit_readword((int)&Count,HOST_CCREG);
-  emit_readword((int)&next_interupt,EAX);
-  emit_readword((int)&pending_exception,EBX);
-  emit_writeword(EAX,(int)&last_count);
-  emit_sub(HOST_CCREG,EAX,HOST_CCREG);
-  emit_test(EBX,EBX);
-  int jne_instr=(int)out;
-  emit_jne(0);
-  if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
-  load_all_regs(branch_regs[i].regmap);
-  emit_jmp(stubs[n][2]); // return address
-  set_jump_target(jne_instr,(int)out);
-  emit_readword((int)&pcaddr,EAX);
-  // Call get_addr_ht instead of doing the hash table here.
-  // This code is executed infrequently and takes up a lot of space
-  // so smaller is better.
-  emit_storereg(CCREG,HOST_CCREG);
-  emit_pushreg(EAX);
-  emit_call((int)get_addr_ht);
-  emit_loadreg(CCREG,HOST_CCREG);
-  emit_addimm(ESP,4,ESP);
-  emit_jmpreg(EAX);*/
-}
-
-static void add_to_linker(int addr,int target,int ext)
-{
-  link_addr[linkcount][0]=addr;
-  link_addr[linkcount][1]=target;
-  link_addr[linkcount][2]=ext;
+  if (stubs[n].retaddr)
+    emit_jmp(stubs[n].retaddr);
+  else
+    do_jump_vaddr(stubs[n].e);
+}
+
+static void add_to_linker(void *addr, u_int target, int ext)
+{
+  assert(linkcount < ARRAY_SIZE(link_addr));
+  link_addr[linkcount].addr = addr;
+  link_addr[linkcount].target = target;
+  link_addr[linkcount].ext = ext;
   linkcount++;
 }
 
@@ -4722,7 +5097,7 @@ static void ujump_assemble_write_ra(int i)
   return_address=start+i*4+8;
   if(rt>=0) {
     #ifdef USE_MINI_HT
-    if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
+    if(internal_branch(return_address)&&dops[i+1].rt1!=31) {
       int temp=-1; // note: must be ds-safe
       #ifdef HOST_TEMPREG
       temp=HOST_TEMPREG;
@@ -4736,65 +5111,62 @@ static void ujump_assemble_write_ra(int i)
       #ifdef REG_PREFETCH
       if(temp>=0)
       {
-        if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
+        if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
       }
       #endif
       emit_movimm(return_address,rt); // PC into link register
       #ifdef IMM_PREFETCH
-      emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
+      emit_prefetch(hash_table_get(return_address));
       #endif
     }
   }
 }
 
-void ujump_assemble(int i,struct regstat *i_regs)
+static void ujump_assemble(int i, const struct regstat *i_regs)
 {
   int ra_done=0;
   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
   address_generation(i+1,i_regs,regs[i].regmap_entry);
   #ifdef REG_PREFETCH
   int temp=get_reg(branch_regs[i].regmap,PTEMP);
-  if(rt1[i]==31&&temp>=0)
+  if(dops[i].rt1==31&&temp>=0)
   {
     signed char *i_regmap=i_regs->regmap;
     int return_address=start+i*4+8;
     if(get_reg(branch_regs[i].regmap,31)>0)
-    if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
+    if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
   }
   #endif
-  if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
+  if(dops[i].rt1==31&&(dops[i].rt1==dops[i+1].rs1||dops[i].rt1==dops[i+1].rs2)) {
     ujump_assemble_write_ra(i); // writeback ra for DS
     ra_done=1;
   }
   ds_assemble(i+1,i_regs);
   uint64_t bc_unneeded=branch_regs[i].u;
-  uint64_t bc_unneeded_upper=branch_regs[i].uu;
-  bc_unneeded|=1|(1LL<<rt1[i]);
-  bc_unneeded_upper|=1|(1LL<<rt1[i]);
-  wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
-                bc_unneeded,bc_unneeded_upper);
-  load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
-  if(!ra_done&&rt1[i]==31)
+  bc_unneeded|=1|(1LL<<dops[i].rt1);
+  wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
+  load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
+  if(!ra_done&&dops[i].rt1==31)
     ujump_assemble_write_ra(i);
   int cc,adj;
   cc=get_reg(branch_regs[i].regmap,CCREG);
   assert(cc==HOST_CCREG);
-  store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
+  store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
   #ifdef REG_PREFETCH
-  if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
+  if(dops[i].rt1==31&&temp>=0) emit_prefetchreg(temp);
   #endif
   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
-  if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
-  load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
-  if(internal_branch(branch_regs[i].is32,ba[i]))
+  if(adj) emit_addimm(cc, ccadj[i] + CLOCK_ADJUST(2) - adj, cc);
+  load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
+  if(internal_branch(ba[i]))
     assem_debug("branch: internal\n");
   else
     assem_debug("branch: external\n");
-  if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
+  if (internal_branch(ba[i]) && dops[(ba[i]-start)>>2].is_ds) {
     ds_assemble_entry(i);
   }
   else {
-    add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
+    add_to_linker(out,ba[i],internal_branch(ba[i]));
     emit_jmp(0);
   }
 }
@@ -4802,32 +5174,32 @@ void ujump_assemble(int i,struct regstat *i_regs)
 static void rjump_assemble_write_ra(int i)
 {
   int rt,return_address;
-  assert(rt1[i+1]!=rt1[i]);
-  assert(rt2[i+1]!=rt1[i]);
-  rt=get_reg(branch_regs[i].regmap,rt1[i]);
+  assert(dops[i+1].rt1!=dops[i].rt1);
+  assert(dops[i+1].rt2!=dops[i].rt1);
+  rt=get_reg(branch_regs[i].regmap,dops[i].rt1);
   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
   assert(rt>=0);
   return_address=start+i*4+8;
   #ifdef REG_PREFETCH
   if(temp>=0)
   {
-    if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
+    if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
   }
   #endif
   emit_movimm(return_address,rt); // PC into link register
   #ifdef IMM_PREFETCH
-  emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
+  emit_prefetch(hash_table_get(return_address));
   #endif
 }
 
-void rjump_assemble(int i,struct regstat *i_regs)
+static void rjump_assemble(int i, const struct regstat *i_regs)
 {
   int temp;
   int rs,cc;
   int ra_done=0;
-  rs=get_reg(branch_regs[i].regmap,rs1[i]);
+  rs=get_reg(branch_regs[i].regmap,dops[i].rs1);
   assert(rs>=0);
-  if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
+  if (ds_writes_rjump_rs(i)) {
     // Delay slot abuse, make a copy of the branch address register
     temp=get_reg(branch_regs[i].regmap,RTEMP);
     assert(temp>=0);
@@ -4837,35 +5209,32 @@ void rjump_assemble(int i,struct regstat *i_regs)
   }
   address_generation(i+1,i_regs,regs[i].regmap_entry);
   #ifdef REG_PREFETCH
-  if(rt1[i]==31)
+  if(dops[i].rt1==31)
   {
     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
       signed char *i_regmap=i_regs->regmap;
       int return_address=start+i*4+8;
-      if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
+      if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
     }
   }
   #endif
   #ifdef USE_MINI_HT
-  if(rs1[i]==31) {
+  if(dops[i].rs1==31) {
     int rh=get_reg(regs[i].regmap,RHASH);
     if(rh>=0) do_preload_rhash(rh);
   }
   #endif
-  if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
+  if(dops[i].rt1!=0&&(dops[i].rt1==dops[i+1].rs1||dops[i].rt1==dops[i+1].rs2)) {
     rjump_assemble_write_ra(i);
     ra_done=1;
   }
   ds_assemble(i+1,i_regs);
   uint64_t bc_unneeded=branch_regs[i].u;
-  uint64_t bc_unneeded_upper=branch_regs[i].uu;
-  bc_unneeded|=1|(1LL<<rt1[i]);
-  bc_unneeded_upper|=1|(1LL<<rt1[i]);
-  bc_unneeded&=~(1LL<<rs1[i]);
-  wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
-                bc_unneeded,bc_unneeded_upper);
-  load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
-  if(!ra_done&&rt1[i]!=0)
+  bc_unneeded|=1|(1LL<<dops[i].rt1);
+  bc_unneeded&=~(1LL<<dops[i].rs1);
+  wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
+  load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i].rs1,CCREG);
+  if(!ra_done&&dops[i].rt1!=0)
     rjump_assemble_write_ra(i);
   cc=get_reg(branch_regs[i].regmap,CCREG);
   assert(cc==HOST_CCREG);
@@ -4873,177 +5242,132 @@ void rjump_assemble(int i,struct regstat *i_regs)
   #ifdef USE_MINI_HT
   int rh=get_reg(branch_regs[i].regmap,RHASH);
   int ht=get_reg(branch_regs[i].regmap,RHTBL);
-  if(rs1[i]==31) {
+  if(dops[i].rs1==31) {
     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
     do_preload_rhtbl(ht);
     do_rhash(rs,rh);
   }
   #endif
-  store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
+  store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1);
   #ifdef DESTRUCTIVE_WRITEBACK
-  if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
-    if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
-      emit_loadreg(rs1[i],rs);
+  if((branch_regs[i].dirty>>rs)&1) {
+    if(dops[i].rs1!=dops[i+1].rt1&&dops[i].rs1!=dops[i+1].rt2) {
+      emit_loadreg(dops[i].rs1,rs);
     }
   }
   #endif
   #ifdef REG_PREFETCH
-  if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
+  if(dops[i].rt1==31&&temp>=0) emit_prefetchreg(temp);
   #endif
   #ifdef USE_MINI_HT
-  if(rs1[i]==31) {
+  if(dops[i].rs1==31) {
     do_miniht_load(ht,rh);
   }
   #endif
   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
   //assert(adj==0);
-  emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
-  add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
-  if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
+  emit_addimm_and_set_flags(ccadj[i] + CLOCK_ADJUST(2), HOST_CCREG);
+  add_stub(CC_STUB,out,NULL,0,i,-1,TAKEN,rs);
+  if(dops[i+1].itype==COP0&&(source[i+1]&0x3f)==0x10)
     // special case for RFE
     emit_jmp(0);
   else
     emit_jns(0);
-  //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
+  //load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1);
   #ifdef USE_MINI_HT
-  if(rs1[i]==31) {
+  if(dops[i].rs1==31) {
     do_miniht_jump(rs,rh,ht);
   }
   else
   #endif
   {
-    //if(rs!=EAX) emit_mov(rs,EAX);
-    //emit_jmp((int)jump_vaddr_eax);
-    emit_jmp(jump_vaddr_reg[rs]);
-  }
-  /* Check hash table
-  temp=!rs;
-  emit_mov(rs,temp);
-  emit_shrimm(rs,16,rs);
-  emit_xor(temp,rs,rs);
-  emit_movzwl_reg(rs,rs);
-  emit_shlimm(rs,4,rs);
-  emit_cmpmem_indexed((int)hash_table,rs,temp);
-  emit_jne((int)out+14);
-  emit_readword_indexed((int)hash_table+4,rs,rs);
-  emit_jmpreg(rs);
-  emit_cmpmem_indexed((int)hash_table+8,rs,temp);
-  emit_addimm_no_flags(8,rs);
-  emit_jeq((int)out-17);
-  // No hit on hash table, call compiler
-  emit_pushreg(temp);
-//DEBUG >
-#ifdef DEBUG_CYCLE_COUNT
-  emit_readword((int)&last_count,ECX);
-  emit_add(HOST_CCREG,ECX,HOST_CCREG);
-  emit_readword((int)&next_interupt,ECX);
-  emit_writeword(HOST_CCREG,(int)&Count);
-  emit_sub(HOST_CCREG,ECX,HOST_CCREG);
-  emit_writeword(ECX,(int)&last_count);
-#endif
-//DEBUG <
-  emit_storereg(CCREG,HOST_CCREG);
-  emit_call((int)get_addr);
-  emit_loadreg(CCREG,HOST_CCREG);
-  emit_addimm(ESP,4,ESP);
-  emit_jmpreg(EAX);*/
+    do_jump_vaddr(rs);
+  }
   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
-  if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
+  if(dops[i].rt1!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
   #endif
 }
 
-void cjump_assemble(int i,struct regstat *i_regs)
+static void cjump_assemble(int i, const struct regstat *i_regs)
 {
-  signed char *i_regmap=i_regs->regmap;
+  const signed char *i_regmap = i_regs->regmap;
   int cc;
   int match;
-  match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
+  match=match_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
   assem_debug("match=%d\n",match);
-  int s1h,s1l,s2h,s2l;
-  int prev_cop1_usable=cop1_usable;
+  int s1l,s2l;
   int unconditional=0,nop=0;
-  int only32=0;
   int invert=0;
-  int internal=internal_branch(branch_regs[i].is32,ba[i]);
+  int internal=internal_branch(ba[i]);
   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
   if(!match) invert=1;
   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
   if(i>(ba[i]-start)>>2) invert=1;
   #endif
+  #ifdef __aarch64__
+  invert=1; // because of near cond. branches
+  #endif
 
-  if(ooo[i]) {
-    s1l=get_reg(branch_regs[i].regmap,rs1[i]);
-    s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
-    s2l=get_reg(branch_regs[i].regmap,rs2[i]);
-    s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
+  if(dops[i].ooo) {
+    s1l=get_reg(branch_regs[i].regmap,dops[i].rs1);
+    s2l=get_reg(branch_regs[i].regmap,dops[i].rs2);
   }
   else {
-    s1l=get_reg(i_regmap,rs1[i]);
-    s1h=get_reg(i_regmap,rs1[i]|64);
-    s2l=get_reg(i_regmap,rs2[i]);
-    s2h=get_reg(i_regmap,rs2[i]|64);
+    s1l=get_reg(i_regmap,dops[i].rs1);
+    s2l=get_reg(i_regmap,dops[i].rs2);
   }
-  if(rs1[i]==0&&rs2[i]==0)
+  if(dops[i].rs1==0&&dops[i].rs2==0)
   {
-    if(opcode[i]&1) nop=1;
+    if(dops[i].opcode&1) nop=1;
     else unconditional=1;
-    //assert(opcode[i]!=5);
-    //assert(opcode[i]!=7);
-    //assert(opcode[i]!=0x15);
-    //assert(opcode[i]!=0x17);
+    //assert(dops[i].opcode!=5);
+    //assert(dops[i].opcode!=7);
+    //assert(dops[i].opcode!=0x15);
+    //assert(dops[i].opcode!=0x17);
   }
-  else if(rs1[i]==0)
+  else if(dops[i].rs1==0)
   {
-    s1l=s2l;s1h=s2h;
-    s2l=s2h=-1;
-    only32=(regs[i].was32>>rs2[i])&1;
+    s1l=s2l;
+    s2l=-1;
   }
-  else if(rs2[i]==0)
+  else if(dops[i].rs2==0)
   {
-    s2l=s2h=-1;
-    only32=(regs[i].was32>>rs1[i])&1;
-  }
-  else {
-    only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
+    s2l=-1;
   }
 
-  if(ooo[i]) {
+  if(dops[i].ooo) {
     // Out of order execution (delay slot first)
     //printf("OOOE\n");
     address_generation(i+1,i_regs,regs[i].regmap_entry);
     ds_assemble(i+1,i_regs);
     int adj;
     uint64_t bc_unneeded=branch_regs[i].u;
-    uint64_t bc_unneeded_upper=branch_regs[i].uu;
-    bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
-    bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
+    bc_unneeded&=~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
     bc_unneeded|=1;
-    bc_unneeded_upper|=1;
-    wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
-                  bc_unneeded,bc_unneeded_upper);
-    load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
-    load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
+    wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
+    load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i].rs1,dops[i].rs2);
+    load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
     cc=get_reg(branch_regs[i].regmap,CCREG);
     assert(cc==HOST_CCREG);
     if(unconditional)
-      store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
+      store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
     //assem_debug("cycle count (adj)\n");
     if(unconditional) {
       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
-        if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
-        load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
+        if(adj) emit_addimm(cc, ccadj[i] + CLOCK_ADJUST(2) - adj, cc);
+        load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
         if(internal)
           assem_debug("branch: internal\n");
         else
           assem_debug("branch: external\n");
-        if(internal&&is_ds[(ba[i]-start)>>2]) {
+        if (internal && dops[(ba[i]-start)>>2].is_ds) {
           ds_assemble_entry(i);
         }
         else {
-          add_to_linker((int)out,ba[i],internal);
+          add_to_linker(out,ba[i],internal);
           emit_jmp(0);
         }
         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
@@ -5052,230 +5376,152 @@ void cjump_assemble(int i,struct regstat *i_regs)
       }
     }
     else if(nop) {
-      emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
-      int jaddr=(int)out;
+      emit_addimm_and_set_flags(ccadj[i] + CLOCK_ADJUST(2), cc);
+      void *jaddr=out;
       emit_jns(0);
-      add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
+      add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
     }
     else {
-      int taken=0,nottaken=0,nottaken1=0;
+      void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
-      if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
-      if(!only32)
-      {
-        assert(s1h>=0);
-        if(opcode[i]==4) // BEQ
-        {
-          if(s2h>=0) emit_cmp(s1h,s2h);
-          else emit_test(s1h,s1h);
-          nottaken1=(int)out;
-          emit_jne(1);
-        }
-        if(opcode[i]==5) // BNE
-        {
-          if(s2h>=0) emit_cmp(s1h,s2h);
-          else emit_test(s1h,s1h);
-          if(invert) taken=(int)out;
-          else add_to_linker((int)out,ba[i],internal);
-          emit_jne(0);
-        }
-        if(opcode[i]==6) // BLEZ
-        {
-          emit_test(s1h,s1h);
-          if(invert) taken=(int)out;
-          else add_to_linker((int)out,ba[i],internal);
-          emit_js(0);
-          nottaken1=(int)out;
-          emit_jne(1);
-        }
-        if(opcode[i]==7) // BGTZ
-        {
-          emit_test(s1h,s1h);
-          nottaken1=(int)out;
-          emit_js(1);
-          if(invert) taken=(int)out;
-          else add_to_linker((int)out,ba[i],internal);
-          emit_jne(0);
-        }
-      } // if(!only32)
+      if(adj&&!invert) emit_addimm(cc, ccadj[i] + CLOCK_ADJUST(2) - adj, cc);
 
       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
       assert(s1l>=0);
-      if(opcode[i]==4) // BEQ
+      if(dops[i].opcode==4) // BEQ
       {
         if(s2l>=0) emit_cmp(s1l,s2l);
         else emit_test(s1l,s1l);
         if(invert){
-          nottaken=(int)out;
-          emit_jne(1);
+          nottaken=out;
+          emit_jne(DJT_1);
         }else{
-          add_to_linker((int)out,ba[i],internal);
+          add_to_linker(out,ba[i],internal);
           emit_jeq(0);
         }
       }
-      if(opcode[i]==5) // BNE
+      if(dops[i].opcode==5) // BNE
       {
         if(s2l>=0) emit_cmp(s1l,s2l);
         else emit_test(s1l,s1l);
         if(invert){
-          nottaken=(int)out;
-          emit_jeq(1);
+          nottaken=out;
+          emit_jeq(DJT_1);
         }else{
-          add_to_linker((int)out,ba[i],internal);
+          add_to_linker(out,ba[i],internal);
           emit_jne(0);
         }
       }
-      if(opcode[i]==6) // BLEZ
+      if(dops[i].opcode==6) // BLEZ
       {
         emit_cmpimm(s1l,1);
         if(invert){
-          nottaken=(int)out;
-          emit_jge(1);
+          nottaken=out;
+          emit_jge(DJT_1);
         }else{
-          add_to_linker((int)out,ba[i],internal);
+          add_to_linker(out,ba[i],internal);
           emit_jl(0);
         }
       }
-      if(opcode[i]==7) // BGTZ
+      if(dops[i].opcode==7) // BGTZ
       {
         emit_cmpimm(s1l,1);
         if(invert){
-          nottaken=(int)out;
-          emit_jl(1);
+          nottaken=out;
+          emit_jl(DJT_1);
         }else{
-          add_to_linker((int)out,ba[i],internal);
+          add_to_linker(out,ba[i],internal);
           emit_jge(0);
         }
       }
       if(invert) {
-        if(taken) set_jump_target(taken,(int)out);
+        if(taken) set_jump_target(taken, out);
         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
-        if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
+        if (match && (!internal || !dops[(ba[i]-start)>>2].is_ds)) {
           if(adj) {
-            emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
-            add_to_linker((int)out,ba[i],internal);
+            emit_addimm(cc,-adj,cc);
+            add_to_linker(out,ba[i],internal);
           }else{
             emit_addnop(13);
-            add_to_linker((int)out,ba[i],internal*2);
+            add_to_linker(out,ba[i],internal*2);
           }
           emit_jmp(0);
         }else
         #endif
         {
-          if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
-          store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
-          load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
+          if(adj) emit_addimm(cc,-adj,cc);
+          store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
+          load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
           if(internal)
             assem_debug("branch: internal\n");
           else
             assem_debug("branch: external\n");
-          if(internal&&is_ds[(ba[i]-start)>>2]) {
+          if (internal && dops[(ba[i] - start) >> 2].is_ds) {
             ds_assemble_entry(i);
           }
           else {
-            add_to_linker((int)out,ba[i],internal);
+            add_to_linker(out,ba[i],internal);
             emit_jmp(0);
           }
         }
-        set_jump_target(nottaken,(int)out);
+        set_jump_target(nottaken, out);
       }
 
-      if(nottaken1) set_jump_target(nottaken1,(int)out);
+      if(nottaken1) set_jump_target(nottaken1, out);
       if(adj) {
-        if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
+        if(!invert) emit_addimm(cc,adj,cc);
       }
     } // (!unconditional)
   } // if(ooo)
   else
   {
     // In-order execution (branch first)
-    //if(likely[i]) printf("IOL\n");
-    //else
-    //printf("IOE\n");
-    int taken=0,nottaken=0,nottaken1=0;
+    void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
     if(!unconditional&&!nop) {
-      if(!only32)
-      {
-        assert(s1h>=0);
-        if((opcode[i]&0x2f)==4) // BEQ
-        {
-          if(s2h>=0) emit_cmp(s1h,s2h);
-          else emit_test(s1h,s1h);
-          nottaken1=(int)out;
-          emit_jne(2);
-        }
-        if((opcode[i]&0x2f)==5) // BNE
-        {
-          if(s2h>=0) emit_cmp(s1h,s2h);
-          else emit_test(s1h,s1h);
-          taken=(int)out;
-          emit_jne(1);
-        }
-        if((opcode[i]&0x2f)==6) // BLEZ
-        {
-          emit_test(s1h,s1h);
-          taken=(int)out;
-          emit_js(1);
-          nottaken1=(int)out;
-          emit_jne(2);
-        }
-        if((opcode[i]&0x2f)==7) // BGTZ
-        {
-          emit_test(s1h,s1h);
-          nottaken1=(int)out;
-          emit_js(2);
-          taken=(int)out;
-          emit_jne(1);
-        }
-      } // if(!only32)
-
       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
       assert(s1l>=0);
-      if((opcode[i]&0x2f)==4) // BEQ
+      if((dops[i].opcode&0x2f)==4) // BEQ
       {
         if(s2l>=0) emit_cmp(s1l,s2l);
         else emit_test(s1l,s1l);
-        nottaken=(int)out;
-        emit_jne(2);
+        nottaken=out;
+        emit_jne(DJT_2);
       }
-      if((opcode[i]&0x2f)==5) // BNE
+      if((dops[i].opcode&0x2f)==5) // BNE
       {
         if(s2l>=0) emit_cmp(s1l,s2l);
         else emit_test(s1l,s1l);
-        nottaken=(int)out;
-        emit_jeq(2);
+        nottaken=out;
+        emit_jeq(DJT_2);
       }
-      if((opcode[i]&0x2f)==6) // BLEZ
+      if((dops[i].opcode&0x2f)==6) // BLEZ
       {
         emit_cmpimm(s1l,1);
-        nottaken=(int)out;
-        emit_jge(2);
+        nottaken=out;
+        emit_jge(DJT_2);
       }
-      if((opcode[i]&0x2f)==7) // BGTZ
+      if((dops[i].opcode&0x2f)==7) // BGTZ
       {
         emit_cmpimm(s1l,1);
-        nottaken=(int)out;
-        emit_jl(2);
+        nottaken=out;
+        emit_jl(DJT_2);
       }
     } // if(!unconditional)
     int adj;
     uint64_t ds_unneeded=branch_regs[i].u;
-    uint64_t ds_unneeded_upper=branch_regs[i].uu;
-    ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
-    ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
-    if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
+    ds_unneeded&=~((1LL<<dops[i+1].rs1)|(1LL<<dops[i+1].rs2));
     ds_unneeded|=1;
-    ds_unneeded_upper|=1;
     // branch taken
     if(!nop) {
-      if(taken) set_jump_target(taken,(int)out);
+      if(taken) set_jump_target(taken, out);
       assem_debug("1:\n");
-      wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
-                    ds_unneeded,ds_unneeded_upper);
+      wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
       // load regs
-      load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
+      load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i+1].rs1,dops[i+1].rs2);
       address_generation(i+1,&branch_regs[i],0);
-      load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
+      if (ram_offset)
+        load_regs(regs[i].regmap,branch_regs[i].regmap,ROREG,ROREG);
+      load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
       ds_assemble(i+1,&branch_regs[i]);
       cc=get_reg(branch_regs[i].regmap,CCREG);
       if(cc==-1) {
@@ -5283,120 +5529,111 @@ void cjump_assemble(int i,struct regstat *i_regs)
         // CHECK: Is the following instruction (fall thru) allocated ok?
       }
       assert(cc==HOST_CCREG);
-      store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
+      store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
       assem_debug("cycle count (adj)\n");
-      if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
-      load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
+      if(adj) emit_addimm(cc, ccadj[i] + CLOCK_ADJUST(2) - adj, cc);
+      load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
       if(internal)
         assem_debug("branch: internal\n");
       else
         assem_debug("branch: external\n");
-      if(internal&&is_ds[(ba[i]-start)>>2]) {
+      if (internal && dops[(ba[i] - start) >> 2].is_ds) {
         ds_assemble_entry(i);
       }
       else {
-        add_to_linker((int)out,ba[i],internal);
+        add_to_linker(out,ba[i],internal);
         emit_jmp(0);
       }
     }
     // branch not taken
-    cop1_usable=prev_cop1_usable;
     if(!unconditional) {
-      if(nottaken1) set_jump_target(nottaken1,(int)out);
-      set_jump_target(nottaken,(int)out);
+      if(nottaken1) set_jump_target(nottaken1, out);
+      set_jump_target(nottaken, out);
       assem_debug("2:\n");
-      if(!likely[i]) {
-        wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
-                      ds_unneeded,ds_unneeded_upper);
-        load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
-        address_generation(i+1,&branch_regs[i],0);
-        load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
-        ds_assemble(i+1,&branch_regs[i]);
-      }
+      wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
+      // load regs
+      load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i+1].rs1,dops[i+1].rs2);
+      address_generation(i+1,&branch_regs[i],0);
+      if (ram_offset)
+        load_regs(regs[i].regmap,branch_regs[i].regmap,ROREG,ROREG);
+      load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
+      ds_assemble(i+1,&branch_regs[i]);
       cc=get_reg(branch_regs[i].regmap,CCREG);
-      if(cc==-1&&!likely[i]) {
+      if (cc == -1) {
         // Cycle count isn't in a register, temporarily load it then write it out
         emit_loadreg(CCREG,HOST_CCREG);
-        emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
-        int jaddr=(int)out;
+        emit_addimm_and_set_flags(ccadj[i] + CLOCK_ADJUST(2), HOST_CCREG);
+        void *jaddr=out;
         emit_jns(0);
-        add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
+        add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
         emit_storereg(CCREG,HOST_CCREG);
       }
       else{
         cc=get_reg(i_regmap,CCREG);
         assert(cc==HOST_CCREG);
-        emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
-        int jaddr=(int)out;
+        emit_addimm_and_set_flags(ccadj[i] + CLOCK_ADJUST(2), cc);
+        void *jaddr=out;
         emit_jns(0);
-        add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
+        add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
       }
     }
   }
 }
 
-void sjump_assemble(int i,struct regstat *i_regs)
+static void sjump_assemble(int i, const struct regstat *i_regs)
 {
-  signed char *i_regmap=i_regs->regmap;
+  const signed char *i_regmap = i_regs->regmap;
   int cc;
   int match;
-  match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
+  match=match_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
   assem_debug("smatch=%d\n",match);
-  int s1h,s1l;
-  int prev_cop1_usable=cop1_usable;
+  int s1l;
   int unconditional=0,nevertaken=0;
-  int only32=0;
   int invert=0;
-  int internal=internal_branch(branch_regs[i].is32,ba[i]);
+  int internal=internal_branch(ba[i]);
   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
   if(!match) invert=1;
   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
   if(i>(ba[i]-start)>>2) invert=1;
   #endif
+  #ifdef __aarch64__
+  invert=1; // because of near cond. branches
+  #endif
 
-  //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
-  //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
+  //if(dops[i].opcode2>=0x10) return; // FIXME (BxxZAL)
+  //assert(dops[i].opcode2<0x10||dops[i].rs1==0); // FIXME (BxxZAL)
 
-  if(ooo[i]) {
-    s1l=get_reg(branch_regs[i].regmap,rs1[i]);
-    s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
+  if(dops[i].ooo) {
+    s1l=get_reg(branch_regs[i].regmap,dops[i].rs1);
   }
   else {
-    s1l=get_reg(i_regmap,rs1[i]);
-    s1h=get_reg(i_regmap,rs1[i]|64);
+    s1l=get_reg(i_regmap,dops[i].rs1);
   }
-  if(rs1[i]==0)
+  if(dops[i].rs1==0)
   {
-    if(opcode2[i]&1) unconditional=1;
+    if(dops[i].opcode2&1) unconditional=1;
     else nevertaken=1;
     // These are never taken (r0 is never less than zero)
-    //assert(opcode2[i]!=0);
-    //assert(opcode2[i]!=2);
-    //assert(opcode2[i]!=0x10);
-    //assert(opcode2[i]!=0x12);
-  }
-  else {
-    only32=(regs[i].was32>>rs1[i])&1;
+    //assert(dops[i].opcode2!=0);
+    //assert(dops[i].opcode2!=2);
+    //assert(dops[i].opcode2!=0x10);
+    //assert(dops[i].opcode2!=0x12);
   }
 
-  if(ooo[i]) {
+  if(dops[i].ooo) {
     // Out of order execution (delay slot first)
     //printf("OOOE\n");
     address_generation(i+1,i_regs,regs[i].regmap_entry);
     ds_assemble(i+1,i_regs);
     int adj;
     uint64_t bc_unneeded=branch_regs[i].u;
-    uint64_t bc_unneeded_upper=branch_regs[i].uu;
-    bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
-    bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
+    bc_unneeded&=~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
     bc_unneeded|=1;
-    bc_unneeded_upper|=1;
-    wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
-                  bc_unneeded,bc_unneeded_upper);
-    load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
-    load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
-    if(rt1[i]==31) {
+    wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
+    load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i].rs1,dops[i].rs1);
+    load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
+    if(dops[i].rt1==31) {
       int rt,return_address;
       rt=get_reg(branch_regs[i].regmap,31);
       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
@@ -5405,30 +5642,30 @@ void sjump_assemble(int i,struct regstat *i_regs)
         return_address=start+i*4+8;
         emit_movimm(return_address,rt); // PC into link register
         #ifdef IMM_PREFETCH
-        if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
+        if(!nevertaken) emit_prefetch(hash_table_get(return_address));
         #endif
       }
     }
     cc=get_reg(branch_regs[i].regmap,CCREG);
     assert(cc==HOST_CCREG);
     if(unconditional)
-      store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
+      store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
     assem_debug("cycle count (adj)\n");
     if(unconditional) {
       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
-        if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
-        load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
+        if(adj) emit_addimm(cc, ccadj[i] + CLOCK_ADJUST(2) - adj, cc);
+        load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
         if(internal)
           assem_debug("branch: internal\n");
         else
           assem_debug("branch: external\n");
-        if(internal&&is_ds[(ba[i]-start)>>2]) {
+        if (internal && dops[(ba[i] - start) >> 2].is_ds) {
           ds_assemble_entry(i);
         }
         else {
-          add_to_linker((int)out,ba[i],internal);
+          add_to_linker(out,ba[i],internal);
           emit_jmp(0);
         }
         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
@@ -5437,339 +5674,75 @@ void sjump_assemble(int i,struct regstat *i_regs)
       }
     }
     else if(nevertaken) {
-      emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
-      int jaddr=(int)out;
+      emit_addimm_and_set_flags(ccadj[i] + CLOCK_ADJUST(2), cc);
+      void *jaddr=out;
       emit_jns(0);
-      add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
+      add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
     }
     else {
-      int nottaken=0;
+      void *nottaken = NULL;
       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
-      if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
-      if(!only32)
-      {
-        assert(s1h>=0);
-        if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
-        {
-          emit_test(s1h,s1h);
-          if(invert){
-            nottaken=(int)out;
-            emit_jns(1);
-          }else{
-            add_to_linker((int)out,ba[i],internal);
-            emit_js(0);
-          }
-        }
-        if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
-        {
-          emit_test(s1h,s1h);
-          if(invert){
-            nottaken=(int)out;
-            emit_js(1);
-          }else{
-            add_to_linker((int)out,ba[i],internal);
-            emit_jns(0);
-          }
-        }
-      } // if(!only32)
-      else
+      if(adj&&!invert) emit_addimm(cc, ccadj[i] + CLOCK_ADJUST(2) - adj, cc);
       {
         assert(s1l>=0);
-        if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
+        if((dops[i].opcode2&0xf)==0) // BLTZ/BLTZAL
         {
           emit_test(s1l,s1l);
           if(invert){
-            nottaken=(int)out;
-            emit_jns(1);
+            nottaken=out;
+            emit_jns(DJT_1);
           }else{
-            add_to_linker((int)out,ba[i],internal);
+            add_to_linker(out,ba[i],internal);
             emit_js(0);
           }
         }
-        if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
+        if((dops[i].opcode2&0xf)==1) // BGEZ/BLTZAL
         {
           emit_test(s1l,s1l);
           if(invert){
-            nottaken=(int)out;
-            emit_js(1);
+            nottaken=out;
+            emit_js(DJT_1);
           }else{
-            add_to_linker((int)out,ba[i],internal);
+            add_to_linker(out,ba[i],internal);
             emit_jns(0);
           }
         }
-      } // if(!only32)
-
-      if(invert) {
-        #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
-        if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
-          if(adj) {
-            emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
-            add_to_linker((int)out,ba[i],internal);
-          }else{
-            emit_addnop(13);
-            add_to_linker((int)out,ba[i],internal*2);
-          }
-          emit_jmp(0);
-        }else
-        #endif
-        {
-          if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
-          store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
-          load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
-          if(internal)
-            assem_debug("branch: internal\n");
-          else
-            assem_debug("branch: external\n");
-          if(internal&&is_ds[(ba[i]-start)>>2]) {
-            ds_assemble_entry(i);
-          }
-          else {
-            add_to_linker((int)out,ba[i],internal);
-            emit_jmp(0);
-          }
-        }
-        set_jump_target(nottaken,(int)out);
-      }
-
-      if(adj) {
-        if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
-      }
-    } // (!unconditional)
-  } // if(ooo)
-  else
-  {
-    // In-order execution (branch first)
-    //printf("IOE\n");
-    int nottaken=0;
-    if(rt1[i]==31) {
-      int rt,return_address;
-      rt=get_reg(branch_regs[i].regmap,31);
-      if(rt>=0) {
-        // Save the PC even if the branch is not taken
-        return_address=start+i*4+8;
-        emit_movimm(return_address,rt); // PC into link register
-        #ifdef IMM_PREFETCH
-        emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
-        #endif
-      }
-    }
-    if(!unconditional) {
-      //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
-      if(!only32)
-      {
-        assert(s1h>=0);
-        if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
-        {
-          emit_test(s1h,s1h);
-          nottaken=(int)out;
-          emit_jns(1);
-        }
-        if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
-        {
-          emit_test(s1h,s1h);
-          nottaken=(int)out;
-          emit_js(1);
-        }
-      } // if(!only32)
-      else
-      {
-        assert(s1l>=0);
-        if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
-        {
-          emit_test(s1l,s1l);
-          nottaken=(int)out;
-          emit_jns(1);
-        }
-        if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
-        {
-          emit_test(s1l,s1l);
-          nottaken=(int)out;
-          emit_js(1);
-        }
-      }
-    } // if(!unconditional)
-    int adj;
-    uint64_t ds_unneeded=branch_regs[i].u;
-    uint64_t ds_unneeded_upper=branch_regs[i].uu;
-    ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
-    ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
-    if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
-    ds_unneeded|=1;
-    ds_unneeded_upper|=1;
-    // branch taken
-    if(!nevertaken) {
-      //assem_debug("1:\n");
-      wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
-                    ds_unneeded,ds_unneeded_upper);
-      // load regs
-      load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
-      address_generation(i+1,&branch_regs[i],0);
-      load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
-      ds_assemble(i+1,&branch_regs[i]);
-      cc=get_reg(branch_regs[i].regmap,CCREG);
-      if(cc==-1) {
-        emit_loadreg(CCREG,cc=HOST_CCREG);
-        // CHECK: Is the following instruction (fall thru) allocated ok?
-      }
-      assert(cc==HOST_CCREG);
-      store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
-      do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
-      assem_debug("cycle count (adj)\n");
-      if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
-      load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
-      if(internal)
-        assem_debug("branch: internal\n");
-      else
-        assem_debug("branch: external\n");
-      if(internal&&is_ds[(ba[i]-start)>>2]) {
-        ds_assemble_entry(i);
-      }
-      else {
-        add_to_linker((int)out,ba[i],internal);
-        emit_jmp(0);
-      }
-    }
-    // branch not taken
-    cop1_usable=prev_cop1_usable;
-    if(!unconditional) {
-      set_jump_target(nottaken,(int)out);
-      assem_debug("1:\n");
-      if(!likely[i]) {
-        wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
-                      ds_unneeded,ds_unneeded_upper);
-        load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
-        address_generation(i+1,&branch_regs[i],0);
-        load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
-        ds_assemble(i+1,&branch_regs[i]);
-      }
-      cc=get_reg(branch_regs[i].regmap,CCREG);
-      if(cc==-1&&!likely[i]) {
-        // Cycle count isn't in a register, temporarily load it then write it out
-        emit_loadreg(CCREG,HOST_CCREG);
-        emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
-        int jaddr=(int)out;
-        emit_jns(0);
-        add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
-        emit_storereg(CCREG,HOST_CCREG);
-      }
-      else{
-        cc=get_reg(i_regmap,CCREG);
-        assert(cc==HOST_CCREG);
-        emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
-        int jaddr=(int)out;
-        emit_jns(0);
-        add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
       }
-    }
-  }
-}
-
-void fjump_assemble(int i,struct regstat *i_regs)
-{
-  signed char *i_regmap=i_regs->regmap;
-  int cc;
-  int match;
-  match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
-  assem_debug("fmatch=%d\n",match);
-  int fs,cs;
-  int eaddr;
-  int invert=0;
-  int internal=internal_branch(branch_regs[i].is32,ba[i]);
-  if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
-  if(!match) invert=1;
-  #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
-  if(i>(ba[i]-start)>>2) invert=1;
-  #endif
-
-  if(ooo[i]) {
-    fs=get_reg(branch_regs[i].regmap,FSREG);
-    address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
-  }
-  else {
-    fs=get_reg(i_regmap,FSREG);
-  }
-
-  // Check cop1 unusable
-  if(!cop1_usable) {
-    cs=get_reg(i_regmap,CSREG);
-    assert(cs>=0);
-    emit_testimm(cs,0x20000000);
-    eaddr=(int)out;
-    emit_jeq(0);
-    add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
-    cop1_usable=1;
-  }
-
-  if(ooo[i]) {
-    // Out of order execution (delay slot first)
-    //printf("OOOE\n");
-    ds_assemble(i+1,i_regs);
-    int adj;
-    uint64_t bc_unneeded=branch_regs[i].u;
-    uint64_t bc_unneeded_upper=branch_regs[i].uu;
-    bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
-    bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
-    bc_unneeded|=1;
-    bc_unneeded_upper|=1;
-    wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
-                  bc_unneeded,bc_unneeded_upper);
-    load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
-    load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
-    cc=get_reg(branch_regs[i].regmap,CCREG);
-    assert(cc==HOST_CCREG);
-    do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
-    assem_debug("cycle count (adj)\n");
-    if(1) {
-      int nottaken=0;
-      if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
-      if(1) {
-        assert(fs>=0);
-        emit_testimm(fs,0x800000);
-        if(source[i]&0x10000) // BC1T
-        {
-          if(invert){
-            nottaken=(int)out;
-            emit_jeq(1);
-          }else{
-            add_to_linker((int)out,ba[i],internal);
-            emit_jne(0);
-          }
-        }
-        else // BC1F
-          if(invert){
-            nottaken=(int)out;
-            emit_jne(1);
-          }else{
-            add_to_linker((int)out,ba[i],internal);
-            emit_jeq(0);
-          }
-        {
-        }
-      } // if(!only32)
 
       if(invert) {
-        if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
-        else if(match) emit_addnop(13);
-        #endif
-        store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
-        load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
-        if(internal)
-          assem_debug("branch: internal\n");
-        else
-          assem_debug("branch: external\n");
-        if(internal&&is_ds[(ba[i]-start)>>2]) {
-          ds_assemble_entry(i);
-        }
-        else {
-          add_to_linker((int)out,ba[i],internal);
+        if (match && (!internal || !dops[(ba[i] - start) >> 2].is_ds)) {
+          if(adj) {
+            emit_addimm(cc,-adj,cc);
+            add_to_linker(out,ba[i],internal);
+          }else{
+            emit_addnop(13);
+            add_to_linker(out,ba[i],internal*2);
+          }
           emit_jmp(0);
+        }else
+        #endif
+        {
+          if(adj) emit_addimm(cc,-adj,cc);
+          store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
+          load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
+          if(internal)
+            assem_debug("branch: internal\n");
+          else
+            assem_debug("branch: external\n");
+          if (internal && dops[(ba[i] - start) >> 2].is_ds) {
+            ds_assemble_entry(i);
+          }
+          else {
+            add_to_linker(out,ba[i],internal);
+            emit_jmp(0);
+          }
         }
-        set_jump_target(nottaken,(int)out);
+        set_jump_target(nottaken, out);
       }
 
       if(adj) {
-        if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
+        if(!invert) emit_addimm(cc,adj,cc);
       }
     } // (!unconditional)
   } // if(ooo)
@@ -5777,118 +5750,119 @@ void fjump_assemble(int i,struct regstat *i_regs)
   {
     // In-order execution (branch first)
     //printf("IOE\n");
-    int nottaken=0;
-    if(1) {
+    void *nottaken = NULL;
+    if(dops[i].rt1==31) {
+      int rt,return_address;
+      rt=get_reg(branch_regs[i].regmap,31);
+      if(rt>=0) {
+        // Save the PC even if the branch is not taken
+        return_address=start+i*4+8;
+        emit_movimm(return_address,rt); // PC into link register
+        #ifdef IMM_PREFETCH
+        emit_prefetch(hash_table_get(return_address));
+        #endif
+      }
+    }
+    if(!unconditional) {
       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
-      if(1) {
-        assert(fs>=0);
-        emit_testimm(fs,0x800000);
-        if(source[i]&0x10000) // BC1T
+        assert(s1l>=0);
+        if((dops[i].opcode2&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
         {
-          nottaken=(int)out;
-          emit_jeq(1);
+          emit_test(s1l,s1l);
+          nottaken=out;
+          emit_jns(DJT_1);
         }
-        else // BC1F
+        if((dops[i].opcode2&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
         {
-          nottaken=(int)out;
-          emit_jne(1);
+          emit_test(s1l,s1l);
+          nottaken=out;
+          emit_js(DJT_1);
         }
-      }
     } // if(!unconditional)
     int adj;
     uint64_t ds_unneeded=branch_regs[i].u;
-    uint64_t ds_unneeded_upper=branch_regs[i].uu;
-    ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
-    ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
-    if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
+    ds_unneeded&=~((1LL<<dops[i+1].rs1)|(1LL<<dops[i+1].rs2));
     ds_unneeded|=1;
-    ds_unneeded_upper|=1;
     // branch taken
-    //assem_debug("1:\n");
-    wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
-                  ds_unneeded,ds_unneeded_upper);
-    // load regs
-    load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
-    address_generation(i+1,&branch_regs[i],0);
-    load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
-    ds_assemble(i+1,&branch_regs[i]);
-    cc=get_reg(branch_regs[i].regmap,CCREG);
-    if(cc==-1) {
-      emit_loadreg(CCREG,cc=HOST_CCREG);
-      // CHECK: Is the following instruction (fall thru) allocated ok?
-    }
-    assert(cc==HOST_CCREG);
-    store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
-    do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
-    assem_debug("cycle count (adj)\n");
-    if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
-    load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
-    if(internal)
-      assem_debug("branch: internal\n");
-    else
-      assem_debug("branch: external\n");
-    if(internal&&is_ds[(ba[i]-start)>>2]) {
-      ds_assemble_entry(i);
-    }
-    else {
-      add_to_linker((int)out,ba[i],internal);
-      emit_jmp(0);
+    if(!nevertaken) {
+      //assem_debug("1:\n");
+      wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
+      // load regs
+      load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i+1].rs1,dops[i+1].rs2);
+      address_generation(i+1,&branch_regs[i],0);
+      if (ram_offset)
+        load_regs(regs[i].regmap,branch_regs[i].regmap,ROREG,ROREG);
+      load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
+      ds_assemble(i+1,&branch_regs[i]);
+      cc=get_reg(branch_regs[i].regmap,CCREG);
+      if(cc==-1) {
+        emit_loadreg(CCREG,cc=HOST_CCREG);
+        // CHECK: Is the following instruction (fall thru) allocated ok?
+      }
+      assert(cc==HOST_CCREG);
+      store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
+      do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
+      assem_debug("cycle count (adj)\n");
+      if(adj) emit_addimm(cc, ccadj[i] + CLOCK_ADJUST(2) - adj, cc);
+      load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
+      if(internal)
+        assem_debug("branch: internal\n");
+      else
+        assem_debug("branch: external\n");
+      if (internal && dops[(ba[i] - start) >> 2].is_ds) {
+        ds_assemble_entry(i);
+      }
+      else {
+        add_to_linker(out,ba[i],internal);
+        emit_jmp(0);
+      }
     }
-
     // branch not taken
-    if(1) { // <- FIXME (don't need this)
-      set_jump_target(nottaken,(int)out);
+    if(!unconditional) {
+      set_jump_target(nottaken, out);
       assem_debug("1:\n");
-      if(!likely[i]) {
-        wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
-                      ds_unneeded,ds_unneeded_upper);
-        load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
-        address_generation(i+1,&branch_regs[i],0);
-        load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
-        ds_assemble(i+1,&branch_regs[i]);
-      }
+      wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
+      load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i+1].rs1,dops[i+1].rs2);
+      address_generation(i+1,&branch_regs[i],0);
+      load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
+      ds_assemble(i+1,&branch_regs[i]);
       cc=get_reg(branch_regs[i].regmap,CCREG);
-      if(cc==-1&&!likely[i]) {
+      if (cc == -1) {
         // Cycle count isn't in a register, temporarily load it then write it out
         emit_loadreg(CCREG,HOST_CCREG);
-        emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
-        int jaddr=(int)out;
+        emit_addimm_and_set_flags(ccadj[i] + CLOCK_ADJUST(2), HOST_CCREG);
+        void *jaddr=out;
         emit_jns(0);
-        add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
+        add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
         emit_storereg(CCREG,HOST_CCREG);
       }
       else{
         cc=get_reg(i_regmap,CCREG);
         assert(cc==HOST_CCREG);
-        emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
-        int jaddr=(int)out;
+        emit_addimm_and_set_flags(ccadj[i] + CLOCK_ADJUST(2), cc);
+        void *jaddr=out;
         emit_jns(0);
-        add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
+        add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
       }
     }
   }
 }
 
-static void pagespan_assemble(int i,struct regstat *i_regs)
+static void pagespan_assemble(int i, const struct regstat *i_regs)
 {
-  int s1l=get_reg(i_regs->regmap,rs1[i]);
-  int s1h=get_reg(i_regs->regmap,rs1[i]|64);
-  int s2l=get_reg(i_regs->regmap,rs2[i]);
-  int s2h=get_reg(i_regs->regmap,rs2[i]|64);
-  int taken=0;
-  int nottaken=0;
+  int s1l=get_reg(i_regs->regmap,dops[i].rs1);
+  int s2l=get_reg(i_regs->regmap,dops[i].rs2);
+  void *taken = NULL;
+  void *nottaken = NULL;
   int unconditional=0;
-  if(rs1[i]==0)
+  if(dops[i].rs1==0)
   {
-    s1l=s2l;s1h=s2h;
-    s2l=s2h=-1;
+    s1l=s2l;
+    s2l=-1;
   }
-  else if(rs2[i]==0)
+  else if(dops[i].rs2==0)
   {
-    s2l=s2h=-1;
-  }
-  if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
-    s1h=s2h=-1;
+    s2l=-1;
   }
   int hr=0;
   int addr=-1,alt=-1,ntaddr=-1;
@@ -5897,8 +5871,8 @@ static void pagespan_assemble(int i,struct regstat *i_regs)
     while(hr<HOST_REGS)
     {
       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
-         (i_regs->regmap[hr]&63)!=rs1[i] &&
-         (i_regs->regmap[hr]&63)!=rs2[i] )
+         (i_regs->regmap[hr]&63)!=dops[i].rs1 &&
+         (i_regs->regmap[hr]&63)!=dops[i].rs2 )
       {
         addr=hr++;break;
       }
@@ -5908,20 +5882,20 @@ static void pagespan_assemble(int i,struct regstat *i_regs)
   while(hr<HOST_REGS)
   {
     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
-       (i_regs->regmap[hr]&63)!=rs1[i] &&
-       (i_regs->regmap[hr]&63)!=rs2[i] )
+       (i_regs->regmap[hr]&63)!=dops[i].rs1 &&
+       (i_regs->regmap[hr]&63)!=dops[i].rs2 )
     {
       alt=hr++;break;
     }
     hr++;
   }
-  if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
+  if((dops[i].opcode&0x2E)==6) // BLEZ/BGTZ needs another register
   {
     while(hr<HOST_REGS)
     {
       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
-         (i_regs->regmap[hr]&63)!=rs1[i] &&
-         (i_regs->regmap[hr]&63)!=rs2[i] )
+         (i_regs->regmap[hr]&63)!=dops[i].rs1 &&
+         (i_regs->regmap[hr]&63)!=dops[i].rs2 )
       {
         ntaddr=hr;break;
       }
@@ -5929,39 +5903,39 @@ static void pagespan_assemble(int i,struct regstat *i_regs)
     }
   }
   assert(hr<HOST_REGS);
-  if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
-    load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
+  if((dops[i].opcode&0x2e)==4||dops[i].opcode==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
+    load_regs(regs[i].regmap_entry,regs[i].regmap,CCREG,CCREG);
   }
-  emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
-  if(opcode[i]==2) // J
+  emit_addimm(HOST_CCREG, ccadj[i] + CLOCK_ADJUST(2), HOST_CCREG);
+  if(dops[i].opcode==2) // J
   {
     unconditional=1;
   }
-  if(opcode[i]==3) // JAL
+  if(dops[i].opcode==3) // JAL
   {
     // TODO: mini_ht
     int rt=get_reg(i_regs->regmap,31);
     emit_movimm(start+i*4+8,rt);
     unconditional=1;
   }
-  if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
+  if(dops[i].opcode==0&&(dops[i].opcode2&0x3E)==8) // JR/JALR
   {
     emit_mov(s1l,addr);
-    if(opcode2[i]==9) // JALR
+    if(dops[i].opcode2==9) // JALR
     {
-      int rt=get_reg(i_regs->regmap,rt1[i]);
+      int rt=get_reg(i_regs->regmap,dops[i].rt1);
       emit_movimm(start+i*4+8,rt);
     }
   }
-  if((opcode[i]&0x3f)==4) // BEQ
+  if((dops[i].opcode&0x3f)==4) // BEQ
   {
-    if(rs1[i]==rs2[i])
+    if(dops[i].rs1==dops[i].rs2)
     {
       unconditional=1;
     }
     else
     #ifdef HAVE_CMOV_IMM
-    if(s1h<0) {
+    if(1) {
       if(s2l>=0) emit_cmp(s1l,s2l);
       else emit_test(s1l,s1l);
       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
@@ -5971,103 +5945,65 @@ static void pagespan_assemble(int i,struct regstat *i_regs)
     {
       assert(s1l>=0);
       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
-      if(s1h>=0) {
-        if(s2h>=0) emit_cmp(s1h,s2h);
-        else emit_test(s1h,s1h);
-        emit_cmovne_reg(alt,addr);
-      }
       if(s2l>=0) emit_cmp(s1l,s2l);
       else emit_test(s1l,s1l);
       emit_cmovne_reg(alt,addr);
     }
   }
-  if((opcode[i]&0x3f)==5) // BNE
+  if((dops[i].opcode&0x3f)==5) // BNE
   {
     #ifdef HAVE_CMOV_IMM
-    if(s1h<0) {
-      if(s2l>=0) emit_cmp(s1l,s2l);
-      else emit_test(s1l,s1l);
-      emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
-    }
-    else
+    if(s2l>=0) emit_cmp(s1l,s2l);
+    else emit_test(s1l,s1l);
+    emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
+    #else
+    assert(s1l>=0);
+    emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
+    if(s2l>=0) emit_cmp(s1l,s2l);
+    else emit_test(s1l,s1l);
+    emit_cmovne_reg(alt,addr);
     #endif
-    {
-      assert(s1l>=0);
-      emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
-      if(s1h>=0) {
-        if(s2h>=0) emit_cmp(s1h,s2h);
-        else emit_test(s1h,s1h);
-        emit_cmovne_reg(alt,addr);
-      }
-      if(s2l>=0) emit_cmp(s1l,s2l);
-      else emit_test(s1l,s1l);
-      emit_cmovne_reg(alt,addr);
-    }
   }
-  if((opcode[i]&0x3f)==0x14) // BEQL
+  if((dops[i].opcode&0x3f)==0x14) // BEQL
   {
-    if(s1h>=0) {
-      if(s2h>=0) emit_cmp(s1h,s2h);
-      else emit_test(s1h,s1h);
-      nottaken=(int)out;
-      emit_jne(0);
-    }
     if(s2l>=0) emit_cmp(s1l,s2l);
     else emit_test(s1l,s1l);
-    if(nottaken) set_jump_target(nottaken,(int)out);
-    nottaken=(int)out;
+    if(nottaken) set_jump_target(nottaken, out);
+    nottaken=out;
     emit_jne(0);
   }
-  if((opcode[i]&0x3f)==0x15) // BNEL
+  if((dops[i].opcode&0x3f)==0x15) // BNEL
   {
-    if(s1h>=0) {
-      if(s2h>=0) emit_cmp(s1h,s2h);
-      else emit_test(s1h,s1h);
-      taken=(int)out;
-      emit_jne(0);
-    }
     if(s2l>=0) emit_cmp(s1l,s2l);
     else emit_test(s1l,s1l);
-    nottaken=(int)out;
+    nottaken=out;
     emit_jeq(0);
-    if(taken) set_jump_target(taken,(int)out);
+    if(taken) set_jump_target(taken, out);
   }
-  if((opcode[i]&0x3f)==6) // BLEZ
+  if((dops[i].opcode&0x3f)==6) // BLEZ
   {
     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
     emit_cmpimm(s1l,1);
-    if(s1h>=0) emit_mov(addr,ntaddr);
     emit_cmovl_reg(alt,addr);
-    if(s1h>=0) {
-      emit_test(s1h,s1h);
-      emit_cmovne_reg(ntaddr,addr);
-      emit_cmovs_reg(alt,addr);
-    }
   }
-  if((opcode[i]&0x3f)==7) // BGTZ
+  if((dops[i].opcode&0x3f)==7) // BGTZ
   {
     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
     emit_cmpimm(s1l,1);
-    if(s1h>=0) emit_mov(addr,alt);
     emit_cmovl_reg(ntaddr,addr);
-    if(s1h>=0) {
-      emit_test(s1h,s1h);
-      emit_cmovne_reg(alt,addr);
-      emit_cmovs_reg(ntaddr,addr);
-    }
   }
-  if((opcode[i]&0x3f)==0x16) // BLEZL
+  if((dops[i].opcode&0x3f)==0x16) // BLEZL
   {
-    assert((opcode[i]&0x3f)!=0x16);
+    assert((dops[i].opcode&0x3f)!=0x16);
   }
-  if((opcode[i]&0x3f)==0x17) // BGTZL
+  if((dops[i].opcode&0x3f)==0x17) // BGTZL
   {
-    assert((opcode[i]&0x3f)!=0x17);
+    assert((dops[i].opcode&0x3f)!=0x17);
   }
-  assert(opcode[i]!=1); // BLTZ/BGEZ
+  assert(dops[i].opcode!=1); // BLTZ/BGEZ
 
   //FIXME: Check CSREG
-  if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
+  if(dops[i].opcode==0x11 && dops[i].opcode2==0x08 ) {
     if((source[i]&0x30000)==0) // BC1F
     {
       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
@@ -6083,20 +6019,20 @@ static void pagespan_assemble(int i,struct regstat *i_regs)
     if((source[i]&0x30000)==0x20000) // BC1FL
     {
       emit_testimm(s1l,0x800000);
-      nottaken=(int)out;
+      nottaken=out;
       emit_jne(0);
     }
     if((source[i]&0x30000)==0x30000) // BC1TL
     {
       emit_testimm(s1l,0x800000);
-      nottaken=(int)out;
+      nottaken=out;
       emit_jeq(0);
     }
   }
 
   assert(i_regs->regmap[HOST_CCREG]==CCREG);
-  wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
-  if(likely[i]||unconditional)
+  wb_dirtys(regs[i].regmap,regs[i].dirty);
+  if(unconditional)
   {
     emit_movimm(ba[i],HOST_BTREG);
   }
@@ -6109,28 +6045,12 @@ static void pagespan_assemble(int i,struct regstat *i_regs)
   int target_addr=start+i*4+5;
   void *stub=out;
   void *compiled_target_addr=check_addr(target_addr);
-  emit_extjump_ds((int)branch_addr,target_addr);
+  emit_extjump_ds(branch_addr, target_addr);
   if(compiled_target_addr) {
-    set_jump_target((int)branch_addr,(int)compiled_target_addr);
-    add_link(target_addr,stub);
-  }
-  else set_jump_target((int)branch_addr,(int)stub);
-  if(likely[i]) {
-    // Not-taken path
-    set_jump_target((int)nottaken,(int)out);
-    wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
-    void *branch_addr=out;
-    emit_jmp(0);
-    int target_addr=start+i*4+8;
-    void *stub=out;
-    void *compiled_target_addr=check_addr(target_addr);
-    emit_extjump_ds((int)branch_addr,target_addr);
-    if(compiled_target_addr) {
-      set_jump_target((int)branch_addr,(int)compiled_target_addr);
-      add_link(target_addr,stub);
-    }
-    else set_jump_target((int)branch_addr,(int)stub);
+    set_jump_target(branch_addr, compiled_target_addr);
+    add_jump_out(target_addr,stub);
   }
+  else set_jump_target(branch_addr, stub);
 }
 
 // Assemble the delay slot for the above
@@ -6141,58 +6061,21 @@ static void pagespan_ds()
   u_int page=get_page(vaddr);
   u_int vpage=get_vpage(vaddr);
   ll_add(jump_dirty+vpage,vaddr,(void *)out);
-  do_dirty_stub_ds();
+  do_dirty_stub_ds(slen*4);
   ll_add(jump_in+page,vaddr,(void *)out);
   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
   if(regs[0].regmap[HOST_CCREG]!=CCREG)
-    wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
+    wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty);
   if(regs[0].regmap[HOST_BTREG]!=BTREG)
-    emit_writeword(HOST_BTREG,(int)&branch_target);
-  load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
+    emit_writeword(HOST_BTREG,&branch_target);
+  load_regs(regs[0].regmap_entry,regs[0].regmap,dops[0].rs1,dops[0].rs2);
   address_generation(0,&regs[0],regs[0].regmap_entry);
-  if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
-    load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
-  cop1_usable=0;
+  if (ram_offset && (dops[0].is_load || dops[0].is_store))
+    load_regs(regs[0].regmap_entry,regs[0].regmap,ROREG,ROREG);
+  if (dops[0].is_store)
+    load_regs(regs[0].regmap_entry,regs[0].regmap,INVCP,INVCP);
   is_delayslot=0;
-  switch(itype[0]) {
-    case ALU:
-      alu_assemble(0,&regs[0]);break;
-    case IMM16:
-      imm16_assemble(0,&regs[0]);break;
-    case SHIFT:
-      shift_assemble(0,&regs[0]);break;
-    case SHIFTIMM:
-      shiftimm_assemble(0,&regs[0]);break;
-    case LOAD:
-      load_assemble(0,&regs[0]);break;
-    case LOADLR:
-      loadlr_assemble(0,&regs[0]);break;
-    case STORE:
-      store_assemble(0,&regs[0]);break;
-    case STORELR:
-      storelr_assemble(0,&regs[0]);break;
-    case COP0:
-      cop0_assemble(0,&regs[0]);break;
-    case COP1:
-      cop1_assemble(0,&regs[0]);break;
-    case C1LS:
-      c1ls_assemble(0,&regs[0]);break;
-    case COP2:
-      cop2_assemble(0,&regs[0]);break;
-    case C2LS:
-      c2ls_assemble(0,&regs[0]);break;
-    case C2OP:
-      c2op_assemble(0,&regs[0]);break;
-    case FCONV:
-      fconv_assemble(0,&regs[0]);break;
-    case FLOAT:
-      float_assemble(0,&regs[0]);break;
-    case FCOMP:
-      fcomp_assemble(0,&regs[0]);break;
-    case MULTDIV:
-      multdiv_assemble(0,&regs[0]);break;
-    case MOV:
-      mov_assemble(0,&regs[0]);break;
+  switch (dops[0].itype) {
     case SYSCALL:
     case HLECALL:
     case INTCALL:
@@ -6201,167 +6084,102 @@ static void pagespan_ds()
     case RJUMP:
     case CJUMP:
     case SJUMP:
-    case FJUMP:
       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
+      break;
+    default:
+      assemble(0, &regs[0], 0);
   }
   int btaddr=get_reg(regs[0].regmap,BTREG);
   if(btaddr<0) {
     btaddr=get_reg(regs[0].regmap,-1);
-    emit_readword((int)&branch_target,btaddr);
+    emit_readword(&branch_target,btaddr);
   }
   assert(btaddr!=HOST_CCREG);
   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
 #ifdef HOST_IMM8
+  host_tempreg_acquire();
   emit_movimm(start+4,HOST_TEMPREG);
   emit_cmp(btaddr,HOST_TEMPREG);
+  host_tempreg_release();
 #else
   emit_cmpimm(btaddr,start+4);
 #endif
-  int branch=(int)out;
+  void *branch = out;
   emit_jeq(0);
-  store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
-  emit_jmp(jump_vaddr_reg[btaddr]);
-  set_jump_target(branch,(int)out);
-  store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
-  load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
+  store_regs_bt(regs[0].regmap,regs[0].dirty,-1);
+  do_jump_vaddr(btaddr);
+  set_jump_target(branch, out);
+  store_regs_bt(regs[0].regmap,regs[0].dirty,start+4);
+  load_regs_bt(regs[0].regmap,regs[0].dirty,start+4);
 }
 
 // Basic liveness analysis for MIPS registers
 void unneeded_registers(int istart,int iend,int r)
 {
   int i;
-  uint64_t u,uu,gte_u,b,bu,gte_bu;
-  uint64_t temp_u,temp_uu,temp_gte_u=0;
-  uint64_t tdep;
+  uint64_t u,gte_u,b,gte_b;
+  uint64_t temp_u,temp_gte_u=0;
   uint64_t gte_u_unknown=0;
-  if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
+  if (HACK_ENABLED(NDHACK_GTE_UNNEEDED))
     gte_u_unknown=~0ll;
   if(iend==slen-1) {
-    u=1;uu=1;
+    u=1;
     gte_u=gte_u_unknown;
   }else{
-    u=unneeded_reg[iend+1];
-    uu=unneeded_reg_upper[iend+1];
-    u=1;uu=1;
+    //u=unneeded_reg[iend+1];
+    u=1;
     gte_u=gte_unneeded[iend+1];
   }
 
   for (i=iend;i>=istart;i--)
   {
     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
-    if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
+    if(dops[i].is_jump)
     {
       // If subroutine call, flag return address as a possible branch target
-      if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
+      if(dops[i].rt1==31 && i<slen-2) dops[i+2].bt=1;
 
       if(ba[i]<start || ba[i]>=(start+slen*4))
       {
         // Branch out of this block, flush all regs
         u=1;
-        uu=1;
         gte_u=gte_u_unknown;
-        /* Hexagon hack
-        if(itype[i]==UJUMP&&rt1[i]==31)
-        {
-          uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
-        }
-        if(itype[i]==RJUMP&&rs1[i]==31)
-        {
-          uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
-        }
-        if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
-          if(itype[i]==UJUMP&&rt1[i]==31)
-          {
-            //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
-            uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
-          }
-          if(itype[i]==RJUMP&&rs1[i]==31)
-          {
-            //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
-            uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
-          }
-        }*/
         branch_unneeded_reg[i]=u;
-        branch_unneeded_reg_upper[i]=uu;
         // Merge in delay slot
-        tdep=(~uu>>rt1[i+1])&1;
-        u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
-        uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
-        u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
-        uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
-        uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
-        u|=1;uu|=1;
+        u|=(1LL<<dops[i+1].rt1)|(1LL<<dops[i+1].rt2);
+        u&=~((1LL<<dops[i+1].rs1)|(1LL<<dops[i+1].rs2));
+        u|=1;
         gte_u|=gte_rt[i+1];
         gte_u&=~gte_rs[i+1];
-        // If branch is "likely" (and conditional)
-        // then we skip the delay slot on the fall-thru path
-        if(likely[i]) {
-          if(i<slen-1) {
-            u&=unneeded_reg[i+2];
-            uu&=unneeded_reg_upper[i+2];
-            gte_u&=gte_unneeded[i+2];
-          }
-          else
-          {
-            u=1;
-            uu=1;
-            gte_u=gte_u_unknown;
-          }
-        }
       }
       else
       {
         // Internal branch, flag target
-        bt[(ba[i]-start)>>2]=1;
+        dops[(ba[i]-start)>>2].bt=1;
         if(ba[i]<=start+i*4) {
           // Backward branch
-          if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
+          if(dops[i].is_ujump)
           {
             // Unconditional branch
-            temp_u=1;temp_uu=1;
+            temp_u=1;
             temp_gte_u=0;
           } else {
             // Conditional branch (not taken case)
             temp_u=unneeded_reg[i+2];
-            temp_uu=unneeded_reg_upper[i+2];
             temp_gte_u&=gte_unneeded[i+2];
           }
           // Merge in delay slot
-          tdep=(~temp_uu>>rt1[i+1])&1;
-          temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
-          temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
-          temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
-          temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
-          temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
-          temp_u|=1;temp_uu|=1;
+          temp_u|=(1LL<<dops[i+1].rt1)|(1LL<<dops[i+1].rt2);
+          temp_u&=~((1LL<<dops[i+1].rs1)|(1LL<<dops[i+1].rs2));
+          temp_u|=1;
           temp_gte_u|=gte_rt[i+1];
           temp_gte_u&=~gte_rs[i+1];
-          // If branch is "likely" (and conditional)
-          // then we skip the delay slot on the fall-thru path
-          if(likely[i]) {
-            if(i<slen-1) {
-              temp_u&=unneeded_reg[i+2];
-              temp_uu&=unneeded_reg_upper[i+2];
-              temp_gte_u&=gte_unneeded[i+2];
-            }
-            else
-            {
-              temp_u=1;
-              temp_uu=1;
-              temp_gte_u=gte_u_unknown;
-            }
-          }
-          tdep=(~temp_uu>>rt1[i])&1;
-          temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
-          temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
-          temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
-          temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
-          temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
-          temp_u|=1;temp_uu|=1;
+          temp_u|=(1LL<<dops[i].rt1)|(1LL<<dops[i].rt2);
+          temp_u&=~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
+          temp_u|=1;
           temp_gte_u|=gte_rt[i];
           temp_gte_u&=~gte_rs[i];
           unneeded_reg[i]=temp_u;
-          unneeded_reg_upper[i]=temp_uu;
           gte_unneeded[i]=temp_gte_u;
           // Only go three levels deep.  This recursion can take an
           // excessive amount of time if there are a lot of nested loops.
@@ -6369,122 +6187,69 @@ void unneeded_registers(int istart,int iend,int r)
             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
           }else{
             unneeded_reg[(ba[i]-start)>>2]=1;
-            unneeded_reg_upper[(ba[i]-start)>>2]=1;
             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
           }
         } /*else*/ if(1) {
-          if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
+          if (dops[i].is_ujump)
           {
             // Unconditional branch
             u=unneeded_reg[(ba[i]-start)>>2];
-            uu=unneeded_reg_upper[(ba[i]-start)>>2];
             gte_u=gte_unneeded[(ba[i]-start)>>2];
             branch_unneeded_reg[i]=u;
-            branch_unneeded_reg_upper[i]=uu;
-        //u=1;
-        //uu=1;
-        //branch_unneeded_reg[i]=u;
-        //branch_unneeded_reg_upper[i]=uu;
             // Merge in delay slot
-            tdep=(~uu>>rt1[i+1])&1;
-            u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
-            uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
-            u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
-            uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
-            uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
-            u|=1;uu|=1;
+            u|=(1LL<<dops[i+1].rt1)|(1LL<<dops[i+1].rt2);
+            u&=~((1LL<<dops[i+1].rs1)|(1LL<<dops[i+1].rs2));
+            u|=1;
             gte_u|=gte_rt[i+1];
             gte_u&=~gte_rs[i+1];
           } else {
             // Conditional branch
             b=unneeded_reg[(ba[i]-start)>>2];
-            bu=unneeded_reg_upper[(ba[i]-start)>>2];
-            gte_bu=gte_unneeded[(ba[i]-start)>>2];
+            gte_b=gte_unneeded[(ba[i]-start)>>2];
             branch_unneeded_reg[i]=b;
-            branch_unneeded_reg_upper[i]=bu;
-        //b=1;
-        //bu=1;
-        //branch_unneeded_reg[i]=b;
-        //branch_unneeded_reg_upper[i]=bu;
             // Branch delay slot
-            tdep=(~uu>>rt1[i+1])&1;
-            b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
-            bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
-            b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
-            bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
-            bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
-            b|=1;bu|=1;
-            gte_bu|=gte_rt[i+1];
-            gte_bu&=~gte_rs[i+1];
-            // If branch is "likely" then we skip the
-            // delay slot on the fall-thru path
-            if(likely[i]) {
-              u=b;
-              uu=bu;
-              gte_u=gte_bu;
-              if(i<slen-1) {
-                u&=unneeded_reg[i+2];
-                uu&=unneeded_reg_upper[i+2];
-                gte_u&=gte_unneeded[i+2];
-        //u=1;
-        //uu=1;
-              }
-            } else {
-              u&=b;
-              uu&=bu;
-              gte_u&=gte_bu;
-        //u=1;
-        //uu=1;
-            }
+            b|=(1LL<<dops[i+1].rt1)|(1LL<<dops[i+1].rt2);
+            b&=~((1LL<<dops[i+1].rs1)|(1LL<<dops[i+1].rs2));
+            b|=1;
+            gte_b|=gte_rt[i+1];
+            gte_b&=~gte_rs[i+1];
+            u&=b;
+            gte_u&=gte_b;
             if(i<slen-1) {
               branch_unneeded_reg[i]&=unneeded_reg[i+2];
-              branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
-        //branch_unneeded_reg[i]=1;
-        //branch_unneeded_reg_upper[i]=1;
             } else {
               branch_unneeded_reg[i]=1;
-              branch_unneeded_reg_upper[i]=1;
             }
           }
         }
       }
     }
-    else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
+    else if(dops[i].itype==SYSCALL||dops[i].itype==HLECALL||dops[i].itype==INTCALL)
     {
       // SYSCALL instruction (software interrupt)
       u=1;
-      uu=1;
     }
-    else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
+    else if(dops[i].itype==COP0 && (source[i]&0x3f)==0x18)
     {
       // ERET instruction (return from interrupt)
       u=1;
-      uu=1;
     }
-    //u=uu=1; // DEBUG
-    tdep=(~uu>>rt1[i])&1;
+    //u=1; // DEBUG
     // Written registers are unneeded
-    u|=1LL<<rt1[i];
-    u|=1LL<<rt2[i];
-    uu|=1LL<<rt1[i];
-    uu|=1LL<<rt2[i];
+    u|=1LL<<dops[i].rt1;
+    u|=1LL<<dops[i].rt2;
     gte_u|=gte_rt[i];
     // Accessed registers are needed
-    u&=~(1LL<<rs1[i]);
-    u&=~(1LL<<rs2[i]);
-    uu&=~(1LL<<us1[i]);
-    uu&=~(1LL<<us2[i]);
+    u&=~(1LL<<dops[i].rs1);
+    u&=~(1LL<<dops[i].rs2);
     gte_u&=~gte_rs[i];
-    if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
+    if(gte_rs[i]&&dops[i].rt1&&(unneeded_reg[i+1]&(1ll<<dops[i].rt1)))
       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
     // Source-target dependencies
-    uu&=~(tdep<<dep1[i]);
-    uu&=~(tdep<<dep2[i]);
     // R0 is always unneeded
-    u|=1;uu|=1;
+    u|=1;
     // Save it
     unneeded_reg[i]=u;
-    unneeded_reg_upper[i]=uu;
     gte_unneeded[i]=gte_u;
     /*
     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
@@ -6497,19 +6262,8 @@ void unneeded_registers(int istart,int iend,int r)
         else printf(" r%d",r);
       }
     }
-    printf(" UU:");
-    for(r=1;r<=CCREG;r++) {
-      if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
-        if(r==HIREG) printf(" HI");
-        else if(r==LOREG) printf(" LO");
-        else printf(" r%d",r);
-      }
-    }
-    printf("\n");*/
-  }
-  for (i=iend;i>=istart;i--)
-  {
-    unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
+    printf("\n");
+    */
   }
 }
 
@@ -6530,12 +6284,12 @@ void clean_registers(int istart,int iend,int wr)
   }
   for (i=iend;i>=istart;i--)
   {
-    if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
+    if(dops[i].is_jump)
     {
       if(ba[i]<start || ba[i]>=(start+slen*4))
       {
         // Branch out of this block, flush all regs
-        if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
+        if (dops[i].is_ujump)
         {
           // Unconditional branch
           will_dirty_i=0;
@@ -6543,17 +6297,17 @@ void clean_registers(int istart,int iend,int wr)
           // Merge in delay slot (will dirty)
           for(r=0;r<HOST_REGS;r++) {
             if(r!=EXCLUDE_REG) {
-              if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
-              if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
-              if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
-              if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
+              if((branch_regs[i].regmap[r]&63)==dops[i].rt1) will_dirty_i|=1<<r;
+              if((branch_regs[i].regmap[r]&63)==dops[i].rt2) will_dirty_i|=1<<r;
+              if((branch_regs[i].regmap[r]&63)==dops[i+1].rt1) will_dirty_i|=1<<r;
+              if((branch_regs[i].regmap[r]&63)==dops[i+1].rt2) will_dirty_i|=1<<r;
               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
-              if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
-              if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
-              if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
-              if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
+              if((regs[i].regmap[r]&63)==dops[i].rt1) will_dirty_i|=1<<r;
+              if((regs[i].regmap[r]&63)==dops[i].rt2) will_dirty_i|=1<<r;
+              if((regs[i].regmap[r]&63)==dops[i+1].rt1) will_dirty_i|=1<<r;
+              if((regs[i].regmap[r]&63)==dops[i+1].rt2) will_dirty_i|=1<<r;
               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
@@ -6568,19 +6322,19 @@ void clean_registers(int istart,int iend,int wr)
           // Merge in delay slot (will dirty)
           for(r=0;r<HOST_REGS;r++) {
             if(r!=EXCLUDE_REG) {
-              if(!likely[i]) {
+              if (1) { // !dops[i].likely) {
                 // Might not dirty if likely branch is not taken
-                if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
-                if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
-                if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
-                if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
+                if((branch_regs[i].regmap[r]&63)==dops[i].rt1) will_dirty_i|=1<<r;
+                if((branch_regs[i].regmap[r]&63)==dops[i].rt2) will_dirty_i|=1<<r;
+                if((branch_regs[i].regmap[r]&63)==dops[i+1].rt1) will_dirty_i|=1<<r;
+                if((branch_regs[i].regmap[r]&63)==dops[i+1].rt2) will_dirty_i|=1<<r;
                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
-                //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
-                //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
-                if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
-                if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
+                //if((regs[i].regmap[r]&63)==dops[i].rt1) will_dirty_i|=1<<r;
+                //if((regs[i].regmap[r]&63)==dops[i].rt2) will_dirty_i|=1<<r;
+                if((regs[i].regmap[r]&63)==dops[i+1].rt1) will_dirty_i|=1<<r;
+                if((regs[i].regmap[r]&63)==dops[i+1].rt2) will_dirty_i|=1<<r;
                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
@@ -6591,15 +6345,15 @@ void clean_registers(int istart,int iend,int wr)
         // Merge in delay slot (wont dirty)
         for(r=0;r<HOST_REGS;r++) {
           if(r!=EXCLUDE_REG) {
-            if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
-            if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
-            if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
-            if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
+            if((regs[i].regmap[r]&63)==dops[i].rt1) wont_dirty_i|=1<<r;
+            if((regs[i].regmap[r]&63)==dops[i].rt2) wont_dirty_i|=1<<r;
+            if((regs[i].regmap[r]&63)==dops[i+1].rt1) wont_dirty_i|=1<<r;
+            if((regs[i].regmap[r]&63)==dops[i+1].rt2) wont_dirty_i|=1<<r;
             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
-            if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
-            if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
-            if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
-            if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
+            if((branch_regs[i].regmap[r]&63)==dops[i].rt1) wont_dirty_i|=1<<r;
+            if((branch_regs[i].regmap[r]&63)==dops[i].rt2) wont_dirty_i|=1<<r;
+            if((branch_regs[i].regmap[r]&63)==dops[i+1].rt1) wont_dirty_i|=1<<r;
+            if((branch_regs[i].regmap[r]&63)==dops[i+1].rt2) wont_dirty_i|=1<<r;
             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
           }
         }
@@ -6615,7 +6369,7 @@ void clean_registers(int istart,int iend,int wr)
         // Internal branch
         if(ba[i]<=start+i*4) {
           // Backward branch
-          if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
+          if (dops[i].is_ujump)
           {
             // Unconditional branch
             temp_will_dirty=0;
@@ -6623,17 +6377,17 @@ void clean_registers(int istart,int iend,int wr)
             // Merge in delay slot (will dirty)
             for(r=0;r<HOST_REGS;r++) {
               if(r!=EXCLUDE_REG) {
-                if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
-                if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
-                if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
-                if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
+                if((branch_regs[i].regmap[r]&63)==dops[i].rt1) temp_will_dirty|=1<<r;
+                if((branch_regs[i].regmap[r]&63)==dops[i].rt2) temp_will_dirty|=1<<r;
+                if((branch_regs[i].regmap[r]&63)==dops[i+1].rt1) temp_will_dirty|=1<<r;
+                if((branch_regs[i].regmap[r]&63)==dops[i+1].rt2) temp_will_dirty|=1<<r;
                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
-                if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
-                if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
-                if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
-                if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
+                if((regs[i].regmap[r]&63)==dops[i].rt1) temp_will_dirty|=1<<r;
+                if((regs[i].regmap[r]&63)==dops[i].rt2) temp_will_dirty|=1<<r;
+                if((regs[i].regmap[r]&63)==dops[i+1].rt1) temp_will_dirty|=1<<r;
+                if((regs[i].regmap[r]&63)==dops[i+1].rt2) temp_will_dirty|=1<<r;
                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
@@ -6646,19 +6400,19 @@ void clean_registers(int istart,int iend,int wr)
             // Merge in delay slot (will dirty)
             for(r=0;r<HOST_REGS;r++) {
               if(r!=EXCLUDE_REG) {
-                if(!likely[i]) {
+                if (1) { // !dops[i].likely) {
                   // Will not dirty if likely branch is not taken
-                  if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
-                  if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
-                  if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
-                  if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
+                  if((branch_regs[i].regmap[r]&63)==dops[i].rt1) temp_will_dirty|=1<<r;
+                  if((branch_regs[i].regmap[r]&63)==dops[i].rt2) temp_will_dirty|=1<<r;
+                  if((branch_regs[i].regmap[r]&63)==dops[i+1].rt1) temp_will_dirty|=1<<r;
+                  if((branch_regs[i].regmap[r]&63)==dops[i+1].rt2) temp_will_dirty|=1<<r;
                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
-                  //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
-                  //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
-                  if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
-                  if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
+                  //if((regs[i].regmap[r]&63)==dops[i].rt1) temp_will_dirty|=1<<r;
+                  //if((regs[i].regmap[r]&63)==dops[i].rt2) temp_will_dirty|=1<<r;
+                  if((regs[i].regmap[r]&63)==dops[i+1].rt1) temp_will_dirty|=1<<r;
+                  if((regs[i].regmap[r]&63)==dops[i+1].rt2) temp_will_dirty|=1<<r;
                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
@@ -6669,15 +6423,15 @@ void clean_registers(int istart,int iend,int wr)
           // Merge in delay slot (wont dirty)
           for(r=0;r<HOST_REGS;r++) {
             if(r!=EXCLUDE_REG) {
-              if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
-              if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
-              if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
-              if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
+              if((regs[i].regmap[r]&63)==dops[i].rt1) temp_wont_dirty|=1<<r;
+              if((regs[i].regmap[r]&63)==dops[i].rt2) temp_wont_dirty|=1<<r;
+              if((regs[i].regmap[r]&63)==dops[i+1].rt1) temp_wont_dirty|=1<<r;
+              if((regs[i].regmap[r]&63)==dops[i+1].rt2) temp_wont_dirty|=1<<r;
               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
-              if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
-              if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
-              if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
-              if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
+              if((branch_regs[i].regmap[r]&63)==dops[i].rt1) temp_wont_dirty|=1<<r;
+              if((branch_regs[i].regmap[r]&63)==dops[i].rt2) temp_wont_dirty|=1<<r;
+              if((branch_regs[i].regmap[r]&63)==dops[i+1].rt1) temp_wont_dirty|=1<<r;
+              if((branch_regs[i].regmap[r]&63)==dops[i+1].rt2) temp_wont_dirty|=1<<r;
               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
             }
           }
@@ -6712,7 +6466,7 @@ void clean_registers(int istart,int iend,int wr)
         }
         /*else*/ if(1)
         {
-          if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
+          if (dops[i].is_ujump)
           {
             // Unconditional branch
             will_dirty_i=0;
@@ -6734,17 +6488,17 @@ void clean_registers(int istart,int iend,int wr)
             // Merge in delay slot
             for(r=0;r<HOST_REGS;r++) {
               if(r!=EXCLUDE_REG) {
-                if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
-                if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
-                if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
-                if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
+                if((branch_regs[i].regmap[r]&63)==dops[i].rt1) will_dirty_i|=1<<r;
+                if((branch_regs[i].regmap[r]&63)==dops[i].rt2) will_dirty_i|=1<<r;
+                if((branch_regs[i].regmap[r]&63)==dops[i+1].rt1) will_dirty_i|=1<<r;
+                if((branch_regs[i].regmap[r]&63)==dops[i+1].rt2) will_dirty_i|=1<<r;
                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
-                if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
-                if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
-                if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
-                if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
+                if((regs[i].regmap[r]&63)==dops[i].rt1) will_dirty_i|=1<<r;
+                if((regs[i].regmap[r]&63)==dops[i].rt2) will_dirty_i|=1<<r;
+                if((regs[i].regmap[r]&63)==dops[i+1].rt1) will_dirty_i|=1<<r;
+                if((regs[i].regmap[r]&63)==dops[i+1].rt2) will_dirty_i|=1<<r;
                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
@@ -6766,34 +6520,25 @@ void clean_registers(int istart,int iend,int wr)
                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
                 }
-                // Treat delay slot as part of branch too
-                /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
-                  will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
-                  wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
-                }
-                else
-                {
-                  will_dirty[i+1]&=~(1<<r);
-                }*/
               }
             }
           //}
             // Merge in delay slot
             for(r=0;r<HOST_REGS;r++) {
               if(r!=EXCLUDE_REG) {
-                if(!likely[i]) {
+                if (1) { // !dops[i].likely) {
                   // Might not dirty if likely branch is not taken
-                  if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
-                  if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
-                  if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
-                  if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
+                  if((branch_regs[i].regmap[r]&63)==dops[i].rt1) will_dirty_i|=1<<r;
+                  if((branch_regs[i].regmap[r]&63)==dops[i].rt2) will_dirty_i|=1<<r;
+                  if((branch_regs[i].regmap[r]&63)==dops[i+1].rt1) will_dirty_i|=1<<r;
+                  if((branch_regs[i].regmap[r]&63)==dops[i+1].rt2) will_dirty_i|=1<<r;
                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
-                  //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
-                  //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
-                  if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
-                  if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
+                  //if((regs[i].regmap[r]&63)==dops[i].rt1) will_dirty_i|=1<<r;
+                  //if((regs[i].regmap[r]&63)==dops[i].rt2) will_dirty_i|=1<<r;
+                  if((regs[i].regmap[r]&63)==dops[i+1].rt1) will_dirty_i|=1<<r;
+                  if((regs[i].regmap[r]&63)==dops[i+1].rt2) will_dirty_i|=1<<r;
                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
@@ -6804,15 +6549,15 @@ void clean_registers(int istart,int iend,int wr)
           // Merge in delay slot (won't dirty)
           for(r=0;r<HOST_REGS;r++) {
             if(r!=EXCLUDE_REG) {
-              if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
-              if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
-              if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
-              if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
+              if((regs[i].regmap[r]&63)==dops[i].rt1) wont_dirty_i|=1<<r;
+              if((regs[i].regmap[r]&63)==dops[i].rt2) wont_dirty_i|=1<<r;
+              if((regs[i].regmap[r]&63)==dops[i+1].rt1) wont_dirty_i|=1<<r;
+              if((regs[i].regmap[r]&63)==dops[i+1].rt2) wont_dirty_i|=1<<r;
               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
-              if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
-              if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
-              if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
-              if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
+              if((branch_regs[i].regmap[r]&63)==dops[i].rt1) wont_dirty_i|=1<<r;
+              if((branch_regs[i].regmap[r]&63)==dops[i].rt2) wont_dirty_i|=1<<r;
+              if((branch_regs[i].regmap[r]&63)==dops[i+1].rt1) wont_dirty_i|=1<<r;
+              if((branch_regs[i].regmap[r]&63)==dops[i+1].rt2) wont_dirty_i|=1<<r;
               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
             }
           }
@@ -6825,13 +6570,13 @@ void clean_registers(int istart,int iend,int wr)
         }
       }
     }
-    else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
+    else if(dops[i].itype==SYSCALL||dops[i].itype==HLECALL||dops[i].itype==INTCALL)
     {
       // SYSCALL instruction (software interrupt)
       will_dirty_i=0;
       wont_dirty_i=0;
     }
-    else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
+    else if(dops[i].itype==COP0 && (source[i]&0x3f)==0x18)
     {
       // ERET instruction (return from interrupt)
       will_dirty_i=0;
@@ -6841,21 +6586,21 @@ void clean_registers(int istart,int iend,int wr)
     wont_dirty_next=wont_dirty_i;
     for(r=0;r<HOST_REGS;r++) {
       if(r!=EXCLUDE_REG) {
-        if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
-        if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
+        if((regs[i].regmap[r]&63)==dops[i].rt1) will_dirty_i|=1<<r;
+        if((regs[i].regmap[r]&63)==dops[i].rt2) will_dirty_i|=1<<r;
         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
-        if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
-        if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
+        if((regs[i].regmap[r]&63)==dops[i].rt1) wont_dirty_i|=1<<r;
+        if((regs[i].regmap[r]&63)==dops[i].rt2) wont_dirty_i|=1<<r;
         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
         if(i>istart) {
-          if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP)
+          if (!dops[i].is_jump)
           {
             // Don't store a register immediately after writing it,
             // may prevent dual-issue.
-            if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
-            if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
+            if((regs[i].regmap[r]&63)==dops[i-1].rt1) wont_dirty_i|=1<<r;
+            if((regs[i].regmap[r]&63)==dops[i-1].rt2) wont_dirty_i|=1<<r;
           }
         }
       }
@@ -6865,21 +6610,12 @@ void clean_registers(int istart,int iend,int wr)
     wont_dirty[i]=wont_dirty_i;
     // Mark registers that won't be dirtied as not dirty
     if(wr) {
-      /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
-      for(r=0;r<HOST_REGS;r++) {
-        if((will_dirty_i>>r)&1) {
-          printf(" r%d",r);
-        }
-      }
-      printf("\n");*/
-
-      //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
         regs[i].dirty|=will_dirty_i;
         #ifndef DESTRUCTIVE_WRITEBACK
         regs[i].dirty&=wont_dirty_i;
-        if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
+        if(dops[i].is_jump)
         {
-          if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
+          if (i < iend-1 && !dops[i].is_ujump) {
             for(r=0;r<HOST_REGS;r++) {
               if(r!=EXCLUDE_REG) {
                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
@@ -6951,82 +6687,80 @@ void clean_registers(int istart,int iend,int wr)
   /* disassembly */
 void disassemble_inst(int i)
 {
-    if (bt[i]) printf("*"); else printf(" ");
-    switch(itype[i]) {
+    if (dops[i].bt) printf("*"); else printf(" ");
+    switch(dops[i].itype) {
       case UJUMP:
         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
       case CJUMP:
-        printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
+        printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],dops[i].rs1,dops[i].rs2,i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
       case SJUMP:
-        printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
-      case FJUMP:
-        printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
+        printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],dops[i].rs1,start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
       case RJUMP:
-        if (opcode[i]==0x9&&rt1[i]!=31)
-          printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
+        if (dops[i].opcode==0x9&&dops[i].rt1!=31)
+          printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],dops[i].rt1,dops[i].rs1);
         else
-          printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
+          printf (" %x: %s r%d\n",start+i*4,insn[i],dops[i].rs1);
         break;
       case SPAN:
-        printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
+        printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],dops[i].rs1,dops[i].rs2,ba[i]);break;
       case IMM16:
-        if(opcode[i]==0xf) //LUI
-          printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
+        if(dops[i].opcode==0xf) //LUI
+          printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],dops[i].rt1,imm[i]&0xffff);
         else
-          printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
+          printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],dops[i].rt1,dops[i].rs1,imm[i]);
         break;
       case LOAD:
       case LOADLR:
-        printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
+        printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],dops[i].rt1,dops[i].rs1,imm[i]);
         break;
       case STORE:
       case STORELR:
-        printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
+        printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],dops[i].rs2,dops[i].rs1,imm[i]);
         break;
       case ALU:
       case SHIFT:
-        printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
+        printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],dops[i].rt1,dops[i].rs1,dops[i].rs2);
         break;
       case MULTDIV:
-        printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
+        printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],dops[i].rs1,dops[i].rs2);
         break;
       case SHIFTIMM:
-        printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
+        printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],dops[i].rt1,dops[i].rs1,imm[i]);
         break;
       case MOV:
-        if((opcode2[i]&0x1d)==0x10)
-          printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
-        else if((opcode2[i]&0x1d)==0x11)
-          printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
+        if((dops[i].opcode2&0x1d)==0x10)
+          printf (" %x: %s r%d\n",start+i*4,insn[i],dops[i].rt1);
+        else if((dops[i].opcode2&0x1d)==0x11)
+          printf (" %x: %s r%d\n",start+i*4,insn[i],dops[i].rs1);
         else
           printf (" %x: %s\n",start+i*4,insn[i]);
         break;
       case COP0:
-        if(opcode2[i]==0)
-          printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
-        else if(opcode2[i]==4)
-          printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
+        if(dops[i].opcode2==0)
+          printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],dops[i].rt1,(source[i]>>11)&0x1f); // MFC0
+        else if(dops[i].opcode2==4)
+          printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],dops[i].rs1,(source[i]>>11)&0x1f); // MTC0
         else printf (" %x: %s\n",start+i*4,insn[i]);
         break;
       case COP1:
-        if(opcode2[i]<3)
-          printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
-        else if(opcode2[i]>3)
-          printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
+        if(dops[i].opcode2<3)
+          printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],dops[i].rt1,(source[i]>>11)&0x1f); // MFC1
+        else if(dops[i].opcode2>3)
+          printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],dops[i].rs1,(source[i]>>11)&0x1f); // MTC1
         else printf (" %x: %s\n",start+i*4,insn[i]);
         break;
       case COP2:
-        if(opcode2[i]<3)
-          printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
-        else if(opcode2[i]>3)
-          printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
+        if(dops[i].opcode2<3)
+          printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],dops[i].rt1,(source[i]>>11)&0x1f); // MFC2
+        else if(dops[i].opcode2>3)
+          printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],dops[i].rs1,(source[i]>>11)&0x1f); // MTC2
         else printf (" %x: %s\n",start+i*4,insn[i]);
         break;
       case C1LS:
-        printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
+        printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,dops[i].rs1,imm[i]);
         break;
       case C2LS:
-        printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
+        printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,dops[i].rs1,imm[i]);
         break;
       case INTCALL:
         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
@@ -7042,25 +6776,38 @@ static void disassemble_inst(int i) {}
 
 #define DRC_TEST_VAL 0x74657374
 
-static int new_dynarec_test(void)
+static void new_dynarec_test(void)
 {
-  int (*testfunc)(void) = (void *)out;
+  int (*testfunc)(void);
   void *beginning;
-  int ret;
+  int ret[2];
+  size_t i;
 
-  beginning = start_block();
-  emit_movimm(DRC_TEST_VAL,0); // test
-  emit_jmpreg(14);
-  literal_pool(0);
-  end_block(beginning);
-  SysPrintf("testing if we can run recompiled code..\n");
-  ret = testfunc();
-  if (ret == DRC_TEST_VAL)
+  // check structure linkage
+  if ((u_char *)rcnts - (u_char *)&psxRegs != sizeof(psxRegs))
+  {
+    SysPrintf("linkage_arm* miscompilation/breakage detected.\n");
+  }
+
+  SysPrintf("testing if we can run recompiled code...\n");
+  ((volatile u_int *)out)[0]++; // make cache dirty
+
+  for (i = 0; i < ARRAY_SIZE(ret); i++) {
+    out = ndrc->translation_cache;
+    beginning = start_block();
+    emit_movimm(DRC_TEST_VAL + i, 0); // test
+    emit_ret();
+    literal_pool(0);
+    end_block(beginning);
+    testfunc = beginning;
+    ret[i] = testfunc();
+  }
+
+  if (ret[0] == DRC_TEST_VAL && ret[1] == DRC_TEST_VAL + 1)
     SysPrintf("test passed.\n");
   else
-    SysPrintf("test failed: %08x\n", ret);
-  out=(u_char *)BASE_ADDR;
-  return ret == DRC_TEST_VAL;
+    SysPrintf("test failed, will likely crash soon (r=%08x %08x)\n", ret[0], ret[1]);
+  out = ndrc->translation_cache;
 }
 
 // clear the state completely, instead of just marking
@@ -7068,7 +6815,7 @@ static int new_dynarec_test(void)
 void new_dynarec_clear_full(void)
 {
   int n;
-  out=(u_char *)BASE_ADDR;
+  out = ndrc->translation_cache;
   memset(invalid_code,1,sizeof(invalid_code));
   memset(hash_table,0xff,sizeof(hash_table));
   memset(mini_ht,-1,sizeof(mini_ht));
@@ -7080,64 +6827,51 @@ void new_dynarec_clear_full(void)
   literalcount=0;
   stop_after_jal=0;
   inv_code_start=inv_code_end=~0;
+  f1_hack=0;
   // TLB
   for(n=0;n<4096;n++) ll_clear(jump_in+n);
   for(n=0;n<4096;n++) ll_clear(jump_out+n);
   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
+
+  cycle_multiplier_old = cycle_multiplier;
+  new_dynarec_hacks_old = new_dynarec_hacks;
 }
 
 void new_dynarec_init(void)
 {
   SysPrintf("Init new dynarec\n");
 
-#ifdef _3DS
-  check_rosalina();
-#endif
-
-  // allocate/prepare a buffer for translation cache
-  // see assem_arm.h for some explanation
-#if   defined(BASE_ADDR_FIXED)
-  if (mmap (translation_cache, 1 << TARGET_SIZE_2,
-        PROT_READ | PROT_WRITE | PROT_EXEC,
-        MAP_PRIVATE | MAP_ANONYMOUS,
-        -1, 0) != translation_cache)
-  {
-    SysPrintf("mmap() failed: %s\n", strerror(errno));
-    SysPrintf("disable BASE_ADDR_FIXED and recompile\n");
-    abort();
-  }
-#elif defined(BASE_ADDR_DYNAMIC)
-#ifdef VITA
-  sceBlock = getVMBlock();//sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
+#ifdef BASE_ADDR_DYNAMIC
+  #ifdef VITA
+  sceBlock = sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
   if (sceBlock < 0)
     SysPrintf("sceKernelAllocMemBlockForVM failed\n");
-  int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&translation_cache);
+  int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&ndrc);
   if (ret < 0)
     SysPrintf("sceKernelGetMemBlockBase failed\n");
-    
-  sceKernelOpenVMDomain();
-  sceClibPrintf("translation_cache = 0x%08X \n ", translation_cache);
-#elif defined(_MSC_VER)
-  base_addr = VirtualAlloc(NULL, 1<<TARGET_SIZE_2, MEM_COMMIT | MEM_RESERVE,
-      PAGE_EXECUTE_READWRITE);
-#else
-  translation_cache = mmap (NULL, 1 << TARGET_SIZE_2,
-      PROT_READ | PROT_WRITE | PROT_EXEC,
-      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-  if (translation_cache == MAP_FAILED) {
+  #else
+  uintptr_t desired_addr = 0;
+  #ifdef __ELF__
+  extern char _end;
+  desired_addr = ((uintptr_t)&_end + 0xffffff) & ~0xffffffl;
+  #endif
+  ndrc = mmap((void *)desired_addr, sizeof(*ndrc),
+            PROT_READ | PROT_WRITE | PROT_EXEC,
+            MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  if (ndrc == MAP_FAILED) {
     SysPrintf("mmap() failed: %s\n", strerror(errno));
     abort();
   }
-#endif
+  #endif
 #else
-#ifndef NO_WRITE_EXEC
+  #ifndef NO_WRITE_EXEC
   // not all systems allow execute in data segment by default
-  if (mprotect((void *)BASE_ADDR, 1<<TARGET_SIZE_2, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
+  if (mprotect(ndrc, sizeof(ndrc->translation_cache) + sizeof(ndrc->tramp.ops),
+               PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
     SysPrintf("mprotect() failed: %s\n", strerror(errno));
+  #endif
 #endif
-#endif
-
-  out=(u_char *)BASE_ADDR;
+  out = ndrc->translation_cache;
   cycle_multiplier=200;
   new_dynarec_clear_full();
 #ifdef HOST_IMM8
@@ -7146,9 +6880,7 @@ void new_dynarec_init(void)
 #endif
   arch_init();
   new_dynarec_test();
-#ifndef RAM_FIXED
-  ram_offset=(u_int)rdram-0x80000000;
-#endif
+  ram_offset=(uintptr_t)rdram-0x80000000;
   if (ram_offset!=0)
     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
 }
@@ -7156,45 +6888,47 @@ void new_dynarec_init(void)
 void new_dynarec_cleanup(void)
 {
   int n;
-#if defined(BASE_ADDR_FIXED) || defined(BASE_ADDR_DYNAMIC)
-#ifndef VITA
-#if defined(_MSC_VER)
-  VirtualFree(base_addr, 0, MEM_RELEASE);
-#else
-  if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0)
+#ifdef BASE_ADDR_DYNAMIC
+  #ifdef VITA
+  sceKernelFreeMemBlock(sceBlock);
+  sceBlock = -1;
+  #else
+  if (munmap(ndrc, sizeof(*ndrc)) < 0)
     SysPrintf("munmap() failed\n");
+  #endif
 #endif
-#endif
-#endif
-  for(n=0;n<4096;n++)
-    ll_clear(jump_in+n);
-  for(n=0;n<4096;n++)
-    ll_clear(jump_out+n);
-  for(n=0;n<4096;n++)
-    ll_clear(jump_dirty+n);
-#ifdef ROM_COPY
+  for(n=0;n<4096;n++) ll_clear(jump_in+n);
+  for(n=0;n<4096;n++) ll_clear(jump_out+n);
+  for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
+  #ifdef ROM_COPY
   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
-#endif
+  #endif
 }
 
 static u_int *get_source_start(u_int addr, u_int *limit)
 {
   if (addr < 0x00200000 ||
-    (0xa0000000 <= addr && addr < 0xa0200000)) {
+    (0xa0000000 <= addr && addr < 0xa0200000))
+  {
     // used for BIOS calls mostly?
     *limit = (addr&0xa0000000)|0x00200000;
-    return (u_int *)((u_int)rdram + (addr&0x1fffff));
+    return (u_int *)(rdram + (addr&0x1fffff));
   }
   else if (!Config.HLE && (
     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
-    (0xbfc00000 <= addr && addr < 0xbfc80000))) {
-    // BIOS
+    (0xbfc00000 <= addr && addr < 0xbfc80000)))
+  {
+    // BIOS. The multiplier should be much higher as it's uncached 8bit mem,
+    // but timings in PCSX are too tied to the interpreter's BIAS
+    if (!HACK_ENABLED(NDHACK_OVERRIDE_CYCLE_M))
+      cycle_multiplier_active = 200;
+
     *limit = (addr & 0xfff00000) | 0x80000;
-    return (u_int *)((u_int)psxR + (addr&0x7ffff));
+    return (u_int *)((u_char *)psxR + (addr&0x7ffff));
   }
   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
     *limit = (addr & 0x80600000) + 0x00200000;
-    return (u_int *)((u_int)rdram + (addr&0x1fffff));
+    return (u_int *)(rdram + (addr&0x1fffff));
   }
   return NULL;
 }
@@ -7238,7 +6972,7 @@ int new_dynarec_save_blocks(void *save, int size)
   u_int addr;
 
   o = 0;
-  for (p = 0; p < sizeof(jump_in) / sizeof(jump_in[0]); p++) {
+  for (p = 0; p < ARRAY_SIZE(jump_in); p++) {
     bcnt = 0;
     for (head = jump_in[p]; head != NULL; head = head->next) {
       tmp_blocks[bcnt].addr = head->vaddr;
@@ -7299,22 +7033,52 @@ void new_dynarec_load_blocks(const void *save, int size)
   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
 }
 
-int new_recompile_block(int addr)
+static void apply_hacks(void)
+{
+  int i;
+  if (HACK_ENABLED(NDHACK_NO_COMPAT_HACKS))
+    return;
+  /* special hack(s) */
+  for (i = 0; i < slen - 4; i++)
+  {
+    // lui a4, 0xf200; jal <rcnt_read>; addu a0, 2; slti v0, 28224
+    if (source[i] == 0x3c04f200 && dops[i+1].itype == UJUMP
+        && source[i+2] == 0x34840002 && dops[i+3].opcode == 0x0a
+        && imm[i+3] == 0x6e40 && dops[i+3].rs1 == 2)
+    {
+      SysPrintf("PE2 hack @%08x\n", start + (i+3)*4);
+      dops[i + 3].itype = NOP;
+    }
+  }
+  i = slen;
+  if (i > 10 && source[i-1] == 0 && source[i-2] == 0x03e00008
+      && source[i-4] == 0x8fbf0018 && source[i-6] == 0x00c0f809
+      && dops[i-7].itype == STORE)
+  {
+    i = i-8;
+    if (dops[i].itype == IMM16)
+      i--;
+    // swl r2, 15(r6); swr r2, 12(r6); sw r6, *; jalr r6
+    if (dops[i].itype == STORELR && dops[i].rs1 == 6
+      && dops[i-1].itype == STORELR && dops[i-1].rs1 == 6)
+    {
+      SysPrintf("F1 hack from %08x\n", start);
+      if (f1_hack == 0)
+        f1_hack = ~0u;
+    }
+  }
+}
+
+int new_recompile_block(u_int addr)
 {
   u_int pagelimit = 0;
   u_int state_rflags = 0;
   int i;
 
-  assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
-  //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
+  assem_debug("NOTCOMPILED: addr = %x -> %p\n", addr, out);
   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
   //if(debug)
-  //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
-  /*if(Count>=312978186) {
-    rlist();
-  }*/
-  //rlist();
 
   // this is just for speculation
   for (i = 1; i < 32; i++) {
@@ -7323,7 +7087,7 @@ int new_recompile_block(int addr)
   }
 
   start = (u_int)addr&~3;
-  //assert(((u_int)addr&1)==0);
+  //assert(((u_int)addr&1)==0); // start-in-delay-slot flag
   new_dynarec_did_compile=1;
   if (Config.HLE && start == 0x80001000) // hlecall
   {
@@ -7333,18 +7097,42 @@ int new_recompile_block(int addr)
 
     invalid_code[start>>12]=0;
     emit_movimm(start,0);
-    emit_writeword(0,(int)&pcaddr);
-    emit_jmp((int)new_dyna_leave);
+    emit_writeword(0,&pcaddr);
+    emit_far_jump(new_dyna_leave);
     literal_pool(0);
     end_block(beginning);
     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
     return 0;
   }
+  else if (f1_hack == ~0u || (f1_hack != 0 && start == f1_hack)) {
+    void *beginning = start_block();
+    u_int page = get_page(start);
+    emit_readword(&psxRegs.GPR.n.sp, 0);
+    emit_readptr(&mem_rtab, 1);
+    emit_shrimm(0, 12, 2);
+    emit_readptr_dualindexedx_ptrlen(1, 2, 1);
+    emit_addimm(0, 0x18, 0);
+    emit_adds_ptr(1, 1, 1);
+    emit_ldr_dualindexed(1, 0, 0);
+    emit_writeword(0, &psxRegs.GPR.r[26]); // lw k0, 0x18(sp)
+    emit_far_call(get_addr_ht);
+    emit_jmpreg(0); // jr k0
+    literal_pool(0);
+    end_block(beginning);
+
+    ll_add_flags(jump_in + page, start, state_rflags, beginning);
+    SysPrintf("F1 hack to   %08x\n", start);
+    f1_hack = start;
+    return 0;
+  }
+
+  cycle_multiplier_active = cycle_multiplier_override && cycle_multiplier == CYCLE_MULT_DEFAULT
+    ? cycle_multiplier_override : cycle_multiplier;
 
   source = get_source_start(start, &pagelimit);
   if (source == NULL) {
     SysPrintf("Compile at bogus memory address: %08x\n", addr);
-    exit(1);
+    abort();
   }
 
   /* Pass 1: disassemble */
@@ -7367,9 +7155,11 @@ int new_recompile_block(int addr)
   /* Pass 1 disassembly */
 
   for(i=0;!done;i++) {
-    bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
+    dops[i].bt=0;
+    dops[i].ooo=0;
+    op2=0;
     minimum_free_regs[i]=0;
-    opcode[i]=op=source[i]>>26;
+    dops[i].opcode=op=source[i]>>26;
     switch(op)
     {
       case 0x00: strcpy(insn[i],"special"); type=NI;
@@ -7438,18 +7228,18 @@ int new_recompile_block(int addr)
         {
           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
-          case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
-          case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
-          case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
-          case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
-          case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
-          case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
-          case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
-          case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
+          //case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
+          //case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
+          //case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
+          //case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
+          //case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
+          //case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
+          //case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
+          //case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
-          case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
-          case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
+          //case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
+          //case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
         }
         break;
       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
@@ -7471,133 +7261,14 @@ int new_recompile_block(int addr)
         switch(op2)
         {
           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
+          case 0x02: strcpy(insn[i],"CFC0"); type=COP0; break;
           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
-          case 0x10: strcpy(insn[i],"tlb"); type=NI;
-          switch(source[i]&0x3f)
-          {
-            case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
-            case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
-            case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
-            case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
-            case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
-            //case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
-          }
+          case 0x06: strcpy(insn[i],"CTC0"); type=COP0; break;
+          case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
         }
         break;
-      case 0x11: strcpy(insn[i],"cop1"); type=NI;
+      case 0x11: strcpy(insn[i],"cop1"); type=COP1;
         op2=(source[i]>>21)&0x1f;
-        switch(op2)
-        {
-          case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
-          case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
-          case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
-          case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
-          case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
-          case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
-          case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
-          switch((source[i]>>16)&0x3)
-          {
-            case 0x00: strcpy(insn[i],"BC1F"); break;
-            case 0x01: strcpy(insn[i],"BC1T"); break;
-            case 0x02: strcpy(insn[i],"BC1FL"); break;
-            case 0x03: strcpy(insn[i],"BC1TL"); break;
-          }
-          break;
-          case 0x10: strcpy(insn[i],"C1.S"); type=NI;
-          switch(source[i]&0x3f)
-          {
-            case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
-            case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
-            case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
-            case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
-            case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
-            case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
-            case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
-            case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
-            case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
-            case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
-            case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
-            case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
-            case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
-            case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
-            case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
-            case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
-            case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
-            case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
-            case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
-            case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
-            case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
-            case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
-            case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
-            case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
-            case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
-            case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
-            case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
-            case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
-            case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
-            case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
-            case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
-            case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
-            case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
-            case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
-            case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
-          }
-          break;
-          case 0x11: strcpy(insn[i],"C1.D"); type=NI;
-          switch(source[i]&0x3f)
-          {
-            case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
-            case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
-            case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
-            case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
-            case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
-            case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
-            case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
-            case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
-            case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
-            case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
-            case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
-            case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
-            case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
-            case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
-            case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
-            case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
-            case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
-            case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
-            case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
-            case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
-            case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
-            case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
-            case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
-            case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
-            case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
-            case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
-            case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
-            case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
-            case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
-            case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
-            case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
-            case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
-            case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
-            case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
-            case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
-          }
-          break;
-          case 0x14: strcpy(insn[i],"C1.W"); type=NI;
-          switch(source[i]&0x3f)
-          {
-            case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
-            case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
-          }
-          break;
-          case 0x15: strcpy(insn[i],"C1.L"); type=NI;
-          switch(source[i]&0x3f)
-          {
-            case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
-            case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
-          }
-          break;
-        }
         break;
 #if 0
       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
@@ -7645,7 +7316,7 @@ int new_recompile_block(int addr)
 #endif
       case 0x12: strcpy(insn[i],"COP2"); type=NI;
         op2=(source[i]>>21)&0x1f;
-        //if (op2 & 0x10) {
+        //if (op2 & 0x10)
         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
           if (gte_handlers[source[i]&0x3f]!=NULL) {
             if (gte_regnames[source[i]&0x3f]!=NULL)
@@ -7670,195 +7341,155 @@ int new_recompile_block(int addr)
         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
         break;
     }
-    itype[i]=type;
-    opcode2[i]=op2;
+    dops[i].itype=type;
+    dops[i].opcode2=op2;
     /* Get registers/immediates */
-    lt1[i]=0;
-    us1[i]=0;
-    us2[i]=0;
-    dep1[i]=0;
-    dep2[i]=0;
+    dops[i].lt1=0;
     gte_rs[i]=gte_rt[i]=0;
     switch(type) {
       case LOAD:
-        rs1[i]=(source[i]>>21)&0x1f;
-        rs2[i]=0;
-        rt1[i]=(source[i]>>16)&0x1f;
-        rt2[i]=0;
+        dops[i].rs1=(source[i]>>21)&0x1f;
+        dops[i].rs2=0;
+        dops[i].rt1=(source[i]>>16)&0x1f;
+        dops[i].rt2=0;
         imm[i]=(short)source[i];
         break;
       case STORE:
       case STORELR:
-        rs1[i]=(source[i]>>21)&0x1f;
-        rs2[i]=(source[i]>>16)&0x1f;
-        rt1[i]=0;
-        rt2[i]=0;
+        dops[i].rs1=(source[i]>>21)&0x1f;
+        dops[i].rs2=(source[i]>>16)&0x1f;
+        dops[i].rt1=0;
+        dops[i].rt2=0;
         imm[i]=(short)source[i];
-        if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
         break;
       case LOADLR:
         // LWL/LWR only load part of the register,
         // therefore the target register must be treated as a source too
-        rs1[i]=(source[i]>>21)&0x1f;
-        rs2[i]=(source[i]>>16)&0x1f;
-        rt1[i]=(source[i]>>16)&0x1f;
-        rt2[i]=0;
+        dops[i].rs1=(source[i]>>21)&0x1f;
+        dops[i].rs2=(source[i]>>16)&0x1f;
+        dops[i].rt1=(source[i]>>16)&0x1f;
+        dops[i].rt2=0;
         imm[i]=(short)source[i];
-        if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
-        if(op==0x26) dep1[i]=rt1[i]; // LWR
         break;
       case IMM16:
-        if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
-        else rs1[i]=(source[i]>>21)&0x1f;
-        rs2[i]=0;
-        rt1[i]=(source[i]>>16)&0x1f;
-        rt2[i]=0;
+        if (op==0x0f) dops[i].rs1=0; // LUI instruction has no source register
+        else dops[i].rs1=(source[i]>>21)&0x1f;
+        dops[i].rs2=0;
+        dops[i].rt1=(source[i]>>16)&0x1f;
+        dops[i].rt2=0;
         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
           imm[i]=(unsigned short)source[i];
         }else{
           imm[i]=(short)source[i];
         }
-        if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
-        if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
-        if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
         break;
       case UJUMP:
-        rs1[i]=0;
-        rs2[i]=0;
-        rt1[i]=0;
-        rt2[i]=0;
+        dops[i].rs1=0;
+        dops[i].rs2=0;
+        dops[i].rt1=0;
+        dops[i].rt2=0;
         // The JAL instruction writes to r31.
         if (op&1) {
-          rt1[i]=31;
+          dops[i].rt1=31;
         }
-        rs2[i]=CCREG;
+        dops[i].rs2=CCREG;
         break;
       case RJUMP:
-        rs1[i]=(source[i]>>21)&0x1f;
-        rs2[i]=0;
-        rt1[i]=0;
-        rt2[i]=0;
+        dops[i].rs1=(source[i]>>21)&0x1f;
+        dops[i].rs2=0;
+        dops[i].rt1=0;
+        dops[i].rt2=0;
         // The JALR instruction writes to rd.
         if (op2&1) {
-          rt1[i]=(source[i]>>11)&0x1f;
+          dops[i].rt1=(source[i]>>11)&0x1f;
         }
-        rs2[i]=CCREG;
+        dops[i].rs2=CCREG;
         break;
       case CJUMP:
-        rs1[i]=(source[i]>>21)&0x1f;
-        rs2[i]=(source[i]>>16)&0x1f;
-        rt1[i]=0;
-        rt2[i]=0;
+        dops[i].rs1=(source[i]>>21)&0x1f;
+        dops[i].rs2=(source[i]>>16)&0x1f;
+        dops[i].rt1=0;
+        dops[i].rt2=0;
         if(op&2) { // BGTZ/BLEZ
-          rs2[i]=0;
+          dops[i].rs2=0;
         }
-        us1[i]=rs1[i];
-        us2[i]=rs2[i];
-        likely[i]=op>>4;
         break;
       case SJUMP:
-        rs1[i]=(source[i]>>21)&0x1f;
-        rs2[i]=CCREG;
-        rt1[i]=0;
-        rt2[i]=0;
-        us1[i]=rs1[i];
+        dops[i].rs1=(source[i]>>21)&0x1f;
+        dops[i].rs2=CCREG;
+        dops[i].rt1=0;
+        dops[i].rt2=0;
         if(op2&0x10) { // BxxAL
-          rt1[i]=31;
+          dops[i].rt1=31;
           // NOTE: If the branch is not taken, r31 is still overwritten
         }
-        likely[i]=(op2&2)>>1;
-        break;
-      case FJUMP:
-        rs1[i]=FSREG;
-        rs2[i]=CSREG;
-        rt1[i]=0;
-        rt2[i]=0;
-        likely[i]=((source[i])>>17)&1;
         break;
       case ALU:
-        rs1[i]=(source[i]>>21)&0x1f; // source
-        rs2[i]=(source[i]>>16)&0x1f; // subtract amount
-        rt1[i]=(source[i]>>11)&0x1f; // destination
-        rt2[i]=0;
-        if(op2==0x2a||op2==0x2b) { // SLT/SLTU
-          us1[i]=rs1[i];us2[i]=rs2[i];
-        }
-        else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
-          dep1[i]=rs1[i];dep2[i]=rs2[i];
-        }
-        else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
-          dep1[i]=rs1[i];dep2[i]=rs2[i];
-        }
+        dops[i].rs1=(source[i]>>21)&0x1f; // source
+        dops[i].rs2=(source[i]>>16)&0x1f; // subtract amount
+        dops[i].rt1=(source[i]>>11)&0x1f; // destination
+        dops[i].rt2=0;
         break;
       case MULTDIV:
-        rs1[i]=(source[i]>>21)&0x1f; // source
-        rs2[i]=(source[i]>>16)&0x1f; // divisor
-        rt1[i]=HIREG;
-        rt2[i]=LOREG;
-        if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
-          us1[i]=rs1[i];us2[i]=rs2[i];
-        }
+        dops[i].rs1=(source[i]>>21)&0x1f; // source
+        dops[i].rs2=(source[i]>>16)&0x1f; // divisor
+        dops[i].rt1=HIREG;
+        dops[i].rt2=LOREG;
         break;
       case MOV:
-        rs1[i]=0;
-        rs2[i]=0;
-        rt1[i]=0;
-        rt2[i]=0;
-        if(op2==0x10) rs1[i]=HIREG; // MFHI
-        if(op2==0x11) rt1[i]=HIREG; // MTHI
-        if(op2==0x12) rs1[i]=LOREG; // MFLO
-        if(op2==0x13) rt1[i]=LOREG; // MTLO
-        if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
-        if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
-        dep1[i]=rs1[i];
+        dops[i].rs1=0;
+        dops[i].rs2=0;
+        dops[i].rt1=0;
+        dops[i].rt2=0;
+        if(op2==0x10) dops[i].rs1=HIREG; // MFHI
+        if(op2==0x11) dops[i].rt1=HIREG; // MTHI
+        if(op2==0x12) dops[i].rs1=LOREG; // MFLO
+        if(op2==0x13) dops[i].rt1=LOREG; // MTLO
+        if((op2&0x1d)==0x10) dops[i].rt1=(source[i]>>11)&0x1f; // MFxx
+        if((op2&0x1d)==0x11) dops[i].rs1=(source[i]>>21)&0x1f; // MTxx
         break;
       case SHIFT:
-        rs1[i]=(source[i]>>16)&0x1f; // target of shift
-        rs2[i]=(source[i]>>21)&0x1f; // shift amount
-        rt1[i]=(source[i]>>11)&0x1f; // destination
-        rt2[i]=0;
-        // DSLLV/DSRLV/DSRAV are 64-bit
-        if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
+        dops[i].rs1=(source[i]>>16)&0x1f; // target of shift
+        dops[i].rs2=(source[i]>>21)&0x1f; // shift amount
+        dops[i].rt1=(source[i]>>11)&0x1f; // destination
+        dops[i].rt2=0;
         break;
       case SHIFTIMM:
-        rs1[i]=(source[i]>>16)&0x1f;
-        rs2[i]=0;
-        rt1[i]=(source[i]>>11)&0x1f;
-        rt2[i]=0;
+        dops[i].rs1=(source[i]>>16)&0x1f;
+        dops[i].rs2=0;
+        dops[i].rt1=(source[i]>>11)&0x1f;
+        dops[i].rt2=0;
         imm[i]=(source[i]>>6)&0x1f;
         // DSxx32 instructions
         if(op2>=0x3c) imm[i]|=0x20;
-        // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
-        if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
         break;
       case COP0:
-        rs1[i]=0;
-        rs2[i]=0;
-        rt1[i]=0;
-        rt2[i]=0;
-        if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
-        if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
-        if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
-        if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
+        dops[i].rs1=0;
+        dops[i].rs2=0;
+        dops[i].rt1=0;
+        dops[i].rt2=0;
+        if(op2==0||op2==2) dops[i].rt1=(source[i]>>16)&0x1F; // MFC0/CFC0
+        if(op2==4||op2==6) dops[i].rs1=(source[i]>>16)&0x1F; // MTC0/CTC0
+        if(op2==4&&((source[i]>>11)&0x1f)==12) dops[i].rt2=CSREG; // Status
+        if(op2==16) if((source[i]&0x3f)==0x18) dops[i].rs2=CCREG; // ERET
         break;
       case COP1:
-        rs1[i]=0;
-        rs2[i]=0;
-        rt1[i]=0;
-        rt2[i]=0;
-        if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
-        if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
-        if(op2==5) us1[i]=rs1[i]; // DMTC1
-        rs2[i]=CSREG;
+        dops[i].rs1=0;
+        dops[i].rs2=0;
+        dops[i].rt1=0;
+        dops[i].rt2=0;
+        if(op2<3) dops[i].rt1=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
+        if(op2>3) dops[i].rs1=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
+        dops[i].rs2=CSREG;
         break;
       case COP2:
-        rs1[i]=0;
-        rs2[i]=0;
-        rt1[i]=0;
-        rt2[i]=0;
-        if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
-        if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
-        rs2[i]=CSREG;
+        dops[i].rs1=0;
+        dops[i].rs2=0;
+        dops[i].rt1=0;
+        dops[i].rt2=0;
+        if(op2<3) dops[i].rt1=(source[i]>>16)&0x1F; // MFC2/CFC2
+        if(op2>3) dops[i].rs1=(source[i]>>16)&0x1F; // MTC2/CTC2
+        dops[i].rs2=CSREG;
         int gr=(source[i]>>11)&0x1F;
         switch(op2)
         {
@@ -7869,26 +7500,26 @@ int new_recompile_block(int addr)
         }
         break;
       case C1LS:
-        rs1[i]=(source[i]>>21)&0x1F;
-        rs2[i]=CSREG;
-        rt1[i]=0;
-        rt2[i]=0;
+        dops[i].rs1=(source[i]>>21)&0x1F;
+        dops[i].rs2=CSREG;
+        dops[i].rt1=0;
+        dops[i].rt2=0;
         imm[i]=(short)source[i];
         break;
       case C2LS:
-        rs1[i]=(source[i]>>21)&0x1F;
-        rs2[i]=0;
-        rt1[i]=0;
-        rt2[i]=0;
+        dops[i].rs1=(source[i]>>21)&0x1F;
+        dops[i].rs2=0;
+        dops[i].rt1=0;
+        dops[i].rt2=0;
         imm[i]=(short)source[i];
         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
         break;
       case C2OP:
-        rs1[i]=0;
-        rs2[i]=0;
-        rt1[i]=0;
-        rt2[i]=0;
+        dops[i].rs1=0;
+        dops[i].rs2=0;
+        dops[i].rt1=0;
+        dops[i].rt2=0;
         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
         gte_rt[i]|=1ll<<63; // every op changes flags
@@ -7899,79 +7530,85 @@ int new_recompile_block(int addr)
           else gte_rs[i]|=3ll<<(v*2);
         }
         break;
-      case FLOAT:
-      case FCONV:
-        rs1[i]=0;
-        rs2[i]=CSREG;
-        rt1[i]=0;
-        rt2[i]=0;
-        break;
-      case FCOMP:
-        rs1[i]=FSREG;
-        rs2[i]=CSREG;
-        rt1[i]=FSREG;
-        rt2[i]=0;
-        break;
       case SYSCALL:
       case HLECALL:
       case INTCALL:
-        rs1[i]=CCREG;
-        rs2[i]=0;
-        rt1[i]=0;
-        rt2[i]=0;
+        dops[i].rs1=CCREG;
+        dops[i].rs2=0;
+        dops[i].rt1=0;
+        dops[i].rt2=0;
         break;
       default:
-        rs1[i]=0;
-        rs2[i]=0;
-        rt1[i]=0;
-        rt2[i]=0;
+        dops[i].rs1=0;
+        dops[i].rs2=0;
+        dops[i].rt1=0;
+        dops[i].rt2=0;
     }
     /* Calculate branch target addresses */
     if(type==UJUMP)
       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
-    else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
+    else if(type==CJUMP&&dops[i].rs1==dops[i].rs2&&(op&1))
       ba[i]=start+i*4+8; // Ignore never taken branch
-    else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
+    else if(type==SJUMP&&dops[i].rs1==0&&!(op2&1))
       ba[i]=start+i*4+8; // Ignore never taken branch
-    else if(type==CJUMP||type==SJUMP||type==FJUMP)
+    else if(type==CJUMP||type==SJUMP)
       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
     else ba[i]=-1;
-    if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
+
+    /* simplify always (not)taken branches */
+    if (type == CJUMP && dops[i].rs1 == dops[i].rs2) {
+      dops[i].rs1 = dops[i].rs2 = 0;
+      if (!(op & 1)) {
+        dops[i].itype = type = UJUMP;
+        dops[i].rs2 = CCREG;
+      }
+    }
+    else if (type == SJUMP && dops[i].rs1 == 0 && (op2 & 1))
+      dops[i].itype = type = UJUMP;
+
+    dops[i].is_jump = (dops[i].itype == RJUMP || dops[i].itype == UJUMP || dops[i].itype == CJUMP || dops[i].itype == SJUMP);
+    dops[i].is_ujump = (dops[i].itype == RJUMP || dops[i].itype == UJUMP); // || (source[i] >> 16) == 0x1000 // beq r0,r0
+    dops[i].is_load = (dops[i].itype == LOAD || dops[i].itype == LOADLR || op == 0x32); // LWC2
+    dops[i].is_store = (dops[i].itype == STORE || dops[i].itype == STORELR || op == 0x3a); // SWC2
+
+    /* messy cases to just pass over to the interpreter */
+    if (i > 0 && dops[i-1].is_jump) {
       int do_in_intrp=0;
       // branch in delay slot?
-      if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
+      if (dops[i].is_jump) {
         // don't handle first branch and call interpreter if it's hit
         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
         do_in_intrp=1;
       }
       // basic load delay detection
-      else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
+      else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&dops[i].rt1!=0) {
         int t=(ba[i-1]-start)/4;
-        if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
+        if(0 <= t && t < i &&(dops[i].rt1==dops[t].rs1||dops[i].rt1==dops[t].rs2)&&dops[t].itype!=CJUMP&&dops[t].itype!=SJUMP) {
           // jump target wants DS result - potential load delay effect
           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
           do_in_intrp=1;
-          bt[t+1]=1; // expected return from interpreter
+          dops[t+1].bt=1; // expected return from interpreter
         }
-        else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
-              !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
+        else if(i>=2&&dops[i-2].rt1==2&&dops[i].rt1==2&&dops[i].rs1!=2&&dops[i].rs2!=2&&dops[i-1].rs1!=2&&dops[i-1].rs2!=2&&
+              !(i>=3&&dops[i-3].is_jump)) {
           // v0 overwrite like this is a sign of trouble, bail out
           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
           do_in_intrp=1;
         }
       }
       if(do_in_intrp) {
-        rs1[i-1]=CCREG;
-        rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
+        dops[i-1].rs1=CCREG;
+        dops[i-1].rs2=dops[i-1].rt1=dops[i-1].rt2=0;
         ba[i-1]=-1;
-        itype[i-1]=INTCALL;
+        dops[i-1].itype=INTCALL;
         done=2;
         i--; // don't compile the DS
       }
     }
+
     /* Is this the end of the block? */
-    if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
-      if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
+    if (i > 0 && dops[i-1].is_ujump) {
+      if(dops[i-1].rt1==0) { // Continue past subroutine call (JAL)
         done=2;
       }
       else {
@@ -7984,8 +7621,8 @@ int new_recompile_block(int addr)
       // Don't get too close to the limit
       if(i>MAXBLOCK/2) done=1;
     }
-    if(itype[i]==SYSCALL&&stop_after_jal) done=1;
-    if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
+    if(dops[i].itype==SYSCALL&&stop_after_jal) done=1;
+    if(dops[i].itype==HLECALL||dops[i].itype==INTCALL) done=2;
     if(done==2) {
       // Does the block continue due to a branch?
       for(j=i-1;j>=0;j--)
@@ -8000,19 +7637,21 @@ int new_recompile_block(int addr)
     assert(start+i*4<pagelimit);
     if (i==MAXBLOCK-1) done=1;
     // Stop if we're compiling junk
-    if(itype[i]==NI&&opcode[i]==0x11) {
+    if(dops[i].itype==NI&&dops[i].opcode==0x11) {
       done=stop_after_jal=1;
       SysPrintf("Disabled speculative precompilation\n");
     }
   }
   slen=i;
-  if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
+  if (dops[i-1].is_jump) {
     if(start+i*4==pagelimit) {
-      itype[i-1]=SPAN;
+      dops[i-1].itype=SPAN;
     }
   }
   assert(slen>0);
 
+  apply_hacks();
+
   /* Pass 2 - Register dependencies and branch targets */
 
   unneeded_registers(0,slen-1,0);
@@ -8020,10 +7659,8 @@ int new_recompile_block(int addr)
   /* Pass 3 - Register allocation */
 
   struct regstat current; // Current register allocations/status
-  current.is32=1;
   current.dirty=0;
   current.u=unneeded_reg[0];
-  current.uu=unneeded_reg_upper[0];
   clear_all_regs(current.regmap);
   alloc_reg(&current,0,CCREG);
   dirty_reg(&current,CCREG);
@@ -8037,16 +7674,15 @@ int new_recompile_block(int addr)
   if((u_int)addr&1) {
     // First instruction is delay slot
     cc=-1;
-    bt[1]=1;
+    dops[1].bt=1;
     ds=1;
     unneeded_reg[0]=1;
-    unneeded_reg_upper[0]=1;
     current.regmap[HOST_BTREG]=BTREG;
   }
 
   for(i=0;i<slen;i++)
   {
-    if(bt[i])
+    if(dops[i].bt)
     {
       int hr;
       for(hr=0;hr<HOST_REGS;hr++)
@@ -8057,81 +7693,44 @@ int new_recompile_block(int addr)
       current.isconst=0;
       current.waswritten=0;
     }
-    if(i>1)
-    {
-      if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
-      {
-        if(rs1[i-2]==0||rs2[i-2]==0)
-        {
-          if(rs1[i-2]) {
-            current.is32|=1LL<<rs1[i-2];
-            int hr=get_reg(current.regmap,rs1[i-2]|64);
-            if(hr>=0) current.regmap[hr]=-1;
-          }
-          if(rs2[i-2]) {
-            current.is32|=1LL<<rs2[i-2];
-            int hr=get_reg(current.regmap,rs2[i-2]|64);
-            if(hr>=0) current.regmap[hr]=-1;
-          }
-        }
-      }
-    }
-    current.is32=-1LL;
 
     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
     regs[i].wasconst=current.isconst;
-    regs[i].was32=current.is32;
     regs[i].wasdirty=current.dirty;
     regs[i].loadedconst=0;
-    if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
+    if (!dops[i].is_jump) {
       if(i+1<slen) {
-        current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
-        current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
-        if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
+        current.u=unneeded_reg[i+1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
         current.u|=1;
-        current.uu|=1;
       } else {
         current.u=1;
-        current.uu=1;
       }
     } else {
       if(i+1<slen) {
-        current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
-        current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
-        if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
-        current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
-        current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
+        current.u=branch_unneeded_reg[i]&~((1LL<<dops[i+1].rs1)|(1LL<<dops[i+1].rs2));
+        current.u&=~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
         current.u|=1;
-        current.uu|=1;
-      } else { SysPrintf("oops, branch at end of block with no delay slot\n");exit(1); }
+      } else { SysPrintf("oops, branch at end of block with no delay slot\n");abort(); }
     }
-    is_ds[i]=ds;
+    dops[i].is_ds=ds;
     if(ds) {
       ds=0; // Skip delay slot, already allocated as part of branch
       // ...but we need to alloc it in case something jumps here
       if(i+1<slen) {
         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
-        current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
       }else{
         current.u=branch_unneeded_reg[i-1];
-        current.uu=branch_unneeded_reg_upper[i-1];
       }
-      current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
-      current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
-      if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
+      current.u&=~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
       current.u|=1;
-      current.uu|=1;
       struct regstat temp;
       memcpy(&temp,&current,sizeof(current));
       temp.wasdirty=temp.dirty;
-      temp.was32=temp.is32;
       // TODO: Take into account unconditional branches, as below
       delayslot_alloc(&temp,i);
       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
       regs[i].wasdirty=temp.wasdirty;
-      regs[i].was32=temp.was32;
       regs[i].dirty=temp.dirty;
-      regs[i].is32=temp.is32;
       regs[i].isconst=0;
       regs[i].wasconst=0;
       current.isconst=0;
@@ -8145,7 +7744,7 @@ int new_recompile_block(int addr)
           }
           else
           {
-            if(r<64){
+              assert(r < 64);
               if((current.u>>r)&1) {
                 regs[i].regmap_entry[hr]=-1;
                 regs[i].regmap[hr]=-1;
@@ -8153,16 +7752,6 @@ int new_recompile_block(int addr)
                 //current.regmap[hr]=-1;
               }else
                 regs[i].regmap_entry[hr]=r;
-            }
-            else {
-              if((current.uu>>(r&63))&1) {
-                regs[i].regmap_entry[hr]=-1;
-                regs[i].regmap[hr]=-1;
-                //Don't clear regs in the delay slot as the branch might need them
-                //current.regmap[hr]=-1;
-              }else
-                regs[i].regmap_entry[hr]=r;
-            }
           }
         } else {
           // First instruction expects CCREG to be allocated
@@ -8174,25 +7763,24 @@ int new_recompile_block(int addr)
       }
     }
     else { // Not delay slot
-      switch(itype[i]) {
+      switch(dops[i].itype) {
         case UJUMP:
           //current.isconst=0; // DEBUG
           //current.wasconst=0; // DEBUG
           //regs[i].wasconst=0; // DEBUG
-          clear_const(&current,rt1[i]);
+          clear_const(&current,dops[i].rt1);
           alloc_cc(&current,i);
           dirty_reg(&current,CCREG);
-          if (rt1[i]==31) {
+          if (dops[i].rt1==31) {
             alloc_reg(&current,i,31);
             dirty_reg(&current,31);
-            //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
-            //assert(rt1[i+1]!=rt1[i]);
+            //assert(dops[i+1].rs1!=31&&dops[i+1].rs2!=31);
+            //assert(dops[i+1].rt1!=dops[i].rt1);
             #ifdef REG_PREFETCH
             alloc_reg(&current,i,PTEMP);
             #endif
-            //current.is32|=1LL<<rt1[i];
           }
-          ooo[i]=1;
+          dops[i].ooo=1;
           delayslot_alloc(&current,i+1);
           //current.isconst=0; // DEBUG
           ds=1;
@@ -8202,27 +7790,25 @@ int new_recompile_block(int addr)
           //current.isconst=0;
           //current.wasconst=0;
           //regs[i].wasconst=0;
-          clear_const(&current,rs1[i]);
-          clear_const(&current,rt1[i]);
+          clear_const(&current,dops[i].rs1);
+          clear_const(&current,dops[i].rt1);
           alloc_cc(&current,i);
           dirty_reg(&current,CCREG);
-          if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
-            alloc_reg(&current,i,rs1[i]);
-            if (rt1[i]!=0) {
-              alloc_reg(&current,i,rt1[i]);
-              dirty_reg(&current,rt1[i]);
-              assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
-              assert(rt1[i+1]!=rt1[i]);
+          if (!ds_writes_rjump_rs(i)) {
+            alloc_reg(&current,i,dops[i].rs1);
+            if (dops[i].rt1!=0) {
+              alloc_reg(&current,i,dops[i].rt1);
+              dirty_reg(&current,dops[i].rt1);
+              assert(dops[i+1].rs1!=dops[i].rt1&&dops[i+1].rs2!=dops[i].rt1);
+              assert(dops[i+1].rt1!=dops[i].rt1);
               #ifdef REG_PREFETCH
               alloc_reg(&current,i,PTEMP);
               #endif
             }
             #ifdef USE_MINI_HT
-            if(rs1[i]==31) { // JALR
+            if(dops[i].rs1==31) { // JALR
               alloc_reg(&current,i,RHASH);
-              #ifndef HOST_IMM_ADDR32
               alloc_reg(&current,i,RHTBL);
-              #endif
             }
             #endif
             delayslot_alloc(&current,i+1);
@@ -8237,105 +7823,78 @@ int new_recompile_block(int addr)
             alloc_reg(&current,i,RTEMP);
           }
           //current.isconst=0; // DEBUG
-          ooo[i]=1;
+          dops[i].ooo=1;
           ds=1;
           break;
         case CJUMP:
           //current.isconst=0;
           //current.wasconst=0;
           //regs[i].wasconst=0;
-          clear_const(&current,rs1[i]);
-          clear_const(&current,rs2[i]);
-          if((opcode[i]&0x3E)==4) // BEQ/BNE
+          clear_const(&current,dops[i].rs1);
+          clear_const(&current,dops[i].rs2);
+          if((dops[i].opcode&0x3E)==4) // BEQ/BNE
           {
             alloc_cc(&current,i);
             dirty_reg(&current,CCREG);
-            if(rs1[i]) alloc_reg(&current,i,rs1[i]);
-            if(rs2[i]) alloc_reg(&current,i,rs2[i]);
-            if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
-            {
-              if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
-              if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
-            }
-            if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
-               (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
+            if(dops[i].rs1) alloc_reg(&current,i,dops[i].rs1);
+            if(dops[i].rs2) alloc_reg(&current,i,dops[i].rs2);
+            if((dops[i].rs1&&(dops[i].rs1==dops[i+1].rt1||dops[i].rs1==dops[i+1].rt2))||
+               (dops[i].rs2&&(dops[i].rs2==dops[i+1].rt1||dops[i].rs2==dops[i+1].rt2))) {
               // The delay slot overwrites one of our conditions.
               // Allocate the branch condition registers instead.
               current.isconst=0;
               current.wasconst=0;
               regs[i].wasconst=0;
-              if(rs1[i]) alloc_reg(&current,i,rs1[i]);
-              if(rs2[i]) alloc_reg(&current,i,rs2[i]);
-              if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
-              {
-                if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
-                if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
-              }
+              if(dops[i].rs1) alloc_reg(&current,i,dops[i].rs1);
+              if(dops[i].rs2) alloc_reg(&current,i,dops[i].rs2);
             }
             else
             {
-              ooo[i]=1;
+              dops[i].ooo=1;
               delayslot_alloc(&current,i+1);
             }
           }
           else
-          if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
+          if((dops[i].opcode&0x3E)==6) // BLEZ/BGTZ
           {
             alloc_cc(&current,i);
             dirty_reg(&current,CCREG);
-            alloc_reg(&current,i,rs1[i]);
-            if(!(current.is32>>rs1[i]&1))
-            {
-              alloc_reg64(&current,i,rs1[i]);
-            }
-            if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
+            alloc_reg(&current,i,dops[i].rs1);
+            if(dops[i].rs1&&(dops[i].rs1==dops[i+1].rt1||dops[i].rs1==dops[i+1].rt2)) {
               // The delay slot overwrites one of our conditions.
               // Allocate the branch condition registers instead.
               current.isconst=0;
               current.wasconst=0;
               regs[i].wasconst=0;
-              if(rs1[i]) alloc_reg(&current,i,rs1[i]);
-              if(!((current.is32>>rs1[i])&1))
-              {
-                if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
-              }
+              if(dops[i].rs1) alloc_reg(&current,i,dops[i].rs1);
             }
             else
             {
-              ooo[i]=1;
+              dops[i].ooo=1;
               delayslot_alloc(&current,i+1);
             }
           }
           else
           // Don't alloc the delay slot yet because we might not execute it
-          if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
+          if((dops[i].opcode&0x3E)==0x14) // BEQL/BNEL
           {
             current.isconst=0;
             current.wasconst=0;
             regs[i].wasconst=0;
             alloc_cc(&current,i);
             dirty_reg(&current,CCREG);
-            alloc_reg(&current,i,rs1[i]);
-            alloc_reg(&current,i,rs2[i]);
-            if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
-            {
-              alloc_reg64(&current,i,rs1[i]);
-              alloc_reg64(&current,i,rs2[i]);
-            }
+            alloc_reg(&current,i,dops[i].rs1);
+            alloc_reg(&current,i,dops[i].rs2);
           }
           else
-          if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
+          if((dops[i].opcode&0x3E)==0x16) // BLEZL/BGTZL
           {
             current.isconst=0;
             current.wasconst=0;
             regs[i].wasconst=0;
             alloc_cc(&current,i);
             dirty_reg(&current,CCREG);
-            alloc_reg(&current,i,rs1[i]);
-            if(!(current.is32>>rs1[i]&1))
-            {
-              alloc_reg64(&current,i,rs1[i]);
-            }
+            alloc_reg(&current,i,dops[i].rs1);
           }
           ds=1;
           //current.isconst=0;
@@ -8344,103 +7903,49 @@ int new_recompile_block(int addr)
           //current.isconst=0;
           //current.wasconst=0;
           //regs[i].wasconst=0;
-          clear_const(&current,rs1[i]);
-          clear_const(&current,rt1[i]);
-          //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
-          if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
+          clear_const(&current,dops[i].rs1);
+          clear_const(&current,dops[i].rt1);
+          //if((dops[i].opcode2&0x1E)==0x0) // BLTZ/BGEZ
+          if((dops[i].opcode2&0x0E)==0x0) // BLTZ/BGEZ
           {
             alloc_cc(&current,i);
             dirty_reg(&current,CCREG);
-            alloc_reg(&current,i,rs1[i]);
-            if(!(current.is32>>rs1[i]&1))
-            {
-              alloc_reg64(&current,i,rs1[i]);
-            }
-            if (rt1[i]==31) { // BLTZAL/BGEZAL
+            alloc_reg(&current,i,dops[i].rs1);
+            if (dops[i].rt1==31) { // BLTZAL/BGEZAL
               alloc_reg(&current,i,31);
               dirty_reg(&current,31);
               //#ifdef REG_PREFETCH
               //alloc_reg(&current,i,PTEMP);
               //#endif
-              //current.is32|=1LL<<rt1[i];
             }
-            if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
-               ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
+            if((dops[i].rs1&&(dops[i].rs1==dops[i+1].rt1||dops[i].rs1==dops[i+1].rt2)) // The delay slot overwrites the branch condition.
+               ||(dops[i].rt1==31&&(dops[i+1].rs1==31||dops[i+1].rs2==31||dops[i+1].rt1==31||dops[i+1].rt2==31))) { // DS touches $ra
               // Allocate the branch condition registers instead.
               current.isconst=0;
               current.wasconst=0;
               regs[i].wasconst=0;
-              if(rs1[i]) alloc_reg(&current,i,rs1[i]);
-              if(!((current.is32>>rs1[i])&1))
-              {
-                if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
-              }
+              if(dops[i].rs1) alloc_reg(&current,i,dops[i].rs1);
             }
             else
             {
-              ooo[i]=1;
+              dops[i].ooo=1;
               delayslot_alloc(&current,i+1);
             }
           }
           else
           // Don't alloc the delay slot yet because we might not execute it
-          if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
+          if((dops[i].opcode2&0x1E)==0x2) // BLTZL/BGEZL
           {
             current.isconst=0;
             current.wasconst=0;
             regs[i].wasconst=0;
             alloc_cc(&current,i);
             dirty_reg(&current,CCREG);
-            alloc_reg(&current,i,rs1[i]);
-            if(!(current.is32>>rs1[i]&1))
-            {
-              alloc_reg64(&current,i,rs1[i]);
-            }
+            alloc_reg(&current,i,dops[i].rs1);
           }
           ds=1;
           //current.isconst=0;
           break;
-        case FJUMP:
-          current.isconst=0;
-          current.wasconst=0;
-          regs[i].wasconst=0;
-          if(likely[i]==0) // BC1F/BC1T
-          {
-            // TODO: Theoretically we can run out of registers here on x86.
-            // The delay slot can allocate up to six, and we need to check
-            // CSREG before executing the delay slot.  Possibly we can drop
-            // the cycle count and then reload it after checking that the
-            // FPU is in a usable state, or don't do out-of-order execution.
-            alloc_cc(&current,i);
-            dirty_reg(&current,CCREG);
-            alloc_reg(&current,i,FSREG);
-            alloc_reg(&current,i,CSREG);
-            if(itype[i+1]==FCOMP) {
-              // The delay slot overwrites the branch condition.
-              // Allocate the branch condition registers instead.
-              alloc_cc(&current,i);
-              dirty_reg(&current,CCREG);
-              alloc_reg(&current,i,CSREG);
-              alloc_reg(&current,i,FSREG);
-            }
-            else {
-              ooo[i]=1;
-              delayslot_alloc(&current,i+1);
-              alloc_reg(&current,i+1,CSREG);
-            }
-          }
-          else
-          // Don't alloc the delay slot yet because we might not execute it
-          if(likely[i]) // BC1FL/BC1TL
-          {
-            alloc_cc(&current,i);
-            dirty_reg(&current,CCREG);
-            alloc_reg(&current,i,CSREG);
-            alloc_reg(&current,i,FSREG);
-          }
-          ds=1;
-          current.isconst=0;
-          break;
         case IMM16:
           imm16_alloc(&current,i);
           break;
@@ -8471,8 +7976,9 @@ int new_recompile_block(int addr)
           cop0_alloc(&current,i);
           break;
         case COP1:
+          break;
         case COP2:
-          cop1_alloc(&current,i);
+          cop2_alloc(&current,i);
           break;
         case C1LS:
           c1ls_alloc(&current,i);
@@ -8483,15 +7989,6 @@ int new_recompile_block(int addr)
         case C2OP:
           c2op_alloc(&current,i);
           break;
-        case FCONV:
-          fconv_alloc(&current,i);
-          break;
-        case FLOAT:
-          float_alloc(&current,i);
-          break;
-        case FCOMP:
-          fcomp_alloc(&current,i);
-          break;
         case SYSCALL:
         case HLECALL:
         case INTCALL:
@@ -8502,20 +7999,6 @@ int new_recompile_block(int addr)
           break;
       }
 
-      // Drop the upper half of registers that have become 32-bit
-      current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
-      if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
-        current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
-        if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
-        current.uu|=1;
-      } else {
-        current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
-        current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
-        if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
-        current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
-        current.uu|=1;
-      }
-
       // Create entry (branch target) regmap
       for(hr=0;hr<HOST_REGS;hr++)
       {
@@ -8543,7 +8026,8 @@ int new_recompile_block(int addr)
               regs[i].regmap_entry[hr]=0;
             }
             else
-            if(r<64){
+            {
+              assert(r<64);
               if((current.u>>r)&1) {
                 regs[i].regmap_entry[hr]=-1;
                 //regs[i].regmap[hr]=-1;
@@ -8551,14 +8035,6 @@ int new_recompile_block(int addr)
               }else
                 regs[i].regmap_entry[hr]=r;
             }
-            else {
-              if((current.uu>>(r&63))&1) {
-                regs[i].regmap_entry[hr]=-1;
-                //regs[i].regmap[hr]=-1;
-                current.regmap[hr]=-1;
-              }else
-                regs[i].regmap_entry[hr]=r;
-            }
           }
         } else {
           // Branches expect CCREG to be allocated at the target
@@ -8571,137 +8047,111 @@ int new_recompile_block(int addr)
       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
     }
 
-    if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
-      current.waswritten|=1<<rs1[i-1];
-    current.waswritten&=~(1<<rt1[i]);
-    current.waswritten&=~(1<<rt2[i]);
-    if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
-      current.waswritten&=~(1<<rs1[i]);
+    if(i>0&&(dops[i-1].itype==STORE||dops[i-1].itype==STORELR||(dops[i-1].itype==C2LS&&dops[i-1].opcode==0x3a))&&(u_int)imm[i-1]<0x800)
+      current.waswritten|=1<<dops[i-1].rs1;
+    current.waswritten&=~(1<<dops[i].rt1);
+    current.waswritten&=~(1<<dops[i].rt2);
+    if((dops[i].itype==STORE||dops[i].itype==STORELR||(dops[i].itype==C2LS&&dops[i].opcode==0x3a))&&(u_int)imm[i]>=0x800)
+      current.waswritten&=~(1<<dops[i].rs1);
 
     /* Branch post-alloc */
     if(i>0)
     {
-      current.was32=current.is32;
       current.wasdirty=current.dirty;
-      switch(itype[i-1]) {
+      switch(dops[i-1].itype) {
         case UJUMP:
           memcpy(&branch_regs[i-1],&current,sizeof(current));
           branch_regs[i-1].isconst=0;
           branch_regs[i-1].wasconst=0;
-          branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
-          branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
+          branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<dops[i-1].rs1)|(1LL<<dops[i-1].rs2));
           alloc_cc(&branch_regs[i-1],i-1);
           dirty_reg(&branch_regs[i-1],CCREG);
-          if(rt1[i-1]==31) { // JAL
+          if(dops[i-1].rt1==31) { // JAL
             alloc_reg(&branch_regs[i-1],i-1,31);
             dirty_reg(&branch_regs[i-1],31);
-            branch_regs[i-1].is32|=1LL<<31;
           }
           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
-          memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
+          memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
           break;
         case RJUMP:
           memcpy(&branch_regs[i-1],&current,sizeof(current));
           branch_regs[i-1].isconst=0;
           branch_regs[i-1].wasconst=0;
-          branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
-          branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
+          branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<dops[i-1].rs1)|(1LL<<dops[i-1].rs2));
           alloc_cc(&branch_regs[i-1],i-1);
           dirty_reg(&branch_regs[i-1],CCREG);
-          alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
-          if(rt1[i-1]!=0) { // JALR
-            alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
-            dirty_reg(&branch_regs[i-1],rt1[i-1]);
-            branch_regs[i-1].is32|=1LL<<rt1[i-1];
+          alloc_reg(&branch_regs[i-1],i-1,dops[i-1].rs1);
+          if(dops[i-1].rt1!=0) { // JALR
+            alloc_reg(&branch_regs[i-1],i-1,dops[i-1].rt1);
+            dirty_reg(&branch_regs[i-1],dops[i-1].rt1);
           }
           #ifdef USE_MINI_HT
-          if(rs1[i-1]==31) { // JALR
+          if(dops[i-1].rs1==31) { // JALR
             alloc_reg(&branch_regs[i-1],i-1,RHASH);
-            #ifndef HOST_IMM_ADDR32
             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
-            #endif
           }
           #endif
           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
-          memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
+          memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
           break;
         case CJUMP:
-          if((opcode[i-1]&0x3E)==4) // BEQ/BNE
+          if((dops[i-1].opcode&0x3E)==4) // BEQ/BNE
           {
             alloc_cc(&current,i-1);
             dirty_reg(&current,CCREG);
-            if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
-               (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
+            if((dops[i-1].rs1&&(dops[i-1].rs1==dops[i].rt1||dops[i-1].rs1==dops[i].rt2))||
+               (dops[i-1].rs2&&(dops[i-1].rs2==dops[i].rt1||dops[i-1].rs2==dops[i].rt2))) {
               // The delay slot overwrote one of our conditions
               // Delay slot goes after the test (in order)
-              current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
-              current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
-              if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
+              current.u=branch_unneeded_reg[i-1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
               current.u|=1;
-              current.uu|=1;
               delayslot_alloc(&current,i);
               current.isconst=0;
             }
             else
             {
-              current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
-              current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
+              current.u=branch_unneeded_reg[i-1]&~((1LL<<dops[i-1].rs1)|(1LL<<dops[i-1].rs2));
               // Alloc the branch condition registers
-              if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
-              if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
-              if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
-              {
-                if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
-                if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
-              }
+              if(dops[i-1].rs1) alloc_reg(&current,i-1,dops[i-1].rs1);
+              if(dops[i-1].rs2) alloc_reg(&current,i-1,dops[i-1].rs2);
             }
             memcpy(&branch_regs[i-1],&current,sizeof(current));
             branch_regs[i-1].isconst=0;
             branch_regs[i-1].wasconst=0;
             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
-            memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
+            memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
           }
           else
-          if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
+          if((dops[i-1].opcode&0x3E)==6) // BLEZ/BGTZ
           {
             alloc_cc(&current,i-1);
             dirty_reg(&current,CCREG);
-            if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
+            if(dops[i-1].rs1==dops[i].rt1||dops[i-1].rs1==dops[i].rt2) {
               // The delay slot overwrote the branch condition
               // Delay slot goes after the test (in order)
-              current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
-              current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
-              if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
+              current.u=branch_unneeded_reg[i-1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
               current.u|=1;
-              current.uu|=1;
               delayslot_alloc(&current,i);
               current.isconst=0;
             }
             else
             {
-              current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
-              current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
+              current.u=branch_unneeded_reg[i-1]&~(1LL<<dops[i-1].rs1);
               // Alloc the branch condition register
-              alloc_reg(&current,i-1,rs1[i-1]);
-              if(!(current.is32>>rs1[i-1]&1))
-              {
-                alloc_reg64(&current,i-1,rs1[i-1]);
-              }
+              alloc_reg(&current,i-1,dops[i-1].rs1);
             }
             memcpy(&branch_regs[i-1],&current,sizeof(current));
             branch_regs[i-1].isconst=0;
             branch_regs[i-1].wasconst=0;
             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
-            memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
+            memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
           }
           else
           // Alloc the delay slot in case the branch is taken
-          if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
+          if((dops[i-1].opcode&0x3E)==0x14) // BEQL/BNEL
           {
             memcpy(&branch_regs[i-1],&current,sizeof(current));
-            branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
-            branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
-            if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
+            branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2)|(1LL<<dops[i].rt1)|(1LL<<dops[i].rt2)))|1;
             alloc_cc(&branch_regs[i-1],i);
             dirty_reg(&branch_regs[i-1],CCREG);
             delayslot_alloc(&branch_regs[i-1],i);
@@ -8711,12 +8161,10 @@ int new_recompile_block(int addr)
             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
           }
           else
-          if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
+          if((dops[i-1].opcode&0x3E)==0x16) // BLEZL/BGTZL
           {
             memcpy(&branch_regs[i-1],&current,sizeof(current));
-            branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
-            branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
-            if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
+            branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2)|(1LL<<dops[i].rt1)|(1LL<<dops[i].rt2)))|1;
             alloc_cc(&branch_regs[i-1],i);
             dirty_reg(&branch_regs[i-1],CCREG);
             delayslot_alloc(&branch_regs[i-1],i);
@@ -8727,47 +8175,37 @@ int new_recompile_block(int addr)
           }
           break;
         case SJUMP:
-          //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
-          if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
+          //if((dops[i-1].opcode2&0x1E)==0) // BLTZ/BGEZ
+          if((dops[i-1].opcode2&0x0E)==0) // BLTZ/BGEZ
           {
             alloc_cc(&current,i-1);
             dirty_reg(&current,CCREG);
-            if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
+            if(dops[i-1].rs1==dops[i].rt1||dops[i-1].rs1==dops[i].rt2) {
               // The delay slot overwrote the branch condition
               // Delay slot goes after the test (in order)
-              current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
-              current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
-              if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
+              current.u=branch_unneeded_reg[i-1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
               current.u|=1;
-              current.uu|=1;
               delayslot_alloc(&current,i);
               current.isconst=0;
             }
             else
             {
-              current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
-              current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
+              current.u=branch_unneeded_reg[i-1]&~(1LL<<dops[i-1].rs1);
               // Alloc the branch condition register
-              alloc_reg(&current,i-1,rs1[i-1]);
-              if(!(current.is32>>rs1[i-1]&1))
-              {
-                alloc_reg64(&current,i-1,rs1[i-1]);
-              }
+              alloc_reg(&current,i-1,dops[i-1].rs1);
             }
             memcpy(&branch_regs[i-1],&current,sizeof(current));
             branch_regs[i-1].isconst=0;
             branch_regs[i-1].wasconst=0;
             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
-            memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
+            memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
           }
           else
           // Alloc the delay slot in case the branch is taken
-          if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
+          if((dops[i-1].opcode2&0x1E)==2) // BLTZL/BGEZL
           {
             memcpy(&branch_regs[i-1],&current,sizeof(current));
-            branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
-            branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
-            if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
+            branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2)|(1LL<<dops[i].rt1)|(1LL<<dops[i].rt2)))|1;
             alloc_cc(&branch_regs[i-1],i);
             dirty_reg(&branch_regs[i-1],CCREG);
             delayslot_alloc(&branch_regs[i-1],i);
@@ -8777,57 +8215,18 @@ int new_recompile_block(int addr)
             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
           }
           // FIXME: BLTZAL/BGEZAL
-          if(opcode2[i-1]&0x10) { // BxxZAL
+          if(dops[i-1].opcode2&0x10) { // BxxZAL
             alloc_reg(&branch_regs[i-1],i-1,31);
             dirty_reg(&branch_regs[i-1],31);
-            branch_regs[i-1].is32|=1LL<<31;
-          }
-          break;
-        case FJUMP:
-          if(likely[i-1]==0) // BC1F/BC1T
-          {
-            alloc_cc(&current,i-1);
-            dirty_reg(&current,CCREG);
-            if(itype[i]==FCOMP) {
-              // The delay slot overwrote the branch condition
-              // Delay slot goes after the test (in order)
-              delayslot_alloc(&current,i);
-              current.isconst=0;
-            }
-            else
-            {
-              current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
-              current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
-              // Alloc the branch condition register
-              alloc_reg(&current,i-1,FSREG);
-            }
-            memcpy(&branch_regs[i-1],&current,sizeof(current));
-            memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
-          }
-          else // BC1FL/BC1TL
-          {
-            // Alloc the delay slot in case the branch is taken
-            memcpy(&branch_regs[i-1],&current,sizeof(current));
-            branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
-            branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
-            if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
-            alloc_cc(&branch_regs[i-1],i);
-            dirty_reg(&branch_regs[i-1],CCREG);
-            delayslot_alloc(&branch_regs[i-1],i);
-            branch_regs[i-1].isconst=0;
-            alloc_reg(&current,i,CCREG); // Not taken path
-            dirty_reg(&current,CCREG);
-            memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
           }
           break;
       }
 
-      if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
+      if (dops[i-1].is_ujump)
       {
-        if(rt1[i-1]==31) // JAL/JALR
+        if(dops[i-1].rt1==31) // JAL/JALR
         {
           // Subroutine call will return here, don't alloc any registers
-          current.is32=1;
           current.dirty=0;
           clear_all_regs(current.regmap);
           alloc_reg(&current,i,CCREG);
@@ -8836,7 +8235,6 @@ int new_recompile_block(int addr)
         else if(i+1<slen)
         {
           // Internal branch will jump here, match registers to caller
-          current.is32=0x3FFFFFFFFLL;
           current.dirty=0;
           clear_all_regs(current.regmap);
           alloc_reg(&current,i,CCREG);
@@ -8845,7 +8243,6 @@ int new_recompile_block(int addr)
           {
             if(ba[j]==start+i*4+4) {
               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
-              current.is32=branch_regs[j].is32;
               current.dirty=branch_regs[j].dirty;
               break;
             }
@@ -8856,7 +8253,6 @@ int new_recompile_block(int addr)
                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
                   current.regmap[hr]=-1;
                 }
-                current.is32&=branch_regs[j].is32;
                 current.dirty&=branch_regs[j].dirty;
               }
             }
@@ -8867,28 +8263,27 @@ int new_recompile_block(int addr)
     }
 
     // Count cycles in between branches
-    ccadj[i]=cc;
-    if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
+    ccadj[i] = CLOCK_ADJUST(cc);
+    if (i > 0 && (dops[i-1].is_jump || dops[i].itype == SYSCALL || dops[i].itype == HLECALL))
     {
       cc=0;
     }
 #if !defined(DRC_DBG)
-    else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
-    {
-      // GTE runs in parallel until accessed, divide by 2 for a rough guess
-      cc+=gte_cycletab[source[i]&0x3f]/2;
-    }
-    else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
+    else if(dops[i].itype==C2OP&&gte_cycletab[source[i]&0x3f]>2)
     {
-      cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
+      // this should really be removed since the real stalls have been implemented,
+      // but doing so causes sizeable perf regression against the older version
+      u_int gtec = gte_cycletab[source[i] & 0x3f];
+      cc += HACK_ENABLED(NDHACK_NO_STALLS) ? gtec/2 : 2;
     }
-    else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
+    else if(i>1&&dops[i].itype==STORE&&dops[i-1].itype==STORE&&dops[i-2].itype==STORE&&!dops[i].bt)
     {
       cc+=4;
     }
-    else if(itype[i]==C2LS)
+    else if(dops[i].itype==C2LS)
     {
-      cc+=4;
+      // same as with C2OP
+      cc += HACK_ENABLED(NDHACK_NO_STALLS) ? 4 : 2;
     }
 #endif
     else
@@ -8896,12 +8291,10 @@ int new_recompile_block(int addr)
       cc++;
     }
 
-    flush_dirty_uppers(&current);
-    if(!is_ds[i]) {
-      regs[i].is32=current.is32;
+    if(!dops[i].is_ds) {
       regs[i].dirty=current.dirty;
       regs[i].isconst=current.isconst;
-      memcpy(constmap[i],current_constmap,sizeof(current_constmap));
+      memcpy(constmap[i],current_constmap,sizeof(constmap[i]));
     }
     for(hr=0;hr<HOST_REGS;hr++) {
       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
@@ -8921,7 +8314,7 @@ int new_recompile_block(int addr)
   for (i=slen-1;i>=0;i--)
   {
     int hr;
-    if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
+    if(dops[i].is_jump)
     {
       if(ba[i]<start || ba[i]>=(start+slen*4))
       {
@@ -8942,7 +8335,7 @@ int new_recompile_block(int addr)
         }
       }
       // Conditional branch may need registers for following instructions
-      if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
+      if (!dops[i].is_ujump)
       {
         if(i<slen-2) {
           nr|=needed_reg[i+2];
@@ -8959,40 +8352,28 @@ int new_recompile_block(int addr)
       // Merge in delay slot
       for(hr=0;hr<HOST_REGS;hr++)
       {
-        if(!likely[i]) {
-          // These are overwritten unless the branch is "likely"
-          // and the delay slot is nullified if not taken
-          if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
-          if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
-        }
-        if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
-        if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
-        if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
-        if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
-        if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
-        if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
-        if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
-        if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
-        if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
-          if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
-          if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
-        }
-        if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
-          if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
-          if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
-        }
-        if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
+        if(dops[i+1].rt1&&dops[i+1].rt1==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
+        if(dops[i+1].rt2&&dops[i+1].rt2==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
+        if(dops[i+1].rs1==regmap_pre[i][hr]) nr|=1<<hr;
+        if(dops[i+1].rs2==regmap_pre[i][hr]) nr|=1<<hr;
+        if(dops[i+1].rs1==regs[i].regmap_entry[hr]) nr|=1<<hr;
+        if(dops[i+1].rs2==regs[i].regmap_entry[hr]) nr|=1<<hr;
+        if(ram_offset && (dops[i+1].is_load || dops[i+1].is_store)) {
+          if(regmap_pre[i][hr]==ROREG) nr|=1<<hr;
+          if(regs[i].regmap_entry[hr]==ROREG) nr|=1<<hr;
+        }
+        if(dops[i+1].is_store) {
           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
         }
       }
     }
-    else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
+    else if(dops[i].itype==SYSCALL||dops[i].itype==HLECALL||dops[i].itype==INTCALL)
     {
       // SYSCALL instruction (software interrupt)
       nr=0;
     }
-    else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
+    else if(dops[i].itype==COP0 && (source[i]&0x3f)==0x18)
     {
       // ERET instruction (return from interrupt)
       nr=0;
@@ -9011,27 +8392,19 @@ int new_recompile_block(int addr)
     for(hr=0;hr<HOST_REGS;hr++)
     {
       // Overwritten registers are not needed
-      if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
-      if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
+      if(dops[i].rt1&&dops[i].rt1==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
+      if(dops[i].rt2&&dops[i].rt2==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
       // Source registers are needed
-      if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
-      if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
-      if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
-      if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
-      if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
-      if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
-      if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
-      if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
-      if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
-        if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
-        if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
-      }
-      if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
-        if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
-        if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
-      }
-      if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
+      if(dops[i].rs1==regmap_pre[i][hr]) nr|=1<<hr;
+      if(dops[i].rs2==regmap_pre[i][hr]) nr|=1<<hr;
+      if(dops[i].rs1==regs[i].regmap_entry[hr]) nr|=1<<hr;
+      if(dops[i].rs2==regs[i].regmap_entry[hr]) nr|=1<<hr;
+      if(ram_offset && (dops[i].is_load || dops[i].is_store)) {
+        if(regmap_pre[i][hr]==ROREG) nr|=1<<hr;
+        if(regs[i].regmap_entry[hr]==ROREG) nr|=1<<hr;
+      }
+      if(dops[i].is_store) {
         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
       }
@@ -9039,21 +8412,19 @@ int new_recompile_block(int addr)
       // may prevent dual-issue.
       // But do so if this is a branch target, otherwise we
       // might have to load the register before the branch.
-      if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
-        if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
-           (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
-          if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
-          if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
+      if(i>0&&!dops[i].bt&&((regs[i].wasdirty>>hr)&1)) {
+        if((regmap_pre[i][hr]>0&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1))) {
+          if(dops[i-1].rt1==(regmap_pre[i][hr]&63)) nr|=1<<hr;
+          if(dops[i-1].rt2==(regmap_pre[i][hr]&63)) nr|=1<<hr;
         }
-        if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
-           (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
-          if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
-          if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
+        if((regs[i].regmap_entry[hr]>0&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1))) {
+          if(dops[i-1].rt1==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
+          if(dops[i-1].rt2==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
         }
       }
     }
     // Cycle count is needed at branches.  Assume it is needed at the target too.
-    if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
+    if(i==0||dops[i].bt||dops[i].itype==CJUMP||dops[i].itype==SPAN) {
       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
     }
@@ -9065,66 +8436,40 @@ int new_recompile_block(int addr)
     {
       if(!((nr>>hr)&1)) {
         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
-        if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
-           (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
-           (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
-        {
-          if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
-          {
-            if(likely[i]) {
-              regs[i].regmap[hr]=-1;
-              regs[i].isconst&=~(1<<hr);
-              if(i<slen-2) {
-                regmap_pre[i+2][hr]=-1;
-                regs[i+2].wasconst&=~(1<<hr);
-              }
-            }
-          }
-        }
-        if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
+        if(dops[i].is_jump)
         {
-          int d1=0,d2=0,map=0,temp=0;
-          if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
-          {
-            d1=dep1[i+1];
-            d2=dep2[i+1];
-          }
-          if(itype[i+1]==STORE || itype[i+1]==STORELR ||
-             (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
-            map=INVCP;
-          }
-          if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
-             itype[i+1]==C1LS || itype[i+1]==C2LS)
-            temp=FTEMP;
-          if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
-             (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
-             (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
-             (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
-             (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
-             regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
+          int map1 = 0, map2 = 0, temp = 0; // or -1 ??
+          if (dops[i+1].is_load || dops[i+1].is_store)
+            map1 = ROREG;
+          if (dops[i+1].is_store)
+            map2 = INVCP;
+          if(dops[i+1].itype==LOADLR || dops[i+1].itype==STORELR || dops[i+1].itype==C2LS)
+            temp = FTEMP;
+          if((regs[i].regmap[hr]&63)!=dops[i].rs1 && (regs[i].regmap[hr]&63)!=dops[i].rs2 &&
+             (regs[i].regmap[hr]&63)!=dops[i].rt1 && (regs[i].regmap[hr]&63)!=dops[i].rt2 &&
+             (regs[i].regmap[hr]&63)!=dops[i+1].rt1 && (regs[i].regmap[hr]&63)!=dops[i+1].rt2 &&
+             regs[i].regmap[hr]!=dops[i+1].rs1 && regs[i].regmap[hr]!=dops[i+1].rs2 &&
              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
-             regs[i].regmap[hr]!=map )
+             regs[i].regmap[hr]!=map1 && regs[i].regmap[hr]!=map2)
           {
             regs[i].regmap[hr]=-1;
             regs[i].isconst&=~(1<<hr);
-            if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
-               (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
-               (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
-               (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
-               (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
-               branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
+            if((branch_regs[i].regmap[hr]&63)!=dops[i].rs1 && (branch_regs[i].regmap[hr]&63)!=dops[i].rs2 &&
+               (branch_regs[i].regmap[hr]&63)!=dops[i].rt1 && (branch_regs[i].regmap[hr]&63)!=dops[i].rt2 &&
+               (branch_regs[i].regmap[hr]&63)!=dops[i+1].rt1 && (branch_regs[i].regmap[hr]&63)!=dops[i+1].rt2 &&
+               branch_regs[i].regmap[hr]!=dops[i+1].rs1 && branch_regs[i].regmap[hr]!=dops[i+1].rs2 &&
                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
-               branch_regs[i].regmap[hr]!=map)
+               branch_regs[i].regmap[hr]!=map1 && branch_regs[i].regmap[hr]!=map2)
             {
               branch_regs[i].regmap[hr]=-1;
               branch_regs[i].regmap_entry[hr]=-1;
-              if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
+              if (!dops[i].is_ujump)
               {
-                if(!likely[i]&&i<slen-2) {
+                if (i < slen-2) {
                   regmap_pre[i+2][hr]=-1;
                   regs[i+2].wasconst&=~(1<<hr);
                 }
@@ -9137,30 +8482,23 @@ int new_recompile_block(int addr)
           // Non-branch
           if(i>0)
           {
-            int d1=0,d2=0,map=-1,temp=-1;
-            if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
+            int map1 = -1, map2 = -1, temp=-1;
+            if (dops[i].is_load || dops[i].is_store)
+              map1 = ROREG;
+            if (dops[i].is_store)
+              map2 = INVCP;
+            if (dops[i].itype==LOADLR || dops[i].itype==STORELR || dops[i].itype==C2LS)
+              temp = FTEMP;
+            if((regs[i].regmap[hr]&63)!=dops[i].rt1 && (regs[i].regmap[hr]&63)!=dops[i].rt2 &&
+               regs[i].regmap[hr]!=dops[i].rs1 && regs[i].regmap[hr]!=dops[i].rs2 &&
+               (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map1 && regs[i].regmap[hr]!=map2 &&
+               //(dops[i].itype!=SPAN||regs[i].regmap[hr]!=CCREG)
+               regs[i].regmap[hr] != CCREG)
             {
-              d1=dep1[i];
-              d2=dep2[i];
-            }
-            if(itype[i]==STORE || itype[i]==STORELR ||
-                      (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
-              map=INVCP;
-            }
-            if(itype[i]==LOADLR || itype[i]==STORELR ||
-               itype[i]==C1LS || itype[i]==C2LS)
-              temp=FTEMP;
-            if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
-               (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
-               (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
-               regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
-               (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
-               (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
-            {
-              if(i<slen-1&&!is_ds[i]) {
-                if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
+              if(i<slen-1&&!dops[i].is_ds) {
+                assert(regs[i].regmap[hr]<64);
+                if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]>0)
                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
-                if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
                 {
                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
@@ -9174,8 +8512,8 @@ int new_recompile_block(int addr)
             }
           }
         }
-      }
-    }
+      } // if needed
+    } // for hr
   }
 
   /* Pass 5 - Pre-allocate registers */
@@ -9188,27 +8526,21 @@ int new_recompile_block(int addr)
   clear_all_regs(f_regmap);
   for(i=0;i<slen-1;i++)
   {
-    if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
+    if(dops[i].itype==UJUMP||dops[i].itype==CJUMP||dops[i].itype==SJUMP)
     {
       if(ba[i]>=start && ba[i]<(start+i*4))
-      if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
-      ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
-      ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
-      ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
-      ||itype[i+1]==FCOMP||itype[i+1]==FCONV
-      ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
+      if(dops[i+1].itype==NOP||dops[i+1].itype==MOV||dops[i+1].itype==ALU
+      ||dops[i+1].itype==SHIFTIMM||dops[i+1].itype==IMM16||dops[i+1].itype==LOAD
+      ||dops[i+1].itype==STORE||dops[i+1].itype==STORELR||dops[i+1].itype==C1LS
+      ||dops[i+1].itype==SHIFT||dops[i+1].itype==COP1
+      ||dops[i+1].itype==COP2||dops[i+1].itype==C2LS||dops[i+1].itype==C2OP)
       {
         int t=(ba[i]-start)>>2;
-        if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
-        if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
+        if(t > 0 && !dops[t-1].is_jump) // loop_preload can't handle jumps into delay slots
+        if(t<2||(dops[t-2].itype!=UJUMP&&dops[t-2].itype!=RJUMP)||dops[t-2].rt1!=31) // call/ret assumes no registers allocated
         for(hr=0;hr<HOST_REGS;hr++)
         {
-          if(regs[i].regmap[hr]>64) {
-            if(!((regs[i].dirty>>hr)&1))
-              f_regmap[hr]=regs[i].regmap[hr];
-            else f_regmap[hr]=-1;
-          }
-          else if(regs[i].regmap[hr]>=0) {
+          if(regs[i].regmap[hr]>=0) {
             if(f_regmap[hr]!=regs[i].regmap[hr]) {
               // dealloc old register
               int n;
@@ -9220,12 +8552,7 @@ int new_recompile_block(int addr)
               f_regmap[hr]=regs[i].regmap[hr];
             }
           }
-          if(branch_regs[i].regmap[hr]>64) {
-            if(!((branch_regs[i].dirty>>hr)&1))
-              f_regmap[hr]=branch_regs[i].regmap[hr];
-            else f_regmap[hr]=-1;
-          }
-          else if(branch_regs[i].regmap[hr]>=0) {
+          if(branch_regs[i].regmap[hr]>=0) {
             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
               // dealloc old register
               int n;
@@ -9237,7 +8564,7 @@ int new_recompile_block(int addr)
               f_regmap[hr]=branch_regs[i].regmap[hr];
             }
           }
-          if(ooo[i]) {
+          if(dops[i].ooo) {
             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
               f_regmap[hr]=branch_regs[i].regmap[hr];
           }else{
@@ -9262,15 +8589,7 @@ int new_recompile_block(int addr)
               {
                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
-                if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
-                if(r>63) {
-                  // NB This can exclude the case where the upper-half
-                  // register is lower numbered than the lower-half
-                  // register.  Not sure if it's worth fixing...
-                  if(get_reg(regs[j].regmap,r&63)<0) break;
-                  if(get_reg(regs[j].regmap_entry,r&63)<0) break;
-                  if(regs[j].is32&(1LL<<(r&63))) break;
-                }
+                assert(r < 64);
                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
                   int k;
@@ -9290,30 +8609,17 @@ int new_recompile_block(int addr)
                         //printf("no-match due to different register\n");
                         break;
                       }
-                      if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
+                      if (dops[k-2].is_jump) {
                         //printf("no-match due to branch\n");
                         break;
                       }
                       // call/ret fast path assumes no registers allocated
-                      if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
+                      if(k>2&&(dops[k-3].itype==UJUMP||dops[k-3].itype==RJUMP)&&dops[k-3].rt1==31) {
                         break;
                       }
-                      if(r>63) {
-                        // NB This can exclude the case where the upper-half
-                        // register is lower numbered than the lower-half
-                        // register.  Not sure if it's worth fixing...
-                        if(get_reg(regs[k-1].regmap,r&63)<0) break;
-                        if(regs[k-1].is32&(1LL<<(r&63))) break;
-                      }
+                      assert(r < 64);
                       k--;
                     }
-                    if(i<slen-1) {
-                      if((regs[k].is32&(1LL<<f_regmap[hr]))!=
-                        (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
-                        //printf("bad match after branch\n");
-                        break;
-                      }
-                    }
                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
                       while(k<i) {
@@ -9352,12 +8658,10 @@ int new_recompile_block(int addr)
                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
                       branch_regs[i].wasconst&=~(1<<hr);
                       branch_regs[i].isconst&=~(1<<hr);
-                      if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
+                      if (!dops[i].is_ujump) {
                         regmap_pre[i+2][hr]=f_regmap[hr];
                         regs[i+2].wasdirty&=~(1<<hr);
                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
-                        assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
-                          (regs[i+2].was32&(1LL<<f_regmap[hr])));
                       }
                     }
                   }
@@ -9369,17 +8673,15 @@ int new_recompile_block(int addr)
                     regs[k].dirty&=~(1<<hr);
                     regs[k].wasconst&=~(1<<hr);
                     regs[k].isconst&=~(1<<hr);
-                    if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
+                    if (dops[k].is_jump) {
                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
                       branch_regs[k].regmap[hr]=f_regmap[hr];
                       branch_regs[k].dirty&=~(1<<hr);
                       branch_regs[k].wasconst&=~(1<<hr);
                       branch_regs[k].isconst&=~(1<<hr);
-                      if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
+                      if (!dops[k].is_ujump) {
                         regmap_pre[k+2][hr]=f_regmap[hr];
                         regs[k+2].wasdirty&=~(1<<hr);
-                        assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
-                          (regs[k+2].was32&(1LL<<f_regmap[hr])));
                       }
                     }
                     else
@@ -9399,18 +8701,14 @@ int new_recompile_block(int addr)
                   //printf("no-match due to different register\n");
                   break;
                 }
-                if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
-                  //printf("32/64 mismatch %x %d\n",start+j*4,hr);
-                  break;
-                }
-                if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
+                if (dops[j].is_ujump)
                 {
                   // Stop on unconditional branch
                   break;
                 }
-                if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
+                if(dops[j].itype==CJUMP||dops[j].itype==SJUMP)
                 {
-                  if(ooo[j]) {
+                  if(dops[j].ooo) {
                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
                       break;
                   }else{
@@ -9426,17 +8724,7 @@ int new_recompile_block(int addr)
                   //printf("No free regs for store %x\n",start+j*4);
                   break;
                 }
-                if(f_regmap[hr]>=64) {
-                  if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
-                    break;
-                  }
-                  else
-                  {
-                    if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
-                      break;
-                    }
-                  }
-                }
+                assert(f_regmap[hr]<64);
               }
             }
           }
@@ -9447,11 +8735,7 @@ int new_recompile_block(int addr)
       for(hr=0;hr<HOST_REGS;hr++)
       {
         if(hr!=EXCLUDE_REG) {
-          if(regs[i].regmap[hr]>64) {
-            if(!((regs[i].dirty>>hr)&1))
-              f_regmap[hr]=regs[i].regmap[hr];
-          }
-          else if(regs[i].regmap[hr]>=0) {
+          if(regs[i].regmap[hr]>=0) {
             if(f_regmap[hr]!=regs[i].regmap[hr]) {
               // dealloc old register
               int n;
@@ -9466,7 +8750,7 @@ int new_recompile_block(int addr)
         }
       }
       // Try to restore cycle count at branch targets
-      if(bt[i]) {
+      if(dops[i].bt) {
         for(j=i;j<slen-1;j++) {
           if(regs[j].regmap[HOST_CCREG]!=-1) break;
           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
@@ -9520,209 +8804,28 @@ int new_recompile_block(int addr)
           }
         }
       }
-      if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
-         itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
-         itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
-         itype[i]!=FCONV&&itype[i]!=FCOMP)
+      if(dops[i].itype!=STORE&&dops[i].itype!=STORELR&&dops[i].itype!=C1LS&&dops[i].itype!=SHIFT&&
+         dops[i].itype!=NOP&&dops[i].itype!=MOV&&dops[i].itype!=ALU&&dops[i].itype!=SHIFTIMM&&
+         dops[i].itype!=IMM16&&dops[i].itype!=LOAD&&dops[i].itype!=COP1)
       {
         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
       }
     }
   }
 
-  // Cache memory offset or tlb map pointer if a register is available
-  #ifndef HOST_IMM_ADDR32
-  #ifndef RAM_OFFSET
-  if(0)
-  #endif
-  {
-    int earliest_available[HOST_REGS];
-    int loop_start[HOST_REGS];
-    int score[HOST_REGS];
-    int end[HOST_REGS];
-    int reg=ROREG;
-
-    // Init
-    for(hr=0;hr<HOST_REGS;hr++) {
-      score[hr]=0;earliest_available[hr]=0;
-      loop_start[hr]=MAXBLOCK;
-    }
-    for(i=0;i<slen-1;i++)
-    {
-      // Can't do anything if no registers are available
-      if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
-        for(hr=0;hr<HOST_REGS;hr++) {
-          score[hr]=0;earliest_available[hr]=i+1;
-          loop_start[hr]=MAXBLOCK;
-        }
-      }
-      if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
-        if(!ooo[i]) {
-          if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
-            for(hr=0;hr<HOST_REGS;hr++) {
-              score[hr]=0;earliest_available[hr]=i+1;
-              loop_start[hr]=MAXBLOCK;
-            }
-          }
-        }else{
-          if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
-            for(hr=0;hr<HOST_REGS;hr++) {
-              score[hr]=0;earliest_available[hr]=i+1;
-              loop_start[hr]=MAXBLOCK;
-            }
-          }
-        }
-      }
-      // Mark unavailable registers
-      for(hr=0;hr<HOST_REGS;hr++) {
-        if(regs[i].regmap[hr]>=0) {
-          score[hr]=0;earliest_available[hr]=i+1;
-          loop_start[hr]=MAXBLOCK;
-        }
-        if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
-          if(branch_regs[i].regmap[hr]>=0) {
-            score[hr]=0;earliest_available[hr]=i+2;
-            loop_start[hr]=MAXBLOCK;
-          }
-        }
-      }
-      // No register allocations after unconditional jumps
-      if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
-      {
-        for(hr=0;hr<HOST_REGS;hr++) {
-          score[hr]=0;earliest_available[hr]=i+2;
-          loop_start[hr]=MAXBLOCK;
-        }
-        i++; // Skip delay slot too
-        //printf("skip delay slot: %x\n",start+i*4);
-      }
-      else
-      // Possible match
-      if(itype[i]==LOAD||itype[i]==LOADLR||
-         itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
-        for(hr=0;hr<HOST_REGS;hr++) {
-          if(hr!=EXCLUDE_REG) {
-            end[hr]=i-1;
-            for(j=i;j<slen-1;j++) {
-              if(regs[j].regmap[hr]>=0) break;
-              if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
-                if(branch_regs[j].regmap[hr]>=0) break;
-                if(ooo[j]) {
-                  if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
-                }else{
-                  if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
-                }
-              }
-              else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
-              if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
-                int t=(ba[j]-start)>>2;
-                if(t<j&&t>=earliest_available[hr]) {
-                  if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
-                    // Score a point for hoisting loop invariant
-                    if(t<loop_start[hr]) loop_start[hr]=t;
-                    //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
-                    score[hr]++;
-                    end[hr]=j;
-                  }
-                }
-                else if(t<j) {
-                  if(regs[t].regmap[hr]==reg) {
-                    // Score a point if the branch target matches this register
-                    score[hr]++;
-                    end[hr]=j;
-                  }
-                }
-                if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
-                   itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
-                  score[hr]++;
-                  end[hr]=j;
-                }
-              }
-              if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
-              {
-                // Stop on unconditional branch
-                break;
-              }
-              else
-              if(itype[j]==LOAD||itype[j]==LOADLR||
-                 itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
-                score[hr]++;
-                end[hr]=j;
-              }
-            }
-          }
-        }
-        // Find highest score and allocate that register
-        int maxscore=0;
-        for(hr=0;hr<HOST_REGS;hr++) {
-          if(hr!=EXCLUDE_REG) {
-            if(score[hr]>score[maxscore]) {
-              maxscore=hr;
-              //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
-            }
-          }
-        }
-        if(score[maxscore]>1)
-        {
-          if(i<loop_start[maxscore]) loop_start[maxscore]=i;
-          for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
-            //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
-            assert(regs[j].regmap[maxscore]<0);
-            if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
-            regs[j].regmap[maxscore]=reg;
-            regs[j].dirty&=~(1<<maxscore);
-            regs[j].wasconst&=~(1<<maxscore);
-            regs[j].isconst&=~(1<<maxscore);
-            if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
-              branch_regs[j].regmap[maxscore]=reg;
-              branch_regs[j].wasdirty&=~(1<<maxscore);
-              branch_regs[j].dirty&=~(1<<maxscore);
-              branch_regs[j].wasconst&=~(1<<maxscore);
-              branch_regs[j].isconst&=~(1<<maxscore);
-              if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
-                regmap_pre[j+2][maxscore]=reg;
-                regs[j+2].wasdirty&=~(1<<maxscore);
-              }
-              // loop optimization (loop_preload)
-              int t=(ba[j]-start)>>2;
-              if(t==loop_start[maxscore]) {
-                if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
-                  regs[t].regmap_entry[maxscore]=reg;
-              }
-            }
-            else
-            {
-              if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
-                regmap_pre[j+1][maxscore]=reg;
-                regs[j+1].wasdirty&=~(1<<maxscore);
-              }
-            }
-          }
-          i=j-1;
-          if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
-          for(hr=0;hr<HOST_REGS;hr++) {
-            score[hr]=0;earliest_available[hr]=i+i;
-            loop_start[hr]=MAXBLOCK;
-          }
-        }
-      }
-    }
-  }
-  #endif
-
   // This allocates registers (if possible) one instruction prior
   // to use, which can avoid a load-use penalty on certain CPUs.
   for(i=0;i<slen-1;i++)
   {
-    if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
+    if (!i || !dops[i-1].is_jump)
     {
-      if(!bt[i+1])
+      if(!dops[i+1].bt)
       {
-        if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
-           ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
+        if(dops[i].itype==ALU||dops[i].itype==MOV||dops[i].itype==LOAD||dops[i].itype==SHIFTIMM||dops[i].itype==IMM16
+           ||((dops[i].itype==COP1||dops[i].itype==COP2)&&dops[i].opcode2<3))
         {
-          if(rs1[i+1]) {
-            if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
+          if(dops[i+1].rs1) {
+            if((hr=get_reg(regs[i+1].regmap,dops[i+1].rs1))>=0)
             {
               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
               {
@@ -9737,8 +8840,8 @@ int new_recompile_block(int addr)
               }
             }
           }
-          if(rs2[i+1]) {
-            if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
+          if(dops[i+1].rs2) {
+            if((hr=get_reg(regs[i+1].regmap,dops[i+1].rs2))>=0)
             {
               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
               {
@@ -9754,14 +8857,14 @@ int new_recompile_block(int addr)
             }
           }
           // Preload target address for load instruction (non-constant)
-          if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
-            if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
+          if(dops[i+1].itype==LOAD&&dops[i+1].rs1&&get_reg(regs[i+1].regmap,dops[i+1].rs1)<0) {
+            if((hr=get_reg(regs[i+1].regmap,dops[i+1].rt1))>=0)
             {
               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
               {
-                regs[i].regmap[hr]=rs1[i+1];
-                regmap_pre[i+1][hr]=rs1[i+1];
-                regs[i+1].regmap_entry[hr]=rs1[i+1];
+                regs[i].regmap[hr]=dops[i+1].rs1;
+                regmap_pre[i+1][hr]=dops[i+1].rs1;
+                regs[i+1].regmap_entry[hr]=dops[i+1].rs1;
                 regs[i].isconst&=~(1<<hr);
                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
                 constmap[i][hr]=constmap[i+1][hr];
@@ -9771,14 +8874,14 @@ int new_recompile_block(int addr)
             }
           }
           // Load source into target register
-          if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
-            if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
+          if(dops[i+1].lt1&&get_reg(regs[i+1].regmap,dops[i+1].rs1)<0) {
+            if((hr=get_reg(regs[i+1].regmap,dops[i+1].rt1))>=0)
             {
               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
               {
-                regs[i].regmap[hr]=rs1[i+1];
-                regmap_pre[i+1][hr]=rs1[i+1];
-                regs[i+1].regmap_entry[hr]=rs1[i+1];
+                regs[i].regmap[hr]=dops[i+1].rs1;
+                regmap_pre[i+1][hr]=dops[i+1].rs1;
+                regs[i+1].regmap_entry[hr]=dops[i+1].rs1;
                 regs[i].isconst&=~(1<<hr);
                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
                 constmap[i][hr]=constmap[i+1][hr];
@@ -9788,18 +8891,18 @@ int new_recompile_block(int addr)
             }
           }
           // Address for store instruction (non-constant)
-          if(itype[i+1]==STORE||itype[i+1]==STORELR
-             ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
-            if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
+          if(dops[i+1].itype==STORE||dops[i+1].itype==STORELR
+             ||(dops[i+1].opcode&0x3b)==0x39||(dops[i+1].opcode&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
+            if(get_reg(regs[i+1].regmap,dops[i+1].rs1)<0) {
               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
               assert(hr>=0);
               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
               {
-                regs[i].regmap[hr]=rs1[i+1];
-                regmap_pre[i+1][hr]=rs1[i+1];
-                regs[i+1].regmap_entry[hr]=rs1[i+1];
+                regs[i].regmap[hr]=dops[i+1].rs1;
+                regmap_pre[i+1][hr]=dops[i+1].rs1;
+                regs[i+1].regmap_entry[hr]=dops[i+1].rs1;
                 regs[i].isconst&=~(1<<hr);
                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
                 constmap[i][hr]=constmap[i+1][hr];
@@ -9808,16 +8911,16 @@ int new_recompile_block(int addr)
               }
             }
           }
-          if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
-            if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
+          if(dops[i+1].itype==LOADLR||(dops[i+1].opcode&0x3b)==0x31||(dops[i+1].opcode&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
+            if(get_reg(regs[i+1].regmap,dops[i+1].rs1)<0) {
               int nr;
               hr=get_reg(regs[i+1].regmap,FTEMP);
               assert(hr>=0);
               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
               {
-                regs[i].regmap[hr]=rs1[i+1];
-                regmap_pre[i+1][hr]=rs1[i+1];
-                regs[i+1].regmap_entry[hr]=rs1[i+1];
+                regs[i].regmap[hr]=dops[i+1].rs1;
+                regmap_pre[i+1][hr]=dops[i+1].rs1;
+                regs[i+1].regmap_entry[hr]=dops[i+1].rs1;
                 regs[i].isconst&=~(1<<hr);
                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
                 constmap[i][hr]=constmap[i+1][hr];
@@ -9831,9 +8934,9 @@ int new_recompile_block(int addr)
                 regmap_pre[i+2][hr]=-1;
                 regs[i+1].regmap[nr]=FTEMP;
                 regmap_pre[i+2][nr]=FTEMP;
-                regs[i].regmap[nr]=rs1[i+1];
-                regmap_pre[i+1][nr]=rs1[i+1];
-                regs[i+1].regmap_entry[nr]=rs1[i+1];
+                regs[i].regmap[nr]=dops[i+1].rs1;
+                regmap_pre[i+1][nr]=dops[i+1].rs1;
+                regs[i+1].regmap_entry[nr]=dops[i+1].rs1;
                 regs[i].isconst&=~(1<<nr);
                 regs[i+1].isconst&=~(1<<nr);
                 regs[i].dirty&=~(1<<nr);
@@ -9843,17 +8946,17 @@ int new_recompile_block(int addr)
               }
             }
           }
-          if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
-            if(itype[i+1]==LOAD)
-              hr=get_reg(regs[i+1].regmap,rt1[i+1]);
-            if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
+          if(dops[i+1].itype==LOAD||dops[i+1].itype==LOADLR||dops[i+1].itype==STORE||dops[i+1].itype==STORELR/*||dops[i+1].itype==C1LS||||dops[i+1].itype==C2LS*/) {
+            if(dops[i+1].itype==LOAD)
+              hr=get_reg(regs[i+1].regmap,dops[i+1].rt1);
+            if(dops[i+1].itype==LOADLR||(dops[i+1].opcode&0x3b)==0x31||(dops[i+1].opcode&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
               hr=get_reg(regs[i+1].regmap,FTEMP);
-            if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
+            if(dops[i+1].itype==STORE||dops[i+1].itype==STORELR||(dops[i+1].opcode&0x3b)==0x39||(dops[i+1].opcode&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
             }
             if(hr>=0&&regs[i].regmap[hr]<0) {
-              int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
+              int rs=get_reg(regs[i+1].regmap,dops[i+1].rs1);
               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
@@ -9875,19 +8978,19 @@ int new_recompile_block(int addr)
   /* Pass 7 - Identify 32-bit registers */
   for (i=slen-1;i>=0;i--)
   {
-    if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
+    if(dops[i].itype==CJUMP||dops[i].itype==SJUMP)
     {
       // Conditional branch
       if((source[i]>>16)!=0x1000&&i<slen-2) {
         // Mark this address as a branch target since it may be called
         // upon return from interrupt
-        bt[i+2]=1;
+        dops[i+2].bt=1;
       }
     }
   }
 
-  if(itype[slen-1]==SPAN) {
-    bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
+  if(dops[slen-1].itype==SPAN) {
+    dops[slen-1].bt=1; // Mark as a branch target so instruction can restart after exception
   }
 
 #ifdef DISASM
@@ -9910,6 +9013,7 @@ int new_recompile_block(int addr)
     #ifdef __arm__
     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
     #endif
+    #if defined(__i386__) || defined(__x86_64__)
     printf("needs: ");
     if(needed_reg[i]&1) printf("eax ");
     if((needed_reg[i]>>1)&1) printf("ecx ");
@@ -9919,7 +9023,6 @@ int new_recompile_block(int addr)
     if((needed_reg[i]>>6)&1) printf("esi ");
     if((needed_reg[i]>>7)&1) printf("edi ");
     printf("\n");
-    #if defined(__i386__) || defined(__x86_64__)
     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
     printf("dirty: ");
     if(regs[i].wasdirty&1) printf("eax ");
@@ -9978,31 +9081,23 @@ int new_recompile_block(int addr)
     if(regs[i].isconst) {
       printf("constants: ");
       #if defined(__i386__) || defined(__x86_64__)
-      if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
-      if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
-      if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
-      if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
-      if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
-      if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
-      if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
+      if(regs[i].isconst&1) printf("eax=%x ",(u_int)constmap[i][0]);
+      if((regs[i].isconst>>1)&1) printf("ecx=%x ",(u_int)constmap[i][1]);
+      if((regs[i].isconst>>2)&1) printf("edx=%x ",(u_int)constmap[i][2]);
+      if((regs[i].isconst>>3)&1) printf("ebx=%x ",(u_int)constmap[i][3]);
+      if((regs[i].isconst>>5)&1) printf("ebp=%x ",(u_int)constmap[i][5]);
+      if((regs[i].isconst>>6)&1) printf("esi=%x ",(u_int)constmap[i][6]);
+      if((regs[i].isconst>>7)&1) printf("edi=%x ",(u_int)constmap[i][7]);
       #endif
-      #ifdef __arm__
-      if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
-      if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
-      if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
-      if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
-      if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
-      if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
-      if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
-      if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
-      if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
-      if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
-      if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
-      if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
+      #if defined(__arm__) || defined(__aarch64__)
+      int r;
+      for (r = 0; r < ARRAY_SIZE(constmap[i]); r++)
+        if ((regs[i].isconst >> r) & 1)
+          printf(" r%d=%x", r, (u_int)constmap[i][r]);
       #endif
       printf("\n");
     }
-    if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
+    if(dops[i].is_jump) {
       #if defined(__i386__) || defined(__x86_64__)
       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
       if(branch_regs[i].dirty&1) printf("eax ");
@@ -10035,28 +9130,31 @@ int new_recompile_block(int addr)
   /* Pass 8 - Assembly */
   linkcount=0;stubcount=0;
   ds=0;is_delayslot=0;
-  cop1_usable=0;
-  uint64_t is32_pre=0;
   u_int dirty_pre=0;
   void *beginning=start_block();
   if((u_int)addr&1) {
     ds=1;
     pagespan_ds();
   }
-  u_int instr_addr0_override=0;
+  void *instr_addr0_override = NULL;
 
   if (start == 0x80030000) {
-    // nasty hack for fastbios thing
+    // nasty hack for the fastbios thing
     // override block entry to this code
-    instr_addr0_override=(u_int)out;
+    instr_addr0_override = out;
     emit_movimm(start,0);
     // abuse io address var as a flag that we
     // have already returned here once
-    emit_readword((int)&address,1);
-    emit_writeword(0,(int)&pcaddr);
-    emit_writeword(0,(int)&address);
+    emit_readword(&address,1);
+    emit_writeword(0,&pcaddr);
+    emit_writeword(0,&address);
     emit_cmp(0,1);
-    emit_jne((int)new_dyna_leave);
+    #ifdef __aarch64__
+    emit_jeq(out + 4*2);
+    emit_far_jump(new_dyna_leave);
+    #else
+    emit_jne(new_dyna_leave);
+    #endif
   }
   for(i=0;i<slen;i++)
   {
@@ -10064,167 +9162,111 @@ int new_recompile_block(int addr)
     disassemble_inst(i);
     if(ds) {
       ds=0; // Skip delay slot
-      if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
-      instr_addr[i]=0;
+      if(dops[i].bt) assem_debug("OOPS - branch into delay slot\n");
+      instr_addr[i] = NULL;
     } else {
       speculate_register_values(i);
       #ifndef DESTRUCTIVE_WRITEBACK
-      if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
+      if (i < 2 || !dops[i-2].is_ujump)
       {
-        wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
-              unneeded_reg[i],unneeded_reg_upper[i]);
+        wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,unneeded_reg[i]);
       }
-      if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
-        is32_pre=branch_regs[i].is32;
+      if((dops[i].itype==CJUMP||dops[i].itype==SJUMP)) {
         dirty_pre=branch_regs[i].dirty;
       }else{
-        is32_pre=regs[i].is32;
         dirty_pre=regs[i].dirty;
       }
       #endif
       // write back
-      if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
+      if (i < 2 || !dops[i-2].is_ujump)
       {
-        wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
-                      unneeded_reg[i],unneeded_reg_upper[i]);
+        wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,unneeded_reg[i]);
         loop_preload(regmap_pre[i],regs[i].regmap_entry);
       }
       // branch target entry point
-      instr_addr[i]=(u_int)out;
+      instr_addr[i] = out;
       assem_debug("<->\n");
+      drc_dbg_emit_do_cmp(i, ccadj[i]);
+
       // load regs
       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
-        wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
-      load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
+        wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty);
+      load_regs(regs[i].regmap_entry,regs[i].regmap,dops[i].rs1,dops[i].rs2);
       address_generation(i,&regs[i],regs[i].regmap_entry);
-      load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
-      if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
+      load_consts(regmap_pre[i],regs[i].regmap,i);
+      if(dops[i].is_jump)
       {
         // Load the delay slot registers if necessary
-        if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
-          load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
-        if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
-          load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
-        if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
-          load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
+        if(dops[i+1].rs1!=dops[i].rs1&&dops[i+1].rs1!=dops[i].rs2&&(dops[i+1].rs1!=dops[i].rt1||dops[i].rt1==0))
+          load_regs(regs[i].regmap_entry,regs[i].regmap,dops[i+1].rs1,dops[i+1].rs1);
+        if(dops[i+1].rs2!=dops[i+1].rs1&&dops[i+1].rs2!=dops[i].rs1&&dops[i+1].rs2!=dops[i].rs2&&(dops[i+1].rs2!=dops[i].rt1||dops[i].rt1==0))
+          load_regs(regs[i].regmap_entry,regs[i].regmap,dops[i+1].rs2,dops[i+1].rs2);
+        if (ram_offset && (dops[i+1].is_load || dops[i+1].is_store))
+          load_regs(regs[i].regmap_entry,regs[i].regmap,ROREG,ROREG);
+        if (dops[i+1].is_store)
+          load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP);
       }
       else if(i+1<slen)
       {
         // Preload registers for following instruction
-        if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
-          if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
-            load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
-        if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
-          if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
-            load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
+        if(dops[i+1].rs1!=dops[i].rs1&&dops[i+1].rs1!=dops[i].rs2)
+          if(dops[i+1].rs1!=dops[i].rt1&&dops[i+1].rs1!=dops[i].rt2)
+            load_regs(regs[i].regmap_entry,regs[i].regmap,dops[i+1].rs1,dops[i+1].rs1);
+        if(dops[i+1].rs2!=dops[i+1].rs1&&dops[i+1].rs2!=dops[i].rs1&&dops[i+1].rs2!=dops[i].rs2)
+          if(dops[i+1].rs2!=dops[i].rt1&&dops[i+1].rs2!=dops[i].rt2)
+            load_regs(regs[i].regmap_entry,regs[i].regmap,dops[i+1].rs2,dops[i+1].rs2);
       }
       // TODO: if(is_ooo(i)) address_generation(i+1);
-      if(itype[i]==CJUMP||itype[i]==FJUMP)
-        load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
-      if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
-        load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
-      if(bt[i]) cop1_usable=0;
-      // assemble
-      switch(itype[i]) {
-        case ALU:
-          alu_assemble(i,&regs[i]);break;
-        case IMM16:
-          imm16_assemble(i,&regs[i]);break;
-        case SHIFT:
-          shift_assemble(i,&regs[i]);break;
-        case SHIFTIMM:
-          shiftimm_assemble(i,&regs[i]);break;
-        case LOAD:
-          load_assemble(i,&regs[i]);break;
-        case LOADLR:
-          loadlr_assemble(i,&regs[i]);break;
-        case STORE:
-          store_assemble(i,&regs[i]);break;
-        case STORELR:
-          storelr_assemble(i,&regs[i]);break;
-        case COP0:
-          cop0_assemble(i,&regs[i]);break;
-        case COP1:
-          cop1_assemble(i,&regs[i]);break;
-        case C1LS:
-          c1ls_assemble(i,&regs[i]);break;
-        case COP2:
-          cop2_assemble(i,&regs[i]);break;
-        case C2LS:
-          c2ls_assemble(i,&regs[i]);break;
-        case C2OP:
-          c2op_assemble(i,&regs[i]);break;
-        case FCONV:
-          fconv_assemble(i,&regs[i]);break;
-        case FLOAT:
-          float_assemble(i,&regs[i]);break;
-        case FCOMP:
-          fcomp_assemble(i,&regs[i]);break;
-        case MULTDIV:
-          multdiv_assemble(i,&regs[i]);break;
-        case MOV:
-          mov_assemble(i,&regs[i]);break;
-        case SYSCALL:
-          syscall_assemble(i,&regs[i]);break;
-        case HLECALL:
-          hlecall_assemble(i,&regs[i]);break;
-        case INTCALL:
-          intcall_assemble(i,&regs[i]);break;
-        case UJUMP:
-          ujump_assemble(i,&regs[i]);ds=1;break;
-        case RJUMP:
-          rjump_assemble(i,&regs[i]);ds=1;break;
-        case CJUMP:
-          cjump_assemble(i,&regs[i]);ds=1;break;
-        case SJUMP:
-          sjump_assemble(i,&regs[i]);ds=1;break;
-        case FJUMP:
-          fjump_assemble(i,&regs[i]);ds=1;break;
-        case SPAN:
-          pagespan_assemble(i,&regs[i]);break;
-      }
-      if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
+      if (!dops[i].is_jump || dops[i].itype == CJUMP)
+        load_regs(regs[i].regmap_entry,regs[i].regmap,CCREG,CCREG);
+      if (ram_offset && (dops[i].is_load || dops[i].is_store))
+        load_regs(regs[i].regmap_entry,regs[i].regmap,ROREG,ROREG);
+      if (dops[i].is_store)
+        load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP);
+
+      ds = assemble(i, &regs[i], ccadj[i]);
+
+      if (dops[i].is_ujump)
         literal_pool(1024);
       else
         literal_pool_jumpover(256);
     }
   }
-  //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
+
+  assert(slen > 0);
+  if (slen > 0 && dops[slen-1].itype == INTCALL) {
+    // no ending needed for this block since INTCALL never returns
+  }
   // If the block did not end with an unconditional branch,
   // add a jump to the next instruction.
-  if(i>1) {
-    if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
-      assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
+  else if (i > 1) {
+    if (!dops[i-2].is_ujump && dops[i-1].itype != SPAN) {
+      assert(!dops[i-1].is_jump);
       assert(i==slen);
-      if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
-        store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
+      if(dops[i-2].itype!=CJUMP&&dops[i-2].itype!=SJUMP) {
+        store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4);
         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
           emit_loadreg(CCREG,HOST_CCREG);
-        emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
-      }
-      else if(!likely[i-2])
-      {
-        store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
-        assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
+        emit_addimm(HOST_CCREG, ccadj[i-1] + CLOCK_ADJUST(1), HOST_CCREG);
       }
       else
       {
-        store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
-        assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
+        store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].dirty,start+i*4);
+        assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
       }
-      add_to_linker((int)out,start+i*4,0);
+      add_to_linker(out,start+i*4,0);
       emit_jmp(0);
     }
   }
   else
   {
     assert(i>0);
-    assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
-    store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
+    assert(!dops[i-1].is_jump);
+    store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4);
     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
       emit_loadreg(CCREG,HOST_CCREG);
-    emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
-    add_to_linker((int)out,start+i*4,0);
+    emit_addimm(HOST_CCREG, ccadj[i-1] + CLOCK_ADJUST(1), HOST_CCREG);
+    add_to_linker(out,start+i*4,0);
     emit_jmp(0);
   }
 
@@ -10232,7 +9274,7 @@ int new_recompile_block(int addr)
   // Stubs
   for(i=0;i<stubcount;i++)
   {
-    switch(stubs[i][0])
+    switch(stubs[i].type)
     {
       case LOADB_STUB:
       case LOADH_STUB:
@@ -10263,37 +9305,47 @@ int new_recompile_block(int addr)
   /* Pass 9 - Linker */
   for(i=0;i<linkcount;i++)
   {
-    assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
+    assem_debug("%p -> %8x\n",link_addr[i].addr,link_addr[i].target);
     literal_pool(64);
-    if(!link_addr[i][2])
+    if (!link_addr[i].ext)
     {
-      void *stub=out;
-      void *addr=check_addr(link_addr[i][1]);
-      emit_extjump(link_addr[i][0],link_addr[i][1]);
-      if(addr) {
-        set_jump_target(link_addr[i][0],(int)addr);
-        add_link(link_addr[i][1],stub);
+      void *stub = out;
+      void *addr = check_addr(link_addr[i].target);
+      emit_extjump(link_addr[i].addr, link_addr[i].target);
+      if (addr) {
+        set_jump_target(link_addr[i].addr, addr);
+        add_jump_out(link_addr[i].target,stub);
       }
-      else set_jump_target(link_addr[i][0],(int)stub);
+      else
+        set_jump_target(link_addr[i].addr, stub);
     }
     else
     {
       // Internal branch
-      int target=(link_addr[i][1]-start)>>2;
+      int target=(link_addr[i].target-start)>>2;
       assert(target>=0&&target<slen);
       assert(instr_addr[target]);
       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
-      //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
+      //set_jump_target_fillslot(link_addr[i].addr,instr_addr[target],link_addr[i].ext>>1);
       //#else
-      set_jump_target(link_addr[i][0],instr_addr[target]);
+      set_jump_target(link_addr[i].addr, instr_addr[target]);
       //#endif
     }
   }
+
+  u_int source_len = slen*4;
+  if (dops[slen-1].itype == INTCALL && source_len > 4)
+    // no need to treat the last instruction as compiled
+    // as interpreter fully handles it
+    source_len -= 4;
+
+  if ((u_char *)copy + source_len > (u_char *)shadow + sizeof(shadow))
+    copy = shadow;
+
   // External Branch Targets (jump_in)
-  if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
   for(i=0;i<slen;i++)
   {
-    if(bt[i]||i==0)
+    if(dops[i].bt||i==0)
     {
       if(instr_addr[i]) // TODO - delay slots (=null)
       {
@@ -10302,22 +9354,20 @@ int new_recompile_block(int addr)
         u_int vpage=get_vpage(vaddr);
         literal_pool(256);
         {
-          assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
+          assem_debug("%p (%d) <- %8x\n",instr_addr[i],i,start+i*4);
           assem_debug("jump_in: %x\n",start+i*4);
-          ll_add(jump_dirty+vpage,vaddr,(void *)out);
-          int entry_point=do_dirty_stub(i);
-          ll_add_flags(jump_in+page,vaddr,state_rflags,(void *)entry_point);
+          ll_add(jump_dirty+vpage,vaddr,out);
+          void *entry_point = do_dirty_stub(i, source_len);
+          ll_add_flags(jump_in+page,vaddr,state_rflags,entry_point);
           // If there was an existing entry in the hash table,
           // replace it with the new address.
           // Don't add new entries.  We'll insert the
           // ones that actually get used in check_addr().
-          u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
-          if(ht_bin[0]==vaddr) {
-            ht_bin[1]=entry_point;
-          }
-          if(ht_bin[2]==vaddr) {
-            ht_bin[3]=entry_point;
-          }
+          struct ht_entry *ht_bin = hash_table_get(vaddr);
+          if (ht_bin->vaddr[0] == vaddr)
+            ht_bin->tcaddr[0] = entry_point;
+          if (ht_bin->vaddr[1] == vaddr)
+            ht_bin->tcaddr[1] = entry_point;
         }
       }
     }
@@ -10328,16 +9378,17 @@ int new_recompile_block(int addr)
   // Align code
   if(((u_int)out)&7) emit_addnop(13);
   #endif
-  assert((u_int)out-(u_int)beginning<MAX_OUTPUT_BLOCK_SIZE);
-  //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
-  memcpy(copy,source,slen*4);
-  copy+=slen*4;
+  assert(out - (u_char *)beginning < MAX_OUTPUT_BLOCK_SIZE);
+  //printf("shadow buffer: %p-%p\n",copy,(u_char *)copy+slen*4);
+  memcpy(copy, source, source_len);
+  copy += source_len;
 
   end_block(beginning);
 
   // If we're within 256K of the end of the buffer,
   // start over from the beginning. (Is 256K enough?)
-  if((u_int)out>(u_int)BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
+  if (out > ndrc->translation_cache + sizeof(ndrc->translation_cache) - MAX_OUTPUT_BLOCK_SIZE)
+    out = ndrc->translation_cache;
 
   // Trap writes to any of the pages we compiled
   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
@@ -10354,56 +9405,62 @@ int new_recompile_block(int addr)
 
   /* Pass 10 - Free memory by expiring oldest blocks */
 
-  int end=((((int)out-(int)BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
+  int end=(((out-ndrc->translation_cache)>>(TARGET_SIZE_2-16))+16384)&65535;
   while(expirep!=end)
   {
     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
-    int base=(int)BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
+    uintptr_t base_offs = ((uintptr_t)(expirep >> 13) << shift); // Base offset of this block
+    uintptr_t base_offs_s = base_offs >> shift;
     inv_debug("EXP: Phase %d\n",expirep);
     switch((expirep>>11)&3)
     {
       case 0:
         // Clear jump_in and jump_dirty
-        ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
-        ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
-        ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
-        ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
+        ll_remove_matching_addrs(jump_in+(expirep&2047),base_offs_s,shift);
+        ll_remove_matching_addrs(jump_dirty+(expirep&2047),base_offs_s,shift);
+        ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base_offs_s,shift);
+        ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base_offs_s,shift);
         break;
       case 1:
         // Clear pointers
-        ll_kill_pointers(jump_out[expirep&2047],base,shift);
-        ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
+        ll_kill_pointers(jump_out[expirep&2047],base_offs_s,shift);
+        ll_kill_pointers(jump_out[(expirep&2047)+2048],base_offs_s,shift);
         break;
       case 2:
         // Clear hash table
         for(i=0;i<32;i++) {
-          u_int *ht_bin=hash_table[((expirep&2047)<<5)+i];
-          if((ht_bin[3]>>shift)==(base>>shift) ||
-             ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
-            inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
-            ht_bin[2]=ht_bin[3]=-1;
-          }
-          if((ht_bin[1]>>shift)==(base>>shift) ||
-             ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
-            inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
-            ht_bin[0]=ht_bin[2];
-            ht_bin[1]=ht_bin[3];
-            ht_bin[2]=ht_bin[3]=-1;
+          struct ht_entry *ht_bin = &hash_table[((expirep&2047)<<5)+i];
+          uintptr_t o1 = (u_char *)ht_bin->tcaddr[1] - ndrc->translation_cache;
+          uintptr_t o2 = o1 - MAX_OUTPUT_BLOCK_SIZE;
+          if ((o1 >> shift) == base_offs_s || (o2 >> shift) == base_offs_s) {
+            inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[1],ht_bin->tcaddr[1]);
+            ht_bin->vaddr[1] = -1;
+            ht_bin->tcaddr[1] = NULL;
+          }
+          o1 = (u_char *)ht_bin->tcaddr[0] - ndrc->translation_cache;
+          o2 = o1 - MAX_OUTPUT_BLOCK_SIZE;
+          if ((o1 >> shift) == base_offs_s || (o2 >> shift) == base_offs_s) {
+            inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[0],ht_bin->tcaddr[0]);
+            ht_bin->vaddr[0] = ht_bin->vaddr[1];
+            ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
+            ht_bin->vaddr[1] = -1;
+            ht_bin->tcaddr[1] = NULL;
           }
         }
         break;
       case 3:
         // Clear jump_out
-        #ifdef __arm__
         if((expirep&2047)==0)
           do_clear_cache();
-        #endif
-        ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
-        ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
+        ll_remove_matching_addrs(jump_out+(expirep&2047),base_offs_s,shift);
+        ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base_offs_s,shift);
         break;
     }
     expirep=(expirep+1)&65535;
   }
+#ifdef ASSEM_PRINT
+  fflush(stdout);
+#endif
   return 0;
 }
 
index 8c89051..8991fac 100644 (file)
@@ -1,27 +1,31 @@
-#ifndef __NEW_DYNAREC_H__
-#define __NEW_DYNAREC_H__
-
-/* #define NEW_DYNAREC 1 */
+#define NEW_DYNAREC 1
 
 extern int pcaddr;
 extern int pending_exception;
 extern int stop;
 extern int new_dynarec_did_compile;
+
+#define CYCLE_MULT_DEFAULT 175
 extern int cycle_multiplier; // 100 for 1.0
+extern int cycle_multiplier_override;
+extern int cycle_multiplier_old;
 
 #define NDHACK_NO_SMC_CHECK    (1<<0)
 #define NDHACK_GTE_UNNEEDED    (1<<1)
 #define NDHACK_GTE_NO_FLAGS    (1<<2)
+#define NDHACK_OVERRIDE_CYCLE_M        (1<<3)
+#define NDHACK_NO_STALLS       (1<<4)
+#define NDHACK_NO_COMPAT_HACKS (1<<5)
 extern int new_dynarec_hacks;
+extern int new_dynarec_hacks_pergame;
+extern int new_dynarec_hacks_old;
 
 void new_dynarec_init(void);
 void new_dynarec_cleanup(void);
 void new_dynarec_clear_full(void);
-void new_dyna_start(void);
+void new_dyna_start(void *context);
 int  new_dynarec_save_blocks(void *save, int size);
 void new_dynarec_load_blocks(const void *save, int size);
 
 void invalidate_all_pages(void);
 void invalidate_block(unsigned int block);
-
-#endif /* __NEW_DYNAREC_H__ */
index 3b00780..321bfbf 100644 (file)
@@ -1,15 +1,14 @@
-#ifndef __NEW_DYNAREC_CONFIG_H__
-#define __NEW_DYNAREC_CONFIG_H__
 
+#ifdef __arm__
 #define CORTEX_A8_BRANCH_PREDICTION_HACK 1
+#endif
+
 #define USE_MINI_HT 1
 //#define REG_PREFETCH 1
 
-#if defined(__MACH__)
+#if defined(__MACH__) || defined(VITA)
 #define NO_WRITE_EXEC 1
 #endif
 #ifdef VITA
 #define BASE_ADDR_DYNAMIC 1
 #endif
-
-#endif /* __NEW_DYNAREC_CONFIG_H__ */
diff --git a/libpcsxcore/new_dynarec/patches/trace_drc_chk b/libpcsxcore/new_dynarec/patches/trace_drc_chk
new file mode 100644 (file)
index 0000000..e98a48e
--- /dev/null
@@ -0,0 +1,133 @@
+diff --git a/libpcsxcore/new_dynarec/new_dynarec.c b/libpcsxcore/new_dynarec/new_dynarec.c
+index f1005db..ebd1d4f 100644
+--- a/libpcsxcore/new_dynarec/new_dynarec.c
++++ b/libpcsxcore/new_dynarec/new_dynarec.c
+@@ -235,7 +235,7 @@ static struct decoded_insn
+   int new_dynarec_hacks_old;
+   int new_dynarec_did_compile;
+-  #define HACK_ENABLED(x) ((new_dynarec_hacks | new_dynarec_hacks_pergame) & (x))
++  #define HACK_ENABLED(x) ((NDHACK_NO_STALLS) & (x))
+   extern int cycle_count; // ... until end of the timeslice, counts -N -> 0
+   extern int last_count;  // last absolute target, often = next_interupt
+@@ -471,6 +471,7 @@ int cycle_multiplier_old;
+ static int CLOCK_ADJUST(int x)
+ {
++  return x * 2;
+   int m = cycle_multiplier_override && cycle_multiplier == CYCLE_MULT_DEFAULT
+         ? cycle_multiplier_override : cycle_multiplier;
+   int s=(x>>31)|1;
+@@ -522,6 +523,9 @@ static int doesnt_expire_soon(void *tcaddr)
+ // This is called from the recompiled JR/JALR instructions
+ void noinline *get_addr(u_int vaddr)
+ {
++#ifdef DRC_DBG
++printf("get_addr %08x, pc=%08x\n", vaddr, psxRegs.pc);
++#endif
+   u_int page=get_page(vaddr);
+   u_int vpage=get_vpage(vaddr);
+   struct ll_entry *head;
+@@ -6248,7 +6252,7 @@ void unneeded_registers(int istart,int iend,int r)
+     // R0 is always unneeded
+     u|=1;
+     // Save it
+-    unneeded_reg[i]=u;
++    unneeded_reg[i]=1;//u;
+     gte_unneeded[i]=gte_u;
+     /*
+     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
+@@ -8794,6 +8798,7 @@ int new_recompile_block(u_int addr)
+   // This allocates registers (if possible) one instruction prior
+   // to use, which can avoid a load-use penalty on certain CPUs.
++#if 0
+   for(i=0;i<slen-1;i++)
+   {
+     if (!i || !dops[i-1].is_jump)
+@@ -8950,6 +8955,7 @@ int new_recompile_block(u_int addr)
+       }
+     }
+   }
++#endif
+   /* Pass 6 - Optimize clean/dirty state */
+   clean_registers(0,slen-1,1);
+@@ -9204,6 +9210,11 @@ int new_recompile_block(u_int addr)
+         load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP);
+       ds = assemble(i, &regs[i], ccadj[i]);
++#ifdef DRC_DBG
++       // write-out non-consts, consts are likely different because of get_final_value()
++       if (!dops[i].is_jump)
++         wb_dirtys(regs[i].regmap,regs[i].dirty&~regs[i].loadedconst);
++#endif
+       if (dops[i].is_ujump)
+         literal_pool(1024);
+@@ -9439,6 +9450,10 @@ int new_recompile_block(u_int addr)
+   }
+ #ifdef ASSEM_PRINT
+   fflush(stdout);
++#endif
++#ifdef DRC_DBG
++printf("new_recompile_block done\n");
++fflush(stdout);
+ #endif
+   return 0;
+ }
+diff --git a/libpcsxcore/new_dynarec/pcsxmem.c b/libpcsxcore/new_dynarec/pcsxmem.c
+index bb471b6..8f68a3b 100644
+--- a/libpcsxcore/new_dynarec/pcsxmem.c
++++ b/libpcsxcore/new_dynarec/pcsxmem.c
+@@ -272,6 +272,8 @@ static void write_biu(u32 value)
+       if (address != 0xfffe0130)
+               return;
++extern u32 handler_cycle;
++handler_cycle = psxRegs.cycle;
+       switch (value) {
+       case 0x800: case 0x804:
+               unmap_ram_write();
+diff --git a/libpcsxcore/psxcounters.c b/libpcsxcore/psxcounters.c
+index b2cc07b..f916580 100644
+--- a/libpcsxcore/psxcounters.c
++++ b/libpcsxcore/psxcounters.c
+@@ -378,9 +378,12 @@ void psxRcntUpdate()
+ /******************************************************************************/
++extern u32 handler_cycle;
++
+ void psxRcntWcount( u32 index, u32 value )
+ {
+     verboseLog( 2, "[RCNT %i] wcount: %x\n", index, value );
++handler_cycle = psxRegs.cycle;
+     _psxRcntWcount( index, value );
+     psxRcntSet();
+@@ -389,6 +392,7 @@ void psxRcntWcount( u32 index, u32 value )
+ void psxRcntWmode( u32 index, u32 value )
+ {
+     verboseLog( 1, "[RCNT %i] wmode: %x\n", index, value );
++handler_cycle = psxRegs.cycle;
+     _psxRcntWmode( index, value );
+     _psxRcntWcount( index, 0 );
+@@ -400,6 +404,7 @@ void psxRcntWmode( u32 index, u32 value )
+ void psxRcntWtarget( u32 index, u32 value )
+ {
+     verboseLog( 1, "[RCNT %i] wtarget: %x\n", index, value );
++handler_cycle = psxRegs.cycle;
+     rcnts[index].target = value;
+@@ -412,6 +417,7 @@ void psxRcntWtarget( u32 index, u32 value )
+ u32 psxRcntRcount( u32 index )
+ {
+     u32 count;
++handler_cycle = psxRegs.cycle;
+     count = _psxRcntRcount( index );
diff --git a/libpcsxcore/new_dynarec/patches/trace_intr b/libpcsxcore/new_dynarec/patches/trace_intr
new file mode 100644 (file)
index 0000000..fc03e7f
--- /dev/null
@@ -0,0 +1,323 @@
+diff --git a/libpcsxcore/new_dynarec/emu_if.c b/libpcsxcore/new_dynarec/emu_if.c
+index 90c4660..441eaca 100644
+--- a/libpcsxcore/new_dynarec/emu_if.c
++++ b/libpcsxcore/new_dynarec/emu_if.c
+@@ -424,13 +424,17 @@ static void ari64_shutdown()
+ {
+       new_dynarec_cleanup();
+       new_dyna_pcsx_mem_shutdown();
++      (void)ari64_execute;
+ }
++extern void intExecuteT();
++extern void intExecuteBlockT();
++
+ R3000Acpu psxRec = {
+       ari64_init,
+       ari64_reset,
+-      ari64_execute,
+-      ari64_execute_until,
++      intExecuteT,
++      intExecuteBlockT,
+       ari64_clear,
+       ari64_notify,
+       ari64_apply_config,
+@@ -501,7 +505,7 @@ static u32 memcheck_read(u32 a)
+       return *(u32 *)(psxM + (a & 0x1ffffc));
+ }
+-#if 0
++#if 1
+ void do_insn_trace(void)
+ {
+       static psxRegisters oldregs;
+diff --git a/libpcsxcore/new_dynarec/pcsxmem.c b/libpcsxcore/new_dynarec/pcsxmem.c
+index bb471b6..8f68a3b 100644
+--- a/libpcsxcore/new_dynarec/pcsxmem.c
++++ b/libpcsxcore/new_dynarec/pcsxmem.c
+@@ -272,6 +272,8 @@ static void write_biu(u32 value)
+       if (address != 0xfffe0130)
+               return;
++extern u32 handler_cycle;
++handler_cycle = psxRegs.cycle;
+       switch (value) {
+       case 0x800: case 0x804:
+               unmap_ram_write();
+diff --git a/libpcsxcore/psxcounters.c b/libpcsxcore/psxcounters.c
+index b2cc07b..f916580 100644
+--- a/libpcsxcore/psxcounters.c
++++ b/libpcsxcore/psxcounters.c
+@@ -378,9 +378,12 @@ void psxRcntUpdate()
+ /******************************************************************************/
++extern u32 handler_cycle;
++
+ void psxRcntWcount( u32 index, u32 value )
+ {
+     verboseLog( 2, "[RCNT %i] wcount: %x\n", index, value );
++handler_cycle = psxRegs.cycle;
+     _psxRcntWcount( index, value );
+     psxRcntSet();
+@@ -389,6 +392,7 @@ void psxRcntWcount( u32 index, u32 value )
+ void psxRcntWmode( u32 index, u32 value )
+ {
+     verboseLog( 1, "[RCNT %i] wmode: %x\n", index, value );
++handler_cycle = psxRegs.cycle;
+     _psxRcntWmode( index, value );
+     _psxRcntWcount( index, 0 );
+@@ -400,6 +404,7 @@ void psxRcntWmode( u32 index, u32 value )
+ void psxRcntWtarget( u32 index, u32 value )
+ {
+     verboseLog( 1, "[RCNT %i] wtarget: %x\n", index, value );
++handler_cycle = psxRegs.cycle;
+     rcnts[index].target = value;
+@@ -412,6 +417,7 @@ void psxRcntWtarget( u32 index, u32 value )
+ u32 psxRcntRcount( u32 index )
+ {
+     u32 count;
++handler_cycle = psxRegs.cycle;
+     count = _psxRcntRcount( index );
+diff --git a/libpcsxcore/psxhw.c b/libpcsxcore/psxhw.c
+index dbcb989..0716f5e 100644
+--- a/libpcsxcore/psxhw.c
++++ b/libpcsxcore/psxhw.c
+@@ -373,13 +373,14 @@ void psxHwWrite8(u32 add, u8 value) {
+               case 0x1f801803: cdrWrite3(value); break;
+               default:
++                      if (add < 0x1f802000)
+                       psxHu8(add) = value;
+ #ifdef PSXHW_LOG
+                       PSXHW_LOG("*Unknown 8bit write at address %x value %x\n", add, value);
+ #endif
+                       return;
+       }
+-      psxHu8(add) = value;
++      //psxHu8(add) = value;
+ #ifdef PSXHW_LOG
+       PSXHW_LOG("*Known 8bit write at address %x value %x\n", add, value);
+ #endif
+@@ -504,6 +505,7 @@ void psxHwWrite16(u32 add, u16 value) {
+                               return;
+                       }
++                      if (add < 0x1f802000)
+                       psxHu16ref(add) = SWAPu16(value);
+ #ifdef PSXHW_LOG
+                       PSXHW_LOG("*Unknown 16bit write at address %x value %x\n", add, value);
+@@ -699,9 +701,9 @@ void psxHwWrite32(u32 add, u32 value) {
+                       return;
+               case 0x1f801820:
+-                      mdecWrite0(value); break;
++                      mdecWrite0(value); return;
+               case 0x1f801824:
+-                      mdecWrite1(value); break;
++                      mdecWrite1(value); return;
+               case 0x1f801100:
+ #ifdef PSXHW_LOG
+@@ -759,6 +761,7 @@ void psxHwWrite32(u32 add, u32 value) {
+                               return;
+                       }
++                      if (add < 0x1f802000)
+                       psxHu32ref(add) = SWAPu32(value);
+ #ifdef PSXHW_LOG
+                       PSXHW_LOG("*Unknown 32bit write at address %x value %x\n", add, value);
+diff --git a/libpcsxcore/psxinterpreter.c b/libpcsxcore/psxinterpreter.c
+index f7898e9..1f125ed 100644
+--- a/libpcsxcore/psxinterpreter.c
++++ b/libpcsxcore/psxinterpreter.c
+@@ -466,6 +466,8 @@ static void doBranch(u32 tar) {
+       psxRegs.pc += 4;
+       psxRegs.cycle += BIAS;
++      (void)tmp;
++#if 0
+       // check for load delay
+       tmp = psxRegs.code >> 26;
+       switch (tmp) {
+@@ -499,13 +501,15 @@ static void doBranch(u32 tar) {
+                       }
+                       break;
+       }
+-
++#endif
+       psxBSC[psxRegs.code >> 26]();
+       branch = 0;
+       psxRegs.pc = branchPC;
++      psxRegs.cycle += BIAS;
+       psxBranchTest();
++      psxRegs.cycle -= BIAS;
+ }
+ /*********************************************************
+@@ -615,12 +619,13 @@ void psxMULTU_stall() {
+       psxMULTU();
+ }
++#define doBranchNotTaken() do { psxRegs.cycle += BIAS; execI(); psxBranchTest(); psxRegs.cycle -= BIAS; } while(0)
+ /*********************************************************
+ * Register branch logic                                  *
+ * Format:  OP rs, offset                                 *
+ *********************************************************/
+-#define RepZBranchi32(op)      if(_i32(_rRs_) op 0) doBranch(_BranchTarget_);
+-#define RepZBranchLinki32(op)  { _SetLink(31); if(_i32(_rRs_) op 0) { doBranch(_BranchTarget_); } }
++#define RepZBranchi32(op)      if(_i32(_rRs_) op 0) doBranch(_BranchTarget_); else doBranchNotTaken();
++#define RepZBranchLinki32(op)  { _SetLink(31); if(_i32(_rRs_) op 0) { doBranch(_BranchTarget_); } else doBranchNotTaken(); }
+ void psxBGEZ()   { RepZBranchi32(>=) }      // Branch if Rs >= 0
+ void psxBGEZAL() { RepZBranchLinki32(>=) }  // Branch if Rs >= 0 and link
+@@ -702,7 +707,7 @@ void psxRFE() {
+ * Register branch logic                                  *
+ * Format:  OP rs, rt, offset                             *
+ *********************************************************/
+-#define RepBranchi32(op)      if(_i32(_rRs_) op _i32(_rRt_)) doBranch(_BranchTarget_);
++#define RepBranchi32(op)      if(_i32(_rRs_) op _i32(_rRt_)) doBranch(_BranchTarget_); else doBranchNotTaken();
+ void psxBEQ() {       RepBranchi32(==) }  // Branch if Rs == Rt
+ void psxBNE() {       RepBranchi32(!=) }  // Branch if Rs != Rt
+@@ -886,6 +891,7 @@ void MTC0(int reg, u32 val) {
+               case 12: // Status
+                       psxRegs.CP0.r[12] = val;
+                       psxTestSWInts();
++                      //psxBranchTest();
+                       break;
+               case 13: // Cause
+@@ -1027,6 +1033,23 @@ void intExecuteBlock() {
+       while (!branch2) execI();
+ }
++extern void do_insn_trace(void);
++
++void intExecuteT() {
++      for (;;) {
++              do_insn_trace();
++              execI();
++      }
++}
++
++void intExecuteBlockT() {
++      branch2 = 0;
++      while (!branch2) {
++              do_insn_trace();
++              execI();
++      }
++}
++
+ static void intClear(u32 Addr, u32 Size) {
+ }
+@@ -1049,7 +1072,7 @@ void intApplyConfig() {
+       assert(psxSPC[26] == psxDIV   || psxSPC[26] == psxDIV_stall);
+       assert(psxSPC[27] == psxDIVU  || psxSPC[27] == psxDIVU_stall);
+-      if (Config.DisableStalls) {
++      if (1) {
+               psxBSC[18] = psxCOP2;
+               psxBSC[50] = gteLWC2;
+               psxBSC[58] = gteSWC2;
+@@ -1091,9 +1114,10 @@ void execI() {
+       if (Config.Debug) ProcessDebug();
+       psxRegs.pc += 4;
+-      psxRegs.cycle += BIAS;
+       psxBSC[psxRegs.code >> 26]();
++
++      psxRegs.cycle += BIAS;
+ }
+ R3000Acpu psxInt = {
+diff --git a/libpcsxcore/psxmem.c b/libpcsxcore/psxmem.c
+index 04aeec2..710a379 100644
+--- a/libpcsxcore/psxmem.c
++++ b/libpcsxcore/psxmem.c
+@@ -217,11 +217,13 @@ void psxMemShutdown() {
+ }
+ static int writeok = 1;
++extern u32 last_io_addr;
+ u8 psxMemRead8(u32 mem) {
+       char *p;
+       u32 t;
++      last_io_addr = mem;
+       t = mem >> 16;
+       if (t == 0x1f80 || t == 0x9f80 || t == 0xbf80) {
+               if ((mem & 0xffff) < 0x400)
+@@ -247,6 +249,7 @@ u16 psxMemRead16(u32 mem) {
+       char *p;
+       u32 t;
++      last_io_addr = mem;
+       t = mem >> 16;
+       if (t == 0x1f80 || t == 0x9f80 || t == 0xbf80) {
+               if ((mem & 0xffff) < 0x400)
+@@ -272,6 +275,7 @@ u32 psxMemRead32(u32 mem) {
+       char *p;
+       u32 t;
++      last_io_addr = mem;
+       t = mem >> 16;
+       if (t == 0x1f80 || t == 0x9f80 || t == 0xbf80) {
+               if ((mem & 0xffff) < 0x400)
+@@ -297,6 +301,7 @@ void psxMemWrite8(u32 mem, u8 value) {
+       char *p;
+       u32 t;
++      last_io_addr = mem;
+       t = mem >> 16;
+       if (t == 0x1f80 || t == 0x9f80 || t == 0xbf80) {
+               if ((mem & 0xffff) < 0x400)
+@@ -324,6 +329,7 @@ void psxMemWrite16(u32 mem, u16 value) {
+       char *p;
+       u32 t;
++      last_io_addr = mem;
+       t = mem >> 16;
+       if (t == 0x1f80 || t == 0x9f80 || t == 0xbf80) {
+               if ((mem & 0xffff) < 0x400)
+@@ -351,6 +357,7 @@ void psxMemWrite32(u32 mem, u32 value) {
+       char *p;
+       u32 t;
++      last_io_addr = mem;
+ //    if ((mem&0x1fffff) == 0x71E18 || value == 0x48088800) SysPrintf("t2fix!!\n");
+       t = mem >> 16;
+       if (t == 0x1f80 || t == 0x9f80 || t == 0xbf80) {
+@@ -380,6 +387,8 @@ void psxMemWrite32(u32 mem, u32 value) {
+                       } else {
+                               int i;
++extern u32 handler_cycle;
++handler_cycle = psxRegs.cycle;
+                               switch (value) {
+                                       case 0x800: case 0x804:
+                                               if (writeok == 0) break;
+diff --git a/libpcsxcore/r3000a.c b/libpcsxcore/r3000a.c
+index 7e6f16b..0114947 100644
+--- a/libpcsxcore/r3000a.c
++++ b/libpcsxcore/r3000a.c
+@@ -120,6 +120,8 @@ void psxException(u32 code, u32 bd) {
+ }
+ void psxBranchTest() {
++ extern u32 irq_test_cycle;
++ irq_test_cycle = psxRegs.cycle;
+       if ((psxRegs.cycle - psxNextsCounter) >= psxNextCounter)
+               psxRcntUpdate();
similarity index 95%
rename from libpcsxcore/new_dynarec/backends/psx/pcsxmem.c
rename to libpcsxcore/new_dynarec/pcsxmem.c
index 647981e..bb471b6 100644 (file)
@@ -6,11 +6,11 @@
  */
 
 #include <stdio.h>
-#include "../../../psxhw.h"
-#include "../../../cdrom.h"
-#include "../../../mdec.h"
-#include "../../../gpu.h"
-#include "../../../psxmem_map.h"
+#include "../psxhw.h"
+#include "../cdrom.h"
+#include "../mdec.h"
+#include "../gpu.h"
+#include "../psxmem_map.h"
 #include "emu_if.h"
 #include "pcsxmem.h"
 
 //#define memprintf printf
 #define memprintf(...)
 
-static u32 *mem_readtab;
-static u32 *mem_writetab;
-static u32 mem_iortab[(1+2+4) * 0x1000 / 4];
-static u32 mem_iowtab[(1+2+4) * 0x1000 / 4];
-static u32 mem_ffwtab[(1+2+4) * 0x1000 / 4];
-//static u32 mem_unmrtab[(1+2+4) * 0x1000 / 4];
-static u32 mem_unmwtab[(1+2+4) * 0x1000 / 4];
+static uintptr_t *mem_readtab;
+static uintptr_t *mem_writetab;
+static uintptr_t mem_iortab[(1+2+4) * 0x1000 / 4];
+static uintptr_t mem_iowtab[(1+2+4) * 0x1000 / 4];
+static uintptr_t mem_ffwtab[(1+2+4) * 0x1000 / 4];
+//static uintptr_t mem_unmrtab[(1+2+4) * 0x1000 / 4];
+static uintptr_t mem_unmwtab[(1+2+4) * 0x1000 / 4];
 
-// When this is called in a loop, and 'h' is a function pointer, clang will crash.
+static
 #ifdef __clang__
-static __attribute__ ((noinline)) void map_item(u32 *out, const void *h, u32 flag)
-#else
-static void map_item(u32 *out, const void *h, u32 flag)
+// When this is called in a loop, and 'h' is a function pointer, clang will crash.
+__attribute__ ((noinline))
 #endif
+void map_item(uintptr_t *out, const void *h, uintptr_t flag)
 {
-       u32 hv = (u32)h;
+       uintptr_t hv = (uintptr_t)h;
        if (hv & 1) {
                SysPrintf("FATAL: %p has LSB set\n", h);
                abort();
        }
-       *out = (hv >> 1) | (flag << 31);
+       *out = (hv >> 1) | (flag << (sizeof(hv) * 8 - 1));
 }
 
 // size must be power of 2, at least 4k
@@ -90,7 +90,7 @@ static void io_write_sio32(u32 value)
        sioWrite8((unsigned char)(value >> 24));
 }
 
-#ifndef DRC_DBG
+#if !defined(DRC_DBG) && defined(__arm__)
 
 static void map_rcnt_rcount0(u32 mode)
 {
@@ -306,7 +306,7 @@ void new_dyna_pcsx_mem_init(void)
        int i;
 
        // have to map these further to keep tcache close to .text
-       mem_readtab = psxMap(0x08000000, 0x200000 * 4, 0, MAP_TAG_LUTS);
+       mem_readtab = psxMap(0x08000000, 0x200000 * sizeof(mem_readtab[0]), 0, MAP_TAG_LUTS);
        if (mem_readtab == NULL) {
                SysPrintf("failed to map mem tables\n");
                exit(1);
similarity index 76%
rename from libpcsxcore/new_dynarec/backends/psx/pcsxmem.h
rename to libpcsxcore/new_dynarec/pcsxmem.h
index 9d292a6..72892a8 100644 (file)
@@ -1,5 +1,3 @@
-#ifndef __PCSXMEM_H__
-#define __PCSXMEM_H__
 
 extern u8 zero_mem[0x1000];
 
@@ -9,5 +7,3 @@ void new_dyna_pcsx_mem_load_state(void);
 void new_dyna_pcsx_mem_shutdown(void);
 
 int pcsxmem_is_handler_dynamic(unsigned int addr);
-
-#endif /* __PCSXMEM_H__ */
@@ -15,14 +15,16 @@ static int pcsx_direct_read(int type, u_int addr, int cc_adj, int cc, int rs, in
       case 0x1120: // rcnt2 count
         if (rt < 0) goto dont_care;
         if (cc < 0) return 0;
-        emit_readword((int)&rcnts[2].mode, HOST_TEMPREG);
-        emit_readword((int)&rcnts[2].cycleStart, rt);
+        host_tempreg_acquire();
+        emit_readword(&rcnts[2].mode, HOST_TEMPREG);
+        emit_readword(&rcnts[2].cycleStart, rt);
         emit_testimm(HOST_TEMPREG, 0x200);
-        emit_readword((int)&last_count, HOST_TEMPREG);
+        emit_readword(&last_count, HOST_TEMPREG);
         emit_sub(HOST_TEMPREG, rt, HOST_TEMPREG);
         emit_add(HOST_TEMPREG, cc, HOST_TEMPREG);
         if (cc_adj)
           emit_addimm(HOST_TEMPREG, cc_adj, rt);
+        host_tempreg_release();
         emit_shrne_imm(rt, 3, rt);
         mov_loadtype_adj(type!=LOADW_STUB?type:LOADH_STUB, rt, rt);
         goto hit;
@@ -31,9 +33,11 @@ static int pcsx_direct_read(int type, u_int addr, int cc_adj, int cc, int rs, in
       case 0x1124: // rcnt mode
         if (rt < 0) return 0;
         t = (addr >> 4) & 3;
-        emit_readword((int)&rcnts[t].mode, rt);
+        emit_readword(&rcnts[t].mode, rt);
+        host_tempreg_acquire();
         emit_andimm(rt, ~0x1800, HOST_TEMPREG);
-        emit_writeword(HOST_TEMPREG, (int)&rcnts[t].mode);
+        emit_writeword(HOST_TEMPREG, &rcnts[t].mode);
+        host_tempreg_release();
         mov_loadtype_adj(type, rt, rt);
         goto hit;
     }
index 34f2481..8538064 100644 (file)
@@ -777,9 +777,7 @@ unsigned char _PADpoll(int port, unsigned char value) {
 
        //if no new request the pad return 0xff, for signaling connected
        if (reqPos >= respSize
-#ifdef ICACHE_EMULATION
         && writeok
-#endif
         ) return 0xff;
 
        switch(reqPos){
index 46e1595..c191832 100644 (file)
 #include "sio.h"
 #include <zlib.h>
 
+#if (defined(__GNUC__) && __GNUC__ >= 5) || defined(__clang__)
+#pragma GCC diagnostic ignored "-Wpointer-sign"
+#endif
+
 #undef SysPrintf
 #define SysPrintf if (Config.PsxOut) printf
 
@@ -1402,10 +1406,8 @@ void psxBios_FlushCache() { // 44
 #ifdef PSXBIOS_LOG
        PSXBIOS_LOG("psxBios_%s\n", biosA0n[0x44]);
 #endif
-#ifdef ICACHE_EMULATION
-    psxCpu->Notify(R3000ACPU_NOTIFY_CACHE_ISOLATED, NULL);
-    psxCpu->Notify(R3000ACPU_NOTIFY_CACHE_UNISOLATED, NULL);
-#endif
+       psxCpu->Notify(R3000ACPU_NOTIFY_CACHE_ISOLATED, NULL);
+       psxCpu->Notify(R3000ACPU_NOTIFY_CACHE_UNISOLATED, NULL);
        pc0 = ra;
 }
 
index 7e10550..3edab38 100644 (file)
@@ -31,6 +31,13 @@ extern "C" {
 
 #include "config.h"
 
+// XXX: don't care but maybe fix it someday
+#if defined(__GNUC__) && __GNUC__ >= 8
+#pragma GCC diagnostic ignored "-Wformat-truncation"
+#pragma GCC diagnostic ignored "-Wformat-overflow"
+#pragma GCC diagnostic ignored "-Wstringop-truncation"
+#endif
+
 // System includes
 #include <stdio.h>
 #include <string.h>
@@ -131,6 +138,7 @@ typedef struct {
        boolean UseNet;
        boolean VSyncWA;
        boolean icache_emulation;
+       boolean DisableStalls;
        u8 Cpu; // CPU_DYNAREC or CPU_INTERPRETER
        u8 PsxType; // PSX_TYPE_NTSC or PSX_TYPE_PAL
 #ifdef _WIN32
index e03bc94..b2cc07b 100644 (file)
@@ -60,20 +60,15 @@ static const u32 CountToOverflow  = 0;
 static const u32 CountToTarget    = 1;
 
 static const u32 FrameRate[]      = { 60, 50 };
-static const u32 HSyncTotal[] = { 263, 314 }; // actually one more on odd lines for PAL
+static const u32 HSyncTotal[]     = { 263, 314 }; // actually one more on odd lines for PAL
 #define VBlankStart 240
 
 #define VERBOSE_LEVEL 0
-#if VERBOSE_LEVEL > 0
-static const s32 VerboseLevel     = VERBOSE_LEVEL;
-#endif
 
 /******************************************************************************/
-
-#ifndef NEW_DYNAREC
+#ifdef DRC_DISABLE
 Rcnt rcnts[ CounterQuantity ];
 #endif
-
 u32 hSyncCount = 0;
 u32 frame_counter = 0;
 static u32 hsync_steps = 0;
@@ -93,7 +88,7 @@ static
 void verboseLog( u32 level, const char *str, ... )
 {
 #if VERBOSE_LEVEL > 0
-    if( level <= VerboseLevel )
+    if( level <= VERBOSE_LEVEL )
     {
         va_list va;
         char buf[ 4096 ];
@@ -507,13 +502,16 @@ s32 psxRcntFreeze( void *f, s32 Mode )
     if (Mode == 0)
     {
         // don't trust things from a savestate
+        rcnts[3].rate = 1;
         for( i = 0; i < CounterQuantity; ++i )
         {
             _psxRcntWmode( i, rcnts[i].mode );
             count = (psxRegs.cycle - rcnts[i].cycleStart) / rcnts[i].rate;
             _psxRcntWcount( i, count );
         }
-        hsync_steps = (psxRegs.cycle - rcnts[3].cycleStart) / rcnts[3].target;
+        hsync_steps = 0;
+        if (rcnts[3].target)
+           hsync_steps = (psxRegs.cycle - rcnts[3].cycleStart) / rcnts[3].target;
         psxRcntSet();
 
         base_cycle = 0;
index f9e13bf..5d931a9 100644 (file)
@@ -26,6 +26,8 @@
 #include "gte.h"
 #include "psxhle.h"
 #include "debug.h"
+#include "psxinterpreter.h"
+#include <assert.h>
 
 static int branch = 0;
 static int branch2 = 0;
@@ -47,8 +49,6 @@ void ProcessDebug() {}
 void StopDebugger() {}
 #endif
 
-void execI();
-
 // Subsets
 void (*psxBSC[64])();
 void (*psxSPC[64])();
@@ -57,64 +57,52 @@ void (*psxCP0[32])();
 void (*psxCP2[64])(struct psxCP2Regs *regs);
 void (*psxCP2BSC[32])();
 
-#ifdef ICACHE_EMULATION
+static u32 fetchNoCache(u32 pc)
+{
+       u32 *code = (u32 *)PSXM(pc);
+       return ((code == NULL) ? 0 : SWAP32(*code));
+}
+
 /*
 Formula One 2001 :
 Use old CPU cache code when the RAM location is updated with new code (affects in-game racing)
 */
-static u8* ICache_Addr;
-static u8* ICache_Code;
-uint32_t *Read_ICache(uint32_t pc)
-{
-       uint32_t pc_bank, pc_offset, pc_cache;
-       uint8_t *IAddr, *ICode;
+static struct cache_entry {
+       u32 tag;
+       u32 data[4];
+} ICache[256];
 
-       pc_bank = pc >> 24;
-       pc_offset = pc & 0xffffff;
-       pc_cache = pc & 0xfff;
-
-       IAddr = ICache_Addr;
-       ICode = ICache_Code;
-
-       // cached - RAM
-       if (pc_bank == 0x80 || pc_bank == 0x00)
+static u32 fetchICache(u32 pc)
+{
+       // cached?
+       if (pc < 0xa0000000)
        {
-               if (SWAP32(*(uint32_t *)(IAddr + pc_cache)) == pc_offset)
-               {
-                       // Cache hit - return last opcode used
-                       return (uint32_t *)(ICode + pc_cache);
-               }
-               else
+               // this is not how the hardware works but whatever
+               struct cache_entry *entry = &ICache[(pc & 0xff0) >> 4];
+
+               if (((entry->tag ^ pc) & 0xfffffff0) != 0 || pc < entry->tag)
                {
-                       // Cache miss - addresses don't match
-                       // - default: 0xffffffff (not init)
-
-                       // cache line is 4 bytes wide
-                       pc_offset &= ~0xf;
-                       pc_cache &= ~0xf;
-
-                       // address line
-                       *(uint32_t *)(IAddr + pc_cache + 0x0) = SWAP32(pc_offset + 0x0);
-                       *(uint32_t *)(IAddr + pc_cache + 0x4) = SWAP32(pc_offset + 0x4);
-                       *(uint32_t *)(IAddr + pc_cache + 0x8) = SWAP32(pc_offset + 0x8);
-                       *(uint32_t *)(IAddr + pc_cache + 0xc) = SWAP32(pc_offset + 0xc);
-
-                       // opcode line
-                       pc_offset = pc & ~0xf;
-                       *(uint32_t *)(ICode + pc_cache + 0x0) = psxMu32ref(pc_offset + 0x0);
-                       *(uint32_t *)(ICode + pc_cache + 0x4) = psxMu32ref(pc_offset + 0x4);
-                       *(uint32_t *)(ICode + pc_cache + 0x8) = psxMu32ref(pc_offset + 0x8);
-                       *(uint32_t *)(ICode + pc_cache + 0xc) = psxMu32ref(pc_offset + 0xc);
+                       u32 *code = (u32 *)PSXM(pc & ~0x0f);
+                       if (!code)
+                               return 0;
+
+                       entry->tag = pc;
+                       // treat as 4 words, although other configurations are said to be possible
+                       switch (pc & 0x0c)
+                       {
+                               case 0x00: entry->data[0] = SWAP32(code[0]);
+                               case 0x04: entry->data[1] = SWAP32(code[1]);
+                               case 0x08: entry->data[2] = SWAP32(code[2]);
+                               case 0x0c: entry->data[3] = SWAP32(code[3]);
+                       }
                }
+               return entry->data[(pc & 0x0f) >> 2];
        }
 
-       /*
-       TODO: Probably should add cached BIOS
-       */
-       // default
-       return (uint32_t *)PSXM(pc);
+       return fetchNoCache(pc);
 }
-#endif
+
+u32 (*fetch)(u32 pc) = fetchNoCache;
 
 static void delayRead(int reg, u32 bpc) {
        u32 rold, rnew;
@@ -330,20 +318,7 @@ int psxTestLoadDelay(int reg, u32 tmp) {
 }
 
 void psxDelayTest(int reg, u32 bpc) {
-       u32 *code;
-       u32 tmp;
-
-       #ifdef ICACHE_EMULATION
-       if (Config.icache_emulation)
-       {
-               code = Read_ICache(psxRegs.pc);
-       }
-       else
-       #endif
-       {
-               code = (u32 *)PSXM(psxRegs.pc);
-       }
-       tmp = ((code == NULL) ? 0 : SWAP32(*code));
+       u32 tmp = fetch(psxRegs.pc);
        branch = 1;
 
        switch (psxTestLoadDelay(reg, tmp)) {
@@ -363,20 +338,9 @@ void psxDelayTest(int reg, u32 bpc) {
 }
 
 static u32 psxBranchNoDelay(void) {
-       u32 *code;
        u32 temp;
 
-       #ifdef ICACHE_EMULATION
-       if (Config.icache_emulation)
-       {
-               code = Read_ICache(psxRegs.pc);
-       }
-       else
-       #endif
-       {
-               code = (u32 *)PSXM(psxRegs.pc);
-       }
-       psxRegs.code = ((code == NULL) ? 0 : SWAP32(*code));
+       psxRegs.code = fetch(psxRegs.pc);
        switch (_Op_) {
                case 0x00: // SPECIAL
                        switch (_Funct_) {
@@ -494,7 +458,6 @@ static int psxDelayBranchTest(u32 tar1) {
 }
 
 static void doBranch(u32 tar) {
-       u32 *code;
        u32 tmp;
 
        branch2 = branch = 1;
@@ -504,17 +467,7 @@ static void doBranch(u32 tar) {
        if (psxDelayBranchTest(tar))
                return;
 
-       #ifdef ICACHE_EMULATION
-       if (Config.icache_emulation)
-       {
-               code = Read_ICache(psxRegs.pc);
-       }
-       else
-       #endif
-       {
-               code = (u32 *)PSXM(psxRegs.pc);
-       }
-       psxRegs.code = ((code == NULL) ? 0 : SWAP32(*code));
+       psxRegs.code = fetch(psxRegs.pc);
 
        debugI();
 
@@ -602,15 +555,27 @@ void psxDIV() {
         } else {
             _i32(_rLo_) = 0xFFFFFFFF;
         }
+/*
+ * Notaz said that this was "not needed" for ARM platforms and could slow it down so let's disable for ARM. 
+ * This fixes a crash issue that can happen when running Amidog's CPU test.
+ * (It still stays stuck to a black screen but at least it doesn't crash anymore)
+ */
+#if !defined(__arm__) && !defined(__aarch64__)
     } else if (_i32(_rRs_) == 0x80000000 && _i32(_rRt_) == 0xFFFFFFFF) {
         _i32(_rLo_) = 0x80000000;
         _i32(_rHi_) = 0;
+#endif
     } else {
         _i32(_rLo_) = _i32(_rRs_) / _i32(_rRt_);
         _i32(_rHi_) = _i32(_rRs_) % _i32(_rRt_);
     }
 }
 
+void psxDIV_stall() {
+       psxRegs.muldivBusyCycle = psxRegs.cycle + 37;
+       psxDIV();
+}
+
 void psxDIVU() {
        if (_rRt_ != 0) {
                _rLo_ = _rRs_ / _rRt_;
@@ -622,6 +587,11 @@ void psxDIVU() {
        }
 }
 
+void psxDIVU_stall() {
+       psxRegs.muldivBusyCycle = psxRegs.cycle + 37;
+       psxDIVU();
+}
+
 void psxMULT() {
        u64 res = (s64)((s64)_i32(_rRs_) * (s64)_i32(_rRt_));
 
@@ -629,6 +599,15 @@ void psxMULT() {
        psxRegs.GPR.n.hi = (u32)((res >> 32) & 0xffffffff);
 }
 
+void psxMULT_stall() {
+       // approximate, but maybe good enough
+       u32 rs = _rRs_;
+       u32 lz = __builtin_clz(((rs ^ ((s32)rs >> 21)) | 1));
+       u32 c = 7 + (2 - (lz / 11)) * 4;
+       psxRegs.muldivBusyCycle = psxRegs.cycle + c;
+       psxMULT();
+}
+
 void psxMULTU() {
        u64 res = (u64)((u64)_u32(_rRs_) * (u64)_u32(_rRt_));
 
@@ -636,6 +615,14 @@ void psxMULTU() {
        psxRegs.GPR.n.hi = (u32)((res >> 32) & 0xffffffff);
 }
 
+void psxMULTU_stall() {
+       // approximate, but maybe good enough
+       u32 lz = __builtin_clz(_rRs_ | 1);
+       u32 c = 7 + (2 - (lz / 11)) * 4;
+       psxRegs.muldivBusyCycle = psxRegs.cycle + c;
+       psxMULTU();
+}
+
 /*********************************************************
 * Register branch logic                                  *
 * Format:  OP rs, offset                                 *
@@ -679,6 +666,18 @@ void psxLUI() { if (!_Rt_) return; _u32(_rRt_) = psxRegs.code << 16; } // Upper
 void psxMFHI() { if (!_Rd_) return; _rRd_ = _rHi_; } // Rd = Hi
 void psxMFLO() { if (!_Rd_) return; _rRd_ = _rLo_; } // Rd = Lo
 
+static void mflohiCheckStall(void)
+{
+       u32 left = psxRegs.muldivBusyCycle - psxRegs.cycle;
+       if (left <= 37) {
+               //printf("muldiv stall %u\n", left);
+               psxRegs.cycle = psxRegs.muldivBusyCycle;
+       }
+}
+
+void psxMFHI_stall() { mflohiCheckStall(); psxMFHI(); }
+void psxMFLO_stall() { mflohiCheckStall(); psxMFLO(); }
+
 /*********************************************************
 * Move to GPR to HI/LO & Register jump                   *
 * Format:  OP rs                                         *
@@ -704,6 +703,7 @@ void psxRFE() {
 //     SysPrintf("psxRFE\n");
        psxRegs.CP0.n.Status = (psxRegs.CP0.n.Status & 0xfffffff0) |
                                                  ((psxRegs.CP0.n.Status & 0x3c) >> 2);
+       psxTestSWInts();
 }
 
 /*********************************************************
@@ -727,7 +727,7 @@ void psxJAL() {     _SetLink(31); doBranch(_JumpTarget_); }
 * Format:  OP rs, rd                                     *
 *********************************************************/
 void psxJR()   {
-       doBranch(_u32(_rRs_) & ~3);
+       doBranch(_rRs_ & ~3);
        psxJumpTest();
 }
 
@@ -933,20 +933,14 @@ void psxCOP0() {
        psxCP0[_Rs_]();
 }
 
-void psxCOP1() {
-#ifdef PSXCPU_LOG
-       PSXCPU_LOG("Attempted to use an invalid floating point instruction. Ignored.\n");
-#endif
-}
-
 void psxCOP2() {
        psxCP2[_Funct_]((struct psxCP2Regs *)&psxRegs.CP2D);
 }
 
-void psxCOP3() {
-#ifdef PSXCPU_LOG
-       PSXCPU_LOG("Attempted to access COP3. Ignored\n");
-#endif
+void psxCOP2_stall() {
+       u32 f = _Funct_;
+       gteCheckStall(f);
+       psxCP2[f]((struct psxCP2Regs *)&psxRegs.CP2D);
 }
 
 void psxBASIC(struct psxCP2Regs *regs) {
@@ -967,7 +961,7 @@ void psxHLE() {
 void (*psxBSC[64])() = {
        psxSPECIAL, psxREGIMM, psxJ   , psxJAL  , psxBEQ , psxBNE , psxBLEZ, psxBGTZ,
        psxADDI   , psxADDIU , psxSLTI, psxSLTIU, psxANDI, psxORI , psxXORI, psxLUI ,
-       psxCOP0   , psxCOP1  , psxCOP2, psxCOP3 , psxNULL, psxNULL, psxNULL, psxNULL,
+       psxCOP0   , psxNULL  , psxCOP2, psxNULL , psxNULL, psxNULL, psxNULL, psxNULL,
        psxNULL   , psxNULL  , psxNULL, psxNULL , psxNULL, psxNULL, psxNULL, psxNULL,
        psxLB     , psxLH    , psxLWL , psxLW   , psxLBU , psxLHU , psxLWR , psxNULL,
        psxSB     , psxSH    , psxSWL , psxSW   , psxNULL, psxNULL, psxSWR , psxNULL, 
@@ -1023,35 +1017,11 @@ void (*psxCP2BSC[32])() = {
 ///////////////////////////////////////////
 
 static int intInit() {
-#ifdef ICACHE_EMULATION
-       if (!ICache_Addr)
-       {
-               ICache_Addr = malloc(0x1000);
-               if (!ICache_Addr)
-               {
-                       return -1;
-               }
-       }
-       
-       if (!ICache_Code)
-       {
-               ICache_Code = malloc(0x1000);
-               if (!ICache_Code)
-               {
-                       return -1;
-               }
-       }
-       memset(ICache_Addr, 0xff, 0x1000);
-       memset(ICache_Code, 0xff, 0x1000);
-#endif
        return 0;
 }
 
 static void intReset() {
-#ifdef ICACHE_EMULATION
-       memset(ICache_Addr, 0xff, 0x1000);
-       memset(ICache_Code, 0xff, 0x1000);
-#endif
+       memset(&ICache, 0xff, sizeof(ICache));
 }
 
 void intExecute() {
@@ -1069,41 +1039,60 @@ static void intClear(u32 Addr, u32 Size) {
 }
 
 void intNotify (int note, void *data) {
-#ifdef ICACHE_EMULATION
        /* Gameblabla - Only clear the icache if it's isolated */
        if (note == R3000ACPU_NOTIFY_CACHE_ISOLATED)
        {
-               memset(ICache_Addr, 0xff, 0x1000);
-               memset(ICache_Code, 0xff, 0x1000);
+               memset(&ICache, 0xff, sizeof(ICache));
        }
-#endif
 }
 
-static void intShutdown() {
-#ifdef ICACHE_EMULATION
-       if (ICache_Addr)
-       {
-               free(ICache_Addr);
-               ICache_Addr = NULL;
+void intApplyConfig() {
+       assert(psxBSC[18] == psxCOP2  || psxBSC[18] == psxCOP2_stall);
+       assert(psxBSC[50] == gteLWC2  || psxBSC[50] == gteLWC2_stall);
+       assert(psxBSC[58] == gteSWC2  || psxBSC[58] == gteSWC2_stall);
+       assert(psxSPC[16] == psxMFHI  || psxSPC[16] == psxMFHI_stall);
+       assert(psxSPC[18] == psxMFLO  || psxSPC[18] == psxMFLO_stall);
+       assert(psxSPC[24] == psxMULT  || psxSPC[24] == psxMULT_stall);
+       assert(psxSPC[25] == psxMULTU || psxSPC[25] == psxMULTU_stall);
+       assert(psxSPC[26] == psxDIV   || psxSPC[26] == psxDIV_stall);
+       assert(psxSPC[27] == psxDIVU  || psxSPC[27] == psxDIVU_stall);
+
+       if (Config.DisableStalls) {
+               psxBSC[18] = psxCOP2;
+               psxBSC[50] = gteLWC2;
+               psxBSC[58] = gteSWC2;
+               psxSPC[16] = psxMFHI;
+               psxSPC[18] = psxMFLO;
+               psxSPC[24] = psxMULT;
+               psxSPC[25] = psxMULTU;
+               psxSPC[26] = psxDIV;
+               psxSPC[27] = psxDIVU;
+       } else {
+               psxBSC[18] = psxCOP2_stall;
+               psxBSC[50] = gteLWC2_stall;
+               psxBSC[58] = gteSWC2_stall;
+               psxSPC[16] = psxMFHI_stall;
+               psxSPC[18] = psxMFLO_stall;
+               psxSPC[24] = psxMULT_stall;
+               psxSPC[25] = psxMULTU_stall;
+               psxSPC[26] = psxDIV_stall;
+               psxSPC[27] = psxDIVU_stall;
        }
 
-       if (ICache_Code)
-       {
-               free(ICache_Code);
-               ICache_Code = NULL;
-       }
-#endif
+       // dynarec may occasionally call the interpreter, in such a case the
+       // cache won't work (cache only works right if all fetches go through it)
+       if (!Config.icache_emulation || psxCpu != &psxInt)
+               fetch = fetchNoCache;
+       else
+               fetch = fetchICache;
+}
+
+static void intShutdown() {
 }
 
 // interpreter execution
 void execI() {
-#ifndef ICACHE_EMULATION
-       u32 *code = (u32 *)PSXM(psxRegs.pc);
-#else
-       u32 *code = Read_ICache(psxRegs.pc);
-#endif
-       
-       psxRegs.code = ((code == NULL) ? 0 : SWAP32(*code));
+       psxRegs.code = fetch(psxRegs.pc);
 
        debugI();
 
@@ -1121,8 +1110,7 @@ R3000Acpu psxInt = {
        intExecute,
        intExecuteBlock,
        intClear,
-#ifdef ICACHE_EMULATION
        intNotify,
-#endif
+       intApplyConfig,
        intShutdown
 };
diff --git a/libpcsxcore/psxinterpreter.h b/libpcsxcore/psxinterpreter.h
new file mode 100644 (file)
index 0000000..89dd7ea
--- /dev/null
@@ -0,0 +1,7 @@
+
+extern u32 (*fetch)(u32 pc);
+
+// called by "new_dynarec"
+void execI();
+void psxNULL();
+void intApplyConfig();
index 7d9f8bf..6f85f82 100644 (file)
@@ -54,16 +54,7 @@ void (*psxUnmapHook)(void *ptr, size_t size, enum psxMapTag tag);
 void *psxMap(unsigned long addr, size_t size, int is_fixed,
                enum psxMapTag tag)
 {
-#ifdef LIGHTREC
-#ifdef MAP_FIXED_NOREPLACE
-       int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE;
-#else
-       int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED;
-#endif
-#else
        int flags = MAP_PRIVATE | MAP_ANONYMOUS;
-#endif
-
        int try_ = 0;
        unsigned long mask;
        void *req, *ret;
@@ -156,36 +147,17 @@ int psxMemInit() {
        memset(psxMemRLUT, 0, 0x10000 * sizeof(void *));
        memset(psxMemWLUT, 0, 0x10000 * sizeof(void *));
 
-#ifdef LIGHTREC
-       psxM = psxMap(0x30000000, 0x00210000, 1, MAP_TAG_RAM);
-       if (psxM == NULL)
-               psxM = psxMap(0x70000000, 0x00210000, 1, MAP_TAG_RAM);
-
-#else
        psxM = psxMap(0x80000000, 0x00210000, 1, MAP_TAG_RAM);
-#endif
-#ifndef RAM_FIXED
        if (psxM == NULL)
                psxM = psxMap(0x77000000, 0x00210000, 0, MAP_TAG_RAM);
-#endif
        if (psxM == NULL) {
                SysMessage(_("mapping main RAM failed"));
                return -1;
        }
 
        psxP = &psxM[0x200000];
-#ifdef LIGHTREC
-       psxH = psxMap(0x4f800000, 0x10000, 0, MAP_TAG_OTHER);
-       if (psxH == NULL)
-               psxH = psxMap(0x8f800000, 0x10000, 0, MAP_TAG_OTHER);
-
-       psxR = psxMap(0x4fc00000, 0x80000, 0, MAP_TAG_OTHER);
-       if (psxR == NULL)
-               psxR = psxMap(0x8fc00000, 0x80000, 0, MAP_TAG_OTHER);
-#else
        psxH = psxMap(0x1f800000, 0x10000, 0, MAP_TAG_OTHER);
        psxR = psxMap(0x1fc00000, 0x80000, 0, MAP_TAG_OTHER);
-#endif
 
        if (psxMemRLUT == NULL || psxMemWLUT == NULL || 
            psxR == NULL || psxP == NULL || psxH == NULL) {
@@ -351,7 +323,7 @@ void psxMemWrite8(u32 mem, u8 value) {
                        if (Config.Debug)
                                DebugCheckBP((mem & 0xffffff) | 0x80000000, W1);
                        *(u8 *)(p + (mem & 0xffff)) = value;
-#ifdef PSXREC
+#ifndef DRC_DISABLE
                        psxCpu->Clear((mem & (~3)), 1);
 #endif
                } else {
@@ -378,7 +350,7 @@ void psxMemWrite16(u32 mem, u16 value) {
                        if (Config.Debug)
                                DebugCheckBP((mem & 0xffffff) | 0x80000000, W2);
                        *(u16 *)(p + (mem & 0xffff)) = SWAPu16(value);
-#ifdef PSXREC
+#ifndef DRC_DISABLE
                        psxCpu->Clear((mem & (~3)), 1);
 #endif
                } else {
@@ -391,36 +363,12 @@ void psxMemWrite16(u32 mem, u16 value) {
 
 void psxMemWrite32(u32 mem, u32 value) {
        char *p;
-#if defined(ICACHE_EMULATION)
-       /*  Stores in PS1 code during cache isolation invalidate cachelines.
-        * It is assumed that cache-flush routines write to the lowest 4KB of
-        * address space for Icache, or 1KB for Dcache/scratchpad.
-        *  Originally, stores had to check 'writeok' in psxRegs struct before
-        * writing to RAM. To eliminate this necessity, we could simply patch the
-        * BIOS 0x44 FlushCache() A0 jumptable entry. Unfortunately, this won't
-        * work for some games that use less-buggy non-BIOS cache-flush routines
-        * like '007 Tomorrow Never Dies', often provided by SN-systems, the PS1
-        * toolchain provider.
-        *  Instead, we backup the lowest 64KB PS1 RAM when the cache is isolated.
-        * All stores write to RAM regardless of cache state. Thus, cache-flush
-        * routines temporarily trash the lowest 4KB of PS1 RAM. Fortunately, they
-        * ran in a 'critical section' with interrupts disabled, so there's little
-        * worry of PS1 code ever reading the trashed contents.
-        *  We point the relevant portions of psxMemRLUT[] to the 64KB backup while
-        * cache is isolated. This is in case the dynarec needs to recompile some
-        * code during isolation. As long as it reads code using psxMemRLUT[] ptrs,
-        * it should never see trashed RAM contents.
-        *
-        * -senquack, mips dynarec team, 2017
-        */
-       static u32 mem_bak[0x10000/4];
-#endif
        u32 t;
-       u32 m = mem & 0xffff;
+
 //     if ((mem&0x1fffff) == 0x71E18 || value == 0x48088800) SysPrintf("t2fix!!\n");
        t = mem >> 16;
        if (t == 0x1f80 || t == 0x9f80 || t == 0xbf80) {
-               if (m < 0x400)
+               if ((mem & 0xffff) < 0x400)
                        psxHu32ref(mem) = SWAPu32(value);
                else
                        psxHwWrite32(mem, value);
@@ -430,12 +378,12 @@ void psxMemWrite32(u32 mem, u32 value) {
                        if (Config.Debug)
                                DebugCheckBP((mem & 0xffffff) | 0x80000000, W4);
                        *(u32 *)(p + (mem & 0xffff)) = SWAPu32(value);
-#ifdef PSXREC
+#ifndef DRC_DISABLE
                        psxCpu->Clear(mem, 1);
 #endif
                } else {
                        if (mem != 0xfffe0130) {
-#ifdef PSXREC
+#ifndef DRC_DISABLE
                                if (!writeok)
                                        psxCpu->Clear(mem, 1);
 #endif
@@ -448,39 +396,22 @@ void psxMemWrite32(u32 mem, u32 value) {
 
                                switch (value) {
                                        case 0x800: case 0x804:
-                                               if (writeok == FALSE) break;
-                                               writeok = FALSE;
+                                               if (writeok == 0) break;
+                                               writeok = 0;
                                                memset(psxMemWLUT + 0x0000, 0, 0x80 * sizeof(void *));
                                                memset(psxMemWLUT + 0x8000, 0, 0x80 * sizeof(void *));
                                                memset(psxMemWLUT + 0xa000, 0, 0x80 * sizeof(void *));
-#ifdef ICACHE_EMULATION
-                                               /* Cache is now isolated, pending cache-flush sequence:
-                                               *  Backup lower 64KB of PS1 RAM, adjust psxMemRLUT[].
-                                               */
-                                               memcpy((void*)mem_bak, (void*)psxM, sizeof(mem_bak));
-                                               psxMemRLUT[0x0000] = psxMemRLUT[0x0020] = psxMemRLUT[0x0040] = psxMemRLUT[0x0060] = (u8 *)mem_bak;
-                                               psxMemRLUT[0x8000] = psxMemRLUT[0x8020] = psxMemRLUT[0x8040] = psxMemRLUT[0x8060] = (u8 *)mem_bak;
-                                               psxMemRLUT[0xa000] = psxMemRLUT[0xa020] = psxMemRLUT[0xa040] = psxMemRLUT[0xa060] = (u8 *)mem_bak;
+                                               /* Required for icache interpreter otherwise Armored Core won't boot on icache interpreter */
                                                psxCpu->Notify(R3000ACPU_NOTIFY_CACHE_ISOLATED, NULL);
-#endif
                                                break;
                                        case 0x00: case 0x1e988:
-                                               if (writeok == TRUE) break;
-                                               writeok = TRUE;
+                                               if (writeok == 1) break;
+                                               writeok = 1;
                                                for (i = 0; i < 0x80; i++) psxMemWLUT[i + 0x0000] = (void *)&psxM[(i & 0x1f) << 16];
                                                memcpy(psxMemWLUT + 0x8000, psxMemWLUT, 0x80 * sizeof(void *));
                                                memcpy(psxMemWLUT + 0xa000, psxMemWLUT, 0x80 * sizeof(void *));
-#ifdef ICACHE_EMULATION
-                                               /* Cache is now unisolated:
-                                               * Restore lower 64KB RAM contents and psxMemRLUT[].
-                                               */
-                                               memcpy((void*)psxM, (void*)mem_bak, sizeof(mem_bak));
-                                               psxMemRLUT[0x0000] = psxMemRLUT[0x0020] = psxMemRLUT[0x0040] = psxMemRLUT[0x0060] = (u8 *)psxM;
-                                               psxMemRLUT[0x8000] = psxMemRLUT[0x8020] = psxMemRLUT[0x8040] = psxMemRLUT[0x8060] = (u8 *)psxM;
-                                               psxMemRLUT[0xa000] = psxMemRLUT[0xa020] = psxMemRLUT[0xa040] = psxMemRLUT[0xa060] = (u8 *)psxM;
                                                /* Dynarecs might take this opportunity to flush their code cache */
                                                psxCpu->Notify(R3000ACPU_NOTIFY_CACHE_UNISOLATED, NULL);
-#endif
                                                break;
                                        default:
 #ifdef PSXMEM_LOG
index 36b4693..fbf5f67 100644 (file)
@@ -122,12 +122,6 @@ extern u8 **psxMemRLUT;
 
 #define PSXMu32ref(mem)        (*(u32 *)PSXM(mem))
 
-#ifndef PSXREC
-#if defined(NEW_DYNAREC) || defined(LIGHTREC)
-#define PSXREC
-#endif
-#endif
-
 int psxMemInit();
 void psxMemReset();
 void psxMemShutdown();
index 3288f5c..3a7c358 100644 (file)
 #include "cdrom.h"
 #include "mdec.h"
 #include "gte.h"
+#include "psxinterpreter.h"
 
 R3000Acpu *psxCpu = NULL;
-#ifndef NEW_DYNAREC
+#ifdef DRC_DISABLE
 psxRegisters psxRegs;
 #endif
 
 int psxInit() {
        SysPrintf(_("Running PCSX Version %s (%s).\n"), PCSX_VERSION, __DATE__);
 
-#if defined(NEW_DYNAREC) || defined(LIGHTREC)
+#ifndef DRC_DISABLE
        if (Config.Cpu == CPU_INTERPRETER) {
                psxCpu = &psxInt;
        } else psxCpu = &psxRec;
 #else
+       Config.Cpu = CPU_INTERPRETER;
        psxCpu = &psxInt;
 #endif
 
@@ -52,8 +54,8 @@ int psxInit() {
 void psxReset() {
        psxMemReset();
 
-       memset(&psxRegs, 0x00, sizeof(psxRegs));
-       writeok = TRUE;
+       memset(&psxRegs, 0, sizeof(psxRegs));
+
        psxRegs.pc = 0xbfc00000; // Start in bootstrap
 
        psxRegs.CP0.r[12] = 0x10900000; // COP0 enabled | BEV = 1 | TS = 1
@@ -81,20 +83,8 @@ void psxShutdown() {
 }
 
 void psxException(u32 code, u32 bd) {
-       #ifdef ICACHE_EMULATION
-       /* Without the CPU_INTERPRETER condition, this will make Lightrec crash.
-        * Hopefully a better solution than this mess is found. - Gameblabla
-       */
-       if (Config.icache_emulation && Config.Cpu == CPU_INTERPRETER)
-       {
-               psxRegs.code = SWAPu32(*Read_ICache(psxRegs.pc));
-       }
-       else
-       #endif
-       {
-               psxRegs.code = PSXMu32(psxRegs.pc);
-       }
-
+       psxRegs.code = fetch(psxRegs.pc);
+       
        if (!Config.HLE && ((((psxRegs.code) >> 24) & 0xfe) == 0x4a)) {
                // "hokuto no ken" / "Crash Bandicot 2" ...
                // BIOS does not allow to return to GTE instructions
@@ -112,7 +102,6 @@ void psxException(u32 code, u32 bd) {
 #ifdef PSXCPU_LOG
                PSXCPU_LOG("bd set!!!\n");
 #endif
-               SysPrintf("bd set!!!\n");
                psxRegs.CP0.n.Cause |= 0x80000000;
                psxRegs.CP0.n.EPC = (psxRegs.pc - 4);
        } else
index a516645..2d7ad40 100644 (file)
@@ -29,14 +29,11 @@ extern "C" {
 #include "psxcounters.h"
 #include "psxbios.h"
 
-#ifdef ICACHE_EMULATION
 enum {
        R3000ACPU_NOTIFY_CACHE_ISOLATED = 0,
        R3000ACPU_NOTIFY_CACHE_UNISOLATED = 1,
        R3000ACPU_NOTIFY_DMA3_EXE_LOAD = 2
 };
-extern uint32_t *Read_ICache(uint32_t pc);
-#endif
 
 typedef struct {
        int  (*Init)();
@@ -44,16 +41,14 @@ typedef struct {
        void (*Execute)();              /* executes up to a break */
        void (*ExecuteBlock)(); /* executes up to a jump */
        void (*Clear)(u32 Addr, u32 Size);
-#ifdef ICACHE_EMULATION
        void (*Notify)(int note, void *data);
-#endif
+       void (*ApplyConfig)();
        void (*Shutdown)();
 } R3000Acpu;
 
 extern R3000Acpu *psxCpu;
 extern R3000Acpu psxInt;
 extern R3000Acpu psxRec;
-#define PSXREC
 
 typedef union {
 #if defined(__BIGENDIAN__)
@@ -194,6 +189,11 @@ typedef struct {
        u32 cycle;
        u32 interrupt;
        struct { u32 sCycle, cycle; } intCycle[32];
+       u32 gteBusyCycle;
+       u32 muldivBusyCycle;
+       // warning: changing anything in psxRegisters requires update of all
+       // asm in libpcsxcore/new_dynarec/, but this member can be replaced
+       u32 reserved[2];
 } psxRegisters;
 
 extern boolean writeok;
index 075e3c3..064b349 100644 (file)
@@ -64,7 +64,7 @@ static void check_mode_change(int force)
 
 void vout_update(void)
 {
-  int x = gpu.screen.x & ~1; // alignment needed by blitter
+  int x = gpu.screen.x;
   int y = gpu.screen.y;
   int w = gpu.screen.w;
   int h = gpu.screen.h;