drc: get rid of RAM_FIXED, revive ROREG
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
index 72f18bf..921a2ed 100644 (file)
@@ -172,9 +172,12 @@ static struct decoded_insn
   u_char rt2;
   u_char lt1;
   u_char bt:1;
-  u_char likely:1;
   u_char ooo:1;
   u_char is_ds:1;
+  u_char is_jump:1;
+  u_char is_ujump:1;
+  u_char is_load:1;
+  u_char is_store:1;
 } dops[MAXBLOCK];
 
   // used by asm:
@@ -224,11 +227,7 @@ static struct decoded_insn
   static void *copy;
   static int expirep;
   static u_int stop_after_jal;
-#ifndef RAM_FIXED
-  static uintptr_t ram_offset;
-#else
-  static const uintptr_t ram_offset=0;
-#endif
+  static u_int f1_hack; // 0 - off, ~0 - capture address, else addr
 
   int new_dynarec_hacks;
   int new_dynarec_hacks_pergame;
@@ -242,6 +241,7 @@ static struct decoded_insn
   extern int pcaddr;
   extern int pending_exception;
   extern int branch_target;
+  extern uintptr_t ram_offset;
   extern uintptr_t mini_ht[32][2];
   extern u_char restore_candidate[512];
 
@@ -254,7 +254,7 @@ static struct decoded_insn
 #define CCREG 36 // Cycle count
 #define INVCP 37 // Pointer to invalid_code
 //#define MMREG 38 // Pointer to memory_map
-//#define ROREG 39 // ram offset (if rdram!=0x80000000)
+#define ROREG 39 // ram offset (if rdram!=0x80000000)
 #define TEMPREG 40
 #define FTEMP 40 // FPU temporary register
 #define PTEMP 41 // Prefetch temporary register
@@ -344,7 +344,8 @@ static void add_stub(enum stub_type type, void *addr, void *retaddr,
 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
   int i, int addr_reg, const struct regstat *i_regs, int ccadj, u_int reglist);
 static void add_to_linker(void *addr, u_int target, int ext);
-static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override);
+static void *emit_fastpath_cmp_jump(int i, const struct regstat *i_regs,
+  int addr, int *offset_reg, int *addr_reg_override);
 static void *get_direct_memhandler(void *table, u_int addr,
   enum stub_type type, uintptr_t *addr_host);
 static void cop2_do_stall_check(u_int op, int i, const struct regstat *i_regs, u_int reglist);
@@ -463,30 +464,18 @@ static void do_clear_cache(void)
 
 #define NO_CYCLE_PENALTY_THR 12
 
-int cycle_multiplier; // 100 for 1.0
+int cycle_multiplier = CYCLE_MULT_DEFAULT; // 100 for 1.0
 int cycle_multiplier_override;
 int cycle_multiplier_old;
 
 static int CLOCK_ADJUST(int x)
 {
-  int m = cycle_multiplier_override
+  int m = cycle_multiplier_override && cycle_multiplier == CYCLE_MULT_DEFAULT
         ? cycle_multiplier_override : cycle_multiplier;
   int s=(x>>31)|1;
   return (x * m + s * 50) / 100;
 }
 
-// is the op an unconditional jump?
-static int is_ujump(int i)
-{
-  return dops[i].itype == UJUMP || dops[i].itype == RJUMP
-    || (source[i] >> 16) == 0x1000; // beq r0, r0, offset // b offset
-}
-
-static int is_jump(int i)
-{
-  return dops[i].itype == RJUMP || dops[i].itype == UJUMP || dops[i].itype == CJUMP || dops[i].itype == SJUMP;
-}
-
 static int ds_writes_rjump_rs(int i)
 {
   return dops[i].rs1 != 0 && (dops[i].rs1 == dops[i+1].rt1 || dops[i].rs1 == dops[i+1].rt2);
@@ -699,7 +688,7 @@ void lsn(u_char hsn[], int i, int *preferred_reg)
       j=slen-i-1;
       break;
     }
-    if (is_ujump(i+j))
+    if (dops[i+j].is_ujump)
     {
       // Don't go past an unconditonal jump
       j++;
@@ -717,11 +706,12 @@ void lsn(u_char hsn[], int i, int *preferred_reg)
       hsn[dops[i+j].rs1]=j;
       hsn[dops[i+j].rs2]=j;
     }
+    if (ram_offset && (dops[i+j].is_load || dops[i+j].is_store))
+      hsn[ROREG] = j;
     // On some architectures stores need invc_ptr
     #if defined(HOST_IMM8)
-    if(dops[i+j].itype==STORE || dops[i+j].itype==STORELR || (dops[i+j].opcode&0x3b)==0x39 || (dops[i+j].opcode&0x3b)==0x3a) {
-      hsn[INVCP]=j;
-    }
+    if (dops[i+j].is_store)
+      hsn[INVCP] = j;
     #endif
     if(i+j>=0&&(dops[i+j].itype==UJUMP||dops[i+j].itype==CJUMP||dops[i+j].itype==SJUMP))
     {
@@ -747,7 +737,7 @@ void lsn(u_char hsn[], int i, int *preferred_reg)
     // TODO: preferred register based on backward branch
   }
   // Delay slot should preferably not overwrite branch conditions or cycle count
-  if (i > 0 && is_jump(i-1)) {
+  if (i > 0 && dops[i-1].is_jump) {
     if(dops[i-1].rs1) if(hsn[dops[i-1].rs1]>1) hsn[dops[i-1].rs1]=1;
     if(dops[i-1].rs2) if(hsn[dops[i-1].rs2]>1) hsn[dops[i-1].rs2]=1;
     hsn[CCREG]=1;
@@ -756,7 +746,7 @@ void lsn(u_char hsn[], int i, int *preferred_reg)
     hsn[RHTBL]=1;
   }
   // Coprocessor load/store needs FTEMP, even if not declared
-  if(dops[i].itype==C1LS||dops[i].itype==C2LS) {
+  if(dops[i].itype==C2LS) {
     hsn[FTEMP]=0;
   }
   // Load L/R also uses FTEMP as a temporary register
@@ -782,7 +772,7 @@ int needed_again(int r, int i)
   int b=-1;
   int rn=10;
 
-  if (i > 0 && is_ujump(i-1))
+  if (i > 0 && dops[i-1].is_ujump)
   {
     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
       return 0; // Don't need any registers if exiting the block
@@ -793,7 +783,7 @@ int needed_again(int r, int i)
       j=slen-i-1;
       break;
     }
-    if (is_ujump(i+j))
+    if (dops[i+j].is_ujump)
     {
       // Don't go past an unconditonal jump
       j++;
@@ -830,7 +820,7 @@ int loop_reg(int i, int r, int hr)
       j=slen-i-1;
       break;
     }
-    if (is_ujump(i+j))
+    if (dops[i+j].is_ujump)
     {
       // Don't go past an unconditonal jump
       j++;
@@ -1456,7 +1446,7 @@ static void alloc_reg(struct regstat *cur,int i,signed char reg)
     // Don't evict the cycle count at entry points, otherwise the entry
     // stub will have to write it.
     if(dops[i].bt&&hsn[CCREG]>2) hsn[CCREG]=2;
-    if(i>1&&hsn[CCREG]>2&&(dops[i-2].itype==RJUMP||dops[i-2].itype==UJUMP||dops[i-2].itype==CJUMP||dops[i-2].itype==SJUMP)) hsn[CCREG]=2;
+    if (i>1 && hsn[CCREG] > 2 && dops[i-2].is_jump) hsn[CCREG]=2;
     for(j=10;j>=3;j--)
     {
       // Alloc preferred register if available
@@ -1562,7 +1552,7 @@ static void alloc_reg_temp(struct regstat *cur,int i,signed char reg)
     // Don't evict the cycle count at entry points, otherwise the entry
     // stub will have to write it.
     if(dops[i].bt&&hsn[CCREG]>2) hsn[CCREG]=2;
-    if(i>1&&hsn[CCREG]>2&&(dops[i-2].itype==RJUMP||dops[i-2].itype==UJUMP||dops[i-2].itype==CJUMP||dops[i-2].itype==SJUMP)) hsn[CCREG]=2;
+    if (i>1 && hsn[CCREG] > 2 && dops[i-2].is_jump) hsn[CCREG]=2;
     for(j=10;j>=3;j--)
     {
       for(r=1;r<=MAXREG;r++)
@@ -1766,7 +1756,10 @@ static void load_alloc(struct regstat *current,int i)
   clear_const(current,dops[i].rt1);
   //if(dops[i].rs1!=dops[i].rt1&&needed_again(dops[i].rs1,i)) clear_const(current,dops[i].rs1); // Does this help or hurt?
   if(!dops[i].rs1) current->u&=~1LL; // Allow allocating r0 if it's the source register
-  if(needed_again(dops[i].rs1,i)) alloc_reg(current,i,dops[i].rs1);
+  if (needed_again(dops[i].rs1, i))
+    alloc_reg(current, i, dops[i].rs1);
+  if (ram_offset)
+    alloc_reg(current, i, ROREG);
   if(dops[i].rt1&&!((current->u>>dops[i].rt1)&1)) {
     alloc_reg(current,i,dops[i].rt1);
     assert(get_reg(current->regmap,dops[i].rt1)>=0);
@@ -1813,9 +1806,11 @@ void store_alloc(struct regstat *current,int i)
   if(dops[i].opcode==0x2c||dops[i].opcode==0x2d||dops[i].opcode==0x3f) { // 64-bit SDL/SDR/SD
     assert(0);
   }
+  if (ram_offset)
+    alloc_reg(current, i, ROREG);
   #if defined(HOST_IMM8)
   // On CPUs without 32-bit immediates we need a pointer to invalid_code
-  else alloc_reg(current,i,INVCP);
+  alloc_reg(current, i, INVCP);
   #endif
   if(dops[i].opcode==0x2a||dops[i].opcode==0x2e||dops[i].opcode==0x2c||dops[i].opcode==0x2d) { // SWL/SWL/SDL/SDR
     alloc_reg(current,i,FTEMP);
@@ -1827,21 +1822,8 @@ void store_alloc(struct regstat *current,int i)
 
 void c1ls_alloc(struct regstat *current,int i)
 {
-  //clear_const(current,dops[i].rs1); // FIXME
   clear_const(current,dops[i].rt1);
-  if(needed_again(dops[i].rs1,i)) alloc_reg(current,i,dops[i].rs1);
   alloc_reg(current,i,CSREG); // Status
-  alloc_reg(current,i,FTEMP);
-  if(dops[i].opcode==0x35||dops[i].opcode==0x3d) { // 64-bit LDC1/SDC1
-    assert(0);
-  }
-  #if defined(HOST_IMM8)
-  // On CPUs without 32-bit immediates we need a pointer to invalid_code
-  else if((dops[i].opcode&0x3b)==0x39) // SWC1/SDC1
-    alloc_reg(current,i,INVCP);
-  #endif
-  // We need a temporary register for address generation
-  alloc_reg_temp(current,i,-1);
 }
 
 void c2ls_alloc(struct regstat *current,int i)
@@ -1849,9 +1831,11 @@ void c2ls_alloc(struct regstat *current,int i)
   clear_const(current,dops[i].rt1);
   if(needed_again(dops[i].rs1,i)) alloc_reg(current,i,dops[i].rs1);
   alloc_reg(current,i,FTEMP);
+  if (ram_offset)
+    alloc_reg(current, i, ROREG);
   #if defined(HOST_IMM8)
   // On CPUs without 32-bit immediates we need a pointer to invalid_code
-  if((dops[i].opcode&0x3b)==0x3a) // SWC2/SDC2
+  if (dops[i].opcode == 0x3a) // SWC2
     alloc_reg(current,i,INVCP);
   #endif
   // We need a temporary register for address generation
@@ -2572,11 +2556,25 @@ static int get_ptr_mem_type(u_int a)
   return MTYPE_8000;
 }
 
-static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override)
+static int get_ro_reg(const struct regstat *i_regs, int host_tempreg_free)
+{
+  int r = get_reg(i_regs->regmap, ROREG);
+  if (r < 0 && host_tempreg_free) {
+    host_tempreg_acquire();
+    emit_loadreg(ROREG, r = HOST_TEMPREG);
+  }
+  if (r < 0)
+    abort();
+  return r;
+}
+
+static void *emit_fastpath_cmp_jump(int i, const struct regstat *i_regs,
+  int addr, int *offset_reg, int *addr_reg_override)
 {
   void *jaddr = NULL;
-  int type=0;
-  int mr=dops[i].rs1;
+  int type = 0;
+  int mr = dops[i].rs1;
+  *offset_reg = -1;
   if(((smrv_strong|smrv_weak)>>mr)&1) {
     type=get_ptr_mem_type(smrv[mr]);
     //printf("set %08x @%08x r%d %d\n", smrv[mr], start+i*4, mr, type);
@@ -2620,22 +2618,19 @@ static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override)
     }
   }
 
-  if(type==0)
+  if (type == 0) // need ram check
   {
     emit_cmpimm(addr,RAM_SIZE);
-    jaddr=out;
+    jaddr = out;
     #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
     // Hint to branch predictor that the branch is unlikely to be taken
-    if(dops[i].rs1>=28)
+    if (dops[i].rs1 >= 28)
       emit_jno_unlikely(0);
     else
     #endif
       emit_jno(0);
-    if(ram_offset!=0) {
-      host_tempreg_acquire();
-      emit_addimm(addr,ram_offset,HOST_TEMPREG);
-      addr=*addr_reg_override=HOST_TEMPREG;
-    }
+    if (ram_offset != 0)
+      *offset_reg = get_ro_reg(i_regs, 0);
   }
 
   return jaddr;
@@ -2645,9 +2640,10 @@ static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override)
 static void *get_direct_memhandler(void *table, u_int addr,
   enum stub_type type, uintptr_t *addr_host)
 {
+  uintptr_t msb = 1ull << (sizeof(uintptr_t)*8 - 1);
   uintptr_t l1, l2 = 0;
   l1 = ((uintptr_t *)table)[addr>>12];
-  if ((l1 & (1ul << (sizeof(l1)*8-1))) == 0) {
+  if (!(l1 & msb)) {
     uintptr_t v = l1 << 1;
     *addr_host = v + addr;
     return NULL;
@@ -2657,10 +2653,10 @@ static void *get_direct_memhandler(void *table, u_int addr,
     if (type == LOADB_STUB || type == LOADBU_STUB || type == STOREB_STUB)
       l2 = ((uintptr_t *)l1)[0x1000/4 + 0x1000/2 + (addr&0xfff)];
     else if (type == LOADH_STUB || type == LOADHU_STUB || type == STOREH_STUB)
-      l2=((uintptr_t *)l1)[0x1000/4 + (addr&0xfff)/2];
+      l2 = ((uintptr_t *)l1)[0x1000/4 + (addr&0xfff)/2];
     else
-      l2=((uintptr_t *)l1)[(addr&0xfff)/4];
-    if ((l2 & (1<<31)) == 0) {
+      l2 = ((uintptr_t *)l1)[(addr&0xfff)/4];
+    if (!(l2 & msb)) {
       uintptr_t v = l2 << 1;
       *addr_host = v + (addr&0xfff);
       return NULL;
@@ -2697,13 +2693,56 @@ static int reglist_find_free(u_int reglist)
   return __builtin_ctz(free_regs);
 }
 
+static void do_load_word(int a, int rt, int offset_reg)
+{
+  if (offset_reg >= 0)
+    emit_ldr_dualindexed(offset_reg, a, rt);
+  else
+    emit_readword_indexed(0, a, rt);
+}
+
+static void do_store_word(int a, int ofs, int rt, int offset_reg, int preseve_a)
+{
+  if (offset_reg < 0) {
+    emit_writeword_indexed(rt, ofs, a);
+    return;
+  }
+  if (ofs != 0)
+    emit_addimm(a, ofs, a);
+  emit_str_dualindexed(offset_reg, a, rt);
+  if (ofs != 0 && preseve_a)
+    emit_addimm(a, -ofs, a);
+}
+
+static void do_store_hword(int a, int ofs, int rt, int offset_reg, int preseve_a)
+{
+  if (offset_reg < 0) {
+    emit_writehword_indexed(rt, ofs, a);
+    return;
+  }
+  if (ofs != 0)
+    emit_addimm(a, ofs, a);
+  emit_strh_dualindexed(offset_reg, a, rt);
+  if (ofs != 0 && preseve_a)
+    emit_addimm(a, -ofs, a);
+}
+
+static void do_store_byte(int a, int rt, int offset_reg)
+{
+  if (offset_reg >= 0)
+    emit_strb_dualindexed(offset_reg, a, rt);
+  else
+    emit_writebyte_indexed(rt, 0, a);
+}
+
 static void load_assemble(int i, const struct regstat *i_regs)
 {
   int s,tl,addr;
   int offset;
   void *jaddr=0;
   int memtarget=0,c=0;
-  int fastio_reg_override=-1;
+  int offset_reg = -1;
+  int fastio_reg_override = -1;
   u_int reglist=get_host_reglist(i_regs->regmap);
   tl=get_reg(i_regs->regmap,dops[i].rt1);
   s=get_reg(i_regs->regmap,dops[i].rs1);
@@ -2740,96 +2779,110 @@ static void load_assemble(int i, const struct regstat *i_regs)
     if(dops[i].rs1!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
     #endif
     {
-      jaddr=emit_fastpath_cmp_jump(i,addr,&fastio_reg_override);
+      jaddr = emit_fastpath_cmp_jump(i, i_regs, addr,
+                &offset_reg, &fastio_reg_override);
     }
   }
-  else if(ram_offset&&memtarget) {
-    host_tempreg_acquire();
-    emit_addimm(addr,ram_offset,HOST_TEMPREG);
-    fastio_reg_override=HOST_TEMPREG;
+  else if (ram_offset && memtarget) {
+    offset_reg = get_ro_reg(i_regs, 0);
   }
   int dummy=(dops[i].rt1==0)||(tl!=get_reg(i_regs->regmap,dops[i].rt1)); // ignore loads to r0 and unneeded reg
-  if (dops[i].opcode==0x20) { // LB
+  switch (dops[i].opcode) {
+  case 0x20: // LB
     if(!c||memtarget) {
       if(!dummy) {
-        {
-          int x=0,a=tl;
-          if(!c) a=addr;
-          if(fastio_reg_override>=0) a=fastio_reg_override;
+        int a = tl;
+        if (!c) a = addr;
+        if (fastio_reg_override >= 0)
+          a = fastio_reg_override;
 
-          emit_movsbl_indexed(x,a,tl);
-        }
+        if (offset_reg >= 0)
+          emit_ldrsb_dualindexed(offset_reg, a, tl);
+        else
+          emit_movsbl_indexed(0, a, tl);
       }
       if(jaddr)
         add_stub_r(LOADB_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
     }
     else
       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,dops[i].rt1,ccadj[i],reglist);
-  }
-  if (dops[i].opcode==0x21) { // LH
+    break;
+  case 0x21: // LH
     if(!c||memtarget) {
       if(!dummy) {
-        int x=0,a=tl;
-        if(!c) a=addr;
-        if(fastio_reg_override>=0) a=fastio_reg_override;
-        emit_movswl_indexed(x,a,tl);
+        int a = tl;
+        if (!c) a = addr;
+        if (fastio_reg_override >= 0)
+          a = fastio_reg_override;
+        if (offset_reg >= 0)
+          emit_ldrsh_dualindexed(offset_reg, a, tl);
+        else
+          emit_movswl_indexed(0, a, tl);
       }
       if(jaddr)
         add_stub_r(LOADH_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
     }
     else
       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,dops[i].rt1,ccadj[i],reglist);
-  }
-  if (dops[i].opcode==0x23) { // LW
+    break;
+  case 0x23: // LW
     if(!c||memtarget) {
       if(!dummy) {
-        int a=addr;
-        if(fastio_reg_override>=0) a=fastio_reg_override;
-        emit_readword_indexed(0,a,tl);
+        int a = addr;
+        if (fastio_reg_override >= 0)
+          a = fastio_reg_override;
+        do_load_word(a, tl, offset_reg);
       }
       if(jaddr)
         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
     }
     else
       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,dops[i].rt1,ccadj[i],reglist);
-  }
-  if (dops[i].opcode==0x24) { // LBU
+    break;
+  case 0x24: // LBU
     if(!c||memtarget) {
       if(!dummy) {
-        int x=0,a=tl;
-        if(!c) a=addr;
-        if(fastio_reg_override>=0) a=fastio_reg_override;
+        int a = tl;
+        if (!c) a = addr;
+        if (fastio_reg_override >= 0)
+          a = fastio_reg_override;
 
-        emit_movzbl_indexed(x,a,tl);
+        if (offset_reg >= 0)
+          emit_ldrb_dualindexed(offset_reg, a, tl);
+        else
+          emit_movzbl_indexed(0, a, tl);
       }
       if(jaddr)
         add_stub_r(LOADBU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
     }
     else
       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,dops[i].rt1,ccadj[i],reglist);
-  }
-  if (dops[i].opcode==0x25) { // LHU
+    break;
+  case 0x25: // LHU
     if(!c||memtarget) {
       if(!dummy) {
-        int x=0,a=tl;
-        if(!c) a=addr;
-        if(fastio_reg_override>=0) a=fastio_reg_override;
-        emit_movzwl_indexed(x,a,tl);
+        int a = tl;
+        if(!c) a = addr;
+        if (fastio_reg_override >= 0)
+          a = fastio_reg_override;
+        if (offset_reg >= 0)
+          emit_ldrh_dualindexed(offset_reg, a, tl);
+        else
+          emit_movzwl_indexed(0, a, tl);
       }
       if(jaddr)
         add_stub_r(LOADHU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
     }
     else
       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,dops[i].rt1,ccadj[i],reglist);
-  }
-  if (dops[i].opcode==0x27) { // LWU
-    assert(0);
-  }
-  if (dops[i].opcode==0x37) { // LD
+    break;
+  case 0x27: // LWU
+  case 0x37: // LD
+  default:
     assert(0);
   }
  }
- if (fastio_reg_override == HOST_TEMPREG)
+ if (fastio_reg_override == HOST_TEMPREG || offset_reg == HOST_TEMPREG)
    host_tempreg_release();
 }
 
@@ -2840,7 +2893,8 @@ static void loadlr_assemble(int i, const struct regstat *i_regs)
   int offset;
   void *jaddr=0;
   int memtarget=0,c=0;
-  int fastio_reg_override=-1;
+  int offset_reg = -1;
+  int fastio_reg_override = -1;
   u_int reglist=get_host_reglist(i_regs->regmap);
   tl=get_reg(i_regs->regmap,dops[i].rt1);
   s=get_reg(i_regs->regmap,dops[i].rs1);
@@ -2865,13 +2919,12 @@ static void loadlr_assemble(int i, const struct regstat *i_regs)
     }else{
       emit_andimm(addr,0xFFFFFFF8,temp2); // LDL/LDR
     }
-    jaddr=emit_fastpath_cmp_jump(i,temp2,&fastio_reg_override);
+    jaddr = emit_fastpath_cmp_jump(i, i_regs, temp2,
+              &offset_reg, &fastio_reg_override);
   }
   else {
-    if(ram_offset&&memtarget) {
-      host_tempreg_acquire();
-      emit_addimm(temp2,ram_offset,HOST_TEMPREG);
-      fastio_reg_override=HOST_TEMPREG;
+    if (ram_offset && memtarget) {
+      offset_reg = get_ro_reg(i_regs, 0);
     }
     if (dops[i].opcode==0x22||dops[i].opcode==0x26) {
       emit_movimm(((constmap[i][s]+offset)<<3)&24,temp); // LWL/LWR
@@ -2881,10 +2934,12 @@ static void loadlr_assemble(int i, const struct regstat *i_regs)
   }
   if (dops[i].opcode==0x22||dops[i].opcode==0x26) { // LWL/LWR
     if(!c||memtarget) {
-      int a=temp2;
-      if(fastio_reg_override>=0) a=fastio_reg_override;
-      emit_readword_indexed(0,a,temp2);
-      if(fastio_reg_override==HOST_TEMPREG) host_tempreg_release();
+      int a = temp2;
+      if (fastio_reg_override >= 0)
+        a = fastio_reg_override;
+      do_load_word(a, temp2, offset_reg);
+      if (fastio_reg_override == HOST_TEMPREG || offset_reg == HOST_TEMPREG)
+        host_tempreg_release();
       if(jaddr) add_stub_r(LOADW_STUB,jaddr,out,i,temp2,i_regs,ccadj[i],reglist);
     }
     else
@@ -2914,16 +2969,17 @@ static void loadlr_assemble(int i, const struct regstat *i_regs)
 }
 #endif
 
-void store_assemble(int i, const struct regstat *i_regs)
+static void store_assemble(int i, const struct regstat *i_regs)
 {
   int s,tl;
   int addr,temp;
   int offset;
   void *jaddr=0;
-  enum stub_type type;
+  enum stub_type type=0;
   int memtarget=0,c=0;
   int agr=AGEN1+(i&1);
-  int fastio_reg_override=-1;
+  int offset_reg = -1;
+  int fastio_reg_override = -1;
   u_int reglist=get_host_reglist(i_regs->regmap);
   tl=get_reg(i_regs->regmap,dops[i].rs2);
   s=get_reg(i_regs->regmap,dops[i].rs1);
@@ -2941,46 +2997,49 @@ void store_assemble(int i, const struct regstat *i_regs)
   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
   if(offset||s<0||c) addr=temp;
   else addr=s;
-  if(!c) {
-    jaddr=emit_fastpath_cmp_jump(i,addr,&fastio_reg_override);
+  if (!c) {
+    jaddr = emit_fastpath_cmp_jump(i, i_regs, addr,
+              &offset_reg, &fastio_reg_override);
   }
-  else if(ram_offset&&memtarget) {
-    host_tempreg_acquire();
-    emit_addimm(addr,ram_offset,HOST_TEMPREG);
-    fastio_reg_override=HOST_TEMPREG;
+  else if (ram_offset && memtarget) {
+    offset_reg = get_ro_reg(i_regs, 0);
   }
 
-  if (dops[i].opcode==0x28) { // SB
+  switch (dops[i].opcode) {
+  case 0x28: // SB
     if(!c||memtarget) {
-      int x=0,a=temp;
-      if(!c) a=addr;
-      if(fastio_reg_override>=0) a=fastio_reg_override;
-      emit_writebyte_indexed(tl,x,a);
-    }
-    type=STOREB_STUB;
-  }
-  if (dops[i].opcode==0x29) { // SH
+      int a = temp;
+      if (!c) a = addr;
+      if (fastio_reg_override >= 0)
+        a = fastio_reg_override;
+      do_store_byte(a, tl, offset_reg);
+    }
+    type = STOREB_STUB;
+    break;
+  case 0x29: // SH
     if(!c||memtarget) {
-      int x=0,a=temp;
-      if(!c) a=addr;
-      if(fastio_reg_override>=0) a=fastio_reg_override;
-      emit_writehword_indexed(tl,x,a);
-    }
-    type=STOREH_STUB;
-  }
-  if (dops[i].opcode==0x2B) { // SW
+      int a = temp;
+      if (!c) a = addr;
+      if (fastio_reg_override >= 0)
+        a = fastio_reg_override;
+      do_store_hword(a, 0, tl, offset_reg, 1);
+    }
+    type = STOREH_STUB;
+    break;
+  case 0x2B: // SW
     if(!c||memtarget) {
-      int a=addr;
-      if(fastio_reg_override>=0) a=fastio_reg_override;
-      emit_writeword_indexed(tl,0,a);
-    }
-    type=STOREW_STUB;
-  }
-  if (dops[i].opcode==0x3F) { // SD
+      int a = addr;
+      if (fastio_reg_override >= 0)
+        a = fastio_reg_override;
+      do_store_word(a, 0, tl, offset_reg, 1);
+    }
+    type = STOREW_STUB;
+    break;
+  case 0x3F: // SD
+  default:
     assert(0);
-    type=STORED_STUB;
   }
-  if(fastio_reg_override==HOST_TEMPREG)
+  if (fastio_reg_override == HOST_TEMPREG || offset_reg == HOST_TEMPREG)
     host_tempreg_release();
   if(jaddr) {
     // PCSX store handlers don't check invcode again
@@ -3041,10 +3100,11 @@ static void storelr_assemble(int i, const struct regstat *i_regs)
   int temp;
   int offset;
   void *jaddr=0;
-  void *case1, *case2, *case3;
+  void *case1, *case23, *case3;
   void *done0, *done1, *done2;
   int memtarget=0,c=0;
   int agr=AGEN1+(i&1);
+  int offset_reg = -1;
   u_int reglist=get_host_reglist(i_regs->regmap);
   tl=get_reg(i_regs->regmap,dops[i].rs2);
   s=get_reg(i_regs->regmap,dops[i].rs1);
@@ -3072,86 +3132,85 @@ static void storelr_assemble(int i, const struct regstat *i_regs)
       emit_jmp(0);
     }
   }
-  if(ram_offset)
-    emit_addimm_no_flags(ram_offset,temp);
+  if (ram_offset)
+    offset_reg = get_ro_reg(i_regs, 0);
 
   if (dops[i].opcode==0x2C||dops[i].opcode==0x2D) { // SDL/SDR
     assert(0);
   }
 
-  emit_xorimm(temp,3,temp);
   emit_testimm(temp,2);
-  case2=out;
+  case23=out;
   emit_jne(0);
   emit_testimm(temp,1);
   case1=out;
   emit_jne(0);
   // 0
-  if (dops[i].opcode==0x2A) { // SWL
-    emit_writeword_indexed(tl,0,temp);
+  if (dops[i].opcode == 0x2A) { // SWL
+    // Write msb into least significant byte
+    if (dops[i].rs2) emit_rorimm(tl, 24, tl);
+    do_store_byte(temp, tl, offset_reg);
+    if (dops[i].rs2) emit_rorimm(tl, 8, tl);
   }
-  else if (dops[i].opcode==0x2E) { // SWR
-    emit_writebyte_indexed(tl,3,temp);
+  else if (dops[i].opcode == 0x2E) { // SWR
+    // Write entire word
+    do_store_word(temp, 0, tl, offset_reg, 1);
   }
-  else
-    assert(0);
-  done0=out;
+  done0 = out;
   emit_jmp(0);
   // 1
   set_jump_target(case1, out);
-  if (dops[i].opcode==0x2A) { // SWL
-    // Write 3 msb into three least significant bytes
-    if(dops[i].rs2) emit_rorimm(tl,8,tl);
-    emit_writehword_indexed(tl,-1,temp);
-    if(dops[i].rs2) emit_rorimm(tl,16,tl);
-    emit_writebyte_indexed(tl,1,temp);
-    if(dops[i].rs2) emit_rorimm(tl,8,tl);
+  if (dops[i].opcode == 0x2A) { // SWL
+    // Write two msb into two least significant bytes
+    if (dops[i].rs2) emit_rorimm(tl, 16, tl);
+    do_store_hword(temp, -1, tl, offset_reg, 0);
+    if (dops[i].rs2) emit_rorimm(tl, 16, tl);
   }
-  else if (dops[i].opcode==0x2E) { // SWR
-    // Write two lsb into two most significant bytes
-    emit_writehword_indexed(tl,1,temp);
+  else if (dops[i].opcode == 0x2E) { // SWR
+    // Write 3 lsb into three most significant bytes
+    do_store_byte(temp, tl, offset_reg);
+    if (dops[i].rs2) emit_rorimm(tl, 8, tl);
+    do_store_hword(temp, 1, tl, offset_reg, 0);
+    if (dops[i].rs2) emit_rorimm(tl, 24, tl);
   }
   done1=out;
   emit_jmp(0);
-  // 2
-  set_jump_target(case2, out);
+  // 2,3
+  set_jump_target(case23, out);
   emit_testimm(temp,1);
-  case3=out;
+  case3 = out;
   emit_jne(0);
+  // 2
   if (dops[i].opcode==0x2A) { // SWL
-    // Write two msb into two least significant bytes
-    if(dops[i].rs2) emit_rorimm(tl,16,tl);
-    emit_writehword_indexed(tl,-2,temp);
-    if(dops[i].rs2) emit_rorimm(tl,16,tl);
+    // Write 3 msb into three least significant bytes
+    if (dops[i].rs2) emit_rorimm(tl, 8, tl);
+    do_store_hword(temp, -2, tl, offset_reg, 1);
+    if (dops[i].rs2) emit_rorimm(tl, 16, tl);
+    do_store_byte(temp, tl, offset_reg);
+    if (dops[i].rs2) emit_rorimm(tl, 8, tl);
   }
-  else if (dops[i].opcode==0x2E) { // SWR
-    // Write 3 lsb into three most significant bytes
-    emit_writebyte_indexed(tl,-1,temp);
-    if(dops[i].rs2) emit_rorimm(tl,8,tl);
-    emit_writehword_indexed(tl,0,temp);
-    if(dops[i].rs2) emit_rorimm(tl,24,tl);
+  else if (dops[i].opcode == 0x2E) { // SWR
+    // Write two lsb into two most significant bytes
+    do_store_hword(temp, 0, tl, offset_reg, 1);
   }
-  done2=out;
+  done2 = out;
   emit_jmp(0);
   // 3
   set_jump_target(case3, out);
-  if (dops[i].opcode==0x2A) { // SWL
-    // Write msb into least significant byte
-    if(dops[i].rs2) emit_rorimm(tl,24,tl);
-    emit_writebyte_indexed(tl,-3,temp);
-    if(dops[i].rs2) emit_rorimm(tl,8,tl);
+  if (dops[i].opcode == 0x2A) { // SWL
+    do_store_word(temp, -3, tl, offset_reg, 0);
   }
-  else if (dops[i].opcode==0x2E) { // SWR
-    // Write entire word
-    emit_writeword_indexed(tl,-3,temp);
+  else if (dops[i].opcode == 0x2E) { // SWR
+    do_store_byte(temp, tl, offset_reg);
   }
   set_jump_target(done0, out);
   set_jump_target(done1, out);
   set_jump_target(done2, out);
+  if (offset_reg == HOST_TEMPREG)
+    host_tempreg_release();
   if(!c||!memtarget)
     add_stub_r(STORELR_STUB,jaddr,out,i,temp,i_regs,ccadj[i],reglist);
   if(!(i_regs->waswritten&(1<<dops[i].rs1)) && !HACK_ENABLED(NDHACK_NO_SMC_CHECK)) {
-    emit_addimm_no_flags(-ram_offset,temp);
     #if defined(HOST_IMM8)
     int ir=get_reg(i_regs->regmap,INVCP);
     assert(ir>=0);
@@ -3396,7 +3455,7 @@ static void cop2_do_stall_check(u_int op, int i, const struct regstat *i_regs, u
   for (j = i + 1; j < slen; j++) {
     if (cop2_is_stalling_op(j, &other_gte_op_cycles))
       break;
-    if (is_jump(j)) {
+    if (dops[j].is_jump) {
       // check ds
       if (j + 1 < slen && cop2_is_stalling_op(j + 1, &other_gte_op_cycles))
         j++;
@@ -3454,7 +3513,7 @@ static void multdiv_prepare_stall(int i, const struct regstat *i_regs)
       break;
     if ((found = is_mflohi(j)))
       break;
-    if (is_jump(j)) {
+    if (dops[j].is_jump) {
       // check ds
       if (j + 1 < slen && (found = is_mflohi(j + 1)))
         j++;
@@ -3608,7 +3667,8 @@ static void c2ls_assemble(int i, const struct regstat *i_regs)
   void *jaddr2=NULL;
   enum stub_type type;
   int agr=AGEN1+(i&1);
-  int fastio_reg_override=-1;
+  int offset_reg = -1;
+  int fastio_reg_override = -1;
   u_int reglist=get_host_reglist(i_regs->regmap);
   u_int copr=(source[i]>>16)&0x1f;
   s=get_reg(i_regs->regmap,dops[i].rs1);
@@ -3648,28 +3708,35 @@ static void c2ls_assemble(int i, const struct regstat *i_regs)
   }
   else {
     if(!c) {
-      jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
-    }
-    else if(ram_offset&&memtarget) {
-      host_tempreg_acquire();
-      emit_addimm(ar,ram_offset,HOST_TEMPREG);
-      fastio_reg_override=HOST_TEMPREG;
-    }
-    if (dops[i].opcode==0x32) { // LWC2
-      int a=ar;
-      if(fastio_reg_override>=0) a=fastio_reg_override;
-      emit_readword_indexed(0,a,tl);
+      jaddr2 = emit_fastpath_cmp_jump(i, i_regs, ar,
+                &offset_reg, &fastio_reg_override);
+    }
+    else if (ram_offset && memtarget) {
+      offset_reg = get_ro_reg(i_regs, 0);
+    }
+    switch (dops[i].opcode) {
+    case 0x32: { // LWC2
+      int a = ar;
+      if (fastio_reg_override >= 0)
+        a = fastio_reg_override;
+      do_load_word(a, tl, offset_reg);
+      break;
     }
-    if (dops[i].opcode==0x3a) { // SWC2
+    case 0x3a: { // SWC2
       #ifdef DESTRUCTIVE_SHIFT
       if(!offset&&!c&&s>=0) emit_mov(s,ar);
       #endif
-      int a=ar;
-      if(fastio_reg_override>=0) a=fastio_reg_override;
-      emit_writeword_indexed(tl,0,a);
+      int a = ar;
+      if (fastio_reg_override >= 0)
+        a = fastio_reg_override;
+      do_store_word(a, 0, tl, offset_reg, 1);
+      break;
+    }
+    default:
+      assert(0);
     }
   }
-  if(fastio_reg_override==HOST_TEMPREG)
+  if (fastio_reg_override == HOST_TEMPREG || offset_reg == HOST_TEMPREG)
     host_tempreg_release();
   if(jaddr2)
     add_stub_r(type,jaddr2,out,i,ar,i_regs,ccadj[i],reglist);
@@ -4098,7 +4165,7 @@ static void loop_preload(signed char pre[],signed char entry[])
 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
 void address_generation(int i,struct regstat *i_regs,signed char entry[])
 {
-  if(dops[i].itype==LOAD||dops[i].itype==LOADLR||dops[i].itype==STORE||dops[i].itype==STORELR||dops[i].itype==C1LS||dops[i].itype==C2LS) {
+  if (dops[i].is_load || dops[i].is_store) {
     int ra=-1;
     int agr=AGEN1+(i&1);
     if(dops[i].itype==LOAD) {
@@ -4113,7 +4180,7 @@ void address_generation(int i,struct regstat *i_regs,signed char entry[])
       ra=get_reg(i_regs->regmap,agr);
       if(ra<0) ra=get_reg(i_regs->regmap,-1);
     }
-    if(dops[i].itype==C1LS||dops[i].itype==C2LS) {
+    if(dops[i].itype==C2LS) {
       if ((dops[i].opcode&0x3b)==0x31||(dops[i].opcode&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
         ra=get_reg(i_regs->regmap,FTEMP);
       else { // SWC1/SDC1/SWC2/SDC2
@@ -4167,7 +4234,7 @@ void address_generation(int i,struct regstat *i_regs,signed char entry[])
     }
   }
   // Preload constants for next instruction
-  if(dops[i+1].itype==LOAD||dops[i+1].itype==LOADLR||dops[i+1].itype==STORE||dops[i+1].itype==STORELR||dops[i+1].itype==C1LS||dops[i+1].itype==C2LS) {
+  if (dops[i+1].is_load || dops[i+1].is_store) {
     int agr,ra;
     // Actual address
     agr=AGEN1+((i+1)&1);
@@ -4210,12 +4277,12 @@ static int get_final_value(int hr, int i, int *value)
     i++;
   }
   if(i<slen-1) {
-    if(dops[i].itype==UJUMP||dops[i].itype==RJUMP||dops[i].itype==CJUMP||dops[i].itype==SJUMP) {
+    if (dops[i].is_jump) {
       *value=constmap[i][hr];
       return 1;
     }
     if(!dops[i+1].bt) {
-      if(dops[i+1].itype==UJUMP||dops[i+1].itype==RJUMP||dops[i+1].itype==CJUMP||dops[i+1].itype==SJUMP) {
+      if (dops[i+1].is_jump) {
         // Load in delay slot, out-of-order execution
         if(dops[i+2].itype==LOAD&&dops[i+2].rs1==reg&&dops[i+2].rt1==reg&&((regs[i+1].wasconst>>hr)&1))
         {
@@ -4525,7 +4592,7 @@ static int match_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
       }
     }
     // Delay slots are not valid branch targets
-    //if(t>0&&(dops[t-1].itype==RJUMP||dops[t-1].itype==UJUMP||dops[t-1].itype==CJUMP||dops[t-1].itype==SJUMP)) return 0;
+    //if(t>0&&(dops[t-1].is_jump) return 0;
     // Delay slots require additional processing, so do not match
     if(dops[t].is_ds) return 0;
   }
@@ -4602,7 +4669,9 @@ static void ds_assemble_entry(int i)
     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty);
   load_regs(regs[t].regmap_entry,regs[t].regmap,dops[t].rs1,dops[t].rs2);
   address_generation(t,&regs[t],regs[t].regmap_entry);
-  if(dops[t].itype==STORE||dops[t].itype==STORELR||(dops[t].opcode&0x3b)==0x39||(dops[t].opcode&0x3b)==0x3a)
+  if (ram_offset && (dops[t].is_load || dops[t].is_store))
+    load_regs(regs[t].regmap_entry,regs[t].regmap,ROREG,ROREG);
+  if (dops[t].is_store)
     load_regs(regs[t].regmap_entry,regs[t].regmap,INVCP,INVCP);
   is_delayslot=0;
   switch(dops[t].itype) {
@@ -5338,9 +5407,6 @@ static void cjump_assemble(int i,struct regstat *i_regs)
   else
   {
     // In-order execution (branch first)
-    //if(dops[i].likely) printf("IOL\n");
-    //else
-    //printf("IOE\n");
     void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
     if(!unconditional&&!nop) {
       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
@@ -5384,6 +5450,8 @@ static void cjump_assemble(int i,struct regstat *i_regs)
       // load regs
       load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i+1].rs1,dops[i+1].rs2);
       address_generation(i+1,&branch_regs[i],0);
+      if (ram_offset)
+        load_regs(regs[i].regmap,branch_regs[i].regmap,ROREG,ROREG);
       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
       ds_assemble(i+1,&branch_regs[i]);
       cc=get_reg(branch_regs[i].regmap,CCREG);
@@ -5414,15 +5482,16 @@ static void cjump_assemble(int i,struct regstat *i_regs)
       if(nottaken1) set_jump_target(nottaken1, out);
       set_jump_target(nottaken, out);
       assem_debug("2:\n");
-      if(!dops[i].likely) {
-        wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
-        load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i+1].rs1,dops[i+1].rs2);
-        address_generation(i+1,&branch_regs[i],0);
-        load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
-        ds_assemble(i+1,&branch_regs[i]);
-      }
+      wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
+      // load regs
+      load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i+1].rs1,dops[i+1].rs2);
+      address_generation(i+1,&branch_regs[i],0);
+      if (ram_offset)
+        load_regs(regs[i].regmap,branch_regs[i].regmap,ROREG,ROREG);
+      load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
+      ds_assemble(i+1,&branch_regs[i]);
       cc=get_reg(branch_regs[i].regmap,CCREG);
-      if(cc==-1&&!dops[i].likely) {
+      if (cc == -1) {
         // Cycle count isn't in a register, temporarily load it then write it out
         emit_loadreg(CCREG,HOST_CCREG);
         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
@@ -5437,7 +5506,7 @@ static void cjump_assemble(int i,struct regstat *i_regs)
         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
         void *jaddr=out;
         emit_jns(0);
-        add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,dops[i].likely?NULLDS:NOTTAKEN,0);
+        add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
       }
     }
   }
@@ -5652,6 +5721,8 @@ static void sjump_assemble(int i,struct regstat *i_regs)
       // load regs
       load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i+1].rs1,dops[i+1].rs2);
       address_generation(i+1,&branch_regs[i],0);
+      if (ram_offset)
+        load_regs(regs[i].regmap,branch_regs[i].regmap,ROREG,ROREG);
       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
       ds_assemble(i+1,&branch_regs[i]);
       cc=get_reg(branch_regs[i].regmap,CCREG);
@@ -5681,15 +5752,13 @@ static void sjump_assemble(int i,struct regstat *i_regs)
     if(!unconditional) {
       set_jump_target(nottaken, out);
       assem_debug("1:\n");
-      if(!dops[i].likely) {
-        wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
-        load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i+1].rs1,dops[i+1].rs2);
-        address_generation(i+1,&branch_regs[i],0);
-        load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
-        ds_assemble(i+1,&branch_regs[i]);
-      }
+      wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
+      load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i+1].rs1,dops[i+1].rs2);
+      address_generation(i+1,&branch_regs[i],0);
+      load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
+      ds_assemble(i+1,&branch_regs[i]);
       cc=get_reg(branch_regs[i].regmap,CCREG);
-      if(cc==-1&&!dops[i].likely) {
+      if (cc == -1) {
         // Cycle count isn't in a register, temporarily load it then write it out
         emit_loadreg(CCREG,HOST_CCREG);
         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
@@ -5704,7 +5773,7 @@ static void sjump_assemble(int i,struct regstat *i_regs)
         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
         void *jaddr=out;
         emit_jns(0);
-        add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,dops[i].likely?NULLDS:NOTTAKEN,0);
+        add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
       }
     }
   }
@@ -5894,7 +5963,7 @@ static void pagespan_assemble(int i,struct regstat *i_regs)
 
   assert(i_regs->regmap[HOST_CCREG]==CCREG);
   wb_dirtys(regs[i].regmap,regs[i].dirty);
-  if(dops[i].likely||unconditional)
+  if(unconditional)
   {
     emit_movimm(ba[i],HOST_BTREG);
   }
@@ -5913,22 +5982,6 @@ static void pagespan_assemble(int i,struct regstat *i_regs)
     add_jump_out(target_addr,stub);
   }
   else set_jump_target(branch_addr, stub);
-  if(dops[i].likely) {
-    // Not-taken path
-    set_jump_target(nottaken, out);
-    wb_dirtys(regs[i].regmap,regs[i].dirty);
-    void *branch_addr=out;
-    emit_jmp(0);
-    int target_addr=start+i*4+8;
-    void *stub=out;
-    void *compiled_target_addr=check_addr(target_addr);
-    emit_extjump_ds(branch_addr, target_addr);
-    if(compiled_target_addr) {
-      set_jump_target(branch_addr, compiled_target_addr);
-      add_jump_out(target_addr,stub);
-    }
-    else set_jump_target(branch_addr, stub);
-  }
 }
 
 // Assemble the delay slot for the above
@@ -5948,7 +6001,9 @@ static void pagespan_ds()
     emit_writeword(HOST_BTREG,&branch_target);
   load_regs(regs[0].regmap_entry,regs[0].regmap,dops[0].rs1,dops[0].rs2);
   address_generation(0,&regs[0],regs[0].regmap_entry);
-  if(dops[0].itype==STORE||dops[0].itype==STORELR||(dops[0].opcode&0x3b)==0x39||(dops[0].opcode&0x3b)==0x3a)
+  if (ram_offset && (dops[0].is_load || dops[0].is_store))
+    load_regs(regs[0].regmap_entry,regs[0].regmap,ROREG,ROREG);
+  if (dops[0].is_store)
     load_regs(regs[0].regmap_entry,regs[0].regmap,INVCP,INVCP);
   is_delayslot=0;
   switch(dops[0].itype) {
@@ -6041,7 +6096,7 @@ void unneeded_registers(int istart,int iend,int r)
   for (i=iend;i>=istart;i--)
   {
     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
-    if(dops[i].itype==RJUMP||dops[i].itype==UJUMP||dops[i].itype==CJUMP||dops[i].itype==SJUMP)
+    if(dops[i].is_jump)
     {
       // If subroutine call, flag return address as a possible branch target
       if(dops[i].rt1==31 && i<slen-2) dops[i+2].bt=1;
@@ -6058,19 +6113,6 @@ void unneeded_registers(int istart,int iend,int r)
         u|=1;
         gte_u|=gte_rt[i+1];
         gte_u&=~gte_rs[i+1];
-        // If branch is "likely" (and conditional)
-        // then we skip the delay slot on the fall-thru path
-        if(dops[i].likely) {
-          if(i<slen-1) {
-            u&=unneeded_reg[i+2];
-            gte_u&=gte_unneeded[i+2];
-          }
-          else
-          {
-            u=1;
-            gte_u=gte_u_unknown;
-          }
-        }
       }
       else
       {
@@ -6078,7 +6120,7 @@ void unneeded_registers(int istart,int iend,int r)
         dops[(ba[i]-start)>>2].bt=1;
         if(ba[i]<=start+i*4) {
           // Backward branch
-          if(is_ujump(i))
+          if(dops[i].is_ujump)
           {
             // Unconditional branch
             temp_u=1;
@@ -6094,19 +6136,6 @@ void unneeded_registers(int istart,int iend,int r)
           temp_u|=1;
           temp_gte_u|=gte_rt[i+1];
           temp_gte_u&=~gte_rs[i+1];
-          // If branch is "likely" (and conditional)
-          // then we skip the delay slot on the fall-thru path
-          if(dops[i].likely) {
-            if(i<slen-1) {
-              temp_u&=unneeded_reg[i+2];
-              temp_gte_u&=gte_unneeded[i+2];
-            }
-            else
-            {
-              temp_u=1;
-              temp_gte_u=gte_u_unknown;
-            }
-          }
           temp_u|=(1LL<<dops[i].rt1)|(1LL<<dops[i].rt2);
           temp_u&=~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
           temp_u|=1;
@@ -6123,7 +6152,7 @@ void unneeded_registers(int istart,int iend,int r)
             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
           }
         } /*else*/ if(1) {
-          if (is_ujump(i))
+          if (dops[i].is_ujump)
           {
             // Unconditional branch
             u=unneeded_reg[(ba[i]-start)>>2];
@@ -6146,19 +6175,8 @@ void unneeded_registers(int istart,int iend,int r)
             b|=1;
             gte_b|=gte_rt[i+1];
             gte_b&=~gte_rs[i+1];
-            // If branch is "likely" then we skip the
-            // delay slot on the fall-thru path
-            if(dops[i].likely) {
-              u=b;
-              gte_u=gte_b;
-              if(i<slen-1) {
-                u&=unneeded_reg[i+2];
-                gte_u&=gte_unneeded[i+2];
-              }
-            } else {
-              u&=b;
-              gte_u&=gte_b;
-            }
+            u&=b;
+            gte_u&=gte_b;
             if(i<slen-1) {
               branch_unneeded_reg[i]&=unneeded_reg[i+2];
             } else {
@@ -6228,12 +6246,12 @@ void clean_registers(int istart,int iend,int wr)
   }
   for (i=iend;i>=istart;i--)
   {
-    if(dops[i].itype==RJUMP||dops[i].itype==UJUMP||dops[i].itype==CJUMP||dops[i].itype==SJUMP)
+    if(dops[i].is_jump)
     {
       if(ba[i]<start || ba[i]>=(start+slen*4))
       {
         // Branch out of this block, flush all regs
-        if (is_ujump(i))
+        if (dops[i].is_ujump)
         {
           // Unconditional branch
           will_dirty_i=0;
@@ -6266,7 +6284,7 @@ void clean_registers(int istart,int iend,int wr)
           // Merge in delay slot (will dirty)
           for(r=0;r<HOST_REGS;r++) {
             if(r!=EXCLUDE_REG) {
-              if(!dops[i].likely) {
+              if (1) { // !dops[i].likely) {
                 // Might not dirty if likely branch is not taken
                 if((branch_regs[i].regmap[r]&63)==dops[i].rt1) will_dirty_i|=1<<r;
                 if((branch_regs[i].regmap[r]&63)==dops[i].rt2) will_dirty_i|=1<<r;
@@ -6313,7 +6331,7 @@ void clean_registers(int istart,int iend,int wr)
         // Internal branch
         if(ba[i]<=start+i*4) {
           // Backward branch
-          if (is_ujump(i))
+          if (dops[i].is_ujump)
           {
             // Unconditional branch
             temp_will_dirty=0;
@@ -6344,7 +6362,7 @@ void clean_registers(int istart,int iend,int wr)
             // Merge in delay slot (will dirty)
             for(r=0;r<HOST_REGS;r++) {
               if(r!=EXCLUDE_REG) {
-                if(!dops[i].likely) {
+                if (1) { // !dops[i].likely) {
                   // Will not dirty if likely branch is not taken
                   if((branch_regs[i].regmap[r]&63)==dops[i].rt1) temp_will_dirty|=1<<r;
                   if((branch_regs[i].regmap[r]&63)==dops[i].rt2) temp_will_dirty|=1<<r;
@@ -6410,7 +6428,7 @@ void clean_registers(int istart,int iend,int wr)
         }
         /*else*/ if(1)
         {
-          if (is_ujump(i))
+          if (dops[i].is_ujump)
           {
             // Unconditional branch
             will_dirty_i=0;
@@ -6470,7 +6488,7 @@ void clean_registers(int istart,int iend,int wr)
             // Merge in delay slot
             for(r=0;r<HOST_REGS;r++) {
               if(r!=EXCLUDE_REG) {
-                if(!dops[i].likely) {
+                if (1) { // !dops[i].likely) {
                   // Might not dirty if likely branch is not taken
                   if((branch_regs[i].regmap[r]&63)==dops[i].rt1) will_dirty_i|=1<<r;
                   if((branch_regs[i].regmap[r]&63)==dops[i].rt2) will_dirty_i|=1<<r;
@@ -6539,7 +6557,7 @@ void clean_registers(int istart,int iend,int wr)
         if((regs[i].regmap[r]&63)==dops[i].rt2) wont_dirty_i|=1<<r;
         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
         if(i>istart) {
-          if(dops[i].itype!=RJUMP&&dops[i].itype!=UJUMP&&dops[i].itype!=CJUMP&&dops[i].itype!=SJUMP)
+          if (!dops[i].is_jump)
           {
             // Don't store a register immediately after writing it,
             // may prevent dual-issue.
@@ -6557,9 +6575,9 @@ void clean_registers(int istart,int iend,int wr)
         regs[i].dirty|=will_dirty_i;
         #ifndef DESTRUCTIVE_WRITEBACK
         regs[i].dirty&=wont_dirty_i;
-        if(dops[i].itype==RJUMP||dops[i].itype==UJUMP||dops[i].itype==CJUMP||dops[i].itype==SJUMP)
+        if(dops[i].is_jump)
         {
-          if (i < iend-1 && !is_ujump(i)) {
+          if (i < iend-1 && !dops[i].is_ujump) {
             for(r=0;r<HOST_REGS;r++) {
               if(r!=EXCLUDE_REG) {
                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
@@ -6771,6 +6789,7 @@ void new_dynarec_clear_full(void)
   literalcount=0;
   stop_after_jal=0;
   inv_code_start=inv_code_end=~0;
+  f1_hack=0;
   // TLB
   for(n=0;n<4096;n++) ll_clear(jump_in+n);
   for(n=0;n<4096;n++) ll_clear(jump_out+n);
@@ -6823,9 +6842,7 @@ void new_dynarec_init(void)
 #endif
   arch_init();
   new_dynarec_test();
-#ifndef RAM_FIXED
   ram_offset=(uintptr_t)rdram-0x80000000;
-#endif
   if (ram_offset!=0)
     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
 }
@@ -7016,6 +7033,27 @@ int new_recompile_block(u_int addr)
     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
     return 0;
   }
+  else if (f1_hack == ~0u || (f1_hack != 0 && start == f1_hack)) {
+    void *beginning = start_block();
+    u_int page = get_page(start);
+    emit_readword(&psxRegs.GPR.n.sp, 0);
+    emit_readptr(&mem_rtab, 1);
+    emit_shrimm(0, 12, 2);
+    emit_readptr_dualindexedx_ptrlen(1, 2, 1);
+    emit_addimm(0, 0x18, 0);
+    emit_adds_ptr(1, 1, 1);
+    emit_ldr_dualindexed(1, 0, 0);
+    emit_writeword(0, &psxRegs.GPR.r[26]); // lw k0, 0x18(sp)
+    emit_far_call(get_addr_ht);
+    emit_jmpreg(0); // jr k0
+    literal_pool(0);
+    end_block(beginning);
+
+    ll_add_flags(jump_in + page, start, state_rflags, beginning);
+    SysPrintf("F1 hack to   %08x\n", start);
+    f1_hack = start;
+    return 0;
+  }
 
   source = get_source_start(start, &pagelimit);
   if (source == NULL) {
@@ -7044,7 +7082,6 @@ int new_recompile_block(u_int addr)
 
   for(i=0;!done;i++) {
     dops[i].bt=0;
-    dops[i].likely=0;
     dops[i].ooo=0;
     op2=0;
     minimum_free_regs[i]=0;
@@ -7302,7 +7339,6 @@ int new_recompile_block(u_int addr)
         if(op&2) { // BGTZ/BLEZ
           dops[i].rs2=0;
         }
-        dops[i].likely=(op>>4)?1:0;
         break;
       case SJUMP:
         dops[i].rs1=(source[i]>>21)&0x1f;
@@ -7313,7 +7349,6 @@ int new_recompile_block(u_int addr)
           dops[i].rt1=31;
           // NOTE: If the branch is not taken, r31 is still overwritten
         }
-        dops[i].likely=(op2&2)?1:0;
         break;
       case ALU:
         dops[i].rs1=(source[i]>>21)&0x1f; // source
@@ -7457,11 +7492,16 @@ int new_recompile_block(u_int addr)
     else if (type == SJUMP && dops[i].rs1 == 0 && (op2 & 1))
       dops[i].itype = type = UJUMP;
 
+    dops[i].is_jump = (dops[i].itype == RJUMP || dops[i].itype == UJUMP || dops[i].itype == CJUMP || dops[i].itype == SJUMP);
+    dops[i].is_ujump = (dops[i].itype == RJUMP || dops[i].itype == UJUMP); // || (source[i] >> 16) == 0x1000 // beq r0,r0
+    dops[i].is_load = (dops[i].itype == LOAD || dops[i].itype == LOADLR || op == 0x32); // LWC2
+    dops[i].is_store = (dops[i].itype == STORE || dops[i].itype == STORELR || op == 0x3a); // SWC2
+
     /* messy cases to just pass over to the interpreter */
-    if (i > 0 && is_jump(i-1)) {
+    if (i > 0 && dops[i-1].is_jump) {
       int do_in_intrp=0;
       // branch in delay slot?
-      if (is_jump(i)) {
+      if (dops[i].is_jump) {
         // don't handle first branch and call interpreter if it's hit
         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
         do_in_intrp=1;
@@ -7476,7 +7516,7 @@ int new_recompile_block(u_int addr)
           dops[t+1].bt=1; // expected return from interpreter
         }
         else if(i>=2&&dops[i-2].rt1==2&&dops[i].rt1==2&&dops[i].rs1!=2&&dops[i].rs2!=2&&dops[i-1].rs1!=2&&dops[i-1].rs2!=2&&
-              !(i>=3&&is_jump(i-3))) {
+              !(i>=3&&dops[i-3].is_jump)) {
           // v0 overwrite like this is a sign of trouble, bail out
           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
           do_in_intrp=1;
@@ -7493,7 +7533,7 @@ int new_recompile_block(u_int addr)
     }
 
     /* Is this the end of the block? */
-    if (i > 0 && is_ujump(i-1)) {
+    if (i > 0 && dops[i-1].is_ujump) {
       if(dops[i-1].rt1==0) { // Continue past subroutine call (JAL)
         done=2;
       }
@@ -7529,13 +7569,31 @@ int new_recompile_block(u_int addr)
     }
   }
   slen=i;
-  if(dops[i-1].itype==UJUMP||dops[i-1].itype==CJUMP||dops[i-1].itype==SJUMP||dops[i-1].itype==RJUMP) {
+  if (dops[i-1].is_jump) {
     if(start+i*4==pagelimit) {
       dops[i-1].itype=SPAN;
     }
   }
   assert(slen>0);
 
+  /* spacial hack(s) */
+  if (i > 10 && source[i-1] == 0 && source[i-2] == 0x03e00008
+      && source[i-4] == 0x8fbf0018 && source[i-6] == 0x00c0f809
+      && dops[i-7].itype == STORE)
+  {
+    i = i-8;
+    if (dops[i].itype == IMM16)
+      i--;
+    // swl r2, 15(r6); swr r2, 12(r6); sw r6, *; jalr r6
+    if (dops[i].itype == STORELR && dops[i].rs1 == 6
+      && dops[i-1].itype == STORELR && dops[i-1].rs1 == 6)
+    {
+      SysPrintf("F1 hack from %08x\n", start);
+      if (f1_hack == 0)
+        f1_hack = ~0u;
+    }
+  }
+
   /* Pass 2 - Register dependencies and branch targets */
 
   unneeded_registers(0,slen-1,0);
@@ -7582,7 +7640,7 @@ int new_recompile_block(u_int addr)
     regs[i].wasconst=current.isconst;
     regs[i].wasdirty=current.dirty;
     regs[i].loadedconst=0;
-    if(dops[i].itype!=UJUMP&&dops[i].itype!=CJUMP&&dops[i].itype!=SJUMP&&dops[i].itype!=RJUMP) {
+    if (!dops[i].is_jump) {
       if(i+1<slen) {
         current.u=unneeded_reg[i+1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
         current.u|=1;
@@ -8106,7 +8164,7 @@ int new_recompile_block(u_int addr)
           break;
       }
 
-      if (is_ujump(i-1))
+      if (dops[i-1].is_ujump)
       {
         if(dops[i-1].rt1==31) // JAL/JALR
         {
@@ -8148,7 +8206,7 @@ int new_recompile_block(u_int addr)
 
     // Count cycles in between branches
     ccadj[i]=cc;
-    if(i>0&&(dops[i-1].itype==RJUMP||dops[i-1].itype==UJUMP||dops[i-1].itype==CJUMP||dops[i-1].itype==SJUMP||dops[i].itype==SYSCALL||dops[i].itype==HLECALL))
+    if (i > 0 && (dops[i-1].is_jump || dops[i].itype == SYSCALL || dops[i].itype == HLECALL))
     {
       cc=0;
     }
@@ -8198,7 +8256,7 @@ int new_recompile_block(u_int addr)
   for (i=slen-1;i>=0;i--)
   {
     int hr;
-    if(dops[i].itype==RJUMP||dops[i].itype==UJUMP||dops[i].itype==CJUMP||dops[i].itype==SJUMP)
+    if(dops[i].is_jump)
     {
       if(ba[i]<start || ba[i]>=(start+slen*4))
       {
@@ -8219,7 +8277,7 @@ int new_recompile_block(u_int addr)
         }
       }
       // Conditional branch may need registers for following instructions
-      if (!is_ujump(i))
+      if (!dops[i].is_ujump)
       {
         if(i<slen-2) {
           nr|=needed_reg[i+2];
@@ -8236,17 +8294,17 @@ int new_recompile_block(u_int addr)
       // Merge in delay slot
       for(hr=0;hr<HOST_REGS;hr++)
       {
-        if(!dops[i].likely) {
-          // These are overwritten unless the branch is "likely"
-          // and the delay slot is nullified if not taken
-          if(dops[i+1].rt1&&dops[i+1].rt1==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
-          if(dops[i+1].rt2&&dops[i+1].rt2==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
-        }
+        if(dops[i+1].rt1&&dops[i+1].rt1==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
+        if(dops[i+1].rt2&&dops[i+1].rt2==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
         if(dops[i+1].rs1==regmap_pre[i][hr]) nr|=1<<hr;
         if(dops[i+1].rs2==regmap_pre[i][hr]) nr|=1<<hr;
         if(dops[i+1].rs1==regs[i].regmap_entry[hr]) nr|=1<<hr;
         if(dops[i+1].rs2==regs[i].regmap_entry[hr]) nr|=1<<hr;
-        if(dops[i+1].itype==STORE || dops[i+1].itype==STORELR || (dops[i+1].opcode&0x3b)==0x39 || (dops[i+1].opcode&0x3b)==0x3a) {
+        if(ram_offset && (dops[i+1].is_load || dops[i+1].is_store)) {
+          if(regmap_pre[i][hr]==ROREG) nr|=1<<hr;
+          if(regs[i].regmap_entry[hr]==ROREG) nr|=1<<hr;
+        }
+        if(dops[i+1].is_store) {
           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
         }
@@ -8284,7 +8342,11 @@ int new_recompile_block(u_int addr)
       if(dops[i].rs2==regmap_pre[i][hr]) nr|=1<<hr;
       if(dops[i].rs1==regs[i].regmap_entry[hr]) nr|=1<<hr;
       if(dops[i].rs2==regs[i].regmap_entry[hr]) nr|=1<<hr;
-      if(dops[i].itype==STORE || dops[i].itype==STORELR || (dops[i].opcode&0x3b)==0x39 || (dops[i].opcode&0x3b)==0x3a) {
+      if(ram_offset && (dops[i].is_load || dops[i].is_store)) {
+        if(regmap_pre[i][hr]==ROREG) nr|=1<<hr;
+        if(regs[i].regmap_entry[hr]==ROREG) nr|=1<<hr;
+      }
+      if(dops[i].is_store) {
         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
       }
@@ -8316,32 +8378,15 @@ int new_recompile_block(u_int addr)
     {
       if(!((nr>>hr)&1)) {
         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
-        if((regs[i].regmap[hr]&63)!=dops[i].rs1 && (regs[i].regmap[hr]&63)!=dops[i].rs2 &&
-           (regs[i].regmap[hr]&63)!=dops[i].rt1 && (regs[i].regmap[hr]&63)!=dops[i].rt2 &&
-           (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
-        {
-          if (!is_ujump(i))
-          {
-            if(dops[i].likely) {
-              regs[i].regmap[hr]=-1;
-              regs[i].isconst&=~(1<<hr);
-              if(i<slen-2) {
-                regmap_pre[i+2][hr]=-1;
-                regs[i+2].wasconst&=~(1<<hr);
-              }
-            }
-          }
-        }
-        if(dops[i].itype==RJUMP||dops[i].itype==UJUMP||dops[i].itype==CJUMP||dops[i].itype==SJUMP)
+        if(dops[i].is_jump)
         {
-          int map=0,temp=0;
-          if(dops[i+1].itype==STORE || dops[i+1].itype==STORELR ||
-             (dops[i+1].opcode&0x3b)==0x39 || (dops[i+1].opcode&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
-            map=INVCP;
-          }
-          if(dops[i+1].itype==LOADLR || dops[i+1].itype==STORELR ||
-             dops[i+1].itype==C1LS || dops[i+1].itype==C2LS)
-            temp=FTEMP;
+          int map1 = 0, map2 = 0, temp = 0; // or -1 ??
+          if (dops[i+1].is_load || dops[i+1].is_store)
+            map1 = ROREG;
+          if (dops[i+1].is_store)
+            map2 = INVCP;
+          if(dops[i+1].itype==LOADLR || dops[i+1].itype==STORELR || dops[i+1].itype==C2LS)
+            temp = FTEMP;
           if((regs[i].regmap[hr]&63)!=dops[i].rs1 && (regs[i].regmap[hr]&63)!=dops[i].rs2 &&
              (regs[i].regmap[hr]&63)!=dops[i].rt1 && (regs[i].regmap[hr]&63)!=dops[i].rt2 &&
              (regs[i].regmap[hr]&63)!=dops[i+1].rt1 && (regs[i].regmap[hr]&63)!=dops[i+1].rt2 &&
@@ -8349,7 +8394,7 @@ int new_recompile_block(u_int addr)
              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
-             regs[i].regmap[hr]!=map )
+             regs[i].regmap[hr]!=map1 && regs[i].regmap[hr]!=map2)
           {
             regs[i].regmap[hr]=-1;
             regs[i].isconst&=~(1<<hr);
@@ -8360,13 +8405,13 @@ int new_recompile_block(u_int addr)
                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
-               branch_regs[i].regmap[hr]!=map)
+               branch_regs[i].regmap[hr]!=map1 && branch_regs[i].regmap[hr]!=map2)
             {
               branch_regs[i].regmap[hr]=-1;
               branch_regs[i].regmap_entry[hr]=-1;
-              if (!is_ujump(i))
+              if (!dops[i].is_ujump)
               {
-                if(!dops[i].likely&&i<slen-2) {
+                if (i < slen-2) {
                   regmap_pre[i+2][hr]=-1;
                   regs[i+2].wasconst&=~(1<<hr);
                 }
@@ -8379,17 +8424,16 @@ int new_recompile_block(u_int addr)
           // Non-branch
           if(i>0)
           {
-            int map=-1,temp=-1;
-            if(dops[i].itype==STORE || dops[i].itype==STORELR ||
-                      (dops[i].opcode&0x3b)==0x39 || (dops[i].opcode&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
-              map=INVCP;
-            }
-            if(dops[i].itype==LOADLR || dops[i].itype==STORELR ||
-               dops[i].itype==C1LS || dops[i].itype==C2LS)
-              temp=FTEMP;
+            int map1 = -1, map2 = -1, temp=-1;
+            if (dops[i].is_load || dops[i].is_store)
+              map1 = ROREG;
+            if (dops[i].is_store)
+              map2 = INVCP;
+            if (dops[i].itype==LOADLR || dops[i].itype==STORELR || dops[i].itype==C2LS)
+              temp = FTEMP;
             if((regs[i].regmap[hr]&63)!=dops[i].rt1 && (regs[i].regmap[hr]&63)!=dops[i].rt2 &&
                regs[i].regmap[hr]!=dops[i].rs1 && regs[i].regmap[hr]!=dops[i].rs2 &&
-               (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
+               (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map1 && regs[i].regmap[hr]!=map2 &&
                (dops[i].itype!=SPAN||regs[i].regmap[hr]!=CCREG))
             {
               if(i<slen-1&&!dops[i].is_ds) {
@@ -8433,7 +8477,7 @@ int new_recompile_block(u_int addr)
       ||dops[i+1].itype==COP2||dops[i+1].itype==C2LS||dops[i+1].itype==C2OP)
       {
         int t=(ba[i]-start)>>2;
-        if(t>0&&(dops[t-1].itype!=UJUMP&&dops[t-1].itype!=RJUMP&&dops[t-1].itype!=CJUMP&&dops[t-1].itype!=SJUMP)) // loop_preload can't handle jumps into delay slots
+        if(t > 0 && !dops[t-1].is_jump) // loop_preload can't handle jumps into delay slots
         if(t<2||(dops[t-2].itype!=UJUMP&&dops[t-2].itype!=RJUMP)||dops[t-2].rt1!=31) // call/ret assumes no registers allocated
         for(hr=0;hr<HOST_REGS;hr++)
         {
@@ -8506,7 +8550,7 @@ int new_recompile_block(u_int addr)
                         //printf("no-match due to different register\n");
                         break;
                       }
-                      if(dops[k-2].itype==UJUMP||dops[k-2].itype==RJUMP||dops[k-2].itype==CJUMP||dops[k-2].itype==SJUMP) {
+                      if (dops[k-2].is_jump) {
                         //printf("no-match due to branch\n");
                         break;
                       }
@@ -8555,7 +8599,7 @@ int new_recompile_block(u_int addr)
                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
                       branch_regs[i].wasconst&=~(1<<hr);
                       branch_regs[i].isconst&=~(1<<hr);
-                      if (!is_ujump(i)) {
+                      if (!dops[i].is_ujump) {
                         regmap_pre[i+2][hr]=f_regmap[hr];
                         regs[i+2].wasdirty&=~(1<<hr);
                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
@@ -8570,13 +8614,13 @@ int new_recompile_block(u_int addr)
                     regs[k].dirty&=~(1<<hr);
                     regs[k].wasconst&=~(1<<hr);
                     regs[k].isconst&=~(1<<hr);
-                    if(dops[k].itype==UJUMP||dops[k].itype==RJUMP||dops[k].itype==CJUMP||dops[k].itype==SJUMP) {
+                    if (dops[k].is_jump) {
                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
                       branch_regs[k].regmap[hr]=f_regmap[hr];
                       branch_regs[k].dirty&=~(1<<hr);
                       branch_regs[k].wasconst&=~(1<<hr);
                       branch_regs[k].isconst&=~(1<<hr);
-                      if (!is_ujump(k)) {
+                      if (!dops[k].is_ujump) {
                         regmap_pre[k+2][hr]=f_regmap[hr];
                         regs[k+2].wasdirty&=~(1<<hr);
                       }
@@ -8598,7 +8642,7 @@ int new_recompile_block(u_int addr)
                   //printf("no-match due to different register\n");
                   break;
                 }
-                if (is_ujump(j))
+                if (dops[j].is_ujump)
                 {
                   // Stop on unconditional branch
                   break;
@@ -8714,7 +8758,7 @@ int new_recompile_block(u_int addr)
   // to use, which can avoid a load-use penalty on certain CPUs.
   for(i=0;i<slen-1;i++)
   {
-    if(!i||(dops[i-1].itype!=UJUMP&&dops[i-1].itype!=CJUMP&&dops[i-1].itype!=SJUMP&&dops[i-1].itype!=RJUMP))
+    if (!i || !dops[i-1].is_jump)
     {
       if(!dops[i+1].bt)
       {
@@ -8994,7 +9038,7 @@ int new_recompile_block(u_int addr)
       #endif
       printf("\n");
     }
-    if(dops[i].itype==RJUMP||dops[i].itype==UJUMP||dops[i].itype==CJUMP||dops[i].itype==SJUMP) {
+    if(dops[i].is_jump) {
       #if defined(__i386__) || defined(__x86_64__)
       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
       if(branch_regs[i].dirty&1) printf("eax ");
@@ -9064,18 +9108,18 @@ int new_recompile_block(u_int addr)
     } else {
       speculate_register_values(i);
       #ifndef DESTRUCTIVE_WRITEBACK
-      if (i < 2 || !is_ujump(i-2))
+      if (i < 2 || !dops[i-2].is_ujump)
       {
         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,unneeded_reg[i]);
       }
-      if((dops[i].itype==CJUMP||dops[i].itype==SJUMP)&&!dops[i].likely) {
+      if((dops[i].itype==CJUMP||dops[i].itype==SJUMP)) {
         dirty_pre=branch_regs[i].dirty;
       }else{
         dirty_pre=regs[i].dirty;
       }
       #endif
       // write back
-      if (i < 2 || !is_ujump(i-2))
+      if (i < 2 || !dops[i-2].is_ujump)
       {
         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,unneeded_reg[i]);
         loop_preload(regmap_pre[i],regs[i].regmap_entry);
@@ -9091,14 +9135,16 @@ int new_recompile_block(u_int addr)
       load_regs(regs[i].regmap_entry,regs[i].regmap,dops[i].rs1,dops[i].rs2);
       address_generation(i,&regs[i],regs[i].regmap_entry);
       load_consts(regmap_pre[i],regs[i].regmap,i);
-      if(dops[i].itype==RJUMP||dops[i].itype==UJUMP||dops[i].itype==CJUMP||dops[i].itype==SJUMP)
+      if(dops[i].is_jump)
       {
         // Load the delay slot registers if necessary
         if(dops[i+1].rs1!=dops[i].rs1&&dops[i+1].rs1!=dops[i].rs2&&(dops[i+1].rs1!=dops[i].rt1||dops[i].rt1==0))
           load_regs(regs[i].regmap_entry,regs[i].regmap,dops[i+1].rs1,dops[i+1].rs1);
         if(dops[i+1].rs2!=dops[i+1].rs1&&dops[i+1].rs2!=dops[i].rs1&&dops[i+1].rs2!=dops[i].rs2&&(dops[i+1].rs2!=dops[i].rt1||dops[i].rt1==0))
           load_regs(regs[i].regmap_entry,regs[i].regmap,dops[i+1].rs2,dops[i+1].rs2);
-        if(dops[i+1].itype==STORE||dops[i+1].itype==STORELR||(dops[i+1].opcode&0x3b)==0x39||(dops[i+1].opcode&0x3b)==0x3a)
+        if (ram_offset && (dops[i+1].is_load || dops[i+1].is_store))
+          load_regs(regs[i].regmap_entry,regs[i].regmap,ROREG,ROREG);
+        if (dops[i+1].is_store)
           load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP);
       }
       else if(i+1<slen)
@@ -9112,9 +9158,11 @@ int new_recompile_block(u_int addr)
             load_regs(regs[i].regmap_entry,regs[i].regmap,dops[i+1].rs2,dops[i+1].rs2);
       }
       // TODO: if(is_ooo(i)) address_generation(i+1);
-      if(dops[i].itype==CJUMP)
+      if (dops[i].itype == CJUMP)
         load_regs(regs[i].regmap_entry,regs[i].regmap,CCREG,CCREG);
-      if(dops[i].itype==STORE||dops[i].itype==STORELR||(dops[i].opcode&0x3b)==0x39||(dops[i].opcode&0x3b)==0x3a)
+      if (ram_offset && (dops[i].is_load || dops[i].is_store))
+        load_regs(regs[i].regmap_entry,regs[i].regmap,ROREG,ROREG);
+      if (dops[i].is_store)
         load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP);
       // assemble
       switch(dops[i].itype) {
@@ -9169,7 +9217,7 @@ int new_recompile_block(u_int addr)
         case SPAN:
           pagespan_assemble(i,&regs[i]);break;
       }
-      if (is_ujump(i))
+      if (dops[i].is_ujump)
         literal_pool(1024);
       else
         literal_pool_jumpover(256);
@@ -9183,8 +9231,8 @@ int new_recompile_block(u_int addr)
   // If the block did not end with an unconditional branch,
   // add a jump to the next instruction.
   else if (i > 1) {
-    if(!is_ujump(i-2)&&dops[i-1].itype!=SPAN) {
-      assert(dops[i-1].itype!=UJUMP&&dops[i-1].itype!=CJUMP&&dops[i-1].itype!=SJUMP&&dops[i-1].itype!=RJUMP);
+    if (!dops[i-2].is_ujump && dops[i-1].itype != SPAN) {
+      assert(!dops[i-1].is_jump);
       assert(i==slen);
       if(dops[i-2].itype!=CJUMP&&dops[i-2].itype!=SJUMP) {
         store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4);
@@ -9192,16 +9240,11 @@ int new_recompile_block(u_int addr)
           emit_loadreg(CCREG,HOST_CCREG);
         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
       }
-      else if(!dops[i-2].likely)
+      else
       {
         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].dirty,start+i*4);
         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
       }
-      else
-      {
-        store_regs_bt(regs[i-2].regmap,regs[i-2].dirty,start+i*4);
-        assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
-      }
       add_to_linker(out,start+i*4,0);
       emit_jmp(0);
     }
@@ -9209,7 +9252,7 @@ int new_recompile_block(u_int addr)
   else
   {
     assert(i>0);
-    assert(dops[i-1].itype!=UJUMP&&dops[i-1].itype!=CJUMP&&dops[i-1].itype!=SJUMP&&dops[i-1].itype!=RJUMP);
+    assert(!dops[i-1].is_jump);
     store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4);
     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
       emit_loadreg(CCREG,HOST_CCREG);
@@ -9406,6 +9449,9 @@ int new_recompile_block(u_int addr)
     }
     expirep=(expirep+1)&65535;
   }
+#ifdef ASSEM_PRINT
+  fflush(stdout);
+#endif
   return 0;
 }