drc: merge Ari64's patch: 15_dirty_registers_fix
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
index 1f33c75..f1bdbfc 100644 (file)
@@ -1,6 +1,6 @@
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  *   Mupen64plus - new_dynarec.c                                           *
- *   Copyright (C) 2009-2010 Ari64                                         *
+ *   Copyright (C) 2009-2011 Ari64                                         *
  *                                                                         *
  *   This program is free software; you can redistribute it and/or modify  *
  *   it under the terms of the GNU General Public License as published by  *
@@ -121,7 +121,12 @@ struct ll_entry
   char shadow[1048576]  __attribute__((aligned(16)));
   void *copy;
   int expirep;
+#ifndef PCSX
   u_int using_tlb;
+#else
+  static const u_int using_tlb=0;
+#endif
+  static u_int sp_in_mirror;
   u_int stop_after_jal;
   extern u_char restore_candidate[512];
   extern int cycle_count;
@@ -594,6 +599,7 @@ void clear_const(struct regstat *cur,signed char reg)
 int is_const(struct regstat *cur,signed char reg)
 {
   int hr;
+  if(reg<0) return 0;
   if(!reg) return 1;
   for (hr=0;hr<HOST_REGS;hr++) {
     if((cur->regmap[hr]&63)==reg) {
@@ -714,12 +720,6 @@ int needed_again(int r, int i)
   int j;
   int b=-1;
   int rn=10;
-  int hr;
-  u_char hsn[MAXREG+1];
-  int preferred_reg;
-  
-  memset(hsn,10,sizeof(hsn));
-  lsn(hsn,i,&preferred_reg);
   
   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
   {
@@ -772,11 +772,7 @@ int needed_again(int r, int i)
       }
     }
   }*/
-  for(hr=0;hr<HOST_REGS;hr++) {
-    if(hr!=EXCLUDE_REG) {
-      if(rn<hsn[hr]) return 1;
-    }
-  }
+  if(rn<10) return 1;
   return 0;
 }
 
@@ -1182,6 +1178,9 @@ void invalidate_block(u_int block)
   
   // Don't trap writes
   invalid_code[block]=1;
+#ifdef PCSX
+  invalid_code[((u_int)0x80000000>>12)|page]=1;
+#endif
 #ifndef DISABLE_TLB
   // If there is a valid TLB entry for this page, remove write protect
   if(tlb_LUT_w[block]) {
@@ -1594,13 +1593,9 @@ void load_alloc(struct regstat *current,int i)
   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
-  if(rt1[i]) {
+  if(rt1[i]&&!((current->u>>rt1[i])&1)) {
     alloc_reg(current,i,rt1[i]);
-    if(get_reg(current->regmap,rt1[i])<0) {
-      // dummy load, but we still need a register to calculate the address
-      alloc_reg_temp(current,i,-1);
-      minimum_free_regs[i]=1;
-    }
+    assert(get_reg(current->regmap,rt1[i])>=0);
     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
     {
       current->is32&=~(1LL<<rt1[i]);
@@ -1628,12 +1623,14 @@ void load_alloc(struct regstat *current,int i)
   }
   else
   {
-    // Load to r0 (dummy load)
+    // Load to r0 or unneeded register (dummy load)
     // but we still need a register to calculate the address
     if(opcode[i]==0x22||opcode[i]==0x26)
     {
       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
     }
+    // If using TLB, need a register for pointer to the mapping table
+    if(using_tlb) alloc_reg(current,i,TLREG);
     alloc_reg_temp(current,i,-1);
     minimum_free_regs[i]=1;
     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
@@ -2780,8 +2777,10 @@ void load_assemble(int i,struct regstat *i_regs)
   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
   if(s>=0) {
     c=(i_regs->wasconst>>s)&1;
-    memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
-    if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
+    if (c) {
+      memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
+      if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
+    }
   }
   //printf("load_assemble: c=%d\n",c);
   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
@@ -2817,6 +2816,13 @@ void load_assemble(int i,struct regstat *i_regs)
       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
       #endif
       {
+        #ifdef PCSX
+        if(sp_in_mirror&&rs1[i]==29) {
+          emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
+          emit_cmpimm(HOST_TEMPREG,RAM_SIZE);
+        }
+        else
+        #endif
         emit_cmpimm(addr,RAM_SIZE);
         jaddr=(int)out;
         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
@@ -2834,6 +2840,7 @@ void load_assemble(int i,struct regstat *i_regs)
     if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
     map=get_reg(i_regs->regmap,TLREG);
     assert(map>=0);
+    reglist&=~(1<<map);
     map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
     do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
   }
@@ -2856,6 +2863,9 @@ void load_assemble(int i,struct regstat *i_regs)
           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
 #else
           if(!c) a=addr;
+#endif
+#ifdef PCSX
+          if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
 #endif
           emit_movsbl_indexed_tlb(x,a,map,tl);
         }
@@ -2881,6 +2891,9 @@ void load_assemble(int i,struct regstat *i_regs)
           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
 #else
           if(!c) a=addr;
+#endif
+#ifdef PCSX
+          if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
 #endif
           //#ifdef
           //emit_movswl_indexed_tlb(x,tl,map,tl);
@@ -2906,13 +2919,17 @@ void load_assemble(int i,struct regstat *i_regs)
   if (opcode[i]==0x23) { // LW
     if(!c||memtarget) {
       if(!dummy) {
+        int a=addr;
+#ifdef PCSX
+        if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
+#endif
         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
         #ifdef HOST_IMM_ADDR32
         if(c)
           emit_readword_tlb(constmap[i][s]+offset,map,tl);
         else
         #endif
-        emit_readword_indexed_tlb(0,addr,map,tl);
+        emit_readword_indexed_tlb(0,a,map,tl);
       }
       if(jaddr)
         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
@@ -2938,6 +2955,9 @@ void load_assemble(int i,struct regstat *i_regs)
           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
 #else
           if(!c) a=addr;
+#endif
+#ifdef PCSX
+          if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
 #endif
           emit_movzbl_indexed_tlb(x,a,map,tl);
         }
@@ -2963,6 +2983,9 @@ void load_assemble(int i,struct regstat *i_regs)
           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
 #else
           if(!c) a=addr;
+#endif
+#ifdef PCSX
+          if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
 #endif
           //#ifdef
           //emit_movzwl_indexed_tlb(x,tl,map,tl);
@@ -2989,13 +3012,17 @@ void load_assemble(int i,struct regstat *i_regs)
     assert(th>=0);
     if(!c||memtarget) {
       if(!dummy) {
+        int a=addr;
+#ifdef PCSX
+        if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
+#endif
         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
         #ifdef HOST_IMM_ADDR32
         if(c)
           emit_readword_tlb(constmap[i][s]+offset,map,tl);
         else
         #endif
-        emit_readword_indexed_tlb(0,addr,map,tl);
+        emit_readword_indexed_tlb(0,a,map,tl);
       }
       if(jaddr)
         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
@@ -3008,6 +3035,10 @@ void load_assemble(int i,struct regstat *i_regs)
   if (opcode[i]==0x37) { // LD
     if(!c||memtarget) {
       if(!dummy) {
+        int a=addr;
+#ifdef PCSX
+        if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
+#endif
         //gen_tlb_addr_r(tl,map);
         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
@@ -3016,7 +3047,7 @@ void load_assemble(int i,struct regstat *i_regs)
           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
         else
         #endif
-        emit_readdword_indexed_tlb(0,addr,map,th,tl);
+        emit_readdword_indexed_tlb(0,a,map,th,tl);
       }
       if(jaddr)
         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
@@ -3081,8 +3112,10 @@ void store_assemble(int i,struct regstat *i_regs)
   offset=imm[i];
   if(s>=0) {
     c=(i_regs->wasconst>>s)&1;
-    memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
-    if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
+    if(c) {
+      memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
+      if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
+    }
   }
   assert(tl>=0);
   assert(temp>=0);
@@ -3094,9 +3127,15 @@ void store_assemble(int i,struct regstat *i_regs)
   else addr=s;
   if(!using_tlb) {
     if(!c) {
+      #ifdef PCSX
+      if(sp_in_mirror&&rs1[i]==29) {
+        emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
+        emit_cmpimm(HOST_TEMPREG,RAM_SIZE);
+      }
+      else
+      #endif
       #ifdef R29_HACK
       // Strmnnrmn's speed hack
-      memtarget=1;
       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
       #endif
       emit_cmpimm(addr,RAM_SIZE);
@@ -3104,6 +3143,7 @@ void store_assemble(int i,struct regstat *i_regs)
       if(s==addr) emit_mov(s,temp);
       #endif
       #ifdef R29_HACK
+      memtarget=1;
       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
       #endif
       {
@@ -3123,65 +3163,79 @@ void store_assemble(int i,struct regstat *i_regs)
     if (opcode[i]==0x29) x=2; // SH
     map=get_reg(i_regs->regmap,TLREG);
     assert(map>=0);
+    reglist&=~(1<<map);
     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
   }
 
   if (opcode[i]==0x28) { // SB
     if(!c||memtarget) {
-      int x=0;
+      int x=0,a=temp;
 #ifdef BIG_ENDIAN_MIPS
       if(!c) emit_xorimm(addr,3,temp);
       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
 #else
-      if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
-      else if (addr!=temp) emit_mov(addr,temp);
+      if(!c) a=addr;
+#endif
+#ifdef PCSX
+      if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
 #endif
       //gen_tlb_addr_w(temp,map);
       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
-      emit_writebyte_indexed_tlb(tl,x,temp,map,temp);
+      emit_writebyte_indexed_tlb(tl,x,a,map,a);
     }
     type=STOREB_STUB;
   }
   if (opcode[i]==0x29) { // SH
     if(!c||memtarget) {
-      int x=0;
+      int x=0,a=temp;
 #ifdef BIG_ENDIAN_MIPS
       if(!c) emit_xorimm(addr,2,temp);
       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
 #else
-      if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
-      else if (addr!=temp) emit_mov(addr,temp);
+      if(!c) a=addr;
+#endif
+#ifdef PCSX
+      if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
 #endif
       //#ifdef
       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
       //#else
       if(map>=0) {
-        gen_tlb_addr_w(temp,map);
-        emit_writehword_indexed(tl,x,temp);
+        gen_tlb_addr_w(a,map);
+        emit_writehword_indexed(tl,x,a);
       }else
-        emit_writehword_indexed(tl,(int)rdram-0x80000000+x,temp);
+        emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
     }
     type=STOREH_STUB;
   }
   if (opcode[i]==0x2B) { // SW
-    if(!c||memtarget)
+    if(!c||memtarget) {
+      int a=addr;
+#ifdef PCSX
+      if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
+#endif
       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
-      emit_writeword_indexed_tlb(tl,0,addr,map,temp);
+      emit_writeword_indexed_tlb(tl,0,a,map,temp);
+    }
     type=STOREW_STUB;
   }
   if (opcode[i]==0x3F) { // SD
     if(!c||memtarget) {
+      int a=addr;
+#ifdef PCSX
+      if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
+#endif
       if(rs2[i]) {
         assert(th>=0);
         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
-        emit_writedword_indexed_tlb(th,tl,0,addr,map,temp);
+        emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
       }else{
         // Store zero
         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
-        emit_writedword_indexed_tlb(tl,tl,0,addr,map,temp);
+        emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
       }
     }
     type=STORED_STUB;
@@ -3254,7 +3308,7 @@ void storelr_assemble(int i,struct regstat *i_regs)
   int jaddr=0,jaddr2;
   int case1,case2,case3;
   int done0,done1,done2;
-  int memtarget,c=0;
+  int memtarget=0,c=0;
   int agr=AGEN1+(i&1);
   u_int hr,reglist=0;
   th=get_reg(i_regs->regmap,rs2[i]|64);
@@ -3265,8 +3319,10 @@ void storelr_assemble(int i,struct regstat *i_regs)
   offset=imm[i];
   if(s>=0) {
     c=(i_regs->isconst>>s)&1;
-    memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
-    if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
+    if(c) {
+      memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
+      if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
+    }
   }
   assert(tl>=0);
   for(hr=0;hr<HOST_REGS;hr++) {
@@ -3298,6 +3354,7 @@ void storelr_assemble(int i,struct regstat *i_regs)
   }else{ // using tlb
     int map=get_reg(i_regs->regmap,TLREG);
     assert(map>=0);
+    reglist&=~(1<<map);
     map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
     if(!c&&!offset&&s>=0) emit_mov(s,temp);
     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
@@ -3548,6 +3605,7 @@ void c1ls_assemble(int i,struct regstat *i_regs)
   {
     map=get_reg(i_regs->regmap,TLREG);
     assert(map>=0);
+    reglist&=~(1<<map);
     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
     }
@@ -3679,7 +3737,7 @@ void c2ls_assemble(int i,struct regstat *i_regs)
   int ar;
   int offset;
   int memtarget=0,c=0;
-  int jaddr,jaddr2=0,jaddr3,type;
+  int jaddr2=0,jaddr3,type;
   int agr=AGEN1+(i&1);
   u_int hr,reglist=0;
   u_int copr=(source[i]>>16)&0x1f;
@@ -4049,7 +4107,7 @@ static void loop_preload(signed char pre[],signed char entry[])
 void address_generation(int i,struct regstat *i_regs,signed char entry[])
 {
   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
-    int ra;
+    int ra=-1;
     int agr=AGEN1+(i&1);
     int mgr=MGEN1+(i&1);
     if(itype[i]==LOAD) {
@@ -4425,7 +4483,7 @@ void load_all_regs(signed char i_regmap[])
         emit_zeroreg(hr);
       }
       else
-      if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
+      if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
       {
         emit_loadreg(i_regmap[hr],hr);
       }
@@ -4444,7 +4502,7 @@ void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
           emit_zeroreg(hr);
         }
         else
-        if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
+        if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
         {
           emit_loadreg(i_regmap[hr],hr);
         }
@@ -4464,7 +4522,7 @@ void load_regs_entry(int t)
   }
   // Load 32-bit regs
   for(hr=0;hr<HOST_REGS;hr++) {
-    if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
+    if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
       if(regs[t].regmap_entry[hr]==0) {
         emit_zeroreg(hr);
       }
@@ -4476,7 +4534,7 @@ void load_regs_entry(int t)
   }
   // Load 64-bit regs
   for(hr=0;hr<HOST_REGS;hr++) {
-    if(regs[t].regmap_entry[hr]>=64) {
+    if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
       assert(regs[t].regmap_entry[hr]!=64);
       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
@@ -4556,7 +4614,7 @@ void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int ad
     }
     // Load 32-bit regs
     for(hr=0;hr<HOST_REGS;hr++) {
-      if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
+      if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
         #ifdef DESTRUCTIVE_WRITEBACK
         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
         #else
@@ -4574,7 +4632,7 @@ void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int ad
     }
     //Load 64-bit regs
     for(hr=0;hr<HOST_REGS;hr++) {
-      if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64) {
+      if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
           assert(regs[t].regmap_entry[hr]!=64);
           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
@@ -4615,19 +4673,19 @@ int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
       {
         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
         {
-          if(regs[t].regmap_entry[hr]!=-1)
+          if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
           {
             return 0;
           }
           else 
           if((i_dirty>>hr)&1)
           {
-            if(i_regmap[hr]<64)
+            if(i_regmap[hr]<TEMPREG)
             {
               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
                 return 0;
             }
-            else
+            else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
             {
               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
                 return 0;
@@ -4801,7 +4859,7 @@ void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
   }
   else
   {
-    emit_cmpimm(HOST_CCREG,-2*(count+2));
+    emit_cmpimm(HOST_CCREG,-CLOCK_DIVIDER*(count+2));
     jaddr=(int)out;
     emit_jns(0);
   }
@@ -4867,7 +4925,7 @@ void do_ccstub(int n)
           emit_loadreg(rs2[i],s2l);
       #endif
       int hr=0;
-      int addr,alt,ntaddr;
+      int addr=-1,alt=-1,ntaddr=-1;
       while(hr<HOST_REGS)
       {
         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
@@ -5100,34 +5158,19 @@ void ujump_assemble(int i,struct regstat *i_regs)
     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
   }
   #endif
-  ds_assemble(i+1,i_regs);
-  uint64_t bc_unneeded=branch_regs[i].u;
-  uint64_t bc_unneeded_upper=branch_regs[i].uu;
-  bc_unneeded|=1|(1LL<<rt1[i]);
-  bc_unneeded_upper|=1|(1LL<<rt1[i]);
-  wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
-                bc_unneeded,bc_unneeded_upper);
-  load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
   if(rt1[i]==31) {
     int rt;
     unsigned int return_address;
-    assert(rt1[i+1]!=31);
-    assert(rt2[i+1]!=31);
     rt=get_reg(branch_regs[i].regmap,31);
     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
     //assert(rt>=0);
     return_address=start+i*4+8;
     if(rt>=0) {
       #ifdef USE_MINI_HT
-      if(internal_branch(branch_regs[i].is32,return_address)) {
-        int temp=rt+1;
-        if(temp==EXCLUDE_REG||temp>=HOST_REGS||
-           branch_regs[i].regmap[temp]>=0)
-        {
-          temp=get_reg(branch_regs[i].regmap,-1);
-        }
+      if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
+        int temp=-1; // note: must be ds-safe
         #ifdef HOST_TEMPREG
-        if(temp<0) temp=HOST_TEMPREG;
+        temp=HOST_TEMPREG;
         #endif
         if(temp>=0) do_miniht_insert(return_address,rt,temp);
         else emit_movimm(return_address,rt);
@@ -5148,6 +5191,14 @@ void ujump_assemble(int i,struct regstat *i_regs)
       }
     }
   }
+  ds_assemble(i+1,i_regs);
+  uint64_t bc_unneeded=branch_regs[i].u;
+  uint64_t bc_unneeded_upper=branch_regs[i].uu;
+  bc_unneeded|=1|(1LL<<rt1[i]);
+  bc_unneeded_upper|=1|(1LL<<rt1[i]);
+  wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
+                bc_unneeded,bc_unneeded_upper);
+  load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
   int cc,adj;
   cc=get_reg(branch_regs[i].regmap,CCREG);
   assert(cc==HOST_CCREG);
@@ -7591,7 +7642,7 @@ void clean_registers(int istart,int iend,int wr)
             regs[i].wasdirty|=will_dirty_i&(1<<r);
           }
         }
-        else if((nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
+        else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
           // Register moved to a different register
           will_dirty_i&=~(1<<r);
           wont_dirty_i&=~(1<<r);
@@ -7709,6 +7760,38 @@ void disassemble_inst(int i)
     }
 }
 
+// clear the state completely, instead of just marking
+// things invalid like invalidate_all_pages() does
+void new_dynarec_clear_full()
+{
+  int n;
+  out=(u_char *)BASE_ADDR;
+  memset(invalid_code,1,sizeof(invalid_code));
+  memset(hash_table,0xff,sizeof(hash_table));
+  memset(mini_ht,-1,sizeof(mini_ht));
+  memset(restore_candidate,0,sizeof(restore_candidate));
+  memset(shadow,0,sizeof(shadow));
+  copy=shadow;
+  expirep=16384; // Expiry pointer, +2 blocks
+  pending_exception=0;
+  literalcount=0;
+  stop_after_jal=0;
+  // TLB
+#ifndef DISABLE_TLB
+  using_tlb=0;
+#endif
+  sp_in_mirror=0;
+  for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
+    memory_map[n]=-1;
+  for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
+    memory_map[n]=((u_int)rdram-0x80000000)>>2;
+  for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
+    memory_map[n]=-1;
+  for(n=0;n<4096;n++) ll_clear(jump_in+n);
+  for(n=0;n<4096;n++) ll_clear(jump_out+n);
+  for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
+}
+
 void new_dynarec_init()
 {
   printf("Init new dynarec\n");
@@ -7724,29 +7807,11 @@ void new_dynarec_init()
   fake_pc.f.r.rd=&readmem_dword;
 #endif
   int n;
-  for(n=0x80000;n<0x80800;n++)
-    invalid_code[n]=1;
-  for(n=0;n<65536;n++)
-    hash_table[n][0]=hash_table[n][2]=-1;
-  memset(mini_ht,-1,sizeof(mini_ht));
-  memset(restore_candidate,0,sizeof(restore_candidate));
-  copy=shadow;
-  expirep=16384; // Expiry pointer, +2 blocks
-  pending_exception=0;
-  literalcount=0;
+  new_dynarec_clear_full();
 #ifdef HOST_IMM8
   // Copy this into local area so we don't have to put it in every literal pool
   invc_ptr=invalid_code;
 #endif
-  stop_after_jal=0;
-  // TLB
-  using_tlb=0;
-  for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
-    memory_map[n]=-1;
-  for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
-    memory_map[n]=((u_int)rdram-0x80000000)>>2;
-  for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
-    memory_map[n]=-1;
 #ifdef MUPEN64
   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
     writemem[n] = write_nomem_new;
@@ -7825,6 +7890,11 @@ int new_recompile_block(int addr)
   start = (u_int)addr&~3;
   //assert(((u_int)addr&1)==0);
 #ifdef PCSX
+  if(!sp_in_mirror&&(signed int)(psxRegs.GPR.n.sp&0xffe00000)>0x80200000&&
+     0x10000<=psxRegs.GPR.n.sp&&(psxRegs.GPR.n.sp&~0xe0e00000)<RAM_SIZE) {
+    printf("SP hack enabled (%08x), @%08x\n", psxRegs.GPR.n.sp, psxRegs.pc);
+    sp_in_mirror=1;
+  }
   if (Config.HLE && start == 0x80001000) // hlecall
   {
     // XXX: is this enough? Maybe check hleSoftCall?
@@ -7940,17 +8010,10 @@ int new_recompile_block(int addr)
           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
-          case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
-          case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
-          case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
-          case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
-          case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
-          case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
-          case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
@@ -7961,22 +8024,31 @@ int new_recompile_block(int addr)
           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
-          case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
-          case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
-          case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
-          case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
+#ifndef FORCE32
+          case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
+          case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
+          case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
+          case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
+          case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
+          case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
+          case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
+          case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
+          case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
+          case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
+          case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
+#endif
         }
         break;
       case 0x01: strcpy(insn[i],"regimm"); type=NI;
@@ -8217,18 +8289,6 @@ int new_recompile_block(int addr)
         printf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
         break;
     }
-#ifdef PCSX
-    /* detect branch in delay slot early */
-    if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
-      opcode[i+1]=source[i+1]>>26;
-      opcode2[i+1]=source[i+1]&0x3f;
-      if((0<opcode[i+1]&&opcode[i+1]<8)||(opcode[i+1]==0&&(opcode2[i+1]==8||opcode2[i+1]==9))) {
-        printf("branch in delay slot @%08x (%08x)\n", addr + i*4+4, addr);
-        // don't handle first branch and call interpreter if it's hit
-        type=INTCALL;
-      }
-    }
-#endif
     itype[i]=type;
     opcode2[i]=op2;
     /* Get registers/immediates */
@@ -8461,19 +8521,43 @@ int new_recompile_block(int addr)
     else if(type==CJUMP||type==SJUMP||type==FJUMP)
       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
     else ba[i]=-1;
-    /* Is this the end of the block? */
-    if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
 #ifdef PCSX
-      // check for link register access in delay slot
-      int rt1_=rt1[i-1];
-      if(rt1_!=0&&(rs1[i]==rt1_||rs2[i]==rt1_||rt1[i]==rt1_||rt2[i]==rt1_)) {
-        printf("link access in delay slot @%08x (%08x)\n", addr + i*4, addr);
+    if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
+      int do_in_intrp=0;
+      // branch in delay slot?
+      if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
+        // don't handle first branch and call interpreter if it's hit
+        printf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
+        do_in_intrp=1;
+      }
+      // basic load delay detection
+      else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
+        int t=(ba[i-1]-start)/4;
+        if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
+          // jump target wants DS result - potential load delay effect
+          printf("load delay @%08x (%08x)\n", addr + i*4, addr);
+          do_in_intrp=1;
+          bt[t+1]=1; // expected return from interpreter
+        }
+        else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
+              !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
+          // v0 overwrite like this is a sign of trouble, bail out
+          printf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
+          do_in_intrp=1;
+        }
+      }
+      if(do_in_intrp) {
+        rs1[i-1]=CCREG;
+        rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
         ba[i-1]=-1;
         itype[i-1]=INTCALL;
         done=2;
+        i--; // don't compile the DS
       }
-      else
+    }
 #endif
+    /* Is this the end of the block? */
+    if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
         done=2;
       }
@@ -8533,7 +8617,7 @@ int new_recompile_block(int addr)
   current.wasconst=0;
   int ds=0;
   int cc=0;
-  int hr;
+  int hr=-1;
 
 #ifndef FORCE32
   provisional_32bit();
@@ -8796,18 +8880,18 @@ int new_recompile_block(int addr)
           clear_const(&current,rt1[i]);
           alloc_cc(&current,i);
           dirty_reg(&current,CCREG);
+          ooo[i]=1;
+          delayslot_alloc(&current,i+1);
           if (rt1[i]==31) {
             alloc_reg(&current,i,31);
             dirty_reg(&current,31);
-            assert(rs1[i+1]!=31&&rs2[i+1]!=31);
-            assert(rt1[i+1]!=rt1[i]);
+            //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
+            //assert(rt1[i+1]!=rt1[i]);
             #ifdef REG_PREFETCH
             alloc_reg(&current,i,PTEMP);
             #endif
             //current.is32|=1LL<<rt1[i];
           }
-          ooo[i]=1;
-          delayslot_alloc(&current,i+1);
           //current.isconst=0; // DEBUG
           ds=1;
           //printf("i=%d, isconst=%x\n",i,current.isconst);
@@ -9550,8 +9634,8 @@ int new_recompile_block(int addr)
         }
       }
       // Don't need stuff which is overwritten
-      if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
-      if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
+      //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
+      //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
       // Merge in delay slot
       for(hr=0;hr<HOST_REGS;hr++)
       {
@@ -9670,7 +9754,10 @@ int new_recompile_block(int addr)
             if(likely[i]) {
               regs[i].regmap[hr]=-1;
               regs[i].isconst&=~(1<<hr);
-              if(i<slen-2) regmap_pre[i+2][hr]=-1;
+              if(i<slen-2) {
+                regmap_pre[i+2][hr]=-1;
+                regs[i+2].wasconst&=~(1<<hr);
+              }
             }
           }
         }
@@ -9725,6 +9812,7 @@ int new_recompile_block(int addr)
               {
                 if(!likely[i]&&i<slen-2) {
                   regmap_pre[i+2][hr]=-1;
+                  regs[i+2].wasconst&=~(1<<hr);
                 }
               }
             }
@@ -9770,6 +9858,7 @@ int new_recompile_block(int addr)
                 }
                 regmap_pre[i+1][hr]=-1;
                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
+                regs[i+1].wasconst&=~(1<<hr);
               }
               regs[i].regmap[hr]=-1;
               regs[i].isconst&=~(1<<hr);
@@ -10446,6 +10535,19 @@ int new_recompile_block(int addr)
     }
     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
   }
+#else
+  for (i=slen-1;i>=0;i--)
+  {
+    if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
+    {
+      // Conditional branch
+      if((source[i]>>16)!=0x1000&&i<slen-2) {
+        // Mark this address as a branch target since it may be called
+        // upon return from interrupt
+        bt[i+2]=1;
+      }
+    }
+  }
 #endif
 
   if(itype[slen-1]==SPAN) {
@@ -10711,8 +10813,13 @@ int new_recompile_block(int addr)
         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
               unneeded_reg[i],unneeded_reg_upper[i]);
       }
-      is32_pre=regs[i].is32;
-      dirty_pre=regs[i].dirty;
+      if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
+        is32_pre=branch_regs[i].is32;
+        dirty_pre=branch_regs[i].dirty;
+      }else{
+        is32_pre=regs[i].is32;
+        dirty_pre=regs[i].dirty;
+      }
       #endif
       // write back
       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
@@ -10733,9 +10840,9 @@ int new_recompile_block(int addr)
       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
       {
         // Load the delay slot registers if necessary
-        if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
+        if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
-        if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
+        if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
@@ -11010,6 +11117,12 @@ int new_recompile_block(int addr)
     }
 #endif
   }
+#ifdef PCSX
+  // PCSX maps all RAM mirror invalid_code tests to 0x80000000..0x80000000+RAM_SIZE
+  if(get_page(start)<(RAM_SIZE>>12))
+    for(i=start>>12;i<=(start+slen*4)>>12;i++)
+      invalid_code[((u_int)0x80000000>>12)|i]=0;
+#endif
   
   /* Pass 10 - Free memory by expiring oldest blocks */