drc: ujump DS $ra overwrite fix?
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
index cfce021..1fe2c0c 100644 (file)
@@ -121,7 +121,12 @@ struct ll_entry
   char shadow[1048576]  __attribute__((aligned(16)));
   void *copy;
   int expirep;
+#ifndef PCSX
   u_int using_tlb;
+#else
+  static const u_int using_tlb=0;
+#endif
+  static u_int sp_in_mirror;
   u_int stop_after_jal;
   extern u_char restore_candidate[512];
   extern int cycle_count;
@@ -1182,6 +1187,9 @@ void invalidate_block(u_int block)
   
   // Don't trap writes
   invalid_code[block]=1;
+#ifdef PCSX
+  invalid_code[((u_int)0x80000000>>12)|page]=1;
+#endif
 #ifndef DISABLE_TLB
   // If there is a valid TLB entry for this page, remove write protect
   if(tlb_LUT_w[block]) {
@@ -2780,8 +2788,10 @@ void load_assemble(int i,struct regstat *i_regs)
   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
   if(s>=0) {
     c=(i_regs->wasconst>>s)&1;
-    memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
-    if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
+    if (c) {
+      memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
+      if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
+    }
   }
   //printf("load_assemble: c=%d\n",c);
   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
@@ -2817,6 +2827,13 @@ void load_assemble(int i,struct regstat *i_regs)
       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
       #endif
       {
+        #ifdef PCSX
+        if(sp_in_mirror&&rs1[i]==29) {
+          emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
+          emit_cmpimm(HOST_TEMPREG,RAM_SIZE);
+        }
+        else
+        #endif
         emit_cmpimm(addr,RAM_SIZE);
         jaddr=(int)out;
         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
@@ -2856,6 +2873,9 @@ void load_assemble(int i,struct regstat *i_regs)
           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
 #else
           if(!c) a=addr;
+#endif
+#ifdef PCSX
+          if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
 #endif
           emit_movsbl_indexed_tlb(x,a,map,tl);
         }
@@ -2881,6 +2901,9 @@ void load_assemble(int i,struct regstat *i_regs)
           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
 #else
           if(!c) a=addr;
+#endif
+#ifdef PCSX
+          if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
 #endif
           //#ifdef
           //emit_movswl_indexed_tlb(x,tl,map,tl);
@@ -2906,13 +2929,17 @@ void load_assemble(int i,struct regstat *i_regs)
   if (opcode[i]==0x23) { // LW
     if(!c||memtarget) {
       if(!dummy) {
+        int a=addr;
+#ifdef PCSX
+        if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
+#endif
         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
         #ifdef HOST_IMM_ADDR32
         if(c)
           emit_readword_tlb(constmap[i][s]+offset,map,tl);
         else
         #endif
-        emit_readword_indexed_tlb(0,addr,map,tl);
+        emit_readword_indexed_tlb(0,a,map,tl);
       }
       if(jaddr)
         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
@@ -2938,6 +2965,9 @@ void load_assemble(int i,struct regstat *i_regs)
           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
 #else
           if(!c) a=addr;
+#endif
+#ifdef PCSX
+          if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
 #endif
           emit_movzbl_indexed_tlb(x,a,map,tl);
         }
@@ -2963,6 +2993,9 @@ void load_assemble(int i,struct regstat *i_regs)
           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
 #else
           if(!c) a=addr;
+#endif
+#ifdef PCSX
+          if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
 #endif
           //#ifdef
           //emit_movzwl_indexed_tlb(x,tl,map,tl);
@@ -2989,13 +3022,17 @@ void load_assemble(int i,struct regstat *i_regs)
     assert(th>=0);
     if(!c||memtarget) {
       if(!dummy) {
+        int a=addr;
+#ifdef PCSX
+        if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
+#endif
         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
         #ifdef HOST_IMM_ADDR32
         if(c)
           emit_readword_tlb(constmap[i][s]+offset,map,tl);
         else
         #endif
-        emit_readword_indexed_tlb(0,addr,map,tl);
+        emit_readword_indexed_tlb(0,a,map,tl);
       }
       if(jaddr)
         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
@@ -3008,6 +3045,10 @@ void load_assemble(int i,struct regstat *i_regs)
   if (opcode[i]==0x37) { // LD
     if(!c||memtarget) {
       if(!dummy) {
+        int a=addr;
+#ifdef PCSX
+        if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
+#endif
         //gen_tlb_addr_r(tl,map);
         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
@@ -3016,7 +3057,7 @@ void load_assemble(int i,struct regstat *i_regs)
           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
         else
         #endif
-        emit_readdword_indexed_tlb(0,addr,map,th,tl);
+        emit_readdword_indexed_tlb(0,a,map,th,tl);
       }
       if(jaddr)
         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
@@ -3081,8 +3122,10 @@ void store_assemble(int i,struct regstat *i_regs)
   offset=imm[i];
   if(s>=0) {
     c=(i_regs->wasconst>>s)&1;
-    memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
-    if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
+    if(c) {
+      memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
+      if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
+    }
   }
   assert(tl>=0);
   assert(temp>=0);
@@ -3094,9 +3137,15 @@ void store_assemble(int i,struct regstat *i_regs)
   else addr=s;
   if(!using_tlb) {
     if(!c) {
+      #ifdef PCSX
+      if(sp_in_mirror&&rs1[i]==29) {
+        emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
+        emit_cmpimm(HOST_TEMPREG,RAM_SIZE);
+      }
+      else
+      #endif
       #ifdef R29_HACK
       // Strmnnrmn's speed hack
-      memtarget=1;
       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
       #endif
       emit_cmpimm(addr,RAM_SIZE);
@@ -3104,6 +3153,7 @@ void store_assemble(int i,struct regstat *i_regs)
       if(s==addr) emit_mov(s,temp);
       #endif
       #ifdef R29_HACK
+      memtarget=1;
       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
       #endif
       {
@@ -3135,6 +3185,9 @@ void store_assemble(int i,struct regstat *i_regs)
       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
 #else
       if(!c) a=addr;
+#endif
+#ifdef PCSX
+      if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
 #endif
       //gen_tlb_addr_w(temp,map);
       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
@@ -3150,6 +3203,9 @@ void store_assemble(int i,struct regstat *i_regs)
       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
 #else
       if(!c) a=addr;
+#endif
+#ifdef PCSX
+      if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
 #endif
       //#ifdef
       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
@@ -3163,23 +3219,32 @@ void store_assemble(int i,struct regstat *i_regs)
     type=STOREH_STUB;
   }
   if (opcode[i]==0x2B) { // SW
-    if(!c||memtarget)
+    if(!c||memtarget) {
+      int a=addr;
+#ifdef PCSX
+      if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
+#endif
       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
-      emit_writeword_indexed_tlb(tl,0,addr,map,temp);
+      emit_writeword_indexed_tlb(tl,0,a,map,temp);
+    }
     type=STOREW_STUB;
   }
   if (opcode[i]==0x3F) { // SD
     if(!c||memtarget) {
+      int a=addr;
+#ifdef PCSX
+      if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
+#endif
       if(rs2[i]) {
         assert(th>=0);
         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
-        emit_writedword_indexed_tlb(th,tl,0,addr,map,temp);
+        emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
       }else{
         // Store zero
         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
-        emit_writedword_indexed_tlb(tl,tl,0,addr,map,temp);
+        emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
       }
     }
     type=STORED_STUB;
@@ -3252,7 +3317,7 @@ void storelr_assemble(int i,struct regstat *i_regs)
   int jaddr=0,jaddr2;
   int case1,case2,case3;
   int done0,done1,done2;
-  int memtarget,c=0;
+  int memtarget=0,c=0;
   int agr=AGEN1+(i&1);
   u_int hr,reglist=0;
   th=get_reg(i_regs->regmap,rs2[i]|64);
@@ -3263,8 +3328,10 @@ void storelr_assemble(int i,struct regstat *i_regs)
   offset=imm[i];
   if(s>=0) {
     c=(i_regs->isconst>>s)&1;
-    memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
-    if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
+    if(c) {
+      memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
+      if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
+    }
   }
   assert(tl>=0);
   for(hr=0;hr<HOST_REGS;hr++) {
@@ -5098,34 +5165,19 @@ void ujump_assemble(int i,struct regstat *i_regs)
     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
   }
   #endif
-  ds_assemble(i+1,i_regs);
-  uint64_t bc_unneeded=branch_regs[i].u;
-  uint64_t bc_unneeded_upper=branch_regs[i].uu;
-  bc_unneeded|=1|(1LL<<rt1[i]);
-  bc_unneeded_upper|=1|(1LL<<rt1[i]);
-  wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
-                bc_unneeded,bc_unneeded_upper);
-  load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
   if(rt1[i]==31) {
     int rt;
     unsigned int return_address;
-    assert(rt1[i+1]!=31);
-    assert(rt2[i+1]!=31);
     rt=get_reg(branch_regs[i].regmap,31);
     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
     //assert(rt>=0);
     return_address=start+i*4+8;
     if(rt>=0) {
       #ifdef USE_MINI_HT
-      if(internal_branch(branch_regs[i].is32,return_address)) {
-        int temp=rt+1;
-        if(temp==EXCLUDE_REG||temp>=HOST_REGS||
-           branch_regs[i].regmap[temp]>=0)
-        {
-          temp=get_reg(branch_regs[i].regmap,-1);
-        }
+      if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
+        int temp=-1; // note: must be ds-safe
         #ifdef HOST_TEMPREG
-        if(temp<0) temp=HOST_TEMPREG;
+        temp=HOST_TEMPREG;
         #endif
         if(temp>=0) do_miniht_insert(return_address,rt,temp);
         else emit_movimm(return_address,rt);
@@ -5146,6 +5198,14 @@ void ujump_assemble(int i,struct regstat *i_regs)
       }
     }
   }
+  ds_assemble(i+1,i_regs);
+  uint64_t bc_unneeded=branch_regs[i].u;
+  uint64_t bc_unneeded_upper=branch_regs[i].uu;
+  bc_unneeded|=1|(1LL<<rt1[i]);
+  bc_unneeded_upper|=1|(1LL<<rt1[i]);
+  wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
+                bc_unneeded,bc_unneeded_upper);
+  load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
   int cc,adj;
   cc=get_reg(branch_regs[i].regmap,CCREG);
   assert(cc==HOST_CCREG);
@@ -7725,7 +7785,10 @@ void new_dynarec_clear_full()
   literalcount=0;
   stop_after_jal=0;
   // TLB
+#ifndef DISABLE_TLB
   using_tlb=0;
+#endif
+  sp_in_mirror=0;
   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
     memory_map[n]=-1;
   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
@@ -7835,6 +7898,11 @@ int new_recompile_block(int addr)
   start = (u_int)addr&~3;
   //assert(((u_int)addr&1)==0);
 #ifdef PCSX
+  if(!sp_in_mirror&&(signed int)(psxRegs.GPR.n.sp&0xffe00000)>0x80200000&&
+     0x10000<=psxRegs.GPR.n.sp&&(psxRegs.GPR.n.sp&~0xe0e00000)<RAM_SIZE) {
+    printf("SP hack enabled (%08x), @%08x\n", psxRegs.GPR.n.sp);
+    sp_in_mirror=1;
+  }
   if (Config.HLE && start == 0x80001000) // hlecall
   {
     // XXX: is this enough? Maybe check hleSoftCall?
@@ -8227,18 +8295,6 @@ int new_recompile_block(int addr)
         printf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
         break;
     }
-#ifdef PCSX
-    /* detect branch in delay slot early */
-    if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
-      opcode[i+1]=source[i+1]>>26;
-      opcode2[i+1]=source[i+1]&0x3f;
-      if((0<opcode[i+1]&&opcode[i+1]<8)||(opcode[i+1]==0&&(opcode2[i+1]==8||opcode2[i+1]==9))) {
-        printf("branch in delay slot @%08x (%08x)\n", addr + i*4+4, addr);
-        // don't handle first branch and call interpreter if it's hit
-        type=INTCALL;
-      }
-    }
-#endif
     itype[i]=type;
     opcode2[i]=op2;
     /* Get registers/immediates */
@@ -8471,19 +8527,43 @@ int new_recompile_block(int addr)
     else if(type==CJUMP||type==SJUMP||type==FJUMP)
       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
     else ba[i]=-1;
-    /* Is this the end of the block? */
-    if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
 #ifdef PCSX
-      // check for link register access in delay slot
-      int rt1_=rt1[i-1];
-      if(rt1_!=0&&(rs1[i]==rt1_||rs2[i]==rt1_||rt1[i]==rt1_||rt2[i]==rt1_)) {
-        printf("link access in delay slot @%08x (%08x)\n", addr + i*4, addr);
+    if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
+      int do_in_intrp=0;
+      // branch in delay slot?
+      if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
+        // don't handle first branch and call interpreter if it's hit
+        printf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
+        do_in_intrp=1;
+      }
+      // basic load delay detection
+      else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
+        int t=(ba[i-1]-start)/4;
+        if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
+          // jump target wants DS result - potential load delay effect
+          printf("load delay @%08x (%08x)\n", addr + i*4, addr);
+          do_in_intrp=1;
+          bt[t+1]=1; // expected return from interpreter
+        }
+        else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
+              !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
+          // v0 overwrite like this is a sign of trouble, bail out
+          printf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
+          do_in_intrp=1;
+        }
+      }
+      if(do_in_intrp) {
+        rs1[i-1]=CCREG;
+        rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
         ba[i-1]=-1;
         itype[i-1]=INTCALL;
         done=2;
+        i--; // don't compile the DS
       }
-      else
+    }
 #endif
+    /* Is this the end of the block? */
+    if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
         done=2;
       }
@@ -8806,18 +8886,18 @@ int new_recompile_block(int addr)
           clear_const(&current,rt1[i]);
           alloc_cc(&current,i);
           dirty_reg(&current,CCREG);
+          ooo[i]=1;
+          delayslot_alloc(&current,i+1);
           if (rt1[i]==31) {
             alloc_reg(&current,i,31);
             dirty_reg(&current,31);
-            assert(rs1[i+1]!=31&&rs2[i+1]!=31);
-            assert(rt1[i+1]!=rt1[i]);
+            //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
+            //assert(rt1[i+1]!=rt1[i]);
             #ifdef REG_PREFETCH
             alloc_reg(&current,i,PTEMP);
             #endif
             //current.is32|=1LL<<rt1[i];
           }
-          ooo[i]=1;
-          delayslot_alloc(&current,i+1);
           //current.isconst=0; // DEBUG
           ds=1;
           //printf("i=%d, isconst=%x\n",i,current.isconst);
@@ -10743,9 +10823,9 @@ int new_recompile_block(int addr)
       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
       {
         // Load the delay slot registers if necessary
-        if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
+        if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
-        if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
+        if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
@@ -11020,6 +11100,12 @@ int new_recompile_block(int addr)
     }
 #endif
   }
+#ifdef PCSX
+  // PCSX maps all RAM mirror invalid_code tests to 0x80000000..0x80000000+RAM_SIZE
+  if(get_page(start)<(RAM_SIZE>>12))
+    for(i=start>>12;i<=(start+slen*4)>>12;i++)
+      invalid_code[((u_int)0x80000000>>12)|i]=0;
+#endif
   
   /* Pass 10 - Free memory by expiring oldest blocks */