drc: merge Ari64's patch: 21_debugging_stuff
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
index 7d4a3d9..d837138 100644 (file)
@@ -3274,8 +3274,12 @@ void store_assemble(int i,struct regstat *i_regs)
   //if(opcode[i]==0x2B)
   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
   {
-    //emit_pusha();
+    #ifdef __i386__
+    emit_pusha();
+    #endif
+    #ifdef __arm__
     save_regs(0x100f);
+    #endif
         emit_readword((int)&last_count,ECX);
         #ifdef __i386__
         if(get_reg(i_regs->regmap,CCREG)<0)
@@ -3294,8 +3298,12 @@ void store_assemble(int i,struct regstat *i_regs)
         emit_writeword(0,(int)&Count);
         #endif
     emit_call((int)memdebug);
-    //emit_popa();
+    #ifdef __i386__
+    emit_popa();
+    #endif
+    #ifdef __arm__
     restore_regs(0x100f);
+    #endif
   }/**/
 }
 
@@ -7453,6 +7461,10 @@ void clean_registers(int istart,int iend,int wr)
                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
                 }
+                if(branch_regs[i].regmap[r]>=0) {
+                  will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
+                  wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
+                }
               }
             }
           //}
@@ -7482,13 +7494,14 @@ void clean_registers(int istart,int iend,int wr)
           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
             for(r=0;r<HOST_REGS;r++) {
               if(r!=EXCLUDE_REG) {
-                if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
+                signed char target_reg=branch_regs[i].regmap[r];
+                if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
                 }
-                else
-                {
-                  will_dirty_i&=~(1<<r);
+                else if(target_reg>=0) {
+                  will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
+                  wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
                 }
                 // Treat delay slot as part of branch too
                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
@@ -7525,7 +7538,7 @@ void clean_registers(int istart,int iend,int wr)
               }
             }
           }
-          // Merge in delay slot
+          // Merge in delay slot (won't dirty)
           for(r=0;r<HOST_REGS;r++) {
             if(r!=EXCLUDE_REG) {
               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
@@ -7642,7 +7655,7 @@ void clean_registers(int istart,int iend,int wr)
             regs[i].wasdirty|=will_dirty_i&(1<<r);
           }
         }
-        else if((nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
+        else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
           // Register moved to a different register
           will_dirty_i&=~(1<<r);
           wont_dirty_i&=~(1<<r);
@@ -8577,6 +8590,7 @@ int new_recompile_block(int addr)
       // Does the block continue due to a branch?
       for(j=i-1;j>=0;j--)
       {
+        if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
         if(ba[j]==start+i*4+4) done=j=0;
         if(ba[j]==start+i*4+8) done=j=0;
       }
@@ -8682,7 +8696,9 @@ int new_recompile_block(int addr)
       }
       if(temp_is32!=current.is32) {
         //printf("dumping 32-bit regs (%x)\n",start+i*4);
-        #ifdef DESTRUCTIVE_WRITEBACK
+        #ifndef DESTRUCTIVE_WRITEBACK
+        if(ds)
+        #endif
         for(hr=0;hr<HOST_REGS;hr++)
         {
           int r=current.regmap[hr];
@@ -8694,7 +8710,6 @@ int new_recompile_block(int addr)
             }
           }
         }
-        #endif
         current.is32=temp_is32;
       }
     }
@@ -9874,7 +9889,7 @@ int new_recompile_block(int addr)
   // If a register is allocated during a loop, try to allocate it for the
   // entire loop, if possible.  This avoids loading/storing registers
   // inside of the loop.
-
+  
   signed char f_regmap[HOST_REGS];
   clear_all_regs(f_regmap);
   for(i=0;i<slen-1;i++)
@@ -9891,7 +9906,7 @@ int new_recompile_block(int addr)
       {
         int t=(ba[i]-start)>>2;
         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
-        if(t<2||(itype[t-2]!=UJUMP)) // call/ret assumes no registers allocated
+        if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
         for(hr=0;hr<HOST_REGS;hr++)
         {
           if(regs[i].regmap[hr]>64) {
@@ -9947,7 +9962,7 @@ int new_recompile_block(int addr)
           // a mov, which is of negligible benefit.  So such cases are
           // skipped below.
           if(f_regmap[hr]>0) {
-            if(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0) {
+            if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
               int r=f_regmap[hr];
               for(j=t;j<=i;j++)
               {
@@ -9986,7 +10001,7 @@ int new_recompile_block(int addr)
                         break;
                       }
                       // call/ret fast path assumes no registers allocated
-                      if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)) {
+                      if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
                         break;
                       }
                       if(r>63) {
@@ -10134,7 +10149,7 @@ int new_recompile_block(int addr)
         }
       }
     }else{
-      int count=0;
+      // Non branch or undetermined branch target
       for(hr=0;hr<HOST_REGS;hr++)
       {
         if(hr!=EXCLUDE_REG) {
@@ -10154,7 +10169,6 @@ int new_recompile_block(int addr)
               f_regmap[hr]=regs[i].regmap[hr];
             }
           }
-          else if(regs[i].regmap[hr]<0) count++;
         }
       }
       // Try to restore cycle count at branch targets
@@ -10222,6 +10236,186 @@ int new_recompile_block(int addr)
     }
   }
   
+  // Cache memory offset or tlb map pointer if a register is available
+  #ifndef HOST_IMM_ADDR32
+  #ifndef RAM_OFFSET
+  if(using_tlb)
+  #endif
+  {
+    int earliest_available[HOST_REGS];
+    int loop_start[HOST_REGS];
+    int score[HOST_REGS];
+    int end[HOST_REGS];
+    int reg=using_tlb?MMREG:ROREG;
+
+    // Init
+    for(hr=0;hr<HOST_REGS;hr++) {
+      score[hr]=0;earliest_available[hr]=0;
+      loop_start[hr]=MAXBLOCK;
+    }
+    for(i=0;i<slen-1;i++)
+    {
+      // Can't do anything if no registers are available
+      if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
+        for(hr=0;hr<HOST_REGS;hr++) {
+          score[hr]=0;earliest_available[hr]=i+1;
+          loop_start[hr]=MAXBLOCK;
+        }
+      }
+      if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
+        if(!ooo[i]) {
+          if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
+            for(hr=0;hr<HOST_REGS;hr++) {
+              score[hr]=0;earliest_available[hr]=i+1;
+              loop_start[hr]=MAXBLOCK;
+            }
+          }
+        }else{
+          if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
+            for(hr=0;hr<HOST_REGS;hr++) {
+              score[hr]=0;earliest_available[hr]=i+1;
+              loop_start[hr]=MAXBLOCK;
+            }
+          }
+        }
+      }
+      // Mark unavailable registers
+      for(hr=0;hr<HOST_REGS;hr++) {
+        if(regs[i].regmap[hr]>=0) {
+          score[hr]=0;earliest_available[hr]=i+1;
+          loop_start[hr]=MAXBLOCK;
+        }
+        if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
+          if(branch_regs[i].regmap[hr]>=0) {
+            score[hr]=0;earliest_available[hr]=i+2;
+            loop_start[hr]=MAXBLOCK;
+          }
+        }
+      }
+      // No register allocations after unconditional jumps
+      if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
+      {
+        for(hr=0;hr<HOST_REGS;hr++) {
+          score[hr]=0;earliest_available[hr]=i+2;
+          loop_start[hr]=MAXBLOCK;
+        }
+        i++; // Skip delay slot too
+        //printf("skip delay slot: %x\n",start+i*4);
+      }
+      else
+      // Possible match
+      if(itype[i]==LOAD||itype[i]==LOADLR||
+         itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
+        for(hr=0;hr<HOST_REGS;hr++) {
+          if(hr!=EXCLUDE_REG) {
+            end[hr]=i-1;
+            for(j=i;j<slen-1;j++) {
+              if(regs[j].regmap[hr]>=0) break;
+              if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
+                if(branch_regs[j].regmap[hr]>=0) break;
+                if(ooo[j]) {
+                  if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
+                }else{
+                  if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
+                }
+              }
+              else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
+              if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
+                int t=(ba[j]-start)>>2;
+                if(t<j&&t>=earliest_available[hr]) {
+                  if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
+                    // Score a point for hoisting loop invariant
+                    if(t<loop_start[hr]) loop_start[hr]=t;
+                    //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
+                    score[hr]++;
+                    end[hr]=j;
+                  }
+                }
+                else if(t<j) {
+                  if(regs[t].regmap[hr]==reg) {
+                    // Score a point if the branch target matches this register
+                    score[hr]++;
+                    end[hr]=j;
+                  }
+                }
+                if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
+                   itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
+                  score[hr]++;
+                  end[hr]=j;
+                }
+              }
+              if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
+              {
+                // Stop on unconditional branch
+                break;
+              }
+              else
+              if(itype[j]==LOAD||itype[j]==LOADLR||
+                 itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
+                score[hr]++;
+                end[hr]=j;
+              }
+            }
+          }
+        }
+        // Find highest score and allocate that register
+        int maxscore=0;
+        for(hr=0;hr<HOST_REGS;hr++) {
+          if(hr!=EXCLUDE_REG) {
+            if(score[hr]>score[maxscore]) {
+              maxscore=hr;
+              //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
+            }
+          }
+        }
+        if(score[maxscore]>1)
+        {
+          if(i<loop_start[maxscore]) loop_start[maxscore]=i;
+          for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
+            //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
+            assert(regs[j].regmap[maxscore]<0);
+            if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
+            regs[j].regmap[maxscore]=reg;
+            regs[j].dirty&=~(1<<maxscore);
+            regs[j].wasconst&=~(1<<maxscore);
+            regs[j].isconst&=~(1<<maxscore);
+            if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
+              branch_regs[j].regmap[maxscore]=reg;
+              branch_regs[j].wasdirty&=~(1<<maxscore);
+              branch_regs[j].dirty&=~(1<<maxscore);
+              branch_regs[j].wasconst&=~(1<<maxscore);
+              branch_regs[j].isconst&=~(1<<maxscore);
+              if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
+                regmap_pre[j+2][maxscore]=reg;
+                regs[j+2].wasdirty&=~(1<<maxscore);
+              }
+              // loop optimization (loop_preload)
+              int t=(ba[j]-start)>>2;
+              if(t==loop_start[maxscore]) {
+                if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
+                  regs[t].regmap_entry[maxscore]=reg;
+              }
+            }
+            else
+            {
+              if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
+                regmap_pre[j+1][maxscore]=reg;
+                regs[j+1].wasdirty&=~(1<<maxscore);
+              }
+            }
+          }
+          i=j-1;
+          if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
+          for(hr=0;hr<HOST_REGS;hr++) {
+            score[hr]=0;earliest_available[hr]=i+i;
+            loop_start[hr]=MAXBLOCK;
+          }
+        }
+      }
+    }
+  }
+  #endif
+  
   // This allocates registers (if possible) one instruction prior
   // to use, which can avoid a load-use penalty on certain CPUs.
   for(i=0;i<slen-1;i++)
@@ -10265,6 +10459,7 @@ int new_recompile_block(int addr)
               }
             }
           }
+          // Preload target address for load instruction (non-constant)
           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
             {
@@ -10281,6 +10476,7 @@ int new_recompile_block(int addr)
               }
             }
           }
+          // Load source into target register 
           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
             {
@@ -10297,6 +10493,7 @@ int new_recompile_block(int addr)
               }
             }
           }
+          // Preload map address
           #ifndef HOST_IMM_ADDR32
           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
             hr=get_reg(regs[i+1].regmap,TLREG);
@@ -10336,6 +10533,7 @@ int new_recompile_block(int addr)
             }
           }
           #endif
+          // Address for store instruction (non-constant)
           if(itype[i+1]==STORE||itype[i+1]==STORELR
              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
@@ -10813,8 +11011,13 @@ int new_recompile_block(int addr)
         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
               unneeded_reg[i],unneeded_reg_upper[i]);
       }
-      is32_pre=regs[i].is32;
-      dirty_pre=regs[i].dirty;
+      if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
+        is32_pre=branch_regs[i].is32;
+        dirty_pre=branch_regs[i].dirty;
+      }else{
+        is32_pre=regs[i].is32;
+        dirty_pre=regs[i].dirty;
+      }
       #endif
       // write back
       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))