hsn[RHASH]=1;
hsn[RHTBL]=1;
}
+ // due to the way JAL is currently done we need DS not to evict $ra
+ if(i>0&&itype[i-1]==UJUMP&&rt1[i-1]==31) {
+ hsn[31]=0;
+ }
// Coprocessor load/store needs FTEMP, even if not declared
if(itype[i]==C1LS||itype[i]==C2LS) {
hsn[FTEMP]=0;
int offset;
int jaddr=0;
int memtarget=0,c=0;
+ int fastload_reg_override=0;
u_int hr,reglist=0;
th=get_reg(i_regs->regmap,rt1[i]|64);
tl=get_reg(i_regs->regmap,rt1[i]);
if(sp_in_mirror&&rs1[i]==29) {
emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
emit_cmpimm(HOST_TEMPREG,RAM_SIZE);
+ fastload_reg_override=HOST_TEMPREG;
}
else
#endif
#else
if(!c) a=addr;
#endif
-#ifdef PCSX
- if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
-#endif
+ if(fastload_reg_override) a=fastload_reg_override;
+
emit_movsbl_indexed_tlb(x,a,map,tl);
}
}
#else
if(!c) a=addr;
#endif
-#ifdef PCSX
- if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
-#endif
+ if(fastload_reg_override) a=fastload_reg_override;
//#ifdef
//emit_movswl_indexed_tlb(x,tl,map,tl);
//else
if(!c||memtarget) {
if(!dummy) {
int a=addr;
-#ifdef PCSX
- if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
-#endif
+ if(fastload_reg_override) a=fastload_reg_override;
//emit_readword_indexed((int)rdram-0x80000000,addr,tl);
#ifdef HOST_IMM_ADDR32
if(c)
#else
if(!c) a=addr;
#endif
-#ifdef PCSX
- if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
-#endif
+ if(fastload_reg_override) a=fastload_reg_override;
+
emit_movzbl_indexed_tlb(x,a,map,tl);
}
}
#else
if(!c) a=addr;
#endif
-#ifdef PCSX
- if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
-#endif
+ if(fastload_reg_override) a=fastload_reg_override;
//#ifdef
//emit_movzwl_indexed_tlb(x,tl,map,tl);
//#else
if(!c||memtarget) {
if(!dummy) {
int a=addr;
-#ifdef PCSX
- if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
-#endif
+ if(fastload_reg_override) a=fastload_reg_override;
//emit_readword_indexed((int)rdram-0x80000000,addr,tl);
#ifdef HOST_IMM_ADDR32
if(c)
if(!c||memtarget) {
if(!dummy) {
int a=addr;
-#ifdef PCSX
- if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
-#endif
+ if(fastload_reg_override) a=fastload_reg_override;
//gen_tlb_addr_r(tl,map);
//if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
//emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
int jaddr=0,jaddr2,type;
int memtarget=0,c=0;
int agr=AGEN1+(i&1);
+ int faststore_reg_override=0;
u_int hr,reglist=0;
th=get_reg(i_regs->regmap,rs2[i]|64);
tl=get_reg(i_regs->regmap,rs2[i]);
if(sp_in_mirror&&rs1[i]==29) {
emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
emit_cmpimm(HOST_TEMPREG,RAM_SIZE);
+ faststore_reg_override=HOST_TEMPREG;
}
else
#endif
#else
if(!c) a=addr;
#endif
-#ifdef PCSX
- if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
-#endif
+ if(faststore_reg_override) a=faststore_reg_override;
//gen_tlb_addr_w(temp,map);
//emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
emit_writebyte_indexed_tlb(tl,x,a,map,a);
#else
if(!c) a=addr;
#endif
-#ifdef PCSX
- if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
-#endif
+ if(faststore_reg_override) a=faststore_reg_override;
//#ifdef
//emit_writehword_indexed_tlb(tl,x,temp,map,temp);
//#else
if (opcode[i]==0x2B) { // SW
if(!c||memtarget) {
int a=addr;
-#ifdef PCSX
- if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
-#endif
+ if(faststore_reg_override) a=faststore_reg_override;
//emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
emit_writeword_indexed_tlb(tl,0,a,map,temp);
}
if (opcode[i]==0x3F) { // SD
if(!c||memtarget) {
int a=addr;
-#ifdef PCSX
- if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG;
-#endif
+ if(faststore_reg_override) a=faststore_reg_override;
if(rs2[i]) {
assert(th>=0);
//emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
//if(opcode[i]==0x2B)
/*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
{
- //emit_pusha();
+ #ifdef __i386__
+ emit_pusha();
+ #endif
+ #ifdef __arm__
save_regs(0x100f);
+ #endif
emit_readword((int)&last_count,ECX);
#ifdef __i386__
if(get_reg(i_regs->regmap,CCREG)<0)
emit_writeword(0,(int)&Count);
#endif
emit_call((int)memdebug);
- //emit_popa();
+ #ifdef __i386__
+ emit_popa();
+ #endif
+ #ifdef __arm__
restore_regs(0x100f);
+ #endif
}/**/
}
will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
}
+ if(branch_regs[i].regmap[r]>=0) {
+ will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
+ wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
+ }
}
}
//}
//if(ba[i]>start+i*4) { // Disable recursion (for debugging)
for(r=0;r<HOST_REGS;r++) {
if(r!=EXCLUDE_REG) {
- if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
+ signed char target_reg=branch_regs[i].regmap[r];
+ if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
}
- else
- {
- will_dirty_i&=~(1<<r);
+ else if(target_reg>=0) {
+ will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
+ wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
}
// Treat delay slot as part of branch too
/*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
}
}
}
- // Merge in delay slot
+ // Merge in delay slot (won't dirty)
for(r=0;r<HOST_REGS;r++) {
if(r!=EXCLUDE_REG) {
if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
+#ifndef FORCE32
case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
+#endif
case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
// Does the block continue due to a branch?
for(j=i-1;j>=0;j--)
{
+ if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
if(ba[j]==start+i*4+4) done=j=0;
if(ba[j]==start+i*4+8) done=j=0;
}
}
if(temp_is32!=current.is32) {
//printf("dumping 32-bit regs (%x)\n",start+i*4);
- #ifdef DESTRUCTIVE_WRITEBACK
+ #ifndef DESTRUCTIVE_WRITEBACK
+ if(ds)
+ #endif
for(hr=0;hr<HOST_REGS;hr++)
{
int r=current.regmap[hr];
}
}
}
- #endif
current.is32=temp_is32;
}
}
clear_const(¤t,rt1[i]);
alloc_cc(¤t,i);
dirty_reg(¤t,CCREG);
- ooo[i]=1;
- delayslot_alloc(¤t,i+1);
if (rt1[i]==31) {
alloc_reg(¤t,i,31);
dirty_reg(¤t,31);
#endif
//current.is32|=1LL<<rt1[i];
}
+ ooo[i]=1;
+ delayslot_alloc(¤t,i+1);
//current.isconst=0; // DEBUG
ds=1;
//printf("i=%d, isconst=%x\n",i,current.isconst);
// If a register is allocated during a loop, try to allocate it for the
// entire loop, if possible. This avoids loading/storing registers
// inside of the loop.
-
+
signed char f_regmap[HOST_REGS];
clear_all_regs(f_regmap);
for(i=0;i<slen-1;i++)
{
int t=(ba[i]-start)>>2;
if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
- if(t<2||(itype[t-2]!=UJUMP)) // call/ret assumes no registers allocated
+ if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
for(hr=0;hr<HOST_REGS;hr++)
{
if(regs[i].regmap[hr]>64) {
// a mov, which is of negligible benefit. So such cases are
// skipped below.
if(f_regmap[hr]>0) {
- if(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0) {
+ if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
int r=f_regmap[hr];
for(j=t;j<=i;j++)
{
break;
}
// call/ret fast path assumes no registers allocated
- if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)) {
+ if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
break;
}
if(r>63) {
}
}
}else{
- int count=0;
+ // Non branch or undetermined branch target
for(hr=0;hr<HOST_REGS;hr++)
{
if(hr!=EXCLUDE_REG) {
f_regmap[hr]=regs[i].regmap[hr];
}
}
- else if(regs[i].regmap[hr]<0) count++;
}
}
// Try to restore cycle count at branch targets
}
}
+ // Cache memory offset or tlb map pointer if a register is available
+ #ifndef HOST_IMM_ADDR32
+ #ifndef RAM_OFFSET
+ if(using_tlb)
+ #endif
+ {
+ int earliest_available[HOST_REGS];
+ int loop_start[HOST_REGS];
+ int score[HOST_REGS];
+ int end[HOST_REGS];
+ int reg=using_tlb?MMREG:ROREG;
+
+ // Init
+ for(hr=0;hr<HOST_REGS;hr++) {
+ score[hr]=0;earliest_available[hr]=0;
+ loop_start[hr]=MAXBLOCK;
+ }
+ for(i=0;i<slen-1;i++)
+ {
+ // Can't do anything if no registers are available
+ if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
+ for(hr=0;hr<HOST_REGS;hr++) {
+ score[hr]=0;earliest_available[hr]=i+1;
+ loop_start[hr]=MAXBLOCK;
+ }
+ }
+ if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
+ if(!ooo[i]) {
+ if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
+ for(hr=0;hr<HOST_REGS;hr++) {
+ score[hr]=0;earliest_available[hr]=i+1;
+ loop_start[hr]=MAXBLOCK;
+ }
+ }
+ }else{
+ if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
+ for(hr=0;hr<HOST_REGS;hr++) {
+ score[hr]=0;earliest_available[hr]=i+1;
+ loop_start[hr]=MAXBLOCK;
+ }
+ }
+ }
+ }
+ // Mark unavailable registers
+ for(hr=0;hr<HOST_REGS;hr++) {
+ if(regs[i].regmap[hr]>=0) {
+ score[hr]=0;earliest_available[hr]=i+1;
+ loop_start[hr]=MAXBLOCK;
+ }
+ if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
+ if(branch_regs[i].regmap[hr]>=0) {
+ score[hr]=0;earliest_available[hr]=i+2;
+ loop_start[hr]=MAXBLOCK;
+ }
+ }
+ }
+ // No register allocations after unconditional jumps
+ if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
+ {
+ for(hr=0;hr<HOST_REGS;hr++) {
+ score[hr]=0;earliest_available[hr]=i+2;
+ loop_start[hr]=MAXBLOCK;
+ }
+ i++; // Skip delay slot too
+ //printf("skip delay slot: %x\n",start+i*4);
+ }
+ else
+ // Possible match
+ if(itype[i]==LOAD||itype[i]==LOADLR||
+ itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
+ for(hr=0;hr<HOST_REGS;hr++) {
+ if(hr!=EXCLUDE_REG) {
+ end[hr]=i-1;
+ for(j=i;j<slen-1;j++) {
+ if(regs[j].regmap[hr]>=0) break;
+ if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
+ if(branch_regs[j].regmap[hr]>=0) break;
+ if(ooo[j]) {
+ if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
+ }else{
+ if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
+ }
+ }
+ else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
+ if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
+ int t=(ba[j]-start)>>2;
+ if(t<j&&t>=earliest_available[hr]) {
+ if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
+ // Score a point for hoisting loop invariant
+ if(t<loop_start[hr]) loop_start[hr]=t;
+ //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
+ score[hr]++;
+ end[hr]=j;
+ }
+ }
+ else if(t<j) {
+ if(regs[t].regmap[hr]==reg) {
+ // Score a point if the branch target matches this register
+ score[hr]++;
+ end[hr]=j;
+ }
+ }
+ if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
+ itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
+ score[hr]++;
+ end[hr]=j;
+ }
+ }
+ if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
+ {
+ // Stop on unconditional branch
+ break;
+ }
+ else
+ if(itype[j]==LOAD||itype[j]==LOADLR||
+ itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
+ score[hr]++;
+ end[hr]=j;
+ }
+ }
+ }
+ }
+ // Find highest score and allocate that register
+ int maxscore=0;
+ for(hr=0;hr<HOST_REGS;hr++) {
+ if(hr!=EXCLUDE_REG) {
+ if(score[hr]>score[maxscore]) {
+ maxscore=hr;
+ //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
+ }
+ }
+ }
+ if(score[maxscore]>1)
+ {
+ if(i<loop_start[maxscore]) loop_start[maxscore]=i;
+ for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
+ //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
+ assert(regs[j].regmap[maxscore]<0);
+ if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
+ regs[j].regmap[maxscore]=reg;
+ regs[j].dirty&=~(1<<maxscore);
+ regs[j].wasconst&=~(1<<maxscore);
+ regs[j].isconst&=~(1<<maxscore);
+ if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
+ branch_regs[j].regmap[maxscore]=reg;
+ branch_regs[j].wasdirty&=~(1<<maxscore);
+ branch_regs[j].dirty&=~(1<<maxscore);
+ branch_regs[j].wasconst&=~(1<<maxscore);
+ branch_regs[j].isconst&=~(1<<maxscore);
+ if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
+ regmap_pre[j+2][maxscore]=reg;
+ regs[j+2].wasdirty&=~(1<<maxscore);
+ }
+ // loop optimization (loop_preload)
+ int t=(ba[j]-start)>>2;
+ if(t==loop_start[maxscore]) {
+ if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
+ regs[t].regmap_entry[maxscore]=reg;
+ }
+ }
+ else
+ {
+ if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
+ regmap_pre[j+1][maxscore]=reg;
+ regs[j+1].wasdirty&=~(1<<maxscore);
+ }
+ }
+ }
+ i=j-1;
+ if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
+ for(hr=0;hr<HOST_REGS;hr++) {
+ score[hr]=0;earliest_available[hr]=i+i;
+ loop_start[hr]=MAXBLOCK;
+ }
+ }
+ }
+ }
+ }
+ #endif
+
// This allocates registers (if possible) one instruction prior
// to use, which can avoid a load-use penalty on certain CPUs.
for(i=0;i<slen-1;i++)
}
}
}
+ // Preload target address for load instruction (non-constant)
if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
{
}
}
}
+ // Load source into target register
if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
{
}
}
}
+ // Preload map address
#ifndef HOST_IMM_ADDR32
if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
hr=get_reg(regs[i+1].regmap,TLREG);
}
}
#endif
+ // Address for store instruction (non-constant)
if(itype[i+1]==STORE||itype[i+1]==STORELR
||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
#ifdef PCSX
if (start == 0x80030000) {
// nasty hack for fastbios thing
+ // override block entry to this code
instr_addr0_override=(u_int)out;
emit_movimm(start,0);
- emit_readword((int)&pcaddr,1);
+ // abuse io address var as a flag that we
+ // have already returned here once
+ emit_readword((int)&address,1);
emit_writeword(0,(int)&pcaddr);
+ emit_writeword(0,(int)&address);
emit_cmp(0,1);
emit_jne((int)new_dyna_leave);
}