X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?p=pcsx_rearmed.git;a=blobdiff_plain;f=libpcsxcore%2Fnew_dynarec%2Fnew_dynarec.c;h=d8371380853bee9a6a5b11b6f0fe436a09c427a4;hp=66603ea8857f33dfafeb15242cfd39b12d4aa7b0;hb=28d74ee8c4a1eade72032aff056ab854d664ce48;hpb=79c75f1b7efbec9dc8bbd6e3546ec1c07040b702 diff --git a/libpcsxcore/new_dynarec/new_dynarec.c b/libpcsxcore/new_dynarec/new_dynarec.c index 66603ea8..d8371380 100644 --- a/libpcsxcore/new_dynarec/new_dynarec.c +++ b/libpcsxcore/new_dynarec/new_dynarec.c @@ -1593,13 +1593,9 @@ void load_alloc(struct regstat *current,int i) //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt? if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]); - if(rt1[i]) { + if(rt1[i]&&!((current->u>>rt1[i])&1)) { alloc_reg(current,i,rt1[i]); - if(get_reg(current->regmap,rt1[i])<0) { - // dummy load, but we still need a register to calculate the address - alloc_reg_temp(current,i,-1); - minimum_free_regs[i]=1; - } + assert(get_reg(current->regmap,rt1[i])>=0); if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD { current->is32&=~(1LL<regmap,TLREG); assert(map>=0); + reglist&=~(1<regmap,TLREG); assert(map>=0); + reglist&=~(1<regmap,CCREG)<0) @@ -3294,8 +3298,12 @@ void store_assemble(int i,struct regstat *i_regs) emit_writeword(0,(int)&Count); #endif emit_call((int)memdebug); - //emit_popa(); + #ifdef __i386__ + emit_popa(); + #endif + #ifdef __arm__ restore_regs(0x100f); + #endif }/**/ } @@ -3354,6 +3362,7 @@ void storelr_assemble(int i,struct regstat *i_regs) }else{ // using tlb int map=get_reg(i_regs->regmap,TLREG); assert(map>=0); + reglist&=~(1<=0) emit_mov(s,temp); do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr); @@ -3604,6 +3613,7 @@ void c1ls_assemble(int i,struct regstat *i_regs) { map=get_reg(i_regs->regmap,TLREG); assert(map>=0); + reglist&=~(1<0 && i_regmap[hr]!=CCREG) + if(i_regmap[hr]>0 && (i_regmap[hr]&63)0 && i_regmap[hr]!=CCREG) + if(i_regmap[hr]>0 && (i_regmap[hr]&63)=0&®s[t].regmap_entry[hr]<64) { + if(regs[t].regmap_entry[hr]>=0&®s[t].regmap_entry[hr]=64) { + if(regs[t].regmap_entry[hr]>=64&®s[t].regmap_entry[hr]>(regs[t].regmap_entry[hr]&63))&1) { int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64); @@ -4612,7 +4622,7 @@ void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int ad } // Load 32-bit regs for(hr=0;hr=0&®s[t].regmap_entry[hr]<64) { + if(hr!=EXCLUDE_REG&®s[t].regmap_entry[hr]>=0&®s[t].regmap_entry[hr]>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) { #else @@ -4630,7 +4640,7 @@ void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int ad } //Load 64-bit regs for(hr=0;hr=64) { + if(hr!=EXCLUDE_REG&®s[t].regmap_entry[hr]>=64&®s[t].regmap_entry[hr]>(regs[t].regmap_entry[hr]&63))&1) { @@ -4671,19 +4681,19 @@ int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr) { if(i_regmap[hr]!=regs[t].regmap_entry[hr]) { - if(regs[t].regmap_entry[hr]!=-1) + if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)>hr)&1) { - if(i_regmap[hr]<64) + if(i_regmap[hr]>i_regmap[hr])&1)) return 0; } - else + else if(i_regmap[hr]>=64&&i_regmap[hr]>(i_regmap[hr]&63))&1)) return 0; @@ -4857,7 +4867,7 @@ void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert) } else { - emit_cmpimm(HOST_CCREG,-2*(count+2)); + emit_cmpimm(HOST_CCREG,-CLOCK_DIVIDER*(count+2)); jaddr=(int)out; emit_jns(0); } @@ -7451,6 +7461,10 @@ void clean_registers(int istart,int iend,int wr) will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<>2]&(1<=0) { + will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<>2]>>(branch_regs[i].regmap[r]&63))&1)<start+i*4) { // Disable recursion (for debugging) for(r=0;r>2].regmap_entry[r]) { + signed char target_reg=branch_regs[i].regmap[r]; + if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) { will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<>2]&(1<=0) { + will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<>2]>>(target_reg&63))&1)<>2].regmap_entry[r]) { @@ -7523,7 +7538,7 @@ void clean_registers(int istart,int iend,int wr) } } } - // Merge in delay slot + // Merge in delay slot (won't dirty) for(r=0;r=0) { + else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) { // Register moved to a different register will_dirty_i&=~(1<=0;j--) { + if(ba[j]==start+i*4) done=j=0; // Branch into delay slot if(ba[j]==start+i*4+4) done=j=0; if(ba[j]==start+i*4+8) done=j=0; } @@ -8680,7 +8696,9 @@ int new_recompile_block(int addr) } if(temp_is32!=current.is32) { //printf("dumping 32-bit regs (%x)\n",start+i*4); - #ifdef DESTRUCTIVE_WRITEBACK + #ifndef DESTRUCTIVE_WRITEBACK + if(ds) + #endif for(hr=0;hr>2; if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots - if(t<2||(itype[t-2]!=UJUMP)) // call/ret assumes no registers allocated + if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated for(hr=0;hr64) { @@ -9945,7 +9962,7 @@ int new_recompile_block(int addr) // a mov, which is of negligible benefit. So such cases are // skipped below. if(f_regmap[hr]>0) { - if(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0) { + if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) { int r=f_regmap[hr]; for(j=t;j<=i;j++) { @@ -9984,7 +10001,7 @@ int new_recompile_block(int addr) break; } // call/ret fast path assumes no registers allocated - if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)) { + if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) { break; } if(r>63) { @@ -10132,7 +10149,7 @@ int new_recompile_block(int addr) } } }else{ - int count=0; + // Non branch or undetermined branch target for(hr=0;hr=0) { + score[hr]=0;earliest_available[hr]=i+1; + loop_start[hr]=MAXBLOCK; + } + if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) { + if(branch_regs[i].regmap[hr]>=0) { + score[hr]=0;earliest_available[hr]=i+2; + loop_start[hr]=MAXBLOCK; + } + } + } + // No register allocations after unconditional jumps + if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000) + { + for(hr=0;hr=0) break; + if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) { + if(branch_regs[j].regmap[hr]>=0) break; + if(ooo[j]) { + if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break; + }else{ + if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break; + } + } + else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break; + if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) { + int t=(ba[j]-start)>>2; + if(t=earliest_available[hr]) { + if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated + // Score a point for hoisting loop invariant + if(t>16)==0x1000) + { + // Stop on unconditional branch + break; + } + else + if(itype[j]==LOAD||itype[j]==LOADLR|| + itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) { + score[hr]++; + end[hr]=j; + } + } + } + } + // Find highest score and allocate that register + int maxscore=0; + for(hr=0;hrscore[maxscore]) { + maxscore=hr; + //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4); + } + } + } + if(score[maxscore]>1) + { + if(i=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);} + assert(regs[j].regmap[maxscore]<0); + if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg; + regs[j].regmap[maxscore]=reg; + regs[j].dirty&=~(1<>16)!=0x1000) { + regmap_pre[j+2][maxscore]=reg; + regs[j+2].wasdirty&=~(1<>2; + if(t==loop_start[maxscore]) { + if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated + regs[t].regmap_entry[maxscore]=reg; + } + } + else + { + if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) { + regmap_pre[j+1][maxscore]=reg; + regs[j+1].wasdirty&=~(1<=0) { @@ -10279,6 +10476,7 @@ int new_recompile_block(int addr) } } } + // Load source into target register if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) { if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0) { @@ -10295,6 +10493,7 @@ int new_recompile_block(int addr) } } } + // Preload map address #ifndef HOST_IMM_ADDR32 if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) { hr=get_reg(regs[i+1].regmap,TLREG); @@ -10334,6 +10533,7 @@ int new_recompile_block(int addr) } } #endif + // Address for store instruction (non-constant) if(itype[i+1]==STORE||itype[i+1]==STORELR ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2 if(get_reg(regs[i+1].regmap,rs1[i+1])<0) { @@ -10811,8 +11011,13 @@ int new_recompile_block(int addr) wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre, unneeded_reg[i],unneeded_reg_upper[i]); } - is32_pre=regs[i].is32; - dirty_pre=regs[i].dirty; + if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) { + is32_pre=branch_regs[i].is32; + dirty_pre=branch_regs[i].dirty; + }else{ + is32_pre=regs[i].is32; + dirty_pre=regs[i].dirty; + } #endif // write back if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))