static u_int smrv_weak; // same, but somewhat less likely
static u_int smrv_strong_next; // same, but after current insn executes
static u_int smrv_weak_next;
- static uint64_t unneeded_reg[MAXBLOCK];
- static uint64_t branch_unneeded_reg[MAXBLOCK];
// see 'struct regstat' for a description
static signed char regmap_pre[MAXBLOCK][HOST_REGS];
// contains 'real' consts at [i] insn, but may differ from what's actually
static uint32_t constmap[MAXBLOCK][HOST_REGS];
static struct regstat regs[MAXBLOCK];
static struct regstat branch_regs[MAXBLOCK];
- static struct code_stub stubs[MAXBLOCK*3];
+ static struct code_stub stubs[MAXBLOCK];
static int stubcount;
static u_int literals[1024][2];
static int literalcount;
int linkcount;
void *instr_addr[MAXBLOCK];
struct link_entry link_addr[MAXBLOCK];
+ uint64_t unneeded_reg[MAXBLOCK];
+ uint64_t branch_unneeded_reg[MAXBLOCK];
};
#define HACK_ENABLED(x) ((ndrc_g.hacks | ndrc_g.hacks_pergame) & (x))
{
if(dops[i+j].rs1==r) rn=j;
if(dops[i+j].rs2==r) rn=j;
- if((unneeded_reg[i+j]>>r)&1) rn=10;
+ if((st->unneeded_reg[i+j]>>r)&1) rn=10;
if(i+j>=0&&(dops[i+j].itype==UJUMP||dops[i+j].itype==CJUMP||dops[i+j].itype==SJUMP))
{
b=j;
for(;k<j;k++)
{
assert(r < 64);
- if((unneeded_reg[i+k]>>r)&1) return hr;
+ if((st->unneeded_reg[i+k]>>r)&1) return hr;
if(i+k>=0&&(dops[i+k].itype==UJUMP||dops[i+k].itype==CJUMP||dops[i+k].itype==SJUMP))
{
if(cinfo[i+k].ba >= st->start && cinfo[i+k].ba < (st->start+i*4))
if(r>=0) {
assert(r < 64);
if((cur->u>>r)&1) {
- if(i==0||((unneeded_reg[i-1]>>r)&1)) {
+ if(i==0||((st->unneeded_reg[i-1]>>r)&1)) {
alloc_set(cur, reg, hr);
return;
}
//printf("c=%lx\n",(long)constmap[i][hr]);
if(i==st->slen-1) return 1;
assert(reg < 64);
- return !((unneeded_reg[i+1]>>reg)&1);
+ return !((st->unneeded_reg[i+1]>>reg)&1);
}
// Load registers with known constants
if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1)) {
if((i_dirty>>hr)&1) {
assert(i_regmap[hr]<64);
- if(!((unneeded_reg[t]>>i_regmap[hr])&1))
+ if(!((st->unneeded_reg[t]>>i_regmap[hr])&1))
emit_storereg(i_regmap[hr],hr);
}
}
{
if(i_regmap[hr]<TEMPREG)
{
- if(!((unneeded_reg[t]>>i_regmap[hr])&1))
+ if(!((st->unneeded_reg[t]>>i_regmap[hr])&1))
return 0;
}
else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
{
if((i_dirty>>hr)&1)
{
- if(!((unneeded_reg[t]>>i_regmap[hr])&1))
+ if(!((st->unneeded_reg[t]>>i_regmap[hr])&1))
{
//printf("%x: dirty no match\n",addr);
return 0;
return;
#endif
printf("D: %x WD: %x U: %"PRIx64" hC: %x hWC: %x hLC: %x\n",
- regs[i].dirty, regs[i].wasdirty, unneeded_reg[i],
+ regs[i].dirty, regs[i].wasdirty, st->unneeded_reg[i],
regs[i].isconst, regs[i].wasconst, regs[i].loadedconst);
print_regmap("pre: ", regmap_pre[i]);
print_regmap("entry: ", regs[i].regmap_entry);
u=1;
gte_u=gte_u_unknown;
}else{
- //u=unneeded_reg[iend+1];
+ //u=st->unneeded_reg[iend+1];
u=1;
gte_u=gte_unneeded[iend+1];
}
// Branch out of this block, flush all regs
u=1;
gte_u=gte_u_unknown;
- branch_unneeded_reg[i]=u;
+ st->branch_unneeded_reg[i]=u;
// Merge in delay slot
u|=(1LL<<dops[i+1].rt1)|(1LL<<dops[i+1].rt2);
u&=~((1LL<<dops[i+1].rs1)|(1LL<<dops[i+1].rs2));
temp_gte_u=0;
} else {
// Conditional branch (not taken case)
- temp_u=unneeded_reg[i+2];
+ temp_u=st->unneeded_reg[i+2];
temp_gte_u&=gte_unneeded[i+2];
}
// Merge in delay slot
temp_u|=1;
temp_gte_u|=gte_rt[i];
temp_gte_u&=~gte_rs[i];
- unneeded_reg[i]=temp_u;
+ st->unneeded_reg[i]=temp_u;
gte_unneeded[i]=temp_gte_u;
// Only go three levels deep. This recursion can take an
// excessive amount of time if there are a lot of nested loops.
if(r<2) {
pass2b_unneeded_regs(st, (cinfo[i].ba - st->start)>>2, i-1, r+1);
}else{
- unneeded_reg[(cinfo[i].ba - st->start)>>2]=1;
+ st->unneeded_reg[(cinfo[i].ba - st->start)>>2]=1;
gte_unneeded[(cinfo[i].ba - st->start)>>2]=gte_u_unknown;
}
} /*else*/ if(1) {
if (dops[i].is_ujump)
{
// Unconditional branch
- u=unneeded_reg[(cinfo[i].ba - st->start)>>2];
+ u=st->unneeded_reg[(cinfo[i].ba - st->start)>>2];
gte_u=gte_unneeded[(cinfo[i].ba - st->start)>>2];
- branch_unneeded_reg[i]=u;
+ st->branch_unneeded_reg[i]=u;
// Merge in delay slot
u|=(1LL<<dops[i+1].rt1)|(1LL<<dops[i+1].rt2);
u&=~((1LL<<dops[i+1].rs1)|(1LL<<dops[i+1].rs2));
gte_u&=~gte_rs[i+1];
} else {
// Conditional branch
- b=unneeded_reg[(cinfo[i].ba - st->start)>>2];
+ b=st->unneeded_reg[(cinfo[i].ba - st->start)>>2];
gte_b=gte_unneeded[(cinfo[i].ba - st->start)>>2];
- branch_unneeded_reg[i]=b;
+ st->branch_unneeded_reg[i]=b;
// Branch delay slot
b|=(1LL<<dops[i+1].rt1)|(1LL<<dops[i+1].rt2);
b&=~((1LL<<dops[i+1].rs1)|(1LL<<dops[i+1].rs2));
u&=b;
gte_u&=gte_b;
if(i<st->slen-1) {
- branch_unneeded_reg[i]&=unneeded_reg[i+2];
+ st->branch_unneeded_reg[i]&=st->unneeded_reg[i+2];
} else {
- branch_unneeded_reg[i]=1;
+ st->branch_unneeded_reg[i]=1;
}
}
}
u&=~(1LL<<dops[i].rs1);
u&=~(1LL<<dops[i].rs2);
gte_u&=~gte_rs[i];
- if(gte_rs[i]&&dops[i].rt1&&(unneeded_reg[i+1]&(1ll<<dops[i].rt1)))
+ if(gte_rs[i]&&dops[i].rt1&&(st->unneeded_reg[i+1]&(1ll<<dops[i].rt1)))
gte_u|=gte_rs[i]>e_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
if (dops[i].may_except || dops[i].itype == RFE)
{
// R0 is always unneeded
u|=1;
// Save it
- unneeded_reg[i]=u;
+ st->unneeded_reg[i]=u;
gte_unneeded[i]=gte_u;
/*
printf("ur (%d,%d) %x: ",istart,iend,st->start+i*4);
printf("U:");
int r;
for(r=1;r<=CCREG;r++) {
- if((unneeded_reg[i]>>r)&1) {
+ if((st->unneeded_reg[i]>>r)&1) {
if(r==HIREG) printf(" HI");
else if(r==LOREG) printf(" LO");
else printf(" r%d",r);
clear_all_regs(current.regmap_entry);
clear_all_regs(current.regmap);
current.wasdirty = current.dirty = 0;
- current.u = unneeded_reg[0];
+ current.u = st->unneeded_reg[0];
alloc_reg(st,¤t, 0, CCREG);
dirty_reg(¤t, CCREG);
current.wasconst = 0;
cc=-1;
dops[1].bt=1;
ds=1;
- unneeded_reg[0]=1;
+ st->unneeded_reg[0]=1;
}
for(i=0;i<st->slen;i++)
regs[i].loadedconst=0;
if (!dops[i].is_jump) {
if(i+1<st->slen) {
- current.u=unneeded_reg[i+1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
+ current.u=st->unneeded_reg[i+1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
current.u|=1;
} else {
current.u=1;
}
} else {
if(i+1<st->slen) {
- current.u=branch_unneeded_reg[i]&~((1LL<<dops[i+1].rs1)|(1LL<<dops[i+1].rs2));
+ current.u=st->branch_unneeded_reg[i]&~((1LL<<dops[i+1].rs1)|(1LL<<dops[i+1].rs2));
current.u&=~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
current.u|=1;
} else {
ds=0; // Skip delay slot, already allocated as part of branch
// ...but we need to alloc it in case something jumps here
if(i+1<st->slen) {
- current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
+ current.u=st->branch_unneeded_reg[i-1]&st->unneeded_reg[i+1];
}else{
- current.u=branch_unneeded_reg[i-1];
+ current.u=st->branch_unneeded_reg[i-1];
}
current.u&=~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
current.u|=1;
memcpy(&branch_regs[i-1],¤t,sizeof(current));
branch_regs[i-1].isconst=0;
branch_regs[i-1].wasconst=0;
- branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<dops[i-1].rs1)|(1LL<<dops[i-1].rs2));
+ branch_regs[i-1].u=st->branch_unneeded_reg[i-1]&~((1LL<<dops[i-1].rs1)|(1LL<<dops[i-1].rs2));
alloc_cc(&branch_regs[i-1],i-1);
dirty_reg(&branch_regs[i-1],CCREG);
if(dops[i-1].rt1==31) { // JAL
memcpy(&branch_regs[i-1],¤t,sizeof(current));
branch_regs[i-1].isconst=0;
branch_regs[i-1].wasconst=0;
- branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<dops[i-1].rs1)|(1LL<<dops[i-1].rs2));
+ branch_regs[i-1].u=st->branch_unneeded_reg[i-1]&~((1LL<<dops[i-1].rs1)|(1LL<<dops[i-1].rs2));
alloc_cc(&branch_regs[i-1],i-1);
dirty_reg(&branch_regs[i-1],CCREG);
alloc_reg(st,&branch_regs[i-1],i-1,dops[i-1].rs1);
(dops[i-1].rs2&&(dops[i-1].rs2==dops[i].rt1||dops[i-1].rs2==dops[i].rt2))) {
// The delay slot overwrote one of our conditions
// Delay slot goes after the test (in order)
- current.u=branch_unneeded_reg[i-1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
+ current.u=st->branch_unneeded_reg[i-1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
current.u|=1;
delayslot_alloc(st,¤t,i);
current.isconst=0;
}
else
{
- current.u=branch_unneeded_reg[i-1]&~((1LL<<dops[i-1].rs1)|(1LL<<dops[i-1].rs2));
+ current.u=st->branch_unneeded_reg[i-1]&~((1LL<<dops[i-1].rs1)|(1LL<<dops[i-1].rs2));
// Alloc the branch condition registers
if(dops[i-1].rs1) alloc_reg(st,¤t,i-1,dops[i-1].rs1);
if(dops[i-1].rs2) alloc_reg(st,¤t,i-1,dops[i-1].rs2);
if(dops[i-1].rs1==dops[i].rt1||dops[i-1].rs1==dops[i].rt2) {
// The delay slot overwrote the branch condition
// Delay slot goes after the test (in order)
- current.u=branch_unneeded_reg[i-1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
+ current.u=st->branch_unneeded_reg[i-1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
current.u|=1;
delayslot_alloc(st,¤t,i);
current.isconst=0;
}
else
{
- current.u=branch_unneeded_reg[i-1]&~(1LL<<dops[i-1].rs1);
+ current.u=st->branch_unneeded_reg[i-1]&~(1LL<<dops[i-1].rs1);
// Alloc the branch condition register
alloc_reg(st,¤t,i-1,dops[i-1].rs1);
}
if(dops[i-1].rs1==dops[i].rt1||dops[i-1].rs1==dops[i].rt2) {
// The delay slot overwrote the branch condition
// Delay slot goes after the test (in order)
- current.u=branch_unneeded_reg[i-1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
+ current.u=st->branch_unneeded_reg[i-1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
current.u|=1;
delayslot_alloc(st,¤t,i);
current.isconst=0;
}
else
{
- current.u=branch_unneeded_reg[i-1]&~(1LL<<dops[i-1].rs1);
+ current.u=st->branch_unneeded_reg[i-1]&~(1LL<<dops[i-1].rs1);
// Alloc the branch condition register
alloc_reg(st,¤t,i-1,dops[i-1].rs1);
}
// But do so if this is a branch target, otherwise we
// might have to load the register before the branch.
if((regs[i].wasdirty>>hr)&1) {
- if((regmap_pre[i][hr]>0&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1))) {
+ if((regmap_pre[i][hr]>0&&!((st->unneeded_reg[i]>>regmap_pre[i][hr])&1))) {
if(dops[i-1].rt1==regmap_pre[i][hr]) nr|=1<<hr;
if(dops[i-1].rt2==regmap_pre[i][hr]) nr|=1<<hr;
}
- if((regs[i].regmap_entry[hr]>0&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1))) {
+ if((regs[i].regmap_entry[hr]>0&&!((st->unneeded_reg[i]>>regs[i].regmap_entry[hr])&1))) {
if(dops[i-1].rt1==regs[i].regmap_entry[hr]) nr|=1<<hr;
if(dops[i-1].rt2==regs[i].regmap_entry[hr]) nr|=1<<hr;
}
for(j=t;j<=i;j++)
{
//printf("Test %x -> %x, %x %d/%d\n",start+i*4,cinfo[i].ba,start+j*4,hr,r);
- if(r<34&&((unneeded_reg[j]>>r)&1)) break;
+ if(r<34&&((st->unneeded_reg[j]>>r)&1)) break;
assert(r < 64);
if(regs[j].regmap[hr]==f_regmap[hr]&&f_regmap[hr]<TEMPREG) {
//printf("Hit %x -> %x, %x %d/%d\n",start+i*4,cinfo[i].ba,start+j*4,hr,r);
// Write back dirty registers as soon as we will no longer modify them,
// so that we don't end up with lots of writes at the branches.
-static noinline void pass6_clean_registers(struct compile_state *st,
+static noinline void pass6_clean_registers_r(struct compile_state *st,
+ u_int wont_dirty[MAXBLOCK], u_int will_dirty[MAXBLOCK],
int istart, int iend, int wr)
{
- static u_int wont_dirty[MAXBLOCK];
- static u_int will_dirty[MAXBLOCK];
u_int start = st->start;
int i;
int r;
temp_will_dirty&=~(1<<r);
temp_wont_dirty&=~(1<<r);
if(regmap_pre[i][r]>0 && regmap_pre[i][r]<34) {
- temp_will_dirty|=((unneeded_reg[i]>>regmap_pre[i][r])&1)<<r;
- temp_wont_dirty|=((unneeded_reg[i]>>regmap_pre[i][r])&1)<<r;
+ temp_will_dirty|=((st->unneeded_reg[i]>>regmap_pre[i][r])&1)<<r;
+ temp_wont_dirty|=((st->unneeded_reg[i]>>regmap_pre[i][r])&1)<<r;
} else {
temp_will_dirty|=1<<r;
temp_wont_dirty|=1<<r;
if(wr) {
will_dirty[i]=temp_will_dirty;
wont_dirty[i]=temp_wont_dirty;
- pass6_clean_registers(st, (cinfo[i].ba-start)>>2,i-1,0);
+ pass6_clean_registers_r(st, wont_dirty, will_dirty,
+ (cinfo[i].ba - start) >> 2, i-1, 0);
}else{
// Limit recursion. It can take an excessive amount
// of time if there are a lot of nested loops.
wont_dirty_i|=wont_dirty[(cinfo[i].ba-start)>>2]&(1<<r);
}
if(branch_regs[i].regmap[r]>=0) {
- will_dirty_i|=((unneeded_reg[(cinfo[i].ba-start)>>2]>>branch_regs[i].regmap[r])&1)<<r;
- wont_dirty_i|=((unneeded_reg[(cinfo[i].ba-start)>>2]>>branch_regs[i].regmap[r])&1)<<r;
+ will_dirty_i|=((st->unneeded_reg[(cinfo[i].ba-start)>>2]>>branch_regs[i].regmap[r])&1)<<r;
+ wont_dirty_i|=((st->unneeded_reg[(cinfo[i].ba-start)>>2]>>branch_regs[i].regmap[r])&1)<<r;
}
}
}
wont_dirty_i|=wont_dirty[(cinfo[i].ba-start)>>2]&(1<<r);
}
else if(target_reg>=0) {
- will_dirty_i&=((unneeded_reg[(cinfo[i].ba-start)>>2]>>target_reg)&1)<<r;
- wont_dirty_i|=((unneeded_reg[(cinfo[i].ba-start)>>2]>>target_reg)&1)<<r;
+ will_dirty_i&=((st->unneeded_reg[(cinfo[i].ba-start)>>2]>>target_reg)&1)<<r;
+ wont_dirty_i|=((st->unneeded_reg[(cinfo[i].ba-start)>>2]>>target_reg)&1)<<r;
}
}
}
will_dirty_i&=~(1<<r);
wont_dirty_i&=~(1<<r);
if(regmap_pre[i][r]>0 && regmap_pre[i][r]<34) {
- will_dirty_i|=((unneeded_reg[i]>>regmap_pre[i][r])&1)<<r;
- wont_dirty_i|=((unneeded_reg[i]>>regmap_pre[i][r])&1)<<r;
+ will_dirty_i|=((st->unneeded_reg[i]>>regmap_pre[i][r])&1)<<r;
+ wont_dirty_i|=((st->unneeded_reg[i]>>regmap_pre[i][r])&1)<<r;
} else {
wont_dirty_i|=1<<r;
/*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
}
}
+static void pass6_clean_registers(struct compile_state *st,
+ int istart, int iend)
+{
+ u_int wont_dirty[MAXBLOCK];
+ u_int will_dirty[MAXBLOCK];
+ pass6_clean_registers_r(st, wont_dirty, will_dirty, istart, iend, 1);
+}
+
static u_int *get_jump_outs(struct block_info *block)
{
return (u_int *)((u_char *)block + sizeof(*block) +
pass5b_preallocate2(&st);
/* Pass 6 - Optimize clean/dirty state */
- pass6_clean_registers(&st, 0, st.slen - 1, 1);
+ pass6_clean_registers(&st, 0, st.slen - 1);
/* Pass 7 */
for (i = st.slen - 1; i >= 0; i--)
#ifndef DESTRUCTIVE_WRITEBACK
if (i < 2 || !dops[i-2].is_ujump)
{
- wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,unneeded_reg[i]);
+ wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,st.unneeded_reg[i]);
}
if((dops[i].itype==CJUMP||dops[i].itype==SJUMP)) {
dirty_pre=branch_regs[i].dirty;
// write back
if (i < 2 || !dops[i-2].is_ujump)
{
- wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,unneeded_reg[i]);
+ wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,st.unneeded_reg[i]);
loop_preload(regmap_pre[i],regs[i].regmap_entry);
}
// branch target entry point