X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?p=pcsx_rearmed.git;a=blobdiff_plain;f=libpcsxcore%2Fnew_dynarec%2Fnew_dynarec.c;h=7d4a3d9261cda4b2a072c42ac2606c969d8fa4a1;hp=28a0245f249063e8a0aa85bccad231500db9bd10;hb=ea3d2e6e638ffd02aee0be8bdd27d8a9babd179f;hpb=535d208a8473e9255919b1e5bfe0b5aa88f6992a diff --git a/libpcsxcore/new_dynarec/new_dynarec.c b/libpcsxcore/new_dynarec/new_dynarec.c index 28a0245f..7d4a3d92 100644 --- a/libpcsxcore/new_dynarec/new_dynarec.c +++ b/libpcsxcore/new_dynarec/new_dynarec.c @@ -1,6 +1,6 @@ /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * Mupen64plus - new_dynarec.c * - * Copyright (C) 2009-2010 Ari64 * + * Copyright (C) 2009-2011 Ari64 * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * @@ -84,6 +84,7 @@ struct ll_entry u_int ba[MAXBLOCK]; char likely[MAXBLOCK]; char is_ds[MAXBLOCK]; + char ooo[MAXBLOCK]; uint64_t unneeded_reg[MAXBLOCK]; uint64_t unneeded_reg_upper[MAXBLOCK]; uint64_t branch_unneeded_reg[MAXBLOCK]; @@ -94,10 +95,9 @@ struct ll_entry signed char regmap[MAXBLOCK][HOST_REGS]; signed char regmap_entry[MAXBLOCK][HOST_REGS]; uint64_t constmap[MAXBLOCK][HOST_REGS]; - uint64_t known_value[HOST_REGS]; - u_int known_reg; struct regstat regs[MAXBLOCK]; struct regstat branch_regs[MAXBLOCK]; + signed char minimum_free_regs[MAXBLOCK]; u_int needed_reg[MAXBLOCK]; uint64_t requires_32bit[MAXBLOCK]; u_int wont_dirty[MAXBLOCK]; @@ -121,7 +121,12 @@ struct ll_entry char shadow[1048576] __attribute__((aligned(16))); void *copy; int expirep; +#ifndef PCSX u_int using_tlb; +#else + static const u_int using_tlb=0; +#endif + static u_int sp_in_mirror; u_int stop_after_jal; extern u_char restore_candidate[512]; extern int cycle_count; @@ -134,19 +139,21 @@ struct ll_entry #define CSREG 35 // Coprocessor status #define CCREG 36 // Cycle count #define INVCP 37 // Pointer to invalid_code -#define TEMPREG 38 -#define FTEMP 38 // FPU/LDL/LDR temporary register -#define PTEMP 39 // Prefetch temporary register -#define TLREG 40 // TLB mapping offset -#define RHASH 41 // Return address hash -#define RHTBL 42 // Return address hash table address -#define RTEMP 43 // JR/JALR address register -#define MAXREG 43 -#define AGEN1 44 // Address generation temporary register -#define AGEN2 45 // Address generation temporary register -#define MGEN1 46 // Maptable address generation temporary register -#define MGEN2 47 // Maptable address generation temporary register -#define BTREG 48 // Branch target temporary register +#define MMREG 38 // Pointer to memory_map +#define ROREG 39 // ram offset (if rdram!=0x80000000) +#define TEMPREG 40 +#define FTEMP 40 // FPU temporary register +#define PTEMP 41 // Prefetch temporary register +#define TLREG 42 // TLB mapping offset +#define RHASH 43 // Return address hash +#define RHTBL 44 // Return address hash table address +#define RTEMP 45 // JR/JALR address register +#define MAXREG 45 +#define AGEN1 46 // Address generation temporary register +#define AGEN2 47 // Address generation temporary register +#define MGEN1 48 // Maptable address generation temporary register +#define MGEN2 49 // Maptable address generation temporary register +#define BTREG 50 // Branch target temporary register /* instruction types */ #define NOP 0 // No operation @@ -592,6 +599,7 @@ void clear_const(struct regstat *cur,signed char reg) int is_const(struct regstat *cur,signed char reg) { int hr; + if(reg<0) return 0; if(!reg) return 1; for (hr=0;hrregmap[hr]&63)==reg) { @@ -712,12 +720,6 @@ int needed_again(int r, int i) int j; int b=-1; int rn=10; - int hr; - u_char hsn[MAXREG+1]; - int preferred_reg; - - memset(hsn,10,sizeof(hsn)); - lsn(hsn,i,&preferred_reg); if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) { @@ -770,11 +772,7 @@ int needed_again(int r, int i) } } }*/ - for(hr=0;hraddr); inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr); @@ -1095,20 +1092,12 @@ void ll_kill_pointers(struct ll_entry *head,int addr,int shift) { inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr); u_int host_addr=(u_int)kill_pointer(head->addr); - - if((host_addr>>12)!=(old_host_addr>>12)) { - #ifdef __arm__ - __clear_cache((void *)(old_host_addr&~0xfff),(void *)(old_host_addr|0xfff)); - #endif - old_host_addr=host_addr; - } + #ifdef __arm__ + needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31); + #endif } head=head->next; } - #ifdef __arm__ - if (old_host_addr) - __clear_cache((void *)(old_host_addr&~0xfff),(void *)(old_host_addr|0xfff)); - #endif } // This is called when we write to a compiled block (see do_invstub) @@ -1116,7 +1105,6 @@ void invalidate_page(u_int page) { struct ll_entry *head; struct ll_entry *next; - u_int old_host_addr=0; head=jump_in[page]; jump_in[page]=0; while(head!=NULL) { @@ -1131,21 +1119,13 @@ void invalidate_page(u_int page) while(head!=NULL) { inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr); u_int host_addr=(u_int)kill_pointer(head->addr); - - if((host_addr>>12)!=(old_host_addr>>12)) { - #ifdef __arm__ - __clear_cache((void *)(old_host_addr&~0xfff),(void *)(old_host_addr|0xfff)); - #endif - old_host_addr=host_addr; - } + #ifdef __arm__ + needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31); + #endif next=head->next; free(head); head=next; } - #ifdef __arm__ - if (old_host_addr) - __clear_cache((void *)(old_host_addr&~0xfff),(void *)(old_host_addr|0xfff)); - #endif } void invalidate_block(u_int block) { @@ -1192,9 +1172,15 @@ void invalidate_block(u_int block) for(first=page+1;first>12)|page]=1; +#endif #ifndef DISABLE_TLB // If there is a valid TLB entry for this page, remove write protect if(tlb_LUT_w[block]) { @@ -1216,6 +1202,8 @@ void invalidate_addr(u_int addr) { invalidate_block(addr>>12); } +// This is called when loading a save state. +// Anything could have changed, so invalidate everything. void invalidate_all_pages() { u_int page,n; @@ -1401,7 +1389,10 @@ void shift_alloc(struct regstat *current,int i) if(rs1[i]) alloc_reg(current,i,rs1[i]); if(rs2[i]) alloc_reg(current,i,rs2[i]); alloc_reg(current,i,rt1[i]); - if(rt1[i]==rs2[i]) alloc_reg_temp(current,i,-1); + if(rt1[i]==rs2[i]) { + alloc_reg_temp(current,i,-1); + minimum_free_regs[i]=1; + } current->is32|=1LL<is32&=~(1LL<u&=~1LL; // Allow allocating r0 if it's the source register if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]); - if(rt1[i]) { + if(rt1[i]&&!((current->u>>rt1[i])&1)) { alloc_reg(current,i,rt1[i]); - if(get_reg(current->regmap,rt1[i])<0) { - // dummy load, but we still need a register to calculate the address - alloc_reg_temp(current,i,-1); - } + assert(get_reg(current->regmap,rt1[i])>=0); if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD { current->is32&=~(1LL<is32|=1LL<is32&=~(1LL<isconst=0; } @@ -1938,6 +1944,7 @@ static void pagespan_alloc(struct regstat *current,int i) current->isconst=0; current->wasconst=0; regs[i].wasconst=0; + minimum_free_regs[i]=HOST_REGS; alloc_all(current,i); alloc_cc(current,i); dirty_reg(current,CCREG); @@ -2770,8 +2777,10 @@ void load_assemble(int i,struct regstat *i_regs) if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<=0) { c=(i_regs->wasconst>>s)&1; - memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE; - if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1; + if (c) { + memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE; + if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1; + } } //printf("load_assemble: c=%d\n",c); //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset); @@ -2807,6 +2816,13 @@ void load_assemble(int i,struct regstat *i_regs) if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE) #endif { + #ifdef PCSX + if(sp_in_mirror&&rs1[i]==29) { + emit_andimm(addr,~0x00e00000,HOST_TEMPREG); + emit_cmpimm(HOST_TEMPREG,RAM_SIZE); + } + else + #endif emit_cmpimm(addr,RAM_SIZE); jaddr=(int)out; #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK @@ -2824,6 +2840,7 @@ void load_assemble(int i,struct regstat *i_regs) if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU map=get_reg(i_regs->regmap,TLREG); assert(map>=0); + reglist&=~(1<=0); if(!c||memtarget) { if(!dummy) { + int a=addr; +#ifdef PCSX + if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG; +#endif //emit_readword_indexed((int)rdram-0x80000000,addr,tl); #ifdef HOST_IMM_ADDR32 if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl); else #endif - emit_readword_indexed_tlb(0,addr,map,tl); + emit_readword_indexed_tlb(0,a,map,tl); } if(jaddr) add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist); @@ -2998,6 +3035,10 @@ void load_assemble(int i,struct regstat *i_regs) if (opcode[i]==0x37) { // LD if(!c||memtarget) { if(!dummy) { + int a=addr; +#ifdef PCSX + if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG; +#endif //gen_tlb_addr_r(tl,map); //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th); //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl); @@ -3006,7 +3047,7 @@ void load_assemble(int i,struct regstat *i_regs) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl); else #endif - emit_readdword_indexed_tlb(0,addr,map,th,tl); + emit_readdword_indexed_tlb(0,a,map,th,tl); } if(jaddr) add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist); @@ -3071,8 +3112,10 @@ void store_assemble(int i,struct regstat *i_regs) offset=imm[i]; if(s>=0) { c=(i_regs->wasconst>>s)&1; - memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE; - if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1; + if(c) { + memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE; + if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1; + } } assert(tl>=0); assert(temp>=0); @@ -3084,9 +3127,15 @@ void store_assemble(int i,struct regstat *i_regs) else addr=s; if(!using_tlb) { if(!c) { + #ifdef PCSX + if(sp_in_mirror&&rs1[i]==29) { + emit_andimm(addr,~0x00e00000,HOST_TEMPREG); + emit_cmpimm(HOST_TEMPREG,RAM_SIZE); + } + else + #endif #ifdef R29_HACK // Strmnnrmn's speed hack - memtarget=1; if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE) #endif emit_cmpimm(addr,RAM_SIZE); @@ -3094,6 +3143,7 @@ void store_assemble(int i,struct regstat *i_regs) if(s==addr) emit_mov(s,temp); #endif #ifdef R29_HACK + memtarget=1; if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE) #endif { @@ -3113,77 +3163,83 @@ void store_assemble(int i,struct regstat *i_regs) if (opcode[i]==0x29) x=2; // SH map=get_reg(i_regs->regmap,TLREG); assert(map>=0); + reglist&=~(1<=0) { - gen_tlb_addr_w(temp,map); - emit_writehword_indexed(tl,x,temp); + gen_tlb_addr_w(a,map); + emit_writehword_indexed(tl,x,a); }else - emit_writehword_indexed(tl,(int)rdram-0x80000000+x,temp); + emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a); } type=STOREH_STUB; } if (opcode[i]==0x2B) { // SW - if(!c||memtarget) + if(!c||memtarget) { + int a=addr; +#ifdef PCSX + if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG; +#endif //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr); - emit_writeword_indexed_tlb(tl,0,addr,map,temp); + emit_writeword_indexed_tlb(tl,0,a,map,temp); + } type=STOREW_STUB; } if (opcode[i]==0x3F) { // SD if(!c||memtarget) { + int a=addr; +#ifdef PCSX + if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG; +#endif if(rs2[i]) { assert(th>=0); //emit_writeword_indexed(th,(int)rdram-0x80000000,addr); //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr); - emit_writedword_indexed_tlb(th,tl,0,addr,map,temp); + emit_writedword_indexed_tlb(th,tl,0,a,map,temp); }else{ // Store zero //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp); //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp); - emit_writedword_indexed_tlb(tl,tl,0,addr,map,temp); + emit_writedword_indexed_tlb(tl,tl,0,a,map,temp); } } type=STORED_STUB; } - if(!using_tlb&&(!c||memtarget)) - // addr could be a temp, make sure it survives STORE*_STUB - reglist|=1<regmap,rs2[i],ccadj[i],reglist); - } if(!using_tlb) { if(!c||memtarget) { #ifdef DESTRUCTIVE_SHIFT @@ -3198,11 +3254,20 @@ void store_assemble(int i,struct regstat *i_regs) #else emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1); #endif + #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT) + emit_callne(invalidate_addr_reg[addr]); + #else jaddr2=(int)out; emit_jne(0); add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<regmap,rs2[i],ccadj[i],reglist); + } //if(opcode[i]==0x2B || opcode[i]==0x3F) //if(opcode[i]==0x2B || opcode[i]==0x28) //if(opcode[i]==0x2B || opcode[i]==0x29) @@ -3243,7 +3308,7 @@ void storelr_assemble(int i,struct regstat *i_regs) int jaddr=0,jaddr2; int case1,case2,case3; int done0,done1,done2; - int memtarget,c=0; + int memtarget=0,c=0; int agr=AGEN1+(i&1); u_int hr,reglist=0; th=get_reg(i_regs->regmap,rs2[i]|64); @@ -3254,8 +3319,10 @@ void storelr_assemble(int i,struct regstat *i_regs) offset=imm[i]; if(s>=0) { c=(i_regs->isconst>>s)&1; - memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE; - if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1; + if(c) { + memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE; + if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1; + } } assert(tl>=0); for(hr=0;hrregmap,TLREG); assert(map>=0); + reglist&=~(1<=0) emit_mov(s,temp); do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr); @@ -3537,6 +3605,7 @@ void c1ls_assemble(int i,struct regstat *i_regs) { map=get_reg(i_regs->regmap,TLREG); assert(map>=0); + reglist&=~(1<>16)&0x1f; @@ -3735,9 +3808,13 @@ void c2ls_assemble(int i,struct regstat *i_regs) #else emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1); #endif + #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT) + emit_callne(invalidate_addr_reg[ar]); + #else jaddr3=(int)out; emit_jne(0); add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<0 && i_regmap[hr]!=CCREG) + if(i_regmap[hr]>0 && (i_regmap[hr]&63)0 && i_regmap[hr]!=CCREG) + if(i_regmap[hr]>0 && (i_regmap[hr]&63)=0&®s[t].regmap_entry[hr]<64) { + if(regs[t].regmap_entry[hr]>=0&®s[t].regmap_entry[hr]=64) { + if(regs[t].regmap_entry[hr]>=64&®s[t].regmap_entry[hr]>(regs[t].regmap_entry[hr]&63))&1) { int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64); @@ -4537,7 +4614,7 @@ void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int ad } // Load 32-bit regs for(hr=0;hr=0&®s[t].regmap_entry[hr]<64) { + if(hr!=EXCLUDE_REG&®s[t].regmap_entry[hr]>=0&®s[t].regmap_entry[hr]>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) { #else @@ -4555,7 +4632,7 @@ void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int ad } //Load 64-bit regs for(hr=0;hr=64) { + if(hr!=EXCLUDE_REG&®s[t].regmap_entry[hr]>=64&®s[t].regmap_entry[hr]>(regs[t].regmap_entry[hr]&63))&1) { @@ -4596,19 +4673,19 @@ int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr) { if(i_regmap[hr]!=regs[t].regmap_entry[hr]) { - if(regs[t].regmap_entry[hr]!=-1) + if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)>hr)&1) { - if(i_regmap[hr]<64) + if(i_regmap[hr]>i_regmap[hr])&1)) return 0; } - else + else if(i_regmap[hr]>=64&&i_regmap[hr]>(i_regmap[hr]&63))&1)) return 0; @@ -4782,7 +4859,7 @@ void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert) } else { - emit_cmpimm(HOST_CCREG,-2*(count+2)); + emit_cmpimm(HOST_CCREG,-CLOCK_DIVIDER*(count+2)); jaddr=(int)out; emit_jns(0); } @@ -4848,7 +4925,7 @@ void do_ccstub(int n) emit_loadreg(rs2[i],s2l); #endif int hr=0; - int addr,alt,ntaddr; + int addr=-1,alt=-1,ntaddr=-1; while(hr>16)^return_address)&0xFFFF],temp); } #endif - ds_assemble(i+1,i_regs); - uint64_t bc_unneeded=branch_regs[i].u; - uint64_t bc_unneeded_upper=branch_regs[i].uu; - bc_unneeded|=1|(1LL<=0); return_address=start+i*4+8; if(rt>=0) { #ifdef USE_MINI_HT - if(internal_branch(branch_regs[i].is32,return_address)) { - int temp=rt+1; - if(temp==EXCLUDE_REG||temp>=HOST_REGS|| - branch_regs[i].regmap[temp]>=0) - { - temp=get_reg(branch_regs[i].regmap,-1); - } + if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) { + int temp=-1; // note: must be ds-safe #ifdef HOST_TEMPREG - if(temp<0) temp=HOST_TEMPREG; + temp=HOST_TEMPREG; #endif if(temp>=0) do_miniht_insert(return_address,rt,temp); else emit_movimm(return_address,rt); @@ -5129,6 +5191,14 @@ void ujump_assemble(int i,struct regstat *i_regs) } } } + ds_assemble(i+1,i_regs); + uint64_t bc_unneeded=branch_regs[i].u; + uint64_t bc_unneeded_upper=branch_regs[i].uu; + bc_unneeded|=1|(1LL<>2) assem_debug("idle loop\n"); - if(likely[i]) ooo=0; if(!match) invert=1; #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK if(i>(ba[i]-start)>>2) invert=1; #endif - - if(ooo) - if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))|| - (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) - { - // Write-after-read dependency prevents out of order execution - // First test branch condition, then execute delay slot, then branch - ooo=0; - } - - if(ooo) { + + if(ooo[i]) { s1l=get_reg(branch_regs[i].regmap,rs1[i]); s1h=get_reg(branch_regs[i].regmap,rs1[i]|64); s2l=get_reg(branch_regs[i].regmap,rs2[i]); @@ -5358,7 +5417,7 @@ void cjump_assemble(int i,struct regstat *i_regs) only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1; } - if(ooo) { + if(ooo[i]) { // Out of order execution (delay slot first) //printf("OOOE\n"); address_generation(i+1,i_regs,regs[i].regmap_entry); @@ -5697,11 +5756,9 @@ void sjump_assemble(int i,struct regstat *i_regs) int prev_cop1_usable=cop1_usable; int unconditional=0,nevertaken=0; int only32=0; - int ooo=1; int invert=0; int internal=internal_branch(branch_regs[i].is32,ba[i]); if(i==(ba[i]-start)>>2) assem_debug("idle loop\n"); - if(likely[i]) ooo=0; if(!match) invert=1; #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK if(i>(ba[i]-start)>>2) invert=1; @@ -5710,19 +5767,7 @@ void sjump_assemble(int i,struct regstat *i_regs) //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL) //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL) - if(ooo) { - if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) - { - // Write-after-read dependency prevents out of order execution - // First test branch condition, then execute delay slot, then branch - ooo=0; - } - if(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31)) - // BxxZAL $ra is available to delay insn, so do it in order - ooo=0; - } - - if(ooo) { + if(ooo[i]) { s1l=get_reg(branch_regs[i].regmap,rs1[i]); s1h=get_reg(branch_regs[i].regmap,rs1[i]|64); } @@ -5744,7 +5789,7 @@ void sjump_assemble(int i,struct regstat *i_regs) only32=(regs[i].was32>>rs1[i])&1; } - if(ooo) { + if(ooo[i]) { // Out of order execution (delay slot first) //printf("OOOE\n"); address_generation(i+1,i_regs,regs[i].regmap_entry); @@ -6037,25 +6082,15 @@ void fjump_assemble(int i,struct regstat *i_regs) assem_debug("fmatch=%d\n",match); int fs,cs; int eaddr; - int ooo=1; int invert=0; int internal=internal_branch(branch_regs[i].is32,ba[i]); if(i==(ba[i]-start)>>2) assem_debug("idle loop\n"); - if(likely[i]) ooo=0; if(!match) invert=1; #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK if(i>(ba[i]-start)>>2) invert=1; #endif - if(ooo) - if(itype[i+1]==FCOMP) - { - // Write-after-read dependency prevents out of order execution - // First test branch condition, then execute delay slot, then branch - ooo=0; - } - - if(ooo) { + if(ooo[i]) { fs=get_reg(branch_regs[i].regmap,FSREG); address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay? } @@ -6074,7 +6109,7 @@ void fjump_assemble(int i,struct regstat *i_regs) cop1_usable=1; } - if(ooo) { + if(ooo[i]) { // Out of order execution (delay slot first) //printf("OOOE\n"); ds_assemble(i+1,i_regs); @@ -7725,6 +7760,38 @@ void disassemble_inst(int i) } } +// clear the state completely, instead of just marking +// things invalid like invalidate_all_pages() does +void new_dynarec_clear_full() +{ + int n; + out=(u_char *)BASE_ADDR; + memset(invalid_code,1,sizeof(invalid_code)); + memset(hash_table,0xff,sizeof(hash_table)); + memset(mini_ht,-1,sizeof(mini_ht)); + memset(restore_candidate,0,sizeof(restore_candidate)); + memset(shadow,0,sizeof(shadow)); + copy=shadow; + expirep=16384; // Expiry pointer, +2 blocks + pending_exception=0; + literalcount=0; + stop_after_jal=0; + // TLB +#ifndef DISABLE_TLB + using_tlb=0; +#endif + sp_in_mirror=0; + for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF + memory_map[n]=-1; + for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF + memory_map[n]=((u_int)rdram-0x80000000)>>2; + for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF + memory_map[n]=-1; + for(n=0;n<4096;n++) ll_clear(jump_in+n); + for(n=0;n<4096;n++) ll_clear(jump_out+n); + for(n=0;n<4096;n++) ll_clear(jump_dirty+n); +} + void new_dynarec_init() { printf("Init new dynarec\n"); @@ -7740,29 +7807,11 @@ void new_dynarec_init() fake_pc.f.r.rd=&readmem_dword; #endif int n; - for(n=0x80000;n<0x80800;n++) - invalid_code[n]=1; - for(n=0;n<65536;n++) - hash_table[n][0]=hash_table[n][2]=-1; - memset(mini_ht,-1,sizeof(mini_ht)); - memset(restore_candidate,0,sizeof(restore_candidate)); - copy=shadow; - expirep=16384; // Expiry pointer, +2 blocks - pending_exception=0; - literalcount=0; + new_dynarec_clear_full(); #ifdef HOST_IMM8 // Copy this into local area so we don't have to put it in every literal pool invc_ptr=invalid_code; #endif - stop_after_jal=0; - // TLB - using_tlb=0; - for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF - memory_map[n]=-1; - for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF - memory_map[n]=((u_int)rdram-0x80000000)>>2; - for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF - memory_map[n]=-1; #ifdef MUPEN64 for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF writemem[n] = write_nomem_new; @@ -7841,6 +7890,11 @@ int new_recompile_block(int addr) start = (u_int)addr&~3; //assert(((u_int)addr&1)==0); #ifdef PCSX + if(!sp_in_mirror&&(signed int)(psxRegs.GPR.n.sp&0xffe00000)>0x80200000&& + 0x10000<=psxRegs.GPR.n.sp&&(psxRegs.GPR.n.sp&~0xe0e00000)>26; switch(op) { @@ -7955,17 +8010,10 @@ int new_recompile_block(int addr) case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break; case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break; case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break; - case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break; - case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break; - case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break; case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break; case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break; case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break; case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break; - case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break; - case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break; - case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break; - case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break; case 0x20: strcpy(insn[i],"ADD"); type=ALU; break; case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break; case 0x22: strcpy(insn[i],"SUB"); type=ALU; break; @@ -7976,22 +8024,31 @@ int new_recompile_block(int addr) case 0x27: strcpy(insn[i],"NOR"); type=ALU; break; case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break; case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break; - case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break; - case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break; - case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break; - case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break; case 0x30: strcpy(insn[i],"TGE"); type=NI; break; case 0x31: strcpy(insn[i],"TGEU"); type=NI; break; case 0x32: strcpy(insn[i],"TLT"); type=NI; break; case 0x33: strcpy(insn[i],"TLTU"); type=NI; break; case 0x34: strcpy(insn[i],"TEQ"); type=NI; break; case 0x36: strcpy(insn[i],"TNE"); type=NI; break; +#ifndef FORCE32 + case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break; + case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break; + case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break; + case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break; + case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break; + case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break; + case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break; + case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break; + case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break; + case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break; + case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break; case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break; case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break; case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break; case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break; case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break; case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break; +#endif } break; case 0x01: strcpy(insn[i],"regimm"); type=NI; @@ -8232,18 +8289,6 @@ int new_recompile_block(int addr) printf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr); break; } -#ifdef PCSX - /* detect branch in delay slot early */ - if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) { - opcode[i+1]=source[i+1]>>26; - opcode2[i+1]=source[i+1]&0x3f; - if((0>14); else ba[i]=-1; - /* Is this the end of the block? */ - if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) { #ifdef PCSX - // check for link register access in delay slot - int rt1_=rt1[i-1]; - if(rt1_!=0&&(rs1[i]==rt1_||rs2[i]==rt1_||rt1[i]==rt1_||rt2[i]==rt1_)) { - printf("link access in delay slot @%08x (%08x)\n", addr + i*4, addr); + if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) { + int do_in_intrp=0; + // branch in delay slot? + if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) { + // don't handle first branch and call interpreter if it's hit + printf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr); + do_in_intrp=1; + } + // basic load delay detection + else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) { + int t=(ba[i-1]-start)/4; + if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) { + // jump target wants DS result - potential load delay effect + printf("load delay @%08x (%08x)\n", addr + i*4, addr); + do_in_intrp=1; + bt[t+1]=1; // expected return from interpreter + } + else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&& + !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) { + // v0 overwrite like this is a sign of trouble, bail out + printf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr); + do_in_intrp=1; + } + } + if(do_in_intrp) { + rs1[i-1]=CCREG; + rs2[i-1]=rt1[i-1]=rt2[i-1]=0; ba[i-1]=-1; itype[i-1]=INTCALL; done=2; + i--; // don't compile the DS } - else + } #endif + /* Is this the end of the block? */ + if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) { if(rt1[i-1]==0) { // Continue past subroutine call (JAL) done=2; } @@ -8548,7 +8617,7 @@ int new_recompile_block(int addr) current.wasconst=0; int ds=0; int cc=0; - int hr; + int hr=-1; #ifndef FORCE32 provisional_32bit(); @@ -8811,17 +8880,18 @@ int new_recompile_block(int addr) clear_const(¤t,rt1[i]); alloc_cc(¤t,i); dirty_reg(¤t,CCREG); + ooo[i]=1; + delayslot_alloc(¤t,i+1); if (rt1[i]==31) { alloc_reg(¤t,i,31); dirty_reg(¤t,31); - assert(rs1[i+1]!=31&&rs2[i+1]!=31); - assert(rt1[i+1]!=rt1[i]); + //assert(rs1[i+1]!=31&&rs2[i+1]!=31); + //assert(rt1[i+1]!=rt1[i]); #ifdef REG_PREFETCH alloc_reg(¤t,i,PTEMP); #endif //current.is32|=1LL<clean transition - // #ifdef DESTRUCTIVE_WRITEBACK here? + #ifdef DESTRUCTIVE_WRITEBACK if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1; + #endif + // This check is only strictly required in the DESTRUCTIVE_WRITEBACK + // case above, however it's always a good idea. We can't hoist the + // load if the register was already allocated, so there's no point + // wasting time analyzing most of these cases. It only "succeeds" + // when the mapping was different and the load can be replaced with + // a mov, which is of negligible benefit. So such cases are + // skipped below. if(f_regmap[hr]>0) { - if(regs[t].regmap_entry[hr]<0) { + if(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0) { int r=f_regmap[hr]; for(j=t;j<=i;j++) { @@ -9868,6 +9959,7 @@ int new_recompile_block(int addr) // register is lower numbered than the lower-half // register. Not sure if it's worth fixing... if(get_reg(regs[j].regmap,r&63)<0) break; + if(get_reg(regs[j].regmap_entry,r&63)<0) break; if(regs[j].is32&(1LL<<(r&63))) break; } if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)1&®s[k-1].regmap[hr]==-1) { - if(itype[k-1]==STORE||itype[k-1]==STORELR - ||itype[k-1]==C1LS||itype[k-1]==SHIFT||itype[k-1]==COP1 - ||itype[k-1]==FLOAT||itype[k-1]==FCONV||itype[k-1]==FCOMP - ||itype[k-1]==COP2||itype[k-1]==C2LS||itype[k-1]==C2OP) { - if(count_free_regs(regs[k-1].regmap)<2) { - //printf("no free regs for store %x\n",start+(k-1)*4); - break; - } + if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) { + //printf("no free regs for store %x\n",start+(k-1)*4); + break; } - else - if(itype[k-1]!=NOP&&itype[k-1]!=MOV&&itype[k-1]!=ALU&&itype[k-1]!=SHIFTIMM&&itype[k-1]!=IMM16&&itype[k-1]!=LOAD) break; if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) { //printf("no-match due to different register\n"); break; @@ -9968,13 +10053,31 @@ int new_recompile_block(int addr) } } for(k=t;k>16)!=0x1000) { + regmap_pre[k+2][hr]=f_regmap[hr]; + regs[k+2].wasdirty&=~(1<>16)==0x1000) + { + // Stop on unconditional branch + break; + } + if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) + { + if(ooo[j]) { + if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) + break; + }else{ + if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) + break; + } + if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) { + //printf("no-match due to different register (branch)\n"); break; } } - else if(itype[j]!=NOP&&itype[j]!=MOV&&itype[j]!=ALU&&itype[j]!=SHIFTIMM&&itype[j]!=IMM16&&itype[j]!=LOAD) break; + if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) { + //printf("No free regs for store %x\n",start+j*4); + break; + } if(f_regmap[hr]>=64) { if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) { break; @@ -10045,17 +10161,10 @@ int new_recompile_block(int addr) if(bt[i]) { for(j=i;j=0;i--) + { + if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) + { + // Conditional branch + if((source[i]>>16)!=0x1000&&i>12)) + for(i=start>>12;i<=(start+slen*4)>>12;i++) + invalid_code[((u_int)0x80000000>>12)|i]=0; +#endif /* Pass 10 - Free memory by expiring oldest blocks */ @@ -11041,6 +11161,10 @@ int new_recompile_block(int addr) break; case 3: // Clear jump_out + #ifdef __arm__ + if((expirep&2047)==0) + do_clear_cache(); + #endif ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift); ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift); break;