X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?p=pcsx_rearmed.git;a=blobdiff_plain;f=libpcsxcore%2Fnew_dynarec%2Fnew_dynarec.c;h=59d42080cb7cac75c0dc8f2923f68f2a46549595;hp=6f45cfc5838ade9192fcbf4ec8d7668726436999;hb=4a35de071887026bb6dcd6b852738a1866959df7;hpb=96186eba5d5bbc011f913e8e63ba2e3bebba81ba diff --git a/libpcsxcore/new_dynarec/new_dynarec.c b/libpcsxcore/new_dynarec/new_dynarec.c index 6f45cfc5..59d42080 100644 --- a/libpcsxcore/new_dynarec/new_dynarec.c +++ b/libpcsxcore/new_dynarec/new_dynarec.c @@ -21,10 +21,15 @@ #include #include //include for uint64_t #include +#include #include "emu_if.h" //emulator interface -#include +//#define DISASM +//#define assem_debug printf +//#define inv_debug printf +#define assem_debug(...) +#define inv_debug(...) #ifdef __i386__ #include "assem_x86.h" @@ -38,7 +43,6 @@ #define MAXBLOCK 4096 #define MAX_OUTPUT_BLOCK_SIZE 262144 -#define CLOCK_DIVIDER 2 struct regstat { @@ -52,7 +56,8 @@ struct regstat uint64_t uu; u_int wasconst; u_int isconst; - uint64_t constmap[HOST_REGS]; + u_int loadedconst; // host regs that have constants loaded + u_int waswritten; // MIPS regs that were used as store base before }; struct ll_entry @@ -80,6 +85,14 @@ struct ll_entry u_char dep1[MAXBLOCK]; u_char dep2[MAXBLOCK]; u_char lt1[MAXBLOCK]; + static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs + static uint64_t gte_rt[MAXBLOCK]; + static uint64_t gte_unneeded[MAXBLOCK]; + static u_int smrv[32]; // speculated MIPS register values + static u_int smrv_strong; // mask or regs that are likely to have correct values + static u_int smrv_weak; // same, but somewhat less likely + static u_int smrv_strong_next; // same, but after current insn executes + static u_int smrv_weak_next; int imm[MAXBLOCK]; u_int ba[MAXBLOCK]; char likely[MAXBLOCK]; @@ -92,11 +105,10 @@ struct ll_entry uint64_t p32[MAXBLOCK]; uint64_t pr32[MAXBLOCK]; signed char regmap_pre[MAXBLOCK][HOST_REGS]; - signed char regmap[MAXBLOCK][HOST_REGS]; - signed char regmap_entry[MAXBLOCK][HOST_REGS]; - uint64_t constmap[MAXBLOCK][HOST_REGS]; - struct regstat regs[MAXBLOCK]; - struct regstat branch_regs[MAXBLOCK]; + static uint64_t current_constmap[HOST_REGS]; + static uint64_t constmap[MAXBLOCK][HOST_REGS]; + static struct regstat regs[MAXBLOCK]; + static struct regstat branch_regs[MAXBLOCK]; signed char minimum_free_regs[MAXBLOCK]; u_int needed_reg[MAXBLOCK]; uint64_t requires_32bit[MAXBLOCK]; @@ -126,8 +138,14 @@ struct ll_entry #else static const u_int using_tlb=0; #endif - static u_int sp_in_mirror; + int new_dynarec_did_compile; + int new_dynarec_hacks; u_int stop_after_jal; +#ifndef RAM_FIXED + static u_int ram_offset; +#else + static const u_int ram_offset=0; +#endif extern u_char restore_candidate[512]; extern int cycle_count; @@ -261,11 +279,13 @@ int tracedebug=0; //#define DEBUG_CYCLE_COUNT 1 -void nullf() {} -//#define assem_debug printf -//#define inv_debug printf -#define assem_debug nullf -#define inv_debug nullf +int cycle_multiplier; // 100 for 1.0 + +static int CLOCK_ADJUST(int x) +{ + int s=(x>>31)|1; + return (x * cycle_multiplier + s * 50) / 100; +} static void tlb_hacks() { @@ -331,6 +351,7 @@ static u_int get_page(u_int vaddr) return page; } +#ifndef PCSX static u_int get_vpage(u_int vaddr) { u_int vpage=(vaddr^0x80000000)>>12; @@ -340,6 +361,13 @@ static u_int get_vpage(u_int vaddr) if(vpage>2048) vpage=2048+(vpage&2047); return vpage; } +#else +// no virtual mem in PCSX +static u_int get_vpage(u_int vaddr) +{ + return get_page(vaddr); +} +#endif // Get address from virtual address // This is called from the recompiled JR/JALR instructions @@ -371,7 +399,10 @@ void *get_addr(u_int vaddr) if(verify_dirty(head->addr)) { //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]); invalid_code[vaddr>>12]=0; + inv_code_start=inv_code_end=~0; +#ifndef DISABLE_TLB memory_map[vaddr>>12]|=0x40000000; +#endif if(vpage<2048) { #ifndef DISABLE_TLB if(tlb_LUT_r[vaddr>>12]) { @@ -463,6 +494,7 @@ void *get_addr_32(u_int vaddr,u_int flags) if(verify_dirty(head->addr)) { //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]); invalid_code[vaddr>>12]=0; + inv_code_start=inv_code_end=~0; memory_map[vaddr>>12]|=0x40000000; if(vpage<2048) { #ifndef DISABLE_TLB @@ -576,11 +608,11 @@ void set_const(struct regstat *cur,signed char reg,uint64_t value) for (hr=0;hrregmap[hr]==reg) { cur->isconst|=1<constmap[hr]=value; + current_constmap[hr]=value; } else if((cur->regmap[hr]^64)==reg) { cur->isconst|=1<constmap[hr]=value>>32; + current_constmap[hr]=value>>32; } } } @@ -614,7 +646,7 @@ uint64_t get_const(struct regstat *cur,signed char reg) if(!reg) return 0; for (hr=0;hrregmap[hr]==reg) { - return cur->constmap[hr]; + return current_constmap[hr]; } } printf("Unknown constant in r%d\n",reg); @@ -842,7 +874,7 @@ void alloc_all(struct regstat *cur,int i) } } - +#ifndef FORCE32 void div64(int64_t dividend,int64_t divisor) { lo=dividend/divisor; @@ -953,6 +985,7 @@ uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits) else original=loaded; return original; } +#endif #ifdef __i386__ #include "assem_x86.c" @@ -1127,39 +1160,10 @@ void invalidate_page(u_int page) head=next; } } -void invalidate_block(u_int block) + +static void invalidate_block_range(u_int block, u_int first, u_int last) { u_int page=get_page(block<<12); - u_int vpage=get_vpage(block<<12); - inv_debug("INVALIDATE: %x (%d)\n",block<<12,page); - //inv_debug("invalid_code[block]=%d\n",invalid_code[block]); - u_int first,last; - first=last=page; - struct ll_entry *head; - head=jump_dirty[vpage]; - //printf("page=%d vpage=%d\n",page,vpage); - while(head!=NULL) { - u_int start,end; - if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision - get_bounds((int)head->addr,&start,&end); - //printf("start: %x end: %x\n",start,end); - if(page<2048&&start>=0x80000000&&end<0x80000000+RAM_SIZE) { - if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) { - if((((start-(u_int)rdram)>>12)&2047)>12)&2047; - if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047; - } - } -#ifndef DISABLE_TLB - if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) { - if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) { - if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)>12]-(u_int)rdram)>>12)&2047; - if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047; - } - } -#endif - } - head=head->next; - } //printf("first=%d last=%d\n",first,last); invalidate_page(page); assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages) @@ -1178,9 +1182,6 @@ void invalidate_block(u_int block) // Don't trap writes invalid_code[block]=1; -#ifdef PCSX - invalid_code[((u_int)0x80000000>>12)|page]=1; -#endif #ifndef DISABLE_TLB // If there is a valid TLB entry for this page, remove write protect if(tlb_LUT_w[block]) { @@ -1198,10 +1199,103 @@ void invalidate_block(u_int block) memset(mini_ht,-1,sizeof(mini_ht)); #endif } + +void invalidate_block(u_int block) +{ + u_int page=get_page(block<<12); + u_int vpage=get_vpage(block<<12); + inv_debug("INVALIDATE: %x (%d)\n",block<<12,page); + //inv_debug("invalid_code[block]=%d\n",invalid_code[block]); + u_int first,last; + first=last=page; + struct ll_entry *head; + head=jump_dirty[vpage]; + //printf("page=%d vpage=%d\n",page,vpage); + while(head!=NULL) { + u_int start,end; + if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision + get_bounds((int)head->addr,&start,&end); + //printf("start: %x end: %x\n",start,end); + if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE) { + if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) { + if((((start-(u_int)rdram)>>12)&2047)>12)&2047; + if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047; + } + } +#ifndef DISABLE_TLB + if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) { + if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) { + if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)>12]-(u_int)rdram)>>12)&2047; + if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047; + } + } +#endif + } + head=head->next; + } + invalidate_block_range(block,first,last); +} + void invalidate_addr(u_int addr) { +#ifdef PCSX + //static int rhits; + // this check is done by the caller + //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; } + u_int page=get_vpage(addr); + if(page<2048) { // RAM + struct ll_entry *head; + u_int addr_min=~0, addr_max=0; + u_int mask=RAM_SIZE-1; + u_int addr_main=0x80000000|(addr&mask); + int pg1; + inv_code_start=addr_main&~0xfff; + inv_code_end=addr_main|0xfff; + pg1=page; + if (pg1>0) { + // must check previous page too because of spans.. + pg1--; + inv_code_start-=0x1000; + } + for(;pg1<=page;pg1++) { + for(head=jump_dirty[pg1];head!=NULL;head=head->next) { + u_int start,end; + get_bounds((int)head->addr,&start,&end); + if(ram_offset) { + start-=ram_offset; + end-=ram_offset; + } + if(start<=addr_main&&addr_mainaddr_max) addr_max=end; + } + else if(addr_maininv_code_start) + inv_code_start=end; + } + } + } + if (addr_min!=~0) { + inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max); + inv_code_start=inv_code_end=~0; + invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12); + return; + } + else { + inv_code_start=(addr&~mask)|(inv_code_start&mask); + inv_code_end=(addr&~mask)|(inv_code_end&mask); + inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0); + return; + } + } +#endif invalidate_block(addr>>12); } + // This is called when loading a save state. // Anything could have changed, so invalidate everything. void invalidate_all_pages() @@ -1240,6 +1334,8 @@ void add_link(u_int vaddr,void *src) { u_int page=get_page(vaddr); inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page); + int *ptr=(int *)(src+4); + assert((*ptr&0x0fff0000)==0x059f0000); ll_add(jump_out+page,vaddr,src); //int ptr=get_pointer(src); //inv_debug("add_link: Pointer is to %x\n",(int)ptr); @@ -1269,11 +1365,13 @@ void clean_blocks(u_int page) inv|=invalid_code[i]; } } +#ifndef DISABLE_TLB if((signed int)head->vaddr>=(signed int)0xC0000000) { u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2)); //printf("addr=%x start=%x end=%x\n",addr,start,end); if(addr=end) inv=1; } +#endif else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) { inv=1; } @@ -1326,8 +1424,6 @@ void mov_alloc(struct regstat *current,int i) void shiftimm_alloc(struct regstat *current,int i) { - clear_const(current,rs1[i]); - clear_const(current,rt1[i]); if(opcode2[i]<=0x3) // SLL/SRL/SRA { if(rt1[i]) { @@ -1336,8 +1432,21 @@ void shiftimm_alloc(struct regstat *current,int i) alloc_reg(current,i,rt1[i]); current->is32|=1LL<>imm[i]); + if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]); + } + else clear_const(current,rt1[i]); } } + else + { + clear_const(current,rs1[i]); + clear_const(current,rt1[i]); + } + if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA { if(rt1[i]) { @@ -2623,7 +2732,7 @@ void shiftimm_assemble(int i,struct regstat *i_regs) t=get_reg(i_regs->regmap,rt1[i]); s=get_reg(i_regs->regmap,rs1[i]); //assert(t>=0); - if(t>=0){ + if(t>=0&&!((i_regs->isconst>>t)&1)){ if(rs1[i]==0) { emit_zeroreg(t); @@ -2766,6 +2875,7 @@ void load_assemble(int i,struct regstat *i_regs) int offset; int jaddr=0; int memtarget=0,c=0; + int fastload_reg_override=0; u_int hr,reglist=0; th=get_reg(i_regs->regmap,rt1[i]|64); tl=get_reg(i_regs->regmap,rt1[i]); @@ -2816,24 +2926,13 @@ void load_assemble(int i,struct regstat *i_regs) if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE) #endif { - #ifdef PCSX - if(sp_in_mirror&&rs1[i]==29) { - emit_andimm(addr,~0x00e00000,HOST_TEMPREG); - emit_cmpimm(HOST_TEMPREG,RAM_SIZE); - } - else - #endif - emit_cmpimm(addr,RAM_SIZE); - jaddr=(int)out; - #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK - // Hint to branch predictor that the branch is unlikely to be taken - if(rs1[i]>=28) - emit_jno_unlikely(0); - else - #endif - emit_jno(0); + jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override); } } + else if(ram_offset&&memtarget) { + emit_addimm(addr,ram_offset,HOST_TEMPREG); + fastload_reg_override=HOST_TEMPREG; + } }else{ // using tlb int x=0; if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU @@ -2864,9 +2963,8 @@ void load_assemble(int i,struct regstat *i_regs) #else if(!c) a=addr; #endif -#ifdef PCSX - if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG; -#endif + if(fastload_reg_override) a=fastload_reg_override; + emit_movsbl_indexed_tlb(x,a,map,tl); } } @@ -2892,9 +2990,7 @@ void load_assemble(int i,struct regstat *i_regs) #else if(!c) a=addr; #endif -#ifdef PCSX - if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG; -#endif + if(fastload_reg_override) a=fastload_reg_override; //#ifdef //emit_movswl_indexed_tlb(x,tl,map,tl); //else @@ -2902,7 +2998,7 @@ void load_assemble(int i,struct regstat *i_regs) gen_tlb_addr_r(a,map); emit_movswl_indexed(x,a,tl); }else{ - #ifdef RAM_OFFSET + #if 1 //def RAM_OFFSET emit_movswl_indexed(x,a,tl); #else emit_movswl_indexed((int)rdram-0x80000000+x,a,tl); @@ -2920,9 +3016,7 @@ void load_assemble(int i,struct regstat *i_regs) if(!c||memtarget) { if(!dummy) { int a=addr; -#ifdef PCSX - if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG; -#endif + if(fastload_reg_override) a=fastload_reg_override; //emit_readword_indexed((int)rdram-0x80000000,addr,tl); #ifdef HOST_IMM_ADDR32 if(c) @@ -2956,9 +3050,8 @@ void load_assemble(int i,struct regstat *i_regs) #else if(!c) a=addr; #endif -#ifdef PCSX - if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG; -#endif + if(fastload_reg_override) a=fastload_reg_override; + emit_movzbl_indexed_tlb(x,a,map,tl); } } @@ -2984,9 +3077,7 @@ void load_assemble(int i,struct regstat *i_regs) #else if(!c) a=addr; #endif -#ifdef PCSX - if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG; -#endif + if(fastload_reg_override) a=fastload_reg_override; //#ifdef //emit_movzwl_indexed_tlb(x,tl,map,tl); //#else @@ -2994,7 +3085,7 @@ void load_assemble(int i,struct regstat *i_regs) gen_tlb_addr_r(a,map); emit_movzwl_indexed(x,a,tl); }else{ - #ifdef RAM_OFFSET + #if 1 //def RAM_OFFSET emit_movzwl_indexed(x,a,tl); #else emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl); @@ -3013,9 +3104,7 @@ void load_assemble(int i,struct regstat *i_regs) if(!c||memtarget) { if(!dummy) { int a=addr; -#ifdef PCSX - if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG; -#endif + if(fastload_reg_override) a=fastload_reg_override; //emit_readword_indexed((int)rdram-0x80000000,addr,tl); #ifdef HOST_IMM_ADDR32 if(c) @@ -3036,9 +3125,7 @@ void load_assemble(int i,struct regstat *i_regs) if(!c||memtarget) { if(!dummy) { int a=addr; -#ifdef PCSX - if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG; -#endif + if(fastload_reg_override) a=fastload_reg_override; //gen_tlb_addr_r(tl,map); //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th); //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl); @@ -3103,6 +3190,7 @@ void store_assemble(int i,struct regstat *i_regs) int jaddr=0,jaddr2,type; int memtarget=0,c=0; int agr=AGEN1+(i&1); + int faststore_reg_override=0; u_int hr,reglist=0; th=get_reg(i_regs->regmap,rs2[i]|64); tl=get_reg(i_regs->regmap,rs2[i]); @@ -3127,13 +3215,7 @@ void store_assemble(int i,struct regstat *i_regs) else addr=s; if(!using_tlb) { if(!c) { - #ifdef PCSX - if(sp_in_mirror&&rs1[i]==29) { - emit_andimm(addr,~0x00e00000,HOST_TEMPREG); - emit_cmpimm(HOST_TEMPREG,RAM_SIZE); - } - else - #endif + #ifndef PCSX #ifdef R29_HACK // Strmnnrmn's speed hack if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE) @@ -3156,6 +3238,13 @@ void store_assemble(int i,struct regstat *i_regs) #endif emit_jno(0); } + #else + jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override); + #endif + } + else if(ram_offset&&memtarget) { + emit_addimm(addr,ram_offset,HOST_TEMPREG); + faststore_reg_override=HOST_TEMPREG; } }else{ // using tlb int x=0; @@ -3177,9 +3266,7 @@ void store_assemble(int i,struct regstat *i_regs) #else if(!c) a=addr; #endif -#ifdef PCSX - if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG; -#endif + if(faststore_reg_override) a=faststore_reg_override; //gen_tlb_addr_w(temp,map); //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp); emit_writebyte_indexed_tlb(tl,x,a,map,a); @@ -3195,9 +3282,7 @@ void store_assemble(int i,struct regstat *i_regs) #else if(!c) a=addr; #endif -#ifdef PCSX - if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG; -#endif + if(faststore_reg_override) a=faststore_reg_override; //#ifdef //emit_writehword_indexed_tlb(tl,x,temp,map,temp); //#else @@ -3205,16 +3290,15 @@ void store_assemble(int i,struct regstat *i_regs) gen_tlb_addr_w(a,map); emit_writehword_indexed(tl,x,a); }else - emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a); + //emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a); + emit_writehword_indexed(tl,x,a); } type=STOREH_STUB; } if (opcode[i]==0x2B) { // SW if(!c||memtarget) { int a=addr; -#ifdef PCSX - if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG; -#endif + if(faststore_reg_override) a=faststore_reg_override; //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr); emit_writeword_indexed_tlb(tl,0,a,map,temp); } @@ -3223,9 +3307,7 @@ void store_assemble(int i,struct regstat *i_regs) if (opcode[i]==0x3F) { // SD if(!c||memtarget) { int a=addr; -#ifdef PCSX - if(sp_in_mirror&&rs1[i]==29) a=HOST_TEMPREG; -#endif + if(faststore_reg_override) a=faststore_reg_override; if(rs2[i]) { assert(th>=0); //emit_writeword_indexed(th,(int)rdram-0x80000000,addr); @@ -3240,7 +3322,15 @@ void store_assemble(int i,struct regstat *i_regs) } type=STORED_STUB; } - if(!using_tlb) { +#ifdef PCSX + if(jaddr) { + // PCSX store handlers don't check invcode again + reglist|=1<waswritten&(1<regmap,rs2[i],ccadj[i],reglist); + inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist); + } + // basic current block modification detection.. + // not looking back as that should be in mips cache already + if(c&&start+i*4regmap==regs[i].regmap); // not delay slot + if(i_regs->regmap==regs[i].regmap) { + load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i); + wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty); + emit_movimm(start+i*4+4,0); + emit_writeword(0,(int)&pcaddr); + emit_jmp((int)do_interrupt); + } } //if(opcode[i]==0x2B || opcode[i]==0x3F) //if(opcode[i]==0x2B || opcode[i]==0x28) @@ -3514,7 +3618,7 @@ void storelr_assemble(int i,struct regstat *i_regs) } if(!c||!memtarget) add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist); - if(!using_tlb) { + if(!using_tlb&&!(i_regs->waswritten&(1<regmap,ROREG); if(map<0) map=HOST_TEMPREG; @@ -3692,7 +3796,7 @@ void c1ls_assemble(int i,struct regstat *i_regs) emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp); type=STORED_STUB; } - if(!using_tlb) { + if(!using_tlb&&!(i_regs->waswritten&(1<>16)&0x1f; s=get_reg(i_regs->regmap,rs1[i]); @@ -3788,27 +3893,34 @@ void c2ls_assemble(int i,struct regstat *i_regs) } else { if(!c) { - emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE); - jaddr2=(int)out; - emit_jno(0); + jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override); + } + else if(ram_offset&&memtarget) { + emit_addimm(ar,ram_offset,HOST_TEMPREG); + fastio_reg_override=HOST_TEMPREG; } if (opcode[i]==0x32) { // LWC2 #ifdef HOST_IMM_ADDR32 if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl); else #endif - emit_readword_indexed(0,ar,tl); + int a=ar; + if(fastio_reg_override) a=fastio_reg_override; + emit_readword_indexed(0,a,tl); } if (opcode[i]==0x3a) { // SWC2 #ifdef DESTRUCTIVE_SHIFT if(!offset&&!c&&s>=0) emit_mov(s,ar); #endif - emit_writeword_indexed(tl,0,ar); + int a=ar; + if(fastio_reg_override) a=fastio_reg_override; + emit_writeword_indexed(tl,0,a); } } if(jaddr2) add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist); - if (opcode[i]==0x3a) { // SWC2 + if(opcode[i]==0x3a) // SWC2 + if(!(i_regs->waswritten&(1<regmap,INVCP); assert(ir>=0); @@ -3881,7 +3993,7 @@ void syscall_assemble(int i,struct regstat *i_regs) assert(ccreg==HOST_CCREG); assert(!is_delayslot); emit_movimm(start+i*4,EAX); // Get PC - emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // CHECK: is this right? There should probably be an extra cycle... + emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right? There should probably be an extra cycle... emit_jmp((int)jump_syscall_hle); // XXX } @@ -3892,7 +4004,7 @@ void hlecall_assemble(int i,struct regstat *i_regs) assert(!is_delayslot); emit_movimm(start+i*4+4,0); // Get PC emit_movimm((int)psxHLEt[source[i]&7],1); - emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // XXX + emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX emit_jmp((int)jump_hlecall); } @@ -3902,12 +4014,13 @@ void intcall_assemble(int i,struct regstat *i_regs) assert(ccreg==HOST_CCREG); assert(!is_delayslot); emit_movimm(start+i*4,0); // Get PC - emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); + emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); emit_jmp((int)jump_intcall); } void ds_assemble(int i,struct regstat *i_regs) { + speculate_register_values(i); is_delayslot=1; switch(itype[i]) { case ALU: @@ -4167,6 +4280,7 @@ void address_generation(int i,struct regstat *i_regs,signed char entry[]) // printf("poor load scheduling!\n"); } else if(c) { +#ifndef DISABLE_TLB if(rm>=0) { if(!entry||entry[rm]!=mgr) { if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a) { @@ -4181,6 +4295,7 @@ void address_generation(int i,struct regstat *i_regs,signed char entry[]) } } } +#endif if(rs1[i]!=rt1[i]||itype[i]!=LOAD) { if(!entry||entry[ra]!=agr) { if (opcode[i]==0x22||opcode[i]==0x26) { @@ -4193,6 +4308,7 @@ void address_generation(int i,struct regstat *i_regs,signed char entry[]) (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000)) #endif emit_movimm(constmap[i][rs]+offset,ra); + regs[i].loadedconst|=1<regmap,agr); @@ -4253,6 +4369,7 @@ void address_generation(int i,struct regstat *i_regs,signed char entry[]) (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000)) #endif emit_movimm(constmap[i+1][rs]+offset,ra); + regs[i+1].loadedconst|=1<=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr] + &®map[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1)) + { + regs[i].loadedconst|=1<=0) { //if(entry[hr]!=regmap[hr]) { - if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) { + if(!((regs[i].loadedconst>>hr)&1)) { if(((regs[i].isconst>>hr)&1)&®map[hr]<64&®map[hr]>0) { - int value; + int value,similar=0; if(get_final_value(hr,i,&value)) { - if(value==0) { + // see if some other register has similar value + for(hr2=0;hr2>hr2)&1)) { + if(is_similar_value(value,constmap[i][hr2])) { + similar=1; + break; + } + } + } + if(similar) { + int value2; + if(get_final_value(hr2,i,&value2)) // is this needed? + emit_movimm_from(value2,hr2,value,hr); + else + emit_movimm(value,hr); + } + else if(value==0) { emit_zeroreg(hr); } else { emit_movimm(value,hr); } } + regs[i].loadedconst|=1<>2].regmap_entry); @@ -5152,9 +5298,45 @@ add_to_linker(int addr,int target,int ext) linkcount++; } +static void ujump_assemble_write_ra(int i) +{ + int rt; + unsigned int return_address; + rt=get_reg(branch_regs[i].regmap,31); + assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]); + //assert(rt>=0); + return_address=start+i*4+8; + if(rt>=0) { + #ifdef USE_MINI_HT + if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) { + int temp=-1; // note: must be ds-safe + #ifdef HOST_TEMPREG + temp=HOST_TEMPREG; + #endif + if(temp>=0) do_miniht_insert(return_address,rt,temp); + else emit_movimm(return_address,rt); + } + else + #endif + { + #ifdef REG_PREFETCH + if(temp>=0) + { + if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp); + } + #endif + emit_movimm(return_address,rt); // PC into link register + #ifdef IMM_PREFETCH + emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]); + #endif + } + } +} + void ujump_assemble(int i,struct regstat *i_regs) { signed char *i_regmap=i_regs->regmap; + int ra_done=0; if(i==(ba[i]-start)>>2) assem_debug("idle loop\n"); address_generation(i+1,i_regs,regs[i].regmap_entry); #ifdef REG_PREFETCH @@ -5166,38 +5348,9 @@ void ujump_assemble(int i,struct regstat *i_regs) if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp); } #endif - if(rt1[i]==31) { - int rt; - unsigned int return_address; - rt=get_reg(branch_regs[i].regmap,31); - assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]); - //assert(rt>=0); - return_address=start+i*4+8; - if(rt>=0) { - #ifdef USE_MINI_HT - if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) { - int temp=-1; // note: must be ds-safe - #ifdef HOST_TEMPREG - temp=HOST_TEMPREG; - #endif - if(temp>=0) do_miniht_insert(return_address,rt,temp); - else emit_movimm(return_address,rt); - } - else - #endif - { - #ifdef REG_PREFETCH - if(temp>=0) - { - if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp); - } - #endif - emit_movimm(return_address,rt); // PC into link register - #ifdef IMM_PREFETCH - emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]); - #endif - } - } + if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) { + ujump_assemble_write_ra(i); // writeback ra for DS + ra_done=1; } ds_assemble(i+1,i_regs); uint64_t bc_unneeded=branch_regs[i].u; @@ -5207,6 +5360,8 @@ void ujump_assemble(int i,struct regstat *i_regs) wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32, bc_unneeded,bc_unneeded_upper); load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG); + if(!ra_done&&rt1[i]==31) + ujump_assemble_write_ra(i); int cc,adj; cc=get_reg(branch_regs[i].regmap,CCREG); assert(cc==HOST_CCREG); @@ -5215,7 +5370,7 @@ void ujump_assemble(int i,struct regstat *i_regs) if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp); #endif do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0); - if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc); + if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc); load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); if(internal_branch(branch_regs[i].is32,ba[i])) assem_debug("branch: internal\n"); @@ -5230,11 +5385,33 @@ void ujump_assemble(int i,struct regstat *i_regs) } } +static void rjump_assemble_write_ra(int i) +{ + int rt,return_address; + assert(rt1[i+1]!=rt1[i]); + assert(rt2[i+1]!=rt1[i]); + rt=get_reg(branch_regs[i].regmap,rt1[i]); + assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]); + assert(rt>=0); + return_address=start+i*4+8; + #ifdef REG_PREFETCH + if(temp>=0) + { + if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp); + } + #endif + emit_movimm(return_address,rt); // PC into link register + #ifdef IMM_PREFETCH + emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]); + #endif +} + void rjump_assemble(int i,struct regstat *i_regs) { signed char *i_regmap=i_regs->regmap; int temp; int rs,cc,adj; + int ra_done=0; rs=get_reg(branch_regs[i].regmap,rs1[i]); assert(rs>=0); if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) { @@ -5261,6 +5438,10 @@ void rjump_assemble(int i,struct regstat *i_regs) if(rh>=0) do_preload_rhash(rh); } #endif + if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) { + rjump_assemble_write_ra(i); + ra_done=1; + } ds_assemble(i+1,i_regs); uint64_t bc_unneeded=branch_regs[i].u; uint64_t bc_unneeded_upper=branch_regs[i].uu; @@ -5270,25 +5451,8 @@ void rjump_assemble(int i,struct regstat *i_regs) wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32, bc_unneeded,bc_unneeded_upper); load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG); - if(rt1[i]!=0) { - int rt,return_address; - assert(rt1[i+1]!=rt1[i]); - assert(rt2[i+1]!=rt1[i]); - rt=get_reg(branch_regs[i].regmap,rt1[i]); - assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]); - assert(rt>=0); - return_address=start+i*4+8; - #ifdef REG_PREFETCH - if(temp>=0) - { - if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp); - } - #endif - emit_movimm(return_address,rt); // PC into link register - #ifdef IMM_PREFETCH - emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]); - #endif - } + if(!ra_done&&rt1[i]!=0) + rjump_assemble_write_ra(i); cc=get_reg(branch_regs[i].regmap,CCREG); assert(cc==HOST_CCREG); #ifdef USE_MINI_HT @@ -5319,8 +5483,14 @@ void rjump_assemble(int i,struct regstat *i_regs) //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN); //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen //assert(adj==0); - emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG); + emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG); add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0); +#ifdef PCSX + if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10) + // special case for RFE + emit_jmp(0); + else +#endif emit_jns(0); //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1); #ifdef USE_MINI_HT @@ -5450,7 +5620,7 @@ void cjump_assemble(int i,struct regstat *i_regs) if(unconditional) { do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0); if(i!=(ba[i]-start)>>2 || source[i+1]!=0) { - if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc); + if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc); load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); if(internal) assem_debug("branch: internal\n"); @@ -5469,7 +5639,7 @@ void cjump_assemble(int i,struct regstat *i_regs) } } else if(nop) { - emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc); + emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc); int jaddr=(int)out; emit_jns(0); add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0); @@ -5477,7 +5647,7 @@ void cjump_assemble(int i,struct regstat *i_regs) else { int taken=0,nottaken=0,nottaken1=0; do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert); - if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc); + if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc); if(!only32) { assert(s1h>=0); @@ -5569,7 +5739,7 @@ void cjump_assemble(int i,struct regstat *i_regs) #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) { if(adj) { - emit_addimm(cc,-CLOCK_DIVIDER*adj,cc); + emit_addimm(cc,-CLOCK_ADJUST(adj),cc); add_to_linker((int)out,ba[i],internal); }else{ emit_addnop(13); @@ -5579,7 +5749,7 @@ void cjump_assemble(int i,struct regstat *i_regs) }else #endif { - if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc); + if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc); store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); if(internal) @@ -5599,7 +5769,7 @@ void cjump_assemble(int i,struct regstat *i_regs) if(nottaken1) set_jump_target(nottaken1,(int)out); if(adj) { - if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc); + if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc); } } // (!unconditional) } // if(ooo) @@ -5703,7 +5873,7 @@ void cjump_assemble(int i,struct regstat *i_regs) store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); do_cc(i,i_regmap,&adj,ba[i],TAKEN,0); assem_debug("cycle count (adj)\n"); - if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc); + if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc); load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); if(internal) assem_debug("branch: internal\n"); @@ -5735,7 +5905,7 @@ void cjump_assemble(int i,struct regstat *i_regs) if(cc==-1&&!likely[i]) { // Cycle count isn't in a register, temporarily load it then write it out emit_loadreg(CCREG,HOST_CCREG); - emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG); + emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG); int jaddr=(int)out; emit_jns(0); add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0); @@ -5744,7 +5914,7 @@ void cjump_assemble(int i,struct regstat *i_regs) else{ cc=get_reg(i_regmap,CCREG); assert(cc==HOST_CCREG); - emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc); + emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc); int jaddr=(int)out; emit_jns(0); add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0); @@ -5835,7 +6005,7 @@ void sjump_assemble(int i,struct regstat *i_regs) if(unconditional) { do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0); if(i!=(ba[i]-start)>>2 || source[i+1]!=0) { - if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc); + if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc); load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); if(internal) assem_debug("branch: internal\n"); @@ -5854,7 +6024,7 @@ void sjump_assemble(int i,struct regstat *i_regs) } } else if(nevertaken) { - emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc); + emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc); int jaddr=(int)out; emit_jns(0); add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0); @@ -5862,7 +6032,7 @@ void sjump_assemble(int i,struct regstat *i_regs) else { int nottaken=0; do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert); - if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc); + if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc); if(!only32) { assert(s1h>=0); @@ -5920,7 +6090,7 @@ void sjump_assemble(int i,struct regstat *i_regs) #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) { if(adj) { - emit_addimm(cc,-CLOCK_DIVIDER*adj,cc); + emit_addimm(cc,-CLOCK_ADJUST(adj),cc); add_to_linker((int)out,ba[i],internal); }else{ emit_addnop(13); @@ -5930,7 +6100,7 @@ void sjump_assemble(int i,struct regstat *i_regs) }else #endif { - if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc); + if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc); store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); if(internal) @@ -5949,7 +6119,7 @@ void sjump_assemble(int i,struct regstat *i_regs) } if(adj) { - if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc); + if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc); } } // (!unconditional) } // if(ooo) @@ -6032,7 +6202,7 @@ void sjump_assemble(int i,struct regstat *i_regs) store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); do_cc(i,i_regmap,&adj,ba[i],TAKEN,0); assem_debug("cycle count (adj)\n"); - if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc); + if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc); load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); if(internal) assem_debug("branch: internal\n"); @@ -6063,7 +6233,7 @@ void sjump_assemble(int i,struct regstat *i_regs) if(cc==-1&&!likely[i]) { // Cycle count isn't in a register, temporarily load it then write it out emit_loadreg(CCREG,HOST_CCREG); - emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG); + emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG); int jaddr=(int)out; emit_jns(0); add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0); @@ -6072,7 +6242,7 @@ void sjump_assemble(int i,struct regstat *i_regs) else{ cc=get_reg(i_regmap,CCREG); assert(cc==HOST_CCREG); - emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc); + emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc); int jaddr=(int)out; emit_jns(0); add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0); @@ -6138,7 +6308,7 @@ void fjump_assemble(int i,struct regstat *i_regs) assem_debug("cycle count (adj)\n"); if(1) { int nottaken=0; - if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc); + if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc); if(1) { assert(fs>=0); emit_testimm(fs,0x800000); @@ -6165,7 +6335,7 @@ void fjump_assemble(int i,struct regstat *i_regs) } // if(!only32) if(invert) { - if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc); + if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc); #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK else if(match) emit_addnop(13); #endif @@ -6186,7 +6356,7 @@ void fjump_assemble(int i,struct regstat *i_regs) } if(adj) { - if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc); + if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc); } } // (!unconditional) } // if(ooo) @@ -6238,7 +6408,7 @@ void fjump_assemble(int i,struct regstat *i_regs) store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); do_cc(i,i_regmap,&adj,ba[i],TAKEN,0); assem_debug("cycle count (adj)\n"); - if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc); + if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc); load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); if(internal) assem_debug("branch: internal\n"); @@ -6268,7 +6438,7 @@ void fjump_assemble(int i,struct regstat *i_regs) if(cc==-1&&!likely[i]) { // Cycle count isn't in a register, temporarily load it then write it out emit_loadreg(CCREG,HOST_CCREG); - emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG); + emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG); int jaddr=(int)out; emit_jns(0); add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0); @@ -6277,7 +6447,7 @@ void fjump_assemble(int i,struct regstat *i_regs) else{ cc=get_reg(i_regmap,CCREG); assert(cc==HOST_CCREG); - emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc); + emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc); int jaddr=(int)out; emit_jns(0); add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0); @@ -6350,7 +6520,7 @@ static void pagespan_assemble(int i,struct regstat *i_regs) if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1 load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG); } - emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG); + emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG); if(opcode[i]==2) // J { unconditional=1; @@ -6648,16 +6818,22 @@ static void pagespan_ds() void unneeded_registers(int istart,int iend,int r) { int i; - uint64_t u,uu,b,bu; - uint64_t temp_u,temp_uu; + uint64_t u,uu,gte_u,b,bu,gte_bu; + uint64_t temp_u,temp_uu,temp_gte_u=0; uint64_t tdep; + uint64_t gte_u_unknown=0; + if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED) + gte_u_unknown=~0ll; if(iend==slen-1) { u=1;uu=1; + gte_u=gte_u_unknown; }else{ u=unneeded_reg[iend+1]; uu=unneeded_reg_upper[iend+1]; u=1;uu=1; + gte_u=gte_unneeded[iend+1]; } + for (i=iend;i>=istart;i--) { //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r); @@ -6671,6 +6847,7 @@ void unneeded_registers(int istart,int iend,int r) // Branch out of this block, flush all regs u=1; uu=1; + gte_u=gte_u_unknown; /* Hexagon hack if(itype[i]==UJUMP&&rt1[i]==31) { @@ -6702,17 +6879,21 @@ void unneeded_registers(int istart,int iend,int r) uu&=~((1LL<>rt1[i+1])&1; @@ -6739,17 +6922,21 @@ void unneeded_registers(int istart,int iend,int r) temp_uu&=~((1LL<>rt1[i])&1; @@ -6759,8 +6946,11 @@ void unneeded_registers(int istart,int iend,int r) temp_uu&=~((1LL<>2]=1; unneeded_reg_upper[(ba[i]-start)>>2]=1; + gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown; } } /*else*/ if(1) { if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000) @@ -6775,6 +6966,7 @@ void unneeded_registers(int istart,int iend,int r) // Unconditional branch u=unneeded_reg[(ba[i]-start)>>2]; uu=unneeded_reg_upper[(ba[i]-start)>>2]; + gte_u=gte_unneeded[(ba[i]-start)>>2]; branch_unneeded_reg[i]=u; branch_unneeded_reg_upper[i]=uu; //u=1; @@ -6789,10 +6981,13 @@ void unneeded_registers(int istart,int iend,int r) uu&=~((1LL<>2]; bu=unneeded_reg_upper[(ba[i]-start)>>2]; + gte_bu=gte_unneeded[(ba[i]-start)>>2]; branch_unneeded_reg[i]=b; branch_unneeded_reg_upper[i]=bu; //b=1; @@ -6807,20 +7002,25 @@ void unneeded_registers(int istart,int iend,int r) bu&=~((1LL<>2; for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF memory_map[n]=-1; +#endif for(n=0;n<4096;n++) ll_clear(jump_in+n); for(n=0;n<4096;n++) ll_clear(jump_out+n); for(n=0;n<4096;n++) ll_clear(jump_dirty+n); @@ -7809,10 +8018,16 @@ void new_dynarec_init() { printf("Init new dynarec\n"); out=(u_char *)BASE_ADDR; +#if BASE_ADDR_FIXED if (mmap (out, 1<0x80200000&& - 0x10000<=psxRegs.GPR.n.sp&&(psxRegs.GPR.n.sp&~0xe0e00000)>21)&0x1f; - if (source[i]&0x3f) { + //if (op2 & 0x10) { + if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns if (gte_handlers[source[i]&0x3f]!=NULL) { - snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f); + if (gte_regnames[source[i]&0x3f]!=NULL) + strcpy(insn[i],gte_regnames[source[i]&0x3f]); + else + snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f); type=C2OP; } } @@ -8312,6 +8533,7 @@ int new_recompile_block(int addr) us2[i]=0; dep1[i]=0; dep2[i]=0; + gte_rs[i]=gte_rt[i]=0; switch(type) { case LOAD: rs1[i]=(source[i]>>21)&0x1f; @@ -8475,7 +8697,6 @@ int new_recompile_block(int addr) if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET break; case COP1: - case COP2: rs1[i]=0; rs2[i]=0; rt1[i]=0; @@ -8485,6 +8706,23 @@ int new_recompile_block(int addr) if(op2==5) us1[i]=rs1[i]; // DMTC1 rs2[i]=CSREG; break; + case COP2: + rs1[i]=0; + rs2[i]=0; + rt1[i]=0; + rt2[i]=0; + if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2 + if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2 + rs2[i]=CSREG; + int gr=(source[i]>>11)&0x1F; + switch(op2) + { + case 0x00: gte_rs[i]=1ll<>21)&0x1F; rs2[i]=CSREG; @@ -8498,6 +8736,23 @@ int new_recompile_block(int addr) rt1[i]=0; rt2[i]=0; imm[i]=(short)source[i]; + if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2 + else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2 + break; + case C2OP: + rs1[i]=0; + rs2[i]=0; + rt1[i]=0; + rt2[i]=0; + gte_rs[i]=gte_reg_reads[source[i]&0x3f]; + gte_rt[i]=gte_reg_writes[source[i]&0x3f]; + gte_rt[i]|=1ll<<63; // every op changes flags + if((source[i]&0x3f)==GTE_MVMVA) { + int v = (source[i] >> 15) & 3; + gte_rs[i]&=~0xe3fll; + if(v==3) gte_rs[i]|=0xe00ll; + else gte_rs[i]|=3ll<<(v*2); + } break; case FLOAT: case FCONV: @@ -8631,6 +8886,7 @@ int new_recompile_block(int addr) dirty_reg(¤t,CCREG); current.isconst=0; current.wasconst=0; + current.waswritten=0; int ds=0; int cc=0; int hr=-1; @@ -8659,6 +8915,7 @@ int new_recompile_block(int addr) if(current.regmap[hr]==0) current.regmap[hr]=-1; } current.isconst=0; + current.waswritten=0; } if(i>1) { @@ -8723,6 +8980,7 @@ int new_recompile_block(int addr) regs[i].wasconst=current.isconst; regs[i].was32=current.is32; regs[i].wasdirty=current.dirty; + regs[i].loadedconst=0; #if defined(DESTRUCTIVE_WRITEBACK) && !defined(FORCE32) // To change a dirty register from 32 to 64 bits, we must write // it out during the previous cycle (for branches, 2 cycles) @@ -8897,8 +9155,6 @@ int new_recompile_block(int addr) clear_const(¤t,rt1[i]); alloc_cc(¤t,i); dirty_reg(¤t,CCREG); - ooo[i]=1; - delayslot_alloc(¤t,i+1); if (rt1[i]==31) { alloc_reg(¤t,i,31); dirty_reg(¤t,31); @@ -8909,6 +9165,8 @@ int new_recompile_block(int addr) #endif //current.is32|=1LL<0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800) + current.waswritten|=1<=0x800) + current.waswritten&=~(1<0) { @@ -9305,7 +9571,7 @@ int new_recompile_block(int addr) branch_regs[i-1].is32|=1LL<<31; } memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap)); - memcpy(constmap[i],constmap[i-1],sizeof(current.constmap)); + memcpy(constmap[i],constmap[i-1],sizeof(current_constmap)); break; case RJUMP: memcpy(&branch_regs[i-1],¤t,sizeof(current)); @@ -9330,7 +9596,7 @@ int new_recompile_block(int addr) } #endif memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap)); - memcpy(constmap[i],constmap[i-1],sizeof(current.constmap)); + memcpy(constmap[i],constmap[i-1],sizeof(current_constmap)); break; case CJUMP: if((opcode[i-1]&0x3E)==4) // BEQ/BNE @@ -9366,7 +9632,7 @@ int new_recompile_block(int addr) branch_regs[i-1].isconst=0; branch_regs[i-1].wasconst=0; memcpy(&branch_regs[i-1].regmap_entry,¤t.regmap,sizeof(current.regmap)); - memcpy(constmap[i],constmap[i-1],sizeof(current.constmap)); + memcpy(constmap[i],constmap[i-1],sizeof(current_constmap)); } else if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ @@ -9399,7 +9665,7 @@ int new_recompile_block(int addr) branch_regs[i-1].isconst=0; branch_regs[i-1].wasconst=0; memcpy(&branch_regs[i-1].regmap_entry,¤t.regmap,sizeof(current.regmap)); - memcpy(constmap[i],constmap[i-1],sizeof(current.constmap)); + memcpy(constmap[i],constmap[i-1],sizeof(current_constmap)); } else // Alloc the delay slot in case the branch is taken @@ -9465,7 +9731,7 @@ int new_recompile_block(int addr) branch_regs[i-1].isconst=0; branch_regs[i-1].wasconst=0; memcpy(&branch_regs[i-1].regmap_entry,¤t.regmap,sizeof(current.regmap)); - memcpy(constmap[i],constmap[i-1],sizeof(current.constmap)); + memcpy(constmap[i],constmap[i-1],sizeof(current_constmap)); } else // Alloc the delay slot in case the branch is taken @@ -9579,7 +9845,12 @@ int new_recompile_block(int addr) { cc=0; } -#ifdef PCSX +#if defined(PCSX) && !defined(DRC_DBG) + else if(itype[i]==C2OP&>e_cycletab[source[i]&0x3f]>2) + { + // GTE runs in parallel until accessed, divide by 2 for a rough guess + cc+=gte_cycletab[source[i]&0x3f]/2; + } else if(/*itype[i]==LOAD||*/itype[i]==STORE||itype[i]==C1LS) // load causes weird timing issues { cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER) @@ -9599,7 +9870,7 @@ int new_recompile_block(int addr) regs[i].is32=current.is32; regs[i].dirty=current.dirty; regs[i].isconst=current.isconst; - memcpy(constmap[i],current.constmap,sizeof(current.constmap)); + memcpy(constmap[i],current_constmap,sizeof(current_constmap)); } for(hr=0;hr=0) { @@ -9609,6 +9880,7 @@ int new_recompile_block(int addr) } } if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1; + regs[i].waswritten=current.waswritten; } /* Pass 4 - Cull unused host registers */ @@ -10753,9 +11025,9 @@ int new_recompile_block(int addr) if(itype[slen-1]==SPAN) { bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception } - + +#ifdef DISASM /* Debug/disassembly */ - if((void*)assem_debug==(void*)printf) for(i=0;i>16)!=0x1000)) { @@ -11143,7 +11417,7 @@ int new_recompile_block(int addr) store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4); if(regs[i-1].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG); - emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG); + emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG); } else if(!likely[i-2]) { @@ -11166,7 +11440,7 @@ int new_recompile_block(int addr) store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4); if(regs[i-1].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG); - emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG); + emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG); add_to_linker((int)out,start+i*4,0); emit_jmp(0); } @@ -11305,7 +11579,7 @@ int new_recompile_block(int addr) // If we're within 256K of the end of the buffer, // start over from the beginning. (Is 256K enough?) - if((int)out>BASE_ADDR+(1<(u_int)BASE_ADDR+(1<>12;i<=(start+slen*4)>>12;i++) { @@ -11321,20 +11595,23 @@ int new_recompile_block(int addr) } #endif } + inv_code_start=inv_code_end=~0; #ifdef PCSX - // PCSX maps all RAM mirror invalid_code tests to 0x80000000..0x80000000+RAM_SIZE + // for PCSX we need to mark all mirrors too if(get_page(start)<(RAM_SIZE>>12)) for(i=start>>12;i<=(start+slen*4)>>12;i++) - invalid_code[((u_int)0x80000000>>12)|i]=0; + invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]= + invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]= + invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0; #endif /* Pass 10 - Free memory by expiring oldest blocks */ - int end=((((int)out-BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535; + int end=((((int)out-(int)BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535; while(expirep!=end) { int shift=TARGET_SIZE_2-3; // Divide into 8 blocks - int base=BASE_ADDR+((expirep>>13)<>13)<>11)&3) {