X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?p=pcsx_rearmed.git;a=blobdiff_plain;f=libpcsxcore%2Fnew_dynarec%2Fnew_dynarec.c;h=e0cff62ca622b1b5b04dd942626a76e89d4ab5f0;hp=0809a4aa4083b830803f38288a663412051a99e2;hb=e3c6bdb5e46f72f063bb7f588da6588ac1893b17;hpb=be516ebe45e48044b599e9d9f9f2d296c3f3ee62 diff --git a/libpcsxcore/new_dynarec/new_dynarec.c b/libpcsxcore/new_dynarec/new_dynarec.c index 0809a4aa..e0cff62c 100644 --- a/libpcsxcore/new_dynarec/new_dynarec.c +++ b/libpcsxcore/new_dynarec/new_dynarec.c @@ -35,12 +35,18 @@ static int sceBlock; #endif #include "new_dynarec_config.h" -#include "../psxhle.h" //emulator interface -#include "emu_if.h" //emulator interface +#include "../psxhle.h" +#include "../psxinterpreter.h" +#include "../gte.h" +#include "emu_if.h" // emulator interface +#define noinline __attribute__((noinline,noclone)) #ifndef ARRAY_SIZE #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0])) #endif +#ifndef min +#define min(a, b) ((b) < (a) ? (b) : (a)) +#endif //#define DISASM //#define assem_debug printf @@ -61,9 +67,27 @@ static int sceBlock; #include "assem_arm64.h" #endif +#define RAM_SIZE 0x200000 #define MAXBLOCK 4096 #define MAX_OUTPUT_BLOCK_SIZE 262144 +struct ndrc_mem +{ + u_char translation_cache[1 << TARGET_SIZE_2]; + struct + { + struct tramp_insns ops[2048 / sizeof(struct tramp_insns)]; + const void *f[2048 / sizeof(void *)]; + } tramp; +}; + +#ifdef BASE_ADDR_DYNAMIC +static struct ndrc_mem *ndrc; +#else +static struct ndrc_mem ndrc_ __attribute__((aligned(4096))); +static struct ndrc_mem *ndrc = &ndrc_; +#endif + // stubs enum stub_type { CC_STUB = 1, @@ -147,8 +171,6 @@ struct link_entry static u_char rs2[MAXBLOCK]; static u_char rt1[MAXBLOCK]; static u_char rt2[MAXBLOCK]; - static u_char us1[MAXBLOCK]; - static u_char us2[MAXBLOCK]; static u_char dep1[MAXBLOCK]; static u_char dep2[MAXBLOCK]; static u_char lt1[MAXBLOCK]; @@ -167,9 +189,11 @@ struct link_entry static char ooo[MAXBLOCK]; static uint64_t unneeded_reg[MAXBLOCK]; static uint64_t branch_unneeded_reg[MAXBLOCK]; - static signed char regmap_pre[MAXBLOCK][HOST_REGS]; - static uint64_t current_constmap[HOST_REGS]; - static uint64_t constmap[MAXBLOCK][HOST_REGS]; + static signed char regmap_pre[MAXBLOCK][HOST_REGS]; // pre-instruction i? + // contains 'real' consts at [i] insn, but may differ from what's actually + // loaded in host reg as 'final' value is always loaded, see get_final_value() + static uint32_t current_constmap[HOST_REGS]; + static uint32_t constmap[MAXBLOCK][HOST_REGS]; static struct regstat regs[MAXBLOCK]; static struct regstat branch_regs[MAXBLOCK]; static signed char minimum_free_regs[MAXBLOCK]; @@ -197,14 +221,23 @@ struct link_entry #endif int new_dynarec_hacks; + int new_dynarec_hacks_pergame; int new_dynarec_did_compile; + + #define HACK_ENABLED(x) ((new_dynarec_hacks | new_dynarec_hacks_pergame) & (x)) + + extern int cycle_count; // ... until end of the timeslice, counts -N -> 0 + extern int last_count; // last absolute target, often = next_interupt + extern int pcaddr; + extern int pending_exception; + extern int branch_target; + extern uintptr_t mini_ht[32][2]; extern u_char restore_candidate[512]; - extern int cycle_count; /* registers that may be allocated */ /* 1-31 gpr */ -#define HIREG 32 // hi -#define LOREG 33 // lo +#define LOREG 32 // lo +#define HIREG 33 // hi //#define FSREG 34 // FPU status (FCSR) #define CSREG 35 // Coprocessor status #define CCREG 36 // Cycle count @@ -263,8 +296,11 @@ struct link_entry #define NOTTAKEN 2 #define NULLDS 3 +#define DJT_1 (void *)1l // no function, just a label in assem_debug log +#define DJT_2 (void *)2l + // asm linkage -int new_recompile_block(int addr); +int new_recompile_block(u_int addr); void *get_addr_ht(u_int vaddr); void invalidate_block(u_int block); void invalidate_addr(u_int addr); @@ -272,14 +308,12 @@ void remove_hash(int vaddr); void dyna_linker(); void dyna_linker_ds(); void verify_code(); -void verify_code_vm(); void verify_code_ds(); void cc_interrupt(); void fp_exception(); void fp_exception_ds(); -void jump_syscall_hle(); -void jump_hlecall(); -void jump_intcall(); +void jump_to_new_pc(); +void call_gteStall(); void new_dyna_leave(); // Needed by assembler @@ -290,15 +324,22 @@ static void load_all_regs(signed char i_regmap[]); static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]); static void load_regs_entry(int t); static void load_all_consts(signed char regmap[],u_int dirty,int i); +static u_int get_host_reglist(const signed char *regmap); -static int verify_dirty(u_int *ptr); +static int verify_dirty(const u_int *ptr); static int get_final_value(int hr, int i, int *value); static void add_stub(enum stub_type type, void *addr, void *retaddr, u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e); static void add_stub_r(enum stub_type type, void *addr, void *retaddr, - int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist); + int i, int addr_reg, const struct regstat *i_regs, int ccadj, u_int reglist); static void add_to_linker(void *addr, u_int target, int ext); static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override); +static void *get_direct_memhandler(void *table, u_int addr, + enum stub_type type, uintptr_t *addr_host); +static void cop2_call_stall_check(u_int op, int i, const struct regstat *i_regs, u_int reglist); +static void pass_args(int a0, int a1); +static void emit_far_jump(const void *f); +static void emit_far_call(const void *f); static void mprotect_w_x(void *start, void *end, int is_x) { @@ -327,7 +368,7 @@ static void start_tcache_write(void *start, void *end) static void end_tcache_write(void *start, void *end) { -#ifdef __arm__ +#if defined(__arm__) || defined(__aarch64__) size_t len = (char *)end - (char *)start; #if defined(__BLACKBERRY_QNX__) msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE); @@ -337,12 +378,14 @@ static void end_tcache_write(void *start, void *end) sceKernelSyncVMDomain(sceBlock, start, len); #elif defined(_3DS) ctr_flush_invalidate_cache(); + #elif defined(__aarch64__) + // as of 2021, __clear_cache() is still broken on arm64 + // so here is a custom one :( + clear_cache_arm64(start, end); #else __clear_cache(start, end); #endif (void)len; -#else - __clear_cache(start, end); #endif mprotect_w_x(start, end, 1); @@ -351,8 +394,8 @@ static void end_tcache_write(void *start, void *end) static void *start_block(void) { u_char *end = out + MAX_OUTPUT_BLOCK_SIZE; - if (end > translation_cache + (1< ndrc->translation_cache + sizeof(ndrc->translation_cache)) + end = ndrc->translation_cache + sizeof(ndrc->translation_cache); start_tcache_write(out, end); return out; } @@ -362,16 +405,74 @@ static void end_block(void *start) end_tcache_write(start, out); } +// also takes care of w^x mappings when patching code +static u_int needs_clear_cache[1<<(TARGET_SIZE_2-17)]; + +static void mark_clear_cache(void *target) +{ + uintptr_t offset = (u_char *)target - ndrc->translation_cache; + u_int mask = 1u << ((offset >> 12) & 31); + if (!(needs_clear_cache[offset >> 17] & mask)) { + char *start = (char *)((uintptr_t)target & ~4095l); + start_tcache_write(start, start + 4095); + needs_clear_cache[offset >> 17] |= mask; + } +} + +// Clearing the cache is rather slow on ARM Linux, so mark the areas +// that need to be cleared, and then only clear these areas once. +static void do_clear_cache(void) +{ + int i, j; + for (i = 0; i < (1<<(TARGET_SIZE_2-17)); i++) + { + u_int bitmap = needs_clear_cache[i]; + if (!bitmap) + continue; + for (j = 0; j < 32; j++) + { + u_char *start, *end; + if (!(bitmap & (1<translation_cache + i*131072 + j*4096; + end = start + 4095; + for (j++; j < 32; j++) { + if (!(bitmap & (1<>31)|1; - return (x * cycle_multiplier + s * 50) / 100; + return (x * m + s * 50) / 100; +} + +// is the op an unconditional jump? +static int is_ujump(int i) +{ + return itype[i] == UJUMP || itype[i] == RJUMP + || (source[i] >> 16) == 0x1000; // beq r0, r0, offset // b offset +} + +static int is_jump(int i) +{ + return itype[i] == RJUMP || itype[i] == UJUMP || itype[i] == CJUMP || itype[i] == SJUMP; } static u_int get_page(u_int vaddr) @@ -412,7 +513,7 @@ static int doesnt_expire_soon(void *tcaddr) // Get address from virtual address // This is called from the recompiled JR/JALR instructions -void *get_addr(u_int vaddr) +void noinline *get_addr(u_int vaddr) { u_int page=get_page(vaddr); u_int vpage=get_vpage(vaddr); @@ -480,7 +581,7 @@ void clear_all_regs(signed char regmap[]) for (hr=0;hrisconst|=1<regmap[hr]^64)==reg) { - cur->isconst|=1<>32; - } } } -void clear_const(struct regstat *cur,signed char reg) +static void clear_const(struct regstat *cur, signed char reg) { int hr; if(!reg) return; @@ -546,7 +643,7 @@ void clear_const(struct regstat *cur,signed char reg) } } -int is_const(struct regstat *cur,signed char reg) +static int is_const(struct regstat *cur, signed char reg) { int hr; if(reg<0) return 0; @@ -558,7 +655,8 @@ int is_const(struct regstat *cur,signed char reg) } return 0; } -uint64_t get_const(struct regstat *cur,signed char reg) + +static uint32_t get_const(struct regstat *cur, signed char reg) { int hr; if(!reg) return 0; @@ -568,7 +666,7 @@ uint64_t get_const(struct regstat *cur,signed char reg) } } SysPrintf("Unknown constant in r%d\n",reg); - exit(1); + abort(); } // Least soon needed registers @@ -584,7 +682,7 @@ void lsn(u_char hsn[], int i, int *preferred_reg) j=slen-i-1; break; } - if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000) + if (is_ujump(i+j)) { // Don't go past an unconditonal jump j++; @@ -632,7 +730,7 @@ void lsn(u_char hsn[], int i, int *preferred_reg) // TODO: preferred register based on backward branch } // Delay slot should preferably not overwrite branch conditions or cycle count - if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP)) { + if (i > 0 && is_jump(i-1)) { if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1; if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1; hsn[CCREG]=1; @@ -667,7 +765,7 @@ int needed_again(int r, int i) int b=-1; int rn=10; - if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) + if (i > 0 && is_ujump(i-1)) { if(ba[i-1]start+slen*4-4) return 0; // Don't need any registers if exiting the block @@ -678,7 +776,7 @@ int needed_again(int r, int i) j=slen-i-1; break; } - if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000) + if (is_ujump(i+j)) { // Don't go past an unconditonal jump j++; @@ -734,7 +832,7 @@ int loop_reg(int i, int r, int hr) j=slen-i-1; break; } - if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000) + if (is_ujump(i+j)) { // Don't go past an unconditonal jump j++; @@ -789,12 +887,30 @@ void alloc_all(struct regstat *cur,int i) } } +#ifndef NDEBUG +static int host_tempreg_in_use; + +static void host_tempreg_acquire(void) +{ + assert(!host_tempreg_in_use); + host_tempreg_in_use = 1; +} + +static void host_tempreg_release(void) +{ + host_tempreg_in_use = 0; +} +#else +static void host_tempreg_acquire(void) {} +static void host_tempreg_release(void) {} +#endif + #ifdef DRC_DBG extern void gen_interupt(); extern void do_insn_cmp(); -#define FUNCNAME(f) { (intptr_t)f, " " #f } +#define FUNCNAME(f) { f, " " #f } static const struct { - intptr_t addr; + void *addr; const char *name; } function_names[] = { FUNCNAME(cc_interrupt), @@ -808,17 +924,18 @@ static const struct { FUNCNAME(jump_handler_write16), FUNCNAME(jump_handler_write32), FUNCNAME(invalidate_addr), - FUNCNAME(verify_code_vm), - FUNCNAME(verify_code), - FUNCNAME(jump_hlecall), - FUNCNAME(jump_syscall_hle), + FUNCNAME(jump_to_new_pc), + FUNCNAME(call_gteStall), FUNCNAME(new_dyna_leave), FUNCNAME(pcsx_mtc0), FUNCNAME(pcsx_mtc0_ds), FUNCNAME(do_insn_cmp), +#ifdef __arm__ + FUNCNAME(verify_code), +#endif }; -static const char *func_name(intptr_t a) +static const char *func_name(const void *a) { int i; for (i = 0; i < sizeof(function_names)/sizeof(function_names[0]); i++) @@ -843,6 +960,48 @@ static const char *func_name(intptr_t a) #include "assem_arm64.c" #endif +static void *get_trampoline(const void *f) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(ndrc->tramp.f); i++) { + if (ndrc->tramp.f[i] == f || ndrc->tramp.f[i] == NULL) + break; + } + if (i == ARRAY_SIZE(ndrc->tramp.f)) { + SysPrintf("trampoline table is full, last func %p\n", f); + abort(); + } + if (ndrc->tramp.f[i] == NULL) { + start_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]); + ndrc->tramp.f[i] = f; + end_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]); + } + return &ndrc->tramp.ops[i]; +} + +static void emit_far_jump(const void *f) +{ + if (can_jump_or_call(f)) { + emit_jmp(f); + return; + } + + f = get_trampoline(f); + emit_jmp(f); +} + +static void emit_far_call(const void *f) +{ + if (can_jump_or_call(f)) { + emit_call(f); + return; + } + + f = get_trampoline(f); + emit_call(f); +} + // Add virtual address mapping to linked list void ll_add(struct ll_entry **head,int vaddr,void *addr) { @@ -970,9 +1129,7 @@ static void ll_kill_pointers(struct ll_entry *head,uintptr_t addr,int shift) { inv_debug("EXP: Kill pointer at %p (%x)\n",head->addr,head->vaddr); void *host_addr=find_extjump_insn(head->addr); - #ifdef __arm__ - mark_clear_cache(host_addr); - #endif + mark_clear_cache(host_addr); set_jump_target(host_addr, head->addr); } head=head->next; @@ -980,7 +1137,7 @@ static void ll_kill_pointers(struct ll_entry *head,uintptr_t addr,int shift) } // This is called when we write to a compiled block (see do_invstub) -void invalidate_page(u_int page) +static void invalidate_page(u_int page) { struct ll_entry *head; struct ll_entry *next; @@ -998,9 +1155,7 @@ void invalidate_page(u_int page) while(head!=NULL) { inv_debug("INVALIDATE: kill pointer to %x (%p)\n",head->vaddr,head->addr); void *host_addr=find_extjump_insn(head->addr); - #ifdef __arm__ - mark_clear_cache(host_addr); - #endif + mark_clear_cache(host_addr); set_jump_target(host_addr, head->addr); next=head->next; free(head); @@ -1023,9 +1178,7 @@ static void invalidate_block_range(u_int block, u_int first, u_int last) for(first=page+1;first %x (%d)\n",src,vaddr,page); - int *ptr=(int *)(src+4); - assert((*ptr&0x0fff0000)==0x059f0000); - (void)ptr; + check_extjump2(src); ll_add(jump_out+page,vaddr,src); //void *ptr=get_pointer(src); //inv_debug("add_link: Pointer is to %p\n",ptr); @@ -1307,16 +1472,6 @@ static void alloc_reg(struct regstat *cur,int i,signed char reg) for(r=1;r<=MAXREG;r++) { if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) { - for(hr=0;hrregmap[hr]==r+64) { - cur->regmap[hr]=reg; - cur->dirty&=~(1<isconst&=~(1<regmap[hr]==r) { @@ -1336,14 +1491,6 @@ static void alloc_reg(struct regstat *cur,int i,signed char reg) for(r=1;r<=MAXREG;r++) { if(hsn[r]==j) { - for(hr=0;hrregmap[hr]==r+64) { - cur->regmap[hr]=reg; - cur->dirty&=~(1<isconst&=~(1<regmap[hr]==r) { cur->regmap[hr]=reg; @@ -1355,7 +1502,7 @@ static void alloc_reg(struct regstat *cur,int i,signed char reg) } } } - SysPrintf("This shouldn't happen (alloc_reg)");exit(1); + SysPrintf("This shouldn't happen (alloc_reg)");abort(); } // Allocate a temporary register. This is done without regard to @@ -1418,16 +1565,6 @@ static void alloc_reg_temp(struct regstat *cur,int i,signed char reg) for(r=1;r<=MAXREG;r++) { if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) { - for(hr=0;hr2) { - if(cur->regmap[hr]==r+64) { - cur->regmap[hr]=reg; - cur->dirty&=~(1<isconst&=~(1<2) { if(cur->regmap[hr]==r) { @@ -1447,14 +1584,6 @@ static void alloc_reg_temp(struct regstat *cur,int i,signed char reg) for(r=1;r<=MAXREG;r++) { if(hsn[r]==j) { - for(hr=0;hrregmap[hr]==r+64) { - cur->regmap[hr]=reg; - cur->dirty&=~(1<isconst&=~(1<regmap[hr]==r) { cur->regmap[hr]=reg; @@ -1466,7 +1595,7 @@ static void alloc_reg_temp(struct regstat *cur,int i,signed char reg) } } } - SysPrintf("This shouldn't happen");exit(1); + SysPrintf("This shouldn't happen");abort(); } static void mov_alloc(struct regstat *current,int i) @@ -1618,7 +1747,7 @@ static void imm16_alloc(struct regstat *current,int i) else clear_const(current,rt1[i]); } else { - set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI + set_const(current,rt1[i],imm[i]<<16); // LUI } dirty_reg(current,rt1[i]); } @@ -1798,19 +1927,19 @@ void cop0_alloc(struct regstat *current,int i) minimum_free_regs[i]=HOST_REGS; } -static void cop12_alloc(struct regstat *current,int i) +static void cop2_alloc(struct regstat *current,int i) { - alloc_reg(current,i,CSREG); // Load status - if(opcode2[i]<3) // MFC1/CFC1 + if (opcode2[i] < 3) // MFC2/CFC2 { + alloc_cc(current,i); // for stalls + dirty_reg(current,CCREG); if(rt1[i]){ clear_const(current,rt1[i]); alloc_reg(current,i,rt1[i]); dirty_reg(current,rt1[i]); } - alloc_reg_temp(current,i,-1); } - else if(opcode2[i]>3) // MTC1/CTC1 + else if (opcode2[i] > 3) // MTC2/CTC2 { if(rs1[i]){ clear_const(current,rs1[i]); @@ -1820,13 +1949,15 @@ static void cop12_alloc(struct regstat *current,int i) current->u&=~1LL; alloc_reg(current,i,0); } - alloc_reg_temp(current,i,-1); } + alloc_reg_temp(current,i,-1); minimum_free_regs[i]=1; } void c2op_alloc(struct regstat *current,int i) { + alloc_cc(current,i); // for stalls + dirty_reg(current,CCREG); alloc_reg_temp(current,i,-1); } @@ -1849,7 +1980,7 @@ void delayslot_alloc(struct regstat *current,int i) case SYSCALL: case HLECALL: case SPAN: - assem_debug("jump in the delay slot. this shouldn't happen.\n");//exit(1); + assem_debug("jump in the delay slot. this shouldn't happen.\n");//abort(); SysPrintf("Disabled speculative precompilation\n"); stop_after_jal=1; break; @@ -1883,8 +2014,9 @@ void delayslot_alloc(struct regstat *current,int i) cop0_alloc(current,i); break; case COP1: + break; case COP2: - cop12_alloc(current,i); + cop2_alloc(current,i); break; case C1LS: c1ls_alloc(current,i); @@ -1937,7 +2069,7 @@ static void pagespan_alloc(struct regstat *current,int i) static void add_stub(enum stub_type type, void *addr, void *retaddr, u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e) { - assert(a < ARRAY_SIZE(stubs)); + assert(stubcount < ARRAY_SIZE(stubs)); stubs[stubcount].type = type; stubs[stubcount].addr = addr; stubs[stubcount].retaddr = retaddr; @@ -1950,7 +2082,7 @@ static void add_stub(enum stub_type type, void *addr, void *retaddr, } static void add_stub_r(enum stub_type type, void *addr, void *retaddr, - int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist) + int i, int addr_reg, const struct regstat *i_regs, int ccadj, u_int reglist) { add_stub(type, addr, retaddr, i, addr_reg, (uintptr_t)i_regs, ccadj, reglist); } @@ -1994,16 +2126,24 @@ static void wb_valid(signed char pre[],signed char entry[],u_int dirty_pre,u_int } } -void rlist() +// trashes r2 +static void pass_args(int a0, int a1) { - int i; - printf("TRACE: "); - for(i=0;i<32;i++) - printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]); - printf("\n"); + if(a0==1&&a1==0) { + // must swap + emit_mov(a0,2); emit_mov(a1,1); emit_mov(2,0); + } + else if(a0!=0&&a1==0) { + emit_mov(a1,1); + if (a0>=0) emit_mov(a0,0); + } + else { + if(a0>=0&&a0!=0) emit_mov(a0,0); + if(a1>=0&&a1!=1) emit_mov(a1,1); + } } -void alu_assemble(int i,struct regstat *i_regs) +static void alu_assemble(int i,struct regstat *i_regs) { if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU if(rt1[i]) { @@ -2050,10 +2190,11 @@ void alu_assemble(int i,struct regstat *i_regs) s2l=get_reg(i_regs->regmap,rs2[i]); if(rs2[i]==0) // rx=0); - if(opcode2[i]==0x2a) // SLT + if(opcode2[i]==0x2a&&rs1[i]!=0) { // SLT + assert(s1l>=0); emit_shrimm(s1l,31,t); - else // SLTU (unsigned can not be less than zero) + } + else // SLTU (unsigned can not be less than zero, 0<0) emit_zeroreg(t); } else if(rs1[i]==0) // r0regmap,rt1[i]|64); + signed char sl,tl; tl=get_reg(i_regs->regmap,rt1[i]); - sh=get_reg(i_regs->regmap,rs1[i]|64); sl=get_reg(i_regs->regmap,rs1[i]); if(tl>=0) { if(rs1[i]) { - assert(sh>=0); assert(sl>=0); - if(th>=0) { - emit_addimm64_32(sh,sl,imm[i],th,tl); - } - else { - emit_addimm(sl,imm[i],tl); - } + emit_addimm(sl,imm[i],tl); } else { emit_movimm(imm[i],tl); - if(th>=0) emit_movimm(((signed int)imm[i])>>31,th); } } } @@ -2250,10 +2382,8 @@ void imm16_assemble(int i,struct regstat *i_regs) } else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI if(rt1[i]) { - signed char sh,sl,th,tl; - th=get_reg(i_regs->regmap,rt1[i]|64); + signed char sl,tl; tl=get_reg(i_regs->regmap,rt1[i]); - sh=get_reg(i_regs->regmap,rs1[i]|64); sl=get_reg(i_regs->regmap,rs1[i]); if(tl>=0 && !((i_regs->isconst>>tl)&1)) { if(opcode[i]==0x0c) //ANDI @@ -2271,7 +2401,6 @@ void imm16_assemble(int i,struct regstat *i_regs) } else emit_zeroreg(tl); - if(th>=0) emit_zeroreg(th); } else { @@ -2279,13 +2408,6 @@ void imm16_assemble(int i,struct regstat *i_regs) if(sl<0) { if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl); } - if(th>=0) { - if(sh<0) { - emit_loadreg(rs1[i]|64,th); - }else{ - emit_mov(sh,th); - } - } if(opcode[i]==0x0d) { // ORI if(sl<0) { emit_orimm(tl,imm[i],tl); @@ -2309,7 +2431,6 @@ void imm16_assemble(int i,struct regstat *i_regs) } else { emit_movimm(imm[i],tl); - if(th>=0) emit_zeroreg(th); } } } @@ -2375,11 +2496,44 @@ void shiftimm_assemble(int i,struct regstat *i_regs) } #ifndef shift_assemble -void shift_assemble(int i,struct regstat *i_regs) +static void shift_assemble(int i,struct regstat *i_regs) { - printf("Need shift_assemble for this architecture.\n"); - exit(1); + signed char s,t,shift; + if (rt1[i] == 0) + return; + assert(opcode2[i]<=0x07); // SLLV/SRLV/SRAV + t = get_reg(i_regs->regmap, rt1[i]); + s = get_reg(i_regs->regmap, rs1[i]); + shift = get_reg(i_regs->regmap, rs2[i]); + if (t < 0) + return; + + if(rs1[i]==0) + emit_zeroreg(t); + else if(rs2[i]==0) { + assert(s>=0); + if(s!=t) emit_mov(s,t); + } + else { + host_tempreg_acquire(); + emit_andimm(shift,31,HOST_TEMPREG); + switch(opcode2[i]) { + case 4: // SLLV + emit_shl(s,HOST_TEMPREG,t); + break; + case 6: // SRLV + emit_shr(s,HOST_TEMPREG,t); + break; + case 7: // SRAV + emit_sar(s,HOST_TEMPREG,t); + break; + default: + assert(0); + } + host_tempreg_release(); + } } + #endif enum { @@ -2424,24 +2578,29 @@ static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override) } if(type==MTYPE_8020) { // RAM 80200000+ mirror + host_tempreg_acquire(); emit_andimm(addr,~0x00e00000,HOST_TEMPREG); addr=*addr_reg_override=HOST_TEMPREG; type=0; } else if(type==MTYPE_0000) { // RAM 0 mirror + host_tempreg_acquire(); emit_orimm(addr,0x80000000,HOST_TEMPREG); addr=*addr_reg_override=HOST_TEMPREG; type=0; } else if(type==MTYPE_A000) { // RAM A mirror + host_tempreg_acquire(); emit_andimm(addr,~0x20000000,HOST_TEMPREG); addr=*addr_reg_override=HOST_TEMPREG; type=0; } else if(type==MTYPE_1F80) { // scratchpad if (psxH == (void *)0x1f800000) { - emit_addimm(addr,-0x1f800000,HOST_TEMPREG); + host_tempreg_acquire(); + emit_xorimm(addr,0x1f800000,HOST_TEMPREG); emit_cmpimm(HOST_TEMPREG,0x1000); + host_tempreg_release(); jaddr=out; emit_jc(0); } @@ -2463,6 +2622,7 @@ static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override) #endif emit_jno(0); if(ram_offset!=0) { + host_tempreg_acquire(); emit_addimm(addr,ram_offset,HOST_TEMPREG); addr=*addr_reg_override=HOST_TEMPREG; } @@ -2471,21 +2631,73 @@ static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override) return jaddr; } -static void load_assemble(int i,struct regstat *i_regs) +// return memhandler, or get directly accessable address and return 0 +static void *get_direct_memhandler(void *table, u_int addr, + enum stub_type type, uintptr_t *addr_host) { - int s,th,tl,addr; + uintptr_t l1, l2 = 0; + l1 = ((uintptr_t *)table)[addr>>12]; + if ((l1 & (1ul << (sizeof(l1)*8-1))) == 0) { + uintptr_t v = l1 << 1; + *addr_host = v + addr; + return NULL; + } + else { + l1 <<= 1; + if (type == LOADB_STUB || type == LOADBU_STUB || type == STOREB_STUB) + l2 = ((uintptr_t *)l1)[0x1000/4 + 0x1000/2 + (addr&0xfff)]; + else if (type == LOADH_STUB || type == LOADHU_STUB || type == STOREH_STUB) + l2=((uintptr_t *)l1)[0x1000/4 + (addr&0xfff)/2]; + else + l2=((uintptr_t *)l1)[(addr&0xfff)/4]; + if ((l2 & (1<<31)) == 0) { + uintptr_t v = l2 << 1; + *addr_host = v + (addr&0xfff); + return NULL; + } + return (void *)(l2 << 1); + } +} + +static u_int get_host_reglist(const signed char *regmap) +{ + u_int reglist = 0, hr; + for (hr = 0; hr < HOST_REGS; hr++) { + if (hr != EXCLUDE_REG && regmap[hr] >= 0) + reglist |= 1 << hr; + } + return reglist; +} + +static u_int reglist_exclude(u_int reglist, int r1, int r2) +{ + if (r1 >= 0) + reglist &= ~(1u << r1); + if (r2 >= 0) + reglist &= ~(1u << r2); + return reglist; +} + +// find a temp caller-saved register not in reglist (so assumed to be free) +static int reglist_find_free(u_int reglist) +{ + u_int free_regs = ~reglist & CALLER_SAVE_REGS; + if (free_regs == 0) + return -1; + return __builtin_ctz(free_regs); +} + +static void load_assemble(int i, const struct regstat *i_regs) +{ + int s,tl,addr; int offset; void *jaddr=0; int memtarget=0,c=0; - int fastload_reg_override=0; - u_int hr,reglist=0; - th=get_reg(i_regs->regmap,rt1[i]|64); + int fastio_reg_override=-1; + u_int reglist=get_host_reglist(i_regs->regmap); tl=get_reg(i_regs->regmap,rt1[i]); s=get_reg(i_regs->regmap,rs1[i]); offset=imm[i]; - for(hr=0;hrregmap[hr]>=0) reglist|=1<regmap[HOST_CCREG]==CCREG) reglist&=~(1<=0) { c=(i_regs->wasconst>>s)&1; @@ -2512,19 +2724,19 @@ static void load_assemble(int i,struct regstat *i_regs) //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset); assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O reglist&=~(1<=0) reglist&=~(1<=0x80000000+RAM_SIZE) #endif { - jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override); + jaddr=emit_fastpath_cmp_jump(i,addr,&fastio_reg_override); } } else if(ram_offset&&memtarget) { + host_tempreg_acquire(); emit_addimm(addr,ram_offset,HOST_TEMPREG); - fastload_reg_override=HOST_TEMPREG; + fastio_reg_override=HOST_TEMPREG; } int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg if (opcode[i]==0x20) { // LB @@ -2533,7 +2745,7 @@ static void load_assemble(int i,struct regstat *i_regs) { int x=0,a=tl; if(!c) a=addr; - if(fastload_reg_override) a=fastload_reg_override; + if(fastio_reg_override>=0) a=fastio_reg_override; emit_movsbl_indexed(x,a,tl); } @@ -2549,7 +2761,7 @@ static void load_assemble(int i,struct regstat *i_regs) if(!dummy) { int x=0,a=tl; if(!c) a=addr; - if(fastload_reg_override) a=fastload_reg_override; + if(fastio_reg_override>=0) a=fastio_reg_override; emit_movswl_indexed(x,a,tl); } if(jaddr) @@ -2562,7 +2774,7 @@ static void load_assemble(int i,struct regstat *i_regs) if(!c||memtarget) { if(!dummy) { int a=addr; - if(fastload_reg_override) a=fastload_reg_override; + if(fastio_reg_override>=0) a=fastio_reg_override; emit_readword_indexed(0,a,tl); } if(jaddr) @@ -2576,7 +2788,7 @@ static void load_assemble(int i,struct regstat *i_regs) if(!dummy) { int x=0,a=tl; if(!c) a=addr; - if(fastload_reg_override) a=fastload_reg_override; + if(fastio_reg_override>=0) a=fastio_reg_override; emit_movzbl_indexed(x,a,tl); } @@ -2591,7 +2803,7 @@ static void load_assemble(int i,struct regstat *i_regs) if(!dummy) { int x=0,a=tl; if(!c) a=addr; - if(fastload_reg_override) a=fastload_reg_override; + if(fastio_reg_override>=0) a=fastio_reg_override; emit_movzwl_indexed(x,a,tl); } if(jaddr) @@ -2601,36 +2813,98 @@ static void load_assemble(int i,struct regstat *i_regs) inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist); } if (opcode[i]==0x27) { // LWU - assert(th>=0); - if(!c||memtarget) { - if(!dummy) { - int a=addr; - if(fastload_reg_override) a=fastload_reg_override; - emit_readword_indexed(0,a,tl); - } - if(jaddr) - add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist); - } - else { - inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist); - } - emit_zeroreg(th); + assert(0); } if (opcode[i]==0x37) { // LD assert(0); } } + if (fastio_reg_override == HOST_TEMPREG) + host_tempreg_release(); } #ifndef loadlr_assemble -void loadlr_assemble(int i,struct regstat *i_regs) +static void loadlr_assemble(int i, const struct regstat *i_regs) { - printf("Need loadlr_assemble for this architecture.\n"); - exit(1); + int s,tl,temp,temp2,addr; + int offset; + void *jaddr=0; + int memtarget=0,c=0; + int fastio_reg_override=-1; + u_int reglist=get_host_reglist(i_regs->regmap); + tl=get_reg(i_regs->regmap,rt1[i]); + s=get_reg(i_regs->regmap,rs1[i]); + temp=get_reg(i_regs->regmap,-1); + temp2=get_reg(i_regs->regmap,FTEMP); + addr=get_reg(i_regs->regmap,AGEN1+(i&1)); + assert(addr<0); + offset=imm[i]; + reglist|=1<=0) { + c=(i_regs->wasconst>>s)&1; + if(c) { + memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE; + } + } + if(!c) { + emit_shlimm(addr,3,temp); + if (opcode[i]==0x22||opcode[i]==0x26) { + emit_andimm(addr,0xFFFFFFFC,temp2); // LWL/LWR + }else{ + emit_andimm(addr,0xFFFFFFF8,temp2); // LDL/LDR + } + jaddr=emit_fastpath_cmp_jump(i,temp2,&fastio_reg_override); + } + else { + if(ram_offset&&memtarget) { + host_tempreg_acquire(); + emit_addimm(temp2,ram_offset,HOST_TEMPREG); + fastio_reg_override=HOST_TEMPREG; + } + if (opcode[i]==0x22||opcode[i]==0x26) { + emit_movimm(((constmap[i][s]+offset)<<3)&24,temp); // LWL/LWR + }else{ + emit_movimm(((constmap[i][s]+offset)<<3)&56,temp); // LDL/LDR + } + } + if (opcode[i]==0x22||opcode[i]==0x26) { // LWL/LWR + if(!c||memtarget) { + int a=temp2; + if(fastio_reg_override>=0) a=fastio_reg_override; + emit_readword_indexed(0,a,temp2); + if(fastio_reg_override==HOST_TEMPREG) host_tempreg_release(); + if(jaddr) add_stub_r(LOADW_STUB,jaddr,out,i,temp2,i_regs,ccadj[i],reglist); + } + else + inline_readstub(LOADW_STUB,i,(constmap[i][s]+offset)&0xFFFFFFFC,i_regs->regmap,FTEMP,ccadj[i],reglist); + if(rt1[i]) { + assert(tl>=0); + emit_andimm(temp,24,temp); + if (opcode[i]==0x22) // LWL + emit_xorimm(temp,24,temp); + host_tempreg_acquire(); + emit_movimm(-1,HOST_TEMPREG); + if (opcode[i]==0x26) { + emit_shr(temp2,temp,temp2); + emit_bic_lsr(tl,HOST_TEMPREG,temp,tl); + }else{ + emit_shl(temp2,temp,temp2); + emit_bic_lsl(tl,HOST_TEMPREG,temp,tl); + } + host_tempreg_release(); + emit_or(temp2,tl,tl); + } + //emit_storereg(rt1[i],tl); // DEBUG + } + if (opcode[i]==0x1A||opcode[i]==0x1B) { // LDL/LDR + assert(0); + } } #endif -void store_assemble(int i,struct regstat *i_regs) +void store_assemble(int i, const struct regstat *i_regs) { int s,tl; int addr,temp; @@ -2639,8 +2913,8 @@ void store_assemble(int i,struct regstat *i_regs) enum stub_type type; int memtarget=0,c=0; int agr=AGEN1+(i&1); - int faststore_reg_override=0; - u_int hr,reglist=0; + int fastio_reg_override=-1; + u_int reglist=get_host_reglist(i_regs->regmap); tl=get_reg(i_regs->regmap,rs2[i]); s=get_reg(i_regs->regmap,rs1[i]); temp=get_reg(i_regs->regmap,agr); @@ -2654,25 +2928,23 @@ void store_assemble(int i,struct regstat *i_regs) } assert(tl>=0); assert(temp>=0); - for(hr=0;hrregmap[hr]>=0) reglist|=1<regmap[HOST_CCREG]==CCREG) reglist&=~(1<=0) a=fastio_reg_override; emit_writebyte_indexed(tl,x,a); } type=STOREB_STUB; @@ -2681,7 +2953,7 @@ void store_assemble(int i,struct regstat *i_regs) if(!c||memtarget) { int x=0,a=temp; if(!c) a=addr; - if(faststore_reg_override) a=faststore_reg_override; + if(fastio_reg_override>=0) a=fastio_reg_override; emit_writehword_indexed(tl,x,a); } type=STOREH_STUB; @@ -2689,7 +2961,7 @@ void store_assemble(int i,struct regstat *i_regs) if (opcode[i]==0x2B) { // SW if(!c||memtarget) { int a=addr; - if(faststore_reg_override) a=faststore_reg_override; + if(fastio_reg_override>=0) a=fastio_reg_override; emit_writeword_indexed(tl,0,a); } type=STOREW_STUB; @@ -2698,13 +2970,15 @@ void store_assemble(int i,struct regstat *i_regs) assert(0); type=STORED_STUB; } + if(fastio_reg_override==HOST_TEMPREG) + host_tempreg_release(); if(jaddr) { // PCSX store handlers don't check invcode again reglist|=1<waswritten&(1<waswritten&(1<attract mode) if(c&&start+i*4regmap==regs[i].regmap); // not delay slot @@ -2743,12 +3018,14 @@ void store_assemble(int i,struct regstat *i_regs) wb_dirtys(regs[i].regmap_entry,regs[i].wasdirty); emit_movimm(start+i*4+4,0); emit_writeword(0,&pcaddr); - emit_jmp(do_interrupt); + emit_addimm(HOST_CCREG,2,HOST_CCREG); + emit_far_call(get_addr_ht); + emit_jmpreg(0); } } } -void storelr_assemble(int i,struct regstat *i_regs) +static void storelr_assemble(int i, const struct regstat *i_regs) { int s,tl; int temp; @@ -2758,7 +3035,7 @@ void storelr_assemble(int i,struct regstat *i_regs) void *done0, *done1, *done2; int memtarget=0,c=0; int agr=AGEN1+(i&1); - u_int hr,reglist=0; + u_int reglist=get_host_reglist(i_regs->regmap); tl=get_reg(i_regs->regmap,rs2[i]); s=get_reg(i_regs->regmap,rs1[i]); temp=get_reg(i_regs->regmap,agr); @@ -2771,9 +3048,6 @@ void storelr_assemble(int i,struct regstat *i_regs) } } assert(tl>=0); - for(hr=0;hrregmap[hr]>=0) reglist|=1<=0); if(!c) { emit_cmpimm(s<0||offset?temp:s,RAM_SIZE); @@ -2788,7 +3062,8 @@ void storelr_assemble(int i,struct regstat *i_regs) emit_jmp(0); } } - emit_addimm_no_flags(ram_offset,temp); + if(ram_offset) + emit_addimm_no_flags(ram_offset,temp); if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR assert(0); @@ -2805,15 +3080,11 @@ void storelr_assemble(int i,struct regstat *i_regs) if (opcode[i]==0x2A) { // SWL emit_writeword_indexed(tl,0,temp); } - if (opcode[i]==0x2E) { // SWR + else if (opcode[i]==0x2E) { // SWR emit_writebyte_indexed(tl,3,temp); } - if (opcode[i]==0x2C) { // SDL - assert(0); - } - if (opcode[i]==0x2D) { // SDR + else assert(0); - } done0=out; emit_jmp(0); // 1 @@ -2826,16 +3097,10 @@ void storelr_assemble(int i,struct regstat *i_regs) emit_writebyte_indexed(tl,1,temp); if(rs2[i]) emit_rorimm(tl,8,tl); } - if (opcode[i]==0x2E) { // SWR + else if (opcode[i]==0x2E) { // SWR // Write two lsb into two most significant bytes emit_writehword_indexed(tl,1,temp); } - if (opcode[i]==0x2C) { // SDL - assert(0); - } - if (opcode[i]==0x2D) { // SDR - assert(0); - } done1=out; emit_jmp(0); // 2 @@ -2849,19 +3114,13 @@ void storelr_assemble(int i,struct regstat *i_regs) emit_writehword_indexed(tl,-2,temp); if(rs2[i]) emit_rorimm(tl,16,tl); } - if (opcode[i]==0x2E) { // SWR + else if (opcode[i]==0x2E) { // SWR // Write 3 lsb into three most significant bytes emit_writebyte_indexed(tl,-1,temp); if(rs2[i]) emit_rorimm(tl,8,tl); emit_writehword_indexed(tl,0,temp); if(rs2[i]) emit_rorimm(tl,24,tl); } - if (opcode[i]==0x2C) { // SDL - assert(0); - } - if (opcode[i]==0x2D) { // SDR - assert(0); - } done2=out; emit_jmp(0); // 3 @@ -2872,28 +3131,16 @@ void storelr_assemble(int i,struct regstat *i_regs) emit_writebyte_indexed(tl,-3,temp); if(rs2[i]) emit_rorimm(tl,8,tl); } - if (opcode[i]==0x2E) { // SWR + else if (opcode[i]==0x2E) { // SWR // Write entire word emit_writeword_indexed(tl,-3,temp); } - if (opcode[i]==0x2C) { // SDL - assert(0); - } - if (opcode[i]==0x2D) { // SDR - assert(0); - } set_jump_target(done0, out); set_jump_target(done1, out); set_jump_target(done2, out); - if (opcode[i]==0x2C) { // SDL - assert(0); - } - if (opcode[i]==0x2D) { // SDR - assert(0); - } if(!c||!memtarget) add_stub_r(STORELR_STUB,jaddr,out,i,temp,i_regs,ccadj[i],reglist); - if(!(i_regs->waswritten&(1<waswritten&(1<regmap,INVCP); @@ -2950,7 +3197,7 @@ static void cop0_assemble(int i,struct regstat *i_regs) emit_storereg(CCREG,HOST_CCREG); emit_loadreg(rs1[i],1); emit_movimm(copr,0); - emit_call(pcsx_mtc0_ds); + emit_far_call(pcsx_mtc0_ds); emit_loadreg(rs1[i],s); return; } @@ -2959,14 +3206,12 @@ static void cop0_assemble(int i,struct regstat *i_regs) emit_movimm(0,HOST_TEMPREG); emit_writeword(HOST_TEMPREG,&pending_exception); } - //else if(copr==12&&is_delayslot) emit_call((int)MTC0_R12); - //else if(s==HOST_CCREG) emit_loadreg(rs1[i],1); else if(s!=1) emit_mov(s,1); emit_movimm(copr,0); - emit_call(pcsx_mtc0); + emit_far_call(pcsx_mtc0); if(copr==9||copr==11||copr==12||copr==13) { emit_readword(&Count,HOST_CCREG); emit_readword(&next_interupt,HOST_TEMPREG); @@ -2979,11 +3224,15 @@ static void cop0_assemble(int i,struct regstat *i_regs) assert(!is_delayslot); emit_readword(&pending_exception,14); emit_test(14,14); - emit_jne(&do_interrupt); + void *jaddr = out; + emit_jeq(0); + emit_readword(&pcaddr, 0); + emit_addimm(HOST_CCREG,2,HOST_CCREG); + emit_far_call(get_addr_ht); + emit_jmpreg(0); + set_jump_target(jaddr, out); } emit_loadreg(rs1[i],s); - if(get_reg(i_regs->regmap,rs1[i]|64)>=0) - emit_loadreg(rs1[i]|64,get_reg(i_regs->regmap,rs1[i]|64)); } else { @@ -3039,7 +3288,131 @@ static void do_cop1stub(int n) if(regs[i].regmap_entry[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG); emit_movimm(start+(i-ds)*4,EAX); // Get PC emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right? There should probably be an extra cycle... - emit_jmp(ds?fp_exception_ds:fp_exception); + emit_far_jump(ds?fp_exception_ds:fp_exception); +} + +static int cop2_is_stalling_op(int i, int *cycles) +{ + if (opcode[i] == 0x3a) { // SWC2 + *cycles = 0; + return 1; + } + if (itype[i] == COP2 && (opcode2[i] == 0 || opcode2[i] == 2)) { // MFC2/CFC2 + *cycles = 0; + return 1; + } + if (itype[i] == C2OP) { + *cycles = gte_cycletab[source[i] & 0x3f]; + return 1; + } + // ... what about MTC2/CTC2/LWC2? + return 0; +} + +#if 0 +static void log_gte_stall(int stall, u_int cycle) +{ + if ((u_int)stall <= 44) + printf("x stall %2d %u\n", stall, cycle + last_count); + if (cycle + last_count > 1215348544) exit(1); +} + +static void emit_log_gte_stall(int i, int stall, u_int reglist) +{ + save_regs(reglist); + if (stall > 0) + emit_movimm(stall, 0); + else + emit_mov(HOST_TEMPREG, 0); + emit_addimm(HOST_CCREG, CLOCK_ADJUST(ccadj[i]), 1); + emit_far_call(log_gte_stall); + restore_regs(reglist); +} +#endif + +static void cop2_call_stall_check(u_int op, int i, const struct regstat *i_regs, u_int reglist) +{ + int j = i, other_gte_op_cycles = -1, stall = -MAXBLOCK, cycles_passed; + int rtmp = reglist_find_free(reglist); + + if (HACK_ENABLED(NDHACK_GTE_NO_STALL)) + return; + //assert(get_reg(i_regs->regmap, CCREG) == HOST_CCREG); + if (get_reg(i_regs->regmap, CCREG) != HOST_CCREG) { + // happens occasionally... cc evicted? Don't bother then + //printf("no cc %08x\n", start + i*4); + return; + } + if (!bt[i]) { + for (j = i - 1; j >= 0; j--) { + //if (is_ds[j]) break; + if (cop2_is_stalling_op(j, &other_gte_op_cycles) || bt[j]) + break; + } + } + cycles_passed = CLOCK_ADJUST(ccadj[i] - ccadj[j]); + if (other_gte_op_cycles >= 0) + stall = other_gte_op_cycles - cycles_passed; + else if (cycles_passed >= 44) + stall = 0; // can't stall + if (stall == -MAXBLOCK && rtmp >= 0) { + // unknown stall, do the expensive runtime check + assem_debug("; cop2_call_stall_check\n"); +#if 0 // too slow + save_regs(reglist); + emit_movimm(gte_cycletab[op], 0); + emit_addimm(HOST_CCREG, CLOCK_ADJUST(ccadj[i]), 1); + emit_far_call(call_gteStall); + restore_regs(reglist); +#else + host_tempreg_acquire(); + emit_readword(&psxRegs.gteBusyCycle, rtmp); + emit_addimm(rtmp, -CLOCK_ADJUST(ccadj[i]), rtmp); + emit_sub(rtmp, HOST_CCREG, HOST_TEMPREG); + emit_cmpimm(HOST_TEMPREG, 44); + emit_cmovb_reg(rtmp, HOST_CCREG); + //emit_log_gte_stall(i, 0, reglist); + host_tempreg_release(); +#endif + } + else if (stall > 0) { + //emit_log_gte_stall(i, stall, reglist); + emit_addimm(HOST_CCREG, stall, HOST_CCREG); + } + + // save gteBusyCycle, if needed + if (gte_cycletab[op] == 0) + return; + other_gte_op_cycles = -1; + for (j = i + 1; j < slen; j++) { + if (cop2_is_stalling_op(j, &other_gte_op_cycles)) + break; + if (is_jump(j)) { + // check ds + if (j + 1 < slen && cop2_is_stalling_op(j + 1, &other_gte_op_cycles)) + j++; + break; + } + } + if (other_gte_op_cycles >= 0) + // will handle stall when assembling that op + return; + cycles_passed = CLOCK_ADJUST(ccadj[min(j, slen -1)] - ccadj[i]); + if (cycles_passed >= 44) + return; + assem_debug("; save gteBusyCycle\n"); + host_tempreg_acquire(); +#if 0 + emit_readword(&last_count, HOST_TEMPREG); + emit_add(HOST_TEMPREG, HOST_CCREG, HOST_TEMPREG); + emit_addimm(HOST_TEMPREG, CLOCK_ADJUST(ccadj[i]), HOST_TEMPREG); + emit_addimm(HOST_TEMPREG, gte_cycletab[op]), HOST_TEMPREG); + emit_writeword(HOST_TEMPREG, &psxRegs.gteBusyCycle); +#else + emit_addimm(HOST_CCREG, CLOCK_ADJUST(ccadj[i]) + gte_cycletab[op], HOST_TEMPREG); + emit_writeword(HOST_TEMPREG, &psxRegs.gteBusyCycle); +#endif + host_tempreg_release(); } static void cop2_get_dreg(u_int copr,signed char tl,signed char temp) @@ -3071,22 +3444,7 @@ static void cop2_get_dreg(u_int copr,signed char tl,signed char temp) break; case 28: case 29: - emit_readword(®_cop2d[9],temp); - emit_testimm(temp,0x8000); // do we need this? - emit_andimm(temp,0xf80,temp); - emit_andne_imm(temp,0,temp); - emit_shrimm(temp,7,tl); - emit_readword(®_cop2d[10],temp); - emit_testimm(temp,0x8000); - emit_andimm(temp,0xf80,temp); - emit_andne_imm(temp,0,temp); - emit_orrshr_imm(temp,2,tl); - emit_readword(®_cop2d[11],temp); - emit_testimm(temp,0x8000); - emit_andimm(temp,0xf80,temp); - emit_andne_imm(temp,0,temp); - emit_orrshl_imm(temp,3,tl); - emit_writeword(tl,®_cop2d[copr]); + c2op_mfc2_29_assemble(tl,temp); break; default: emit_readword(®_cop2d[copr],tl); @@ -3118,8 +3476,7 @@ static void cop2_put_dreg(u_int copr,signed char sl,signed char temp) emit_writeword(sl,®_cop2d[28]); break; case 30: - emit_movs(sl,temp); - emit_mvnmi(temp,temp); + emit_xorsar_imm(sl,sl,31,temp); #if defined(HAVE_ARMV5) || defined(__aarch64__) emit_clz(temp,temp); #else @@ -3141,7 +3498,7 @@ static void cop2_put_dreg(u_int copr,signed char sl,signed char temp) } } -static void c2ls_assemble(int i,struct regstat *i_regs) +static void c2ls_assemble(int i, const struct regstat *i_regs) { int s,tl; int ar; @@ -3150,8 +3507,8 @@ static void c2ls_assemble(int i,struct regstat *i_regs) void *jaddr2=NULL; enum stub_type type; int agr=AGEN1+(i&1); - int fastio_reg_override=0; - u_int hr,reglist=0; + int fastio_reg_override=-1; + u_int reglist=get_host_reglist(i_regs->regmap); u_int copr=(source[i]>>16)&0x1f; s=get_reg(i_regs->regmap,rs1[i]); tl=get_reg(i_regs->regmap,FTEMP); @@ -3159,9 +3516,6 @@ static void c2ls_assemble(int i,struct regstat *i_regs) assert(rs1[i]>0); assert(tl>=0); - for(hr=0;hrregmap[hr]>=0) reglist|=1<regmap[HOST_CCREG]==CCREG) reglist&=~(1<=0); if (opcode[i]==0x3a) { // SWC2 - cop2_get_dreg(copr,tl,HOST_TEMPREG); + cop2_call_stall_check(0, i, i_regs, reglist_exclude(reglist, tl, -1)); + cop2_get_dreg(copr,tl,-1); type=STOREW_STUB; } else @@ -3194,12 +3549,13 @@ static void c2ls_assemble(int i,struct regstat *i_regs) jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override); } else if(ram_offset&&memtarget) { + host_tempreg_acquire(); emit_addimm(ar,ram_offset,HOST_TEMPREG); fastio_reg_override=HOST_TEMPREG; } if (opcode[i]==0x32) { // LWC2 int a=ar; - if(fastio_reg_override) a=fastio_reg_override; + if(fastio_reg_override>=0) a=fastio_reg_override; emit_readword_indexed(0,a,tl); } if (opcode[i]==0x3a) { // SWC2 @@ -3207,14 +3563,16 @@ static void c2ls_assemble(int i,struct regstat *i_regs) if(!offset&&!c&&s>=0) emit_mov(s,ar); #endif int a=ar; - if(fastio_reg_override) a=fastio_reg_override; + if(fastio_reg_override>=0) a=fastio_reg_override; emit_writeword_indexed(tl,0,a); } } + if(fastio_reg_override==HOST_TEMPREG) + host_tempreg_release(); if(jaddr2) add_stub_r(type,jaddr2,out,i,ar,i_regs,ccadj[i],reglist); if(opcode[i]==0x3a) // SWC2 - if(!(i_regs->waswritten&(1<waswritten&(1<regmap,INVCP); assert(ir>=0); @@ -3231,14 +3589,24 @@ static void c2ls_assemble(int i,struct regstat *i_regs) #endif } if (opcode[i]==0x32) { // LWC2 + host_tempreg_acquire(); cop2_put_dreg(copr,tl,HOST_TEMPREG); + host_tempreg_release(); } } -static void cop2_assemble(int i,struct regstat *i_regs) +static void cop2_assemble(int i, const struct regstat *i_regs) { - u_int copr=(source[i]>>11)&0x1f; - signed char temp=get_reg(i_regs->regmap,-1); + u_int copr = (source[i]>>11) & 0x1f; + signed char temp = get_reg(i_regs->regmap, -1); + + if (opcode2[i] == 0 || opcode2[i] == 2) { // MFC2/CFC2 + if (!HACK_ENABLED(NDHACK_GTE_NO_STALL)) { + signed char tl = get_reg(i_regs->regmap, rt1[i]); + u_int reglist = reglist_exclude(get_host_reglist(i_regs->regmap), tl, temp); + cop2_call_stall_check(0, i, i_regs, reglist); + } + } if (opcode2[i]==0) { // MFC2 signed char tl=get_reg(i_regs->regmap,rt1[i]); if(tl>=0&&rt1[i]!=0) @@ -3268,14 +3636,7 @@ static void cop2_assemble(int i,struct regstat *i_regs) emit_signextend16(sl,temp); break; case 31: - //value = value & 0x7ffff000; - //if (value & 0x7f87e000) value |= 0x80000000; - emit_shrimm(sl,12,temp); - emit_shlimm(temp,12,temp); - emit_testimm(temp,0x7f000000); - emit_testeqimm(temp,0x00870000); - emit_testeqimm(temp,0x0000e000); - emit_orrne_imm(temp,0x80000000,temp); + c2op_ctc2_31_assemble(sl,temp); break; default: temp=sl; @@ -3286,73 +3647,152 @@ static void cop2_assemble(int i,struct regstat *i_regs) } } +static void do_unalignedwritestub(int n) +{ + assem_debug("do_unalignedwritestub %x\n",start+stubs[n].a*4); + literal_pool(256); + set_jump_target(stubs[n].addr, out); + + int i=stubs[n].a; + struct regstat *i_regs=(struct regstat *)stubs[n].c; + int addr=stubs[n].b; + u_int reglist=stubs[n].e; + signed char *i_regmap=i_regs->regmap; + int temp2=get_reg(i_regmap,FTEMP); + int rt; + rt=get_reg(i_regmap,rs2[i]); + assert(rt>=0); + assert(addr>=0); + assert(opcode[i]==0x2a||opcode[i]==0x2e); // SWL/SWR only implemented + reglist|=(1<regmap,rt1[i]|64); + signed char sl,tl; tl=get_reg(i_regs->regmap,rt1[i]); //assert(tl>=0); if(tl>=0) { - sh=get_reg(i_regs->regmap,rs1[i]|64); sl=get_reg(i_regs->regmap,rs1[i]); if(sl>=0) emit_mov(sl,tl); else emit_loadreg(rs1[i],tl); - if(th>=0) { - if(sh>=0) emit_mov(sh,th); - else emit_loadreg(rs1[i]|64,th); - } } } } -void syscall_assemble(int i,struct regstat *i_regs) +// call interpreter, exception handler, things that change pc/regs/cycles ... +static void call_c_cpu_handler(int i, const struct regstat *i_regs, u_int pc, void *func) { signed char ccreg=get_reg(i_regs->regmap,CCREG); assert(ccreg==HOST_CCREG); assert(!is_delayslot); (void)ccreg; - emit_movimm(start+i*4,EAX); // Get PC - emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right? There should probably be an extra cycle... - emit_jmp(jump_syscall_hle); // XXX + + emit_movimm(pc,3); // Get PC + emit_readword(&last_count,2); + emit_writeword(3,&psxRegs.pc); + emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX + emit_add(2,HOST_CCREG,2); + emit_writeword(2,&psxRegs.cycle); + emit_far_call(func); + emit_far_jump(jump_to_new_pc); } -void hlecall_assemble(int i,struct regstat *i_regs) +static void syscall_assemble(int i,struct regstat *i_regs) { - extern void psxNULL(); - signed char ccreg=get_reg(i_regs->regmap,CCREG); - assert(ccreg==HOST_CCREG); - assert(!is_delayslot); - (void)ccreg; - emit_movimm(start+i*4+4,0); // Get PC + emit_movimm(0x20,0); // cause code + emit_movimm(0,1); // not in delay slot + call_c_cpu_handler(i,i_regs,start+i*4,psxException); +} + +static void hlecall_assemble(int i,struct regstat *i_regs) +{ + void *hlefunc = psxNULL; uint32_t hleCode = source[i] & 0x03ffffff; - if (hleCode >= ARRAY_SIZE(psxHLEt)) - emit_movimm((uintptr_t)psxNULL,1); - else - emit_movimm((uintptr_t)psxHLEt[hleCode],1); - emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX - emit_jmp(jump_hlecall); + if (hleCode < ARRAY_SIZE(psxHLEt)) + hlefunc = psxHLEt[hleCode]; + + call_c_cpu_handler(i,i_regs,start+i*4+4,hlefunc); } -void intcall_assemble(int i,struct regstat *i_regs) +static void intcall_assemble(int i,struct regstat *i_regs) { - signed char ccreg=get_reg(i_regs->regmap,CCREG); - assert(ccreg==HOST_CCREG); - assert(!is_delayslot); - (void)ccreg; - emit_movimm(start+i*4,0); // Get PC - emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); - emit_jmp(jump_intcall); + call_c_cpu_handler(i,i_regs,start+i*4,execI); } static void speculate_mov(int rs,int rt) @@ -3447,7 +3887,7 @@ static void speculate_register_values(int i) #endif } -void ds_assemble(int i,struct regstat *i_regs) +static void ds_assemble(int i,struct regstat *i_regs) { speculate_register_values(i); is_delayslot=1; @@ -4058,26 +4498,41 @@ static int match_bt(signed char i_regmap[],uint64_t i_dirty,int addr) static void drc_dbg_emit_do_cmp(int i) { extern void do_insn_cmp(); - extern int cycle; - u_int hr,reglist=0; + //extern int cycle; + u_int hr, reglist = get_host_reglist(regs[i].regmap); - for(hr=0;hr=0) reglist|=1< 0 && !bt[i]) { + for (hr = 0; hr < HOST_REGS; hr++) { + int reg = regs[i-1].regmap[hr]; + if (hr == EXCLUDE_REG || reg < 0) + continue; + if (!((regs[i-1].isconst >> hr) & 1)) + continue; + if (i > 1 && reg == regs[i-2].regmap[hr] && constmap[i-1][hr] == constmap[i-2][hr]) + continue; + emit_movimm(constmap[i-1][hr],0); + emit_storereg(reg, 0); + } + } emit_movimm(start+i*4,0); emit_writeword(0,&pcaddr); - emit_call(do_insn_cmp); + emit_far_call(do_insn_cmp); //emit_readword(&cycle,0); //emit_addimm(0,2,0); //emit_writeword(0,&cycle); + (void)get_reg2; restore_regs(reglist); + assem_debug("\\\\do_insn_cmp\n"); } #else #define drc_dbg_emit_do_cmp(x) #endif // Used when a branch jumps into the delay slot of another branch -void ds_assemble_entry(int i) +static void ds_assemble_entry(int i) { int t=(ba[i]-start)>>2; if (!instr_addr[t]) @@ -4146,6 +4601,23 @@ void ds_assemble_entry(int i) emit_jmp(0); } +static void emit_extjump(void *addr, u_int target) +{ + emit_extjump2(addr, target, dyna_linker); +} + +static void emit_extjump_ds(void *addr, u_int target) +{ + emit_extjump2(addr, target, dyna_linker_ds); +} + +// Load 2 immediates optimizing for small code size +static void emit_mov2imm_compact(int imm1,u_int rt1,int imm2,u_int rt2) +{ + emit_movimm(imm1,rt1); + emit_movimm_from(imm1,rt1,imm2,rt2); +} + void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert) { int count; @@ -4180,11 +4652,13 @@ void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert) else if(*adj==0||invert) { int cycles=CLOCK_ADJUST(count+2); // faster loop HACK +#if 0 if (t&&*adj) { int rel=t-i; if(-NO_CYCLE_PENALTY_THR>2) assem_debug("idle loop\n"); @@ -4517,7 +4994,7 @@ static void rjump_assemble_write_ra(int i) #endif } -void rjump_assemble(int i,struct regstat *i_regs) +static void rjump_assemble(int i,struct regstat *i_regs) { int temp; int rs,cc; @@ -4593,7 +5070,7 @@ void rjump_assemble(int i,struct regstat *i_regs) //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen //assert(adj==0); emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG); - add_stub(CC_STUB,out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0); + add_stub(CC_STUB,out,NULL,0,i,-1,TAKEN,rs); if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10) // special case for RFE emit_jmp(0); @@ -4607,14 +5084,14 @@ void rjump_assemble(int i,struct regstat *i_regs) else #endif { - emit_jmp(jump_vaddr_reg[rs]); + do_jump_vaddr(rs); } #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK if(rt1[i]!=31&&iregmap; int cc; @@ -4630,6 +5107,9 @@ void cjump_assemble(int i,struct regstat *i_regs) #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK if(i>(ba[i]-start)>>2) invert=1; #endif + #ifdef __aarch64__ + invert=1; // because of near cond. branches + #endif if(ooo[i]) { s1l=get_reg(branch_regs[i].regmap,rs1[i]); @@ -4716,7 +5196,7 @@ void cjump_assemble(int i,struct regstat *i_regs) else emit_test(s1l,s1l); if(invert){ nottaken=out; - emit_jne((void *)1l); + emit_jne(DJT_1); }else{ add_to_linker(out,ba[i],internal); emit_jeq(0); @@ -4728,7 +5208,7 @@ void cjump_assemble(int i,struct regstat *i_regs) else emit_test(s1l,s1l); if(invert){ nottaken=out; - emit_jeq(1); + emit_jeq(DJT_1); }else{ add_to_linker(out,ba[i],internal); emit_jne(0); @@ -4739,7 +5219,7 @@ void cjump_assemble(int i,struct regstat *i_regs) emit_cmpimm(s1l,1); if(invert){ nottaken=out; - emit_jge(1); + emit_jge(DJT_1); }else{ add_to_linker(out,ba[i],internal); emit_jl(0); @@ -4750,7 +5230,7 @@ void cjump_assemble(int i,struct regstat *i_regs) emit_cmpimm(s1l,1); if(invert){ nottaken=out; - emit_jl(1); + emit_jl(DJT_1); }else{ add_to_linker(out,ba[i],internal); emit_jge(0); @@ -4810,26 +5290,26 @@ void cjump_assemble(int i,struct regstat *i_regs) if(s2l>=0) emit_cmp(s1l,s2l); else emit_test(s1l,s1l); nottaken=out; - emit_jne((void *)2l); + emit_jne(DJT_2); } if((opcode[i]&0x2f)==5) // BNE { if(s2l>=0) emit_cmp(s1l,s2l); else emit_test(s1l,s1l); nottaken=out; - emit_jeq(2); + emit_jeq(DJT_2); } if((opcode[i]&0x2f)==6) // BLEZ { emit_cmpimm(s1l,1); nottaken=out; - emit_jge(2); + emit_jge(DJT_2); } if((opcode[i]&0x2f)==7) // BGTZ { emit_cmpimm(s1l,1); nottaken=out; - emit_jl(2); + emit_jl(DJT_2); } } // if(!unconditional) int adj; @@ -4903,7 +5383,7 @@ void cjump_assemble(int i,struct regstat *i_regs) } } -void sjump_assemble(int i,struct regstat *i_regs) +static void sjump_assemble(int i,struct regstat *i_regs) { signed char *i_regmap=i_regs->regmap; int cc; @@ -4919,6 +5399,9 @@ void sjump_assemble(int i,struct regstat *i_regs) #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK if(i>(ba[i]-start)>>2) invert=1; #endif + #ifdef __aarch64__ + invert=1; // because of near cond. branches + #endif //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL) //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL) @@ -5009,7 +5492,7 @@ void sjump_assemble(int i,struct regstat *i_regs) emit_test(s1l,s1l); if(invert){ nottaken=out; - emit_jns(1); + emit_jns(DJT_1); }else{ add_to_linker(out,ba[i],internal); emit_js(0); @@ -5020,7 +5503,7 @@ void sjump_assemble(int i,struct regstat *i_regs) emit_test(s1l,s1l); if(invert){ nottaken=out; - emit_js(1); + emit_js(DJT_1); }else{ add_to_linker(out,ba[i],internal); emit_jns(0); @@ -5089,13 +5572,13 @@ void sjump_assemble(int i,struct regstat *i_regs) { emit_test(s1l,s1l); nottaken=out; - emit_jns(1); + emit_jns(DJT_1); } if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL { emit_test(s1l,s1l); nottaken=out; - emit_js(1); + emit_js(DJT_1); } } // if(!unconditional) int adj; @@ -5459,15 +5942,17 @@ static void pagespan_ds() assert(btaddr!=HOST_CCREG); if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG); #ifdef HOST_IMM8 + host_tempreg_acquire(); emit_movimm(start+4,HOST_TEMPREG); emit_cmp(btaddr,HOST_TEMPREG); + host_tempreg_release(); #else emit_cmpimm(btaddr,start+4); #endif void *branch = out; emit_jeq(0); store_regs_bt(regs[0].regmap,regs[0].dirty,-1); - emit_jmp(jump_vaddr_reg[btaddr]); + do_jump_vaddr(btaddr); set_jump_target(branch, out); store_regs_bt(regs[0].regmap,regs[0].dirty,start+4); load_regs_bt(regs[0].regmap,regs[0].dirty,start+4); @@ -5480,7 +5965,7 @@ void unneeded_registers(int istart,int iend,int r) uint64_t u,gte_u,b,gte_b; uint64_t temp_u,temp_gte_u=0; uint64_t gte_u_unknown=0; - if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED) + if (HACK_ENABLED(NDHACK_GTE_UNNEEDED)) gte_u_unknown=~0ll; if(iend==slen-1) { u=1; @@ -5531,7 +6016,7 @@ void unneeded_registers(int istart,int iend,int r) bt[(ba[i]-start)>>2]=1; if(ba[i]<=start+i*4) { // Backward branch - if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000) + if(is_ujump(i)) { // Unconditional branch temp_u=1; @@ -5576,7 +6061,7 @@ void unneeded_registers(int istart,int iend,int r) gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown; } } /*else*/ if(1) { - if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000) + if (is_ujump(i)) { // Unconditional branch u=unneeded_reg[(ba[i]-start)>>2]; @@ -5686,7 +6171,7 @@ void clean_registers(int istart,int iend,int wr) if(ba[i]=(start+slen*4)) { // Branch out of this block, flush all regs - if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000) + if (is_ujump(i)) { // Unconditional branch will_dirty_i=0; @@ -5766,7 +6251,7 @@ void clean_registers(int istart,int iend,int wr) // Internal branch if(ba[i]<=start+i*4) { // Backward branch - if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000) + if (is_ujump(i)) { // Unconditional branch temp_will_dirty=0; @@ -5863,7 +6348,7 @@ void clean_registers(int istart,int iend,int wr) } /*else*/ if(1) { - if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000) + if (is_ujump(i)) { // Unconditional branch will_dirty_i=0; @@ -6030,7 +6515,7 @@ void clean_registers(int istart,int iend,int wr) regs[i].dirty&=wont_dirty_i; if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP) { - if(i>16)!=0x1000) { + if (i < iend-1 && !is_ujump(i)) { for(r=0;rtranslation_cache; beginning = start_block(); emit_movimm(DRC_TEST_VAL + i, 0); // test emit_ret(); @@ -6216,15 +6707,15 @@ static void new_dynarec_test(void) SysPrintf("test passed.\n"); else SysPrintf("test failed, will likely crash soon (r=%08x %08x)\n", ret[0], ret[1]); - out = translation_cache; + out = ndrc->translation_cache; } // clear the state completely, instead of just marking // things invalid like invalidate_all_pages() does -void new_dynarec_clear_full() +void new_dynarec_clear_full(void) { int n; - out = translation_cache; + out = ndrc->translation_cache; memset(invalid_code,1,sizeof(invalid_code)); memset(hash_table,0xff,sizeof(hash_table)); memset(mini_ht,-1,sizeof(mini_ht)); @@ -6242,34 +6733,28 @@ void new_dynarec_clear_full() for(n=0;n<4096;n++) ll_clear(jump_dirty+n); } -void new_dynarec_init() +void new_dynarec_init(void) { SysPrintf("Init new dynarec\n"); - // allocate/prepare a buffer for translation cache - // see assem_arm.h for some explanation -#if defined(BASE_ADDR_FIXED) - if (mmap(translation_cache, 1 << TARGET_SIZE_2, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_PRIVATE | MAP_ANONYMOUS, - -1, 0) != translation_cache) { - SysPrintf("mmap() failed: %s\n", strerror(errno)); - SysPrintf("disable BASE_ADDR_FIXED and recompile\n"); - abort(); - } -#elif defined(BASE_ADDR_DYNAMIC) +#ifdef BASE_ADDR_DYNAMIC #ifdef VITA sceBlock = sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2); if (sceBlock < 0) SysPrintf("sceKernelAllocMemBlockForVM failed\n"); - int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&translation_cache); + int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&ndrc); if (ret < 0) SysPrintf("sceKernelGetMemBlockBase failed\n"); #else - translation_cache = mmap (NULL, 1 << TARGET_SIZE_2, + uintptr_t desired_addr = 0; + #ifdef __ELF__ + extern char _end; + desired_addr = ((uintptr_t)&_end + 0xffffff) & ~0xffffffl; + #endif + ndrc = mmap((void *)desired_addr, sizeof(*ndrc), PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (translation_cache == MAP_FAILED) { + if (ndrc == MAP_FAILED) { SysPrintf("mmap() failed: %s\n", strerror(errno)); abort(); } @@ -6277,11 +6762,12 @@ void new_dynarec_init() #else #ifndef NO_WRITE_EXEC // not all systems allow execute in data segment by default - if (mprotect(translation_cache, 1<translation_cache) + sizeof(ndrc->tramp.ops), + PROT_READ | PROT_WRITE | PROT_EXEC) != 0) SysPrintf("mprotect() failed: %s\n", strerror(errno)); #endif #endif - out = translation_cache; + out = ndrc->translation_cache; cycle_multiplier=200; new_dynarec_clear_full(); #ifdef HOST_IMM8 @@ -6297,15 +6783,15 @@ void new_dynarec_init() SysPrintf("warning: RAM is not directly mapped, performance will suffer\n"); } -void new_dynarec_cleanup() +void new_dynarec_cleanup(void) { int n; -#if defined(BASE_ADDR_FIXED) || defined(BASE_ADDR_DYNAMIC) +#ifdef BASE_ADDR_DYNAMIC #ifdef VITA sceKernelFreeMemBlock(sceBlock); sceBlock = -1; #else - if (munmap(translation_cache, 1<>12]=0; emit_movimm(start,0); emit_writeword(0,&pcaddr); - emit_jmp(new_dyna_leave); + emit_far_jump(new_dyna_leave); literal_pool(0); end_block(beginning); ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning); @@ -6478,7 +6973,7 @@ int new_recompile_block(int addr) source = get_source_start(start, &pagelimit); if (source == NULL) { SysPrintf("Compile at bogus memory address: %08x\n", addr); - exit(1); + abort(); } /* Pass 1: disassemble */ @@ -6689,8 +7184,6 @@ int new_recompile_block(int addr) opcode2[i]=op2; /* Get registers/immediates */ lt1[i]=0; - us1[i]=0; - us2[i]=0; dep1[i]=0; dep2[i]=0; gte_rs[i]=gte_rt[i]=0; @@ -6709,7 +7202,6 @@ int new_recompile_block(int addr) rt1[i]=0; rt2[i]=0; imm[i]=(short)source[i]; - if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD break; case LOADLR: // LWL/LWR only load part of the register, @@ -6719,7 +7211,6 @@ int new_recompile_block(int addr) rt1[i]=(source[i]>>16)&0x1f; rt2[i]=0; imm[i]=(short)source[i]; - if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL if(op==0x26) dep1[i]=rt1[i]; // LWR break; case IMM16: @@ -6733,8 +7224,6 @@ int new_recompile_block(int addr) }else{ imm[i]=(short)source[i]; } - if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU - if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI break; case UJUMP: @@ -6767,8 +7256,6 @@ int new_recompile_block(int addr) if(op&2) { // BGTZ/BLEZ rs2[i]=0; } - us1[i]=rs1[i]; - us2[i]=rs2[i]; likely[i]=op>>4; break; case SJUMP: @@ -6776,7 +7263,6 @@ int new_recompile_block(int addr) rs2[i]=CCREG; rt1[i]=0; rt2[i]=0; - us1[i]=rs1[i]; if(op2&0x10) { // BxxAL rt1[i]=31; // NOTE: If the branch is not taken, r31 is still overwritten @@ -6788,10 +7274,7 @@ int new_recompile_block(int addr) rs2[i]=(source[i]>>16)&0x1f; // subtract amount rt1[i]=(source[i]>>11)&0x1f; // destination rt2[i]=0; - if(op2==0x2a||op2==0x2b) { // SLT/SLTU - us1[i]=rs1[i];us2[i]=rs2[i]; - } - else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR + if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR dep1[i]=rs1[i];dep2[i]=rs2[i]; } else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB @@ -6803,9 +7286,6 @@ int new_recompile_block(int addr) rs2[i]=(source[i]>>16)&0x1f; // divisor rt1[i]=HIREG; rt2[i]=LOREG; - if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU - us1[i]=rs1[i];us2[i]=rs2[i]; - } break; case MOV: rs1[i]=0; @@ -6825,8 +7305,6 @@ int new_recompile_block(int addr) rs2[i]=(source[i]>>21)&0x1f; // shift amount rt1[i]=(source[i]>>11)&0x1f; // destination rt2[i]=0; - // DSLLV/DSRLV/DSRAV are 64-bit - if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i]; break; case SHIFTIMM: rs1[i]=(source[i]>>16)&0x1f; @@ -6836,8 +7314,6 @@ int new_recompile_block(int addr) imm[i]=(source[i]>>6)&0x1f; // DSxx32 instructions if(op2>=0x3c) imm[i]|=0x20; - // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source - if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i]; break; case COP0: rs1[i]=0; @@ -6856,7 +7332,6 @@ int new_recompile_block(int addr) rt2[i]=0; if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1 if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1 - if(op2==5) us1[i]=rs1[i]; // DMTC1 rs2[i]=CSREG; break; case COP2: @@ -6931,7 +7406,7 @@ int new_recompile_block(int addr) else if(type==CJUMP||type==SJUMP) ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14); else ba[i]=-1; - if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP)) { + if (i > 0 && is_jump(i-1)) { int do_in_intrp=0; // branch in delay slot? if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP) { @@ -6949,7 +7424,7 @@ int new_recompile_block(int addr) bt[t+1]=1; // expected return from interpreter } else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&& - !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) { + !(i>=3&&is_jump(i-3))) { // v0 overwrite like this is a sign of trouble, bail out SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr); do_in_intrp=1; @@ -6965,7 +7440,7 @@ int new_recompile_block(int addr) } } /* Is this the end of the block? */ - if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) { + if (i > 0 && is_ujump(i-1)) { if(rt1[i-1]==0) { // Continue past subroutine call (JAL) done=2; } @@ -7049,23 +7524,6 @@ int new_recompile_block(int addr) current.isconst=0; current.waswritten=0; } - if(i>1) - { - if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL - { - if(rs1[i-2]==0||rs2[i-2]==0) - { - if(rs1[i-2]) { - int hr=get_reg(current.regmap,rs1[i-2]|64); - if(hr>=0) current.regmap[hr]=-1; - } - if(rs2[i-2]) { - int hr=get_reg(current.regmap,rs2[i-2]|64); - if(hr>=0) current.regmap[hr]=-1; - } - } - } - } memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap)); regs[i].wasconst=current.isconst; @@ -7083,7 +7541,7 @@ int new_recompile_block(int addr) current.u=branch_unneeded_reg[i]&~((1LL<>r)&1) { regs[i].regmap_entry[hr]=-1; regs[i].regmap[hr]=-1; @@ -7125,10 +7583,6 @@ int new_recompile_block(int addr) //current.regmap[hr]=-1; }else regs[i].regmap_entry[hr]=r; - } - else { - assert(0); - } } } else { // First instruction expects CCREG to be allocated @@ -7353,8 +7807,9 @@ int new_recompile_block(int addr) cop0_alloc(¤t,i); break; case COP1: + break; case COP2: - cop12_alloc(¤t,i); + cop2_alloc(¤t,i); break; case C1LS: c1ls_alloc(¤t,i); @@ -7402,7 +7857,8 @@ int new_recompile_block(int addr) regs[i].regmap_entry[hr]=0; } else - if(r<64){ + { + assert(r<64); if((current.u>>r)&1) { regs[i].regmap_entry[hr]=-1; //regs[i].regmap[hr]=-1; @@ -7410,9 +7866,6 @@ int new_recompile_block(int addr) }else regs[i].regmap_entry[hr]=r; } - else { - assert(0); - } } } else { // Branches expect CCREG to be allocated at the target @@ -7449,7 +7902,7 @@ int new_recompile_block(int addr) dirty_reg(&branch_regs[i-1],31); } memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap)); - memcpy(constmap[i],constmap[i-1],sizeof(current_constmap)); + memcpy(constmap[i],constmap[i-1],sizeof(constmap[i])); break; case RJUMP: memcpy(&branch_regs[i-1],¤t,sizeof(current)); @@ -7470,7 +7923,7 @@ int new_recompile_block(int addr) } #endif memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap)); - memcpy(constmap[i],constmap[i-1],sizeof(current_constmap)); + memcpy(constmap[i],constmap[i-1],sizeof(constmap[i])); break; case CJUMP: if((opcode[i-1]&0x3E)==4) // BEQ/BNE @@ -7497,7 +7950,7 @@ int new_recompile_block(int addr) branch_regs[i-1].isconst=0; branch_regs[i-1].wasconst=0; memcpy(&branch_regs[i-1].regmap_entry,¤t.regmap,sizeof(current.regmap)); - memcpy(constmap[i],constmap[i-1],sizeof(current_constmap)); + memcpy(constmap[i],constmap[i-1],sizeof(constmap[i])); } else if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ @@ -7522,7 +7975,7 @@ int new_recompile_block(int addr) branch_regs[i-1].isconst=0; branch_regs[i-1].wasconst=0; memcpy(&branch_regs[i-1].regmap_entry,¤t.regmap,sizeof(current.regmap)); - memcpy(constmap[i],constmap[i-1],sizeof(current_constmap)); + memcpy(constmap[i],constmap[i-1],sizeof(constmap[i])); } else // Alloc the delay slot in case the branch is taken @@ -7576,7 +8029,7 @@ int new_recompile_block(int addr) branch_regs[i-1].isconst=0; branch_regs[i-1].wasconst=0; memcpy(&branch_regs[i-1].regmap_entry,¤t.regmap,sizeof(current.regmap)); - memcpy(constmap[i],constmap[i-1],sizeof(current_constmap)); + memcpy(constmap[i],constmap[i-1],sizeof(constmap[i])); } else // Alloc the delay slot in case the branch is taken @@ -7600,7 +8053,7 @@ int new_recompile_block(int addr) break; } - if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000) + if (is_ujump(i-1)) { if(rt1[i-1]==31) // JAL/JALR { @@ -7649,12 +8102,10 @@ int new_recompile_block(int addr) #if !defined(DRC_DBG) else if(itype[i]==C2OP&>e_cycletab[source[i]&0x3f]>2) { - // GTE runs in parallel until accessed, divide by 2 for a rough guess - cc+=gte_cycletab[source[i]&0x3f]/2; - } - else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues - { - cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER) + // this should really be removed since the real stalls have been implemented, + // but doing so causes sizeable perf regression against the older version + u_int gtec = gte_cycletab[source[i] & 0x3f]; + cc += HACK_ENABLED(NDHACK_GTE_NO_STALL) ? gtec/2 : 2; } else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i]) { @@ -7662,7 +8113,8 @@ int new_recompile_block(int addr) } else if(itype[i]==C2LS) { - cc+=4; + // same as with C2OP + cc += HACK_ENABLED(NDHACK_GTE_NO_STALL) ? 4 : 2; } #endif else @@ -7673,7 +8125,7 @@ int new_recompile_block(int addr) if(!is_ds[i]) { regs[i].dirty=current.dirty; regs[i].isconst=current.isconst; - memcpy(constmap[i],current_constmap,sizeof(current_constmap)); + memcpy(constmap[i],current_constmap,sizeof(constmap[i])); } for(hr=0;hr=0) { @@ -7714,7 +8166,7 @@ int new_recompile_block(int addr) } } // Conditional branch may need registers for following instructions - if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) + if (!is_ujump(i)) { if(i0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) { - if((regmap_pre[i][hr]>0&®map_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1))) { + if((regmap_pre[i][hr]>0&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1))) { if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<0&®s[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1))) { + if((regs[i].regmap_entry[hr]>0&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1))) { if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<>16)!=0x1000) + if (!is_ujump(i)) { if(likely[i]) { regs[i].regmap[hr]=-1; @@ -7837,12 +8281,7 @@ int new_recompile_block(int addr) } if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP) { - int d1=0,d2=0,map=0,temp=0; - if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0) - { - d1=dep1[i+1]; - d2=dep2[i+1]; - } + int map=0,temp=0; if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2 map=INVCP; @@ -7853,8 +8292,6 @@ int new_recompile_block(int addr) if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] && (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] && (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] && - (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] && - (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 && regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] && (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP && regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL && @@ -7866,8 +8303,6 @@ int new_recompile_block(int addr) if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] && (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] && (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] && - (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] && - (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 && branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] && (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP && branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL && @@ -7876,7 +8311,7 @@ int new_recompile_block(int addr) { branch_regs[i].regmap[hr]=-1; branch_regs[i].regmap_entry[hr]=-1; - if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) + if (!is_ujump(i)) { if(!likely[i]&&i0) { - int d1=0,d2=0,map=-1,temp=-1; - if(get_reg(regs[i].regmap,rt1[i]|64)>=0) - { - d1=dep1[i]; - d2=dep2[i]; - } + int map=-1,temp=-1; if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2 map=INVCP; @@ -7905,15 +8335,13 @@ int new_recompile_block(int addr) itype[i]==C1LS || itype[i]==C2LS) temp=FTEMP; if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] && - (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] && - (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 && regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] && (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map && (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG)) { if(i0) if(regmap_pre[i+1][hr]!=regs[i].regmap[hr]) { SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]); @@ -7928,8 +8356,8 @@ int new_recompile_block(int addr) } } } - } - } + } // if needed + } // for hr } /* Pass 5 - Pre-allocate registers */ @@ -7956,12 +8384,7 @@ int new_recompile_block(int addr) if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated for(hr=0;hr64) { - if(!((regs[i].dirty>>hr)&1)) - f_regmap[hr]=regs[i].regmap[hr]; - else f_regmap[hr]=-1; - } - else if(regs[i].regmap[hr]>=0) { + if(regs[i].regmap[hr]>=0) { if(f_regmap[hr]!=regs[i].regmap[hr]) { // dealloc old register int n; @@ -7973,12 +8396,7 @@ int new_recompile_block(int addr) f_regmap[hr]=regs[i].regmap[hr]; } } - if(branch_regs[i].regmap[hr]>64) { - if(!((branch_regs[i].dirty>>hr)&1)) - f_regmap[hr]=branch_regs[i].regmap[hr]; - else f_regmap[hr]=-1; - } - else if(branch_regs[i].regmap[hr]>=0) { + if(branch_regs[i].regmap[hr]>=0) { if(f_regmap[hr]!=branch_regs[i].regmap[hr]) { // dealloc old register int n; @@ -8084,7 +8502,7 @@ int new_recompile_block(int addr) branch_regs[i].dirty|=(1<>16)!=0x1000) { + if (!is_ujump(i)) { regmap_pre[i+2][hr]=f_regmap[hr]; regs[i+2].wasdirty&=~(1<>16)!=0x1000) { + if (!is_ujump(k)) { regmap_pre[k+2][hr]=f_regmap[hr]; regs[k+2].wasdirty&=~(1<>16)==0x1000) + if (is_ujump(j)) { // Stop on unconditional branch break; @@ -8161,11 +8579,7 @@ int new_recompile_block(int addr) for(hr=0;hr64) { - if(!((regs[i].dirty>>hr)&1)) - f_regmap[hr]=regs[i].regmap[hr]; - } - else if(regs[i].regmap[hr]>=0) { + if(regs[i].regmap[hr]>=0) { if(f_regmap[hr]!=regs[i].regmap[hr]) { // dealloc old register int n; @@ -8443,6 +8857,7 @@ int new_recompile_block(int addr) #ifdef __arm__ printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]); #endif + #if defined(__i386__) || defined(__x86_64__) printf("needs: "); if(needed_reg[i]&1) printf("eax "); if((needed_reg[i]>>1)&1) printf("ecx "); @@ -8452,7 +8867,6 @@ int new_recompile_block(int addr) if((needed_reg[i]>>6)&1) printf("esi "); if((needed_reg[i]>>7)&1) printf("edi "); printf("\n"); - #if defined(__i386__) || defined(__x86_64__) printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]); printf("dirty: "); if(regs[i].wasdirty&1) printf("eax "); @@ -8519,7 +8933,7 @@ int new_recompile_block(int addr) if((regs[i].isconst>>6)&1) printf("esi=%x ",(u_int)constmap[i][6]); if((regs[i].isconst>>7)&1) printf("edi=%x ",(u_int)constmap[i][7]); #endif - #ifdef __arm__ + #if defined(__arm__) || defined(__aarch64__) int r; for (r = 0; r < ARRAY_SIZE(constmap[i]); r++) if ((regs[i].isconst >> r) & 1) @@ -8569,7 +8983,7 @@ int new_recompile_block(int addr) void *instr_addr0_override = NULL; if (start == 0x80030000) { - // nasty hack for fastbios thing + // nasty hack for the fastbios thing // override block entry to this code instr_addr0_override = out; emit_movimm(start,0); @@ -8579,7 +8993,12 @@ int new_recompile_block(int addr) emit_writeword(0,&pcaddr); emit_writeword(0,&address); emit_cmp(0,1); + #ifdef __aarch64__ + emit_jeq(out + 4*2); + emit_far_jump(new_dyna_leave); + #else emit_jne(new_dyna_leave); + #endif } for(i=0;i>16)!=0x1000)) + if (i < 2 || !is_ujump(i-2)) { wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,unneeded_reg[i]); } @@ -8603,7 +9022,7 @@ int new_recompile_block(int addr) } #endif // write back - if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000)) + if (i < 2 || !is_ujump(i-2)) { wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,unneeded_reg[i]); loop_preload(regmap_pre[i],regs[i].regmap_entry); @@ -8695,17 +9114,17 @@ int new_recompile_block(int addr) case SPAN: pagespan_assemble(i,®s[i]);break; } - if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000) + if (is_ujump(i)) literal_pool(1024); else literal_pool_jumpover(256); } } - //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000); + //assert(is_ujump(i-2)); // If the block did not end with an unconditional branch, // add a jump to the next instruction. if(i>1) { - if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) { + if(!is_ujump(i-2)&&itype[i-1]!=SPAN) { assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP); assert(i==slen); if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP) { @@ -8848,8 +9267,8 @@ int new_recompile_block(int addr) // If we're within 256K of the end of the buffer, // start over from the beginning. (Is 256K enough?) - if (out > translation_cache+(1< ndrc->translation_cache + sizeof(ndrc->translation_cache) - MAX_OUTPUT_BLOCK_SIZE) + out = ndrc->translation_cache; // Trap writes to any of the pages we compiled for(i=start>>12;i<=(start+slen*4)>>12;i++) { @@ -8866,11 +9285,11 @@ int new_recompile_block(int addr) /* Pass 10 - Free memory by expiring oldest blocks */ - int end=(((out-translation_cache)>>(TARGET_SIZE_2-16))+16384)&65535; + int end=(((out-ndrc->translation_cache)>>(TARGET_SIZE_2-16))+16384)&65535; while(expirep!=end) { int shift=TARGET_SIZE_2-3; // Divide into 8 blocks - uintptr_t base=(uintptr_t)translation_cache+((expirep>>13)<translation_cache+((expirep>>13)<>11)&3) { @@ -8908,10 +9327,8 @@ int new_recompile_block(int addr) break; case 3: // Clear jump_out - #if defined(__arm__) || defined(__aarch64__) if((expirep&2047)==0) do_clear_cache(); - #endif ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift); ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift); break;