drc: new far call mechanism
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29 #ifdef _3DS
30 #include <3ds_utils.h>
31 #endif
32 #ifdef VITA
33 #include <psp2/kernel/sysmem.h>
34 static int sceBlock;
35 #endif
36
37 #include "new_dynarec_config.h"
38 #include "../psxhle.h"
39 #include "../psxinterpreter.h"
40 #include "emu_if.h" //emulator interface
41
42 #define noinline __attribute__((noinline,noclone))
43 #ifndef ARRAY_SIZE
44 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
45 #endif
46
47 //#define DISASM
48 //#define assem_debug printf
49 //#define inv_debug printf
50 #define assem_debug(...)
51 #define inv_debug(...)
52
53 #ifdef __i386__
54 #include "assem_x86.h"
55 #endif
56 #ifdef __x86_64__
57 #include "assem_x64.h"
58 #endif
59 #ifdef __arm__
60 #include "assem_arm.h"
61 #endif
62 #ifdef __aarch64__
63 #include "assem_arm64.h"
64 #endif
65
66 #define MAXBLOCK 4096
67 #define MAX_OUTPUT_BLOCK_SIZE 262144
68
69 struct ndrc_mem
70 {
71   u_char translation_cache[1 << TARGET_SIZE_2];
72   struct
73   {
74     struct tramp_insns ops[2048 / sizeof(struct tramp_insns)];
75     const void *f[2048 / sizeof(void *)];
76   } tramp;
77 };
78
79 #ifdef BASE_ADDR_DYNAMIC
80 static struct ndrc_mem *ndrc;
81 #else
82 static struct ndrc_mem ndrc_ __attribute__((aligned(4096)));
83 static struct ndrc_mem *ndrc = &ndrc_;
84 #endif
85
86 // stubs
87 enum stub_type {
88   CC_STUB = 1,
89   FP_STUB = 2,
90   LOADB_STUB = 3,
91   LOADH_STUB = 4,
92   LOADW_STUB = 5,
93   LOADD_STUB = 6,
94   LOADBU_STUB = 7,
95   LOADHU_STUB = 8,
96   STOREB_STUB = 9,
97   STOREH_STUB = 10,
98   STOREW_STUB = 11,
99   STORED_STUB = 12,
100   STORELR_STUB = 13,
101   INVCODE_STUB = 14,
102 };
103
104 struct regstat
105 {
106   signed char regmap_entry[HOST_REGS];
107   signed char regmap[HOST_REGS];
108   uint64_t wasdirty;
109   uint64_t dirty;
110   uint64_t u;
111   u_int wasconst;
112   u_int isconst;
113   u_int loadedconst;             // host regs that have constants loaded
114   u_int waswritten;              // MIPS regs that were used as store base before
115 };
116
117 // note: asm depends on this layout
118 struct ll_entry
119 {
120   u_int vaddr;
121   u_int reg_sv_flags;
122   void *addr;
123   struct ll_entry *next;
124 };
125
126 struct ht_entry
127 {
128   u_int vaddr[2];
129   void *tcaddr[2];
130 };
131
132 struct code_stub
133 {
134   enum stub_type type;
135   void *addr;
136   void *retaddr;
137   u_int a;
138   uintptr_t b;
139   uintptr_t c;
140   u_int d;
141   u_int e;
142 };
143
144 struct link_entry
145 {
146   void *addr;
147   u_int target;
148   u_int ext;
149 };
150
151   // used by asm:
152   u_char *out;
153   struct ht_entry hash_table[65536]  __attribute__((aligned(16)));
154   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
155   struct ll_entry *jump_dirty[4096];
156
157   static struct ll_entry *jump_out[4096];
158   static u_int start;
159   static u_int *source;
160   static char insn[MAXBLOCK][10];
161   static u_char itype[MAXBLOCK];
162   static u_char opcode[MAXBLOCK];
163   static u_char opcode2[MAXBLOCK];
164   static u_char bt[MAXBLOCK];
165   static u_char rs1[MAXBLOCK];
166   static u_char rs2[MAXBLOCK];
167   static u_char rt1[MAXBLOCK];
168   static u_char rt2[MAXBLOCK];
169   static u_char dep1[MAXBLOCK];
170   static u_char dep2[MAXBLOCK];
171   static u_char lt1[MAXBLOCK];
172   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
173   static uint64_t gte_rt[MAXBLOCK];
174   static uint64_t gte_unneeded[MAXBLOCK];
175   static u_int smrv[32]; // speculated MIPS register values
176   static u_int smrv_strong; // mask or regs that are likely to have correct values
177   static u_int smrv_weak; // same, but somewhat less likely
178   static u_int smrv_strong_next; // same, but after current insn executes
179   static u_int smrv_weak_next;
180   static int imm[MAXBLOCK];
181   static u_int ba[MAXBLOCK];
182   static char likely[MAXBLOCK];
183   static char is_ds[MAXBLOCK];
184   static char ooo[MAXBLOCK];
185   static uint64_t unneeded_reg[MAXBLOCK];
186   static uint64_t branch_unneeded_reg[MAXBLOCK];
187   static signed char regmap_pre[MAXBLOCK][HOST_REGS]; // pre-instruction i?
188   static uint64_t current_constmap[HOST_REGS];
189   static uint64_t constmap[MAXBLOCK][HOST_REGS];
190   static struct regstat regs[MAXBLOCK];
191   static struct regstat branch_regs[MAXBLOCK];
192   static signed char minimum_free_regs[MAXBLOCK];
193   static u_int needed_reg[MAXBLOCK];
194   static u_int wont_dirty[MAXBLOCK];
195   static u_int will_dirty[MAXBLOCK];
196   static int ccadj[MAXBLOCK];
197   static int slen;
198   static void *instr_addr[MAXBLOCK];
199   static struct link_entry link_addr[MAXBLOCK];
200   static int linkcount;
201   static struct code_stub stubs[MAXBLOCK*3];
202   static int stubcount;
203   static u_int literals[1024][2];
204   static int literalcount;
205   static int is_delayslot;
206   static char shadow[1048576]  __attribute__((aligned(16)));
207   static void *copy;
208   static int expirep;
209   static u_int stop_after_jal;
210 #ifndef RAM_FIXED
211   static uintptr_t ram_offset;
212 #else
213   static const uintptr_t ram_offset=0;
214 #endif
215
216   int new_dynarec_hacks;
217   int new_dynarec_did_compile;
218
219   extern int cycle_count; // ... until end of the timeslice, counts -N -> 0
220   extern int last_count;  // last absolute target, often = next_interupt
221   extern int pcaddr;
222   extern int pending_exception;
223   extern int branch_target;
224   extern uintptr_t mini_ht[32][2];
225   extern u_char restore_candidate[512];
226
227   /* registers that may be allocated */
228   /* 1-31 gpr */
229 #define LOREG 32 // lo
230 #define HIREG 33 // hi
231 //#define FSREG 34 // FPU status (FCSR)
232 #define CSREG 35 // Coprocessor status
233 #define CCREG 36 // Cycle count
234 #define INVCP 37 // Pointer to invalid_code
235 //#define MMREG 38 // Pointer to memory_map
236 //#define ROREG 39 // ram offset (if rdram!=0x80000000)
237 #define TEMPREG 40
238 #define FTEMP 40 // FPU temporary register
239 #define PTEMP 41 // Prefetch temporary register
240 //#define TLREG 42 // TLB mapping offset
241 #define RHASH 43 // Return address hash
242 #define RHTBL 44 // Return address hash table address
243 #define RTEMP 45 // JR/JALR address register
244 #define MAXREG 45
245 #define AGEN1 46 // Address generation temporary register
246 //#define AGEN2 47 // Address generation temporary register
247 //#define MGEN1 48 // Maptable address generation temporary register
248 //#define MGEN2 49 // Maptable address generation temporary register
249 #define BTREG 50 // Branch target temporary register
250
251   /* instruction types */
252 #define NOP 0     // No operation
253 #define LOAD 1    // Load
254 #define STORE 2   // Store
255 #define LOADLR 3  // Unaligned load
256 #define STORELR 4 // Unaligned store
257 #define MOV 5     // Move
258 #define ALU 6     // Arithmetic/logic
259 #define MULTDIV 7 // Multiply/divide
260 #define SHIFT 8   // Shift by register
261 #define SHIFTIMM 9// Shift by immediate
262 #define IMM16 10  // 16-bit immediate
263 #define RJUMP 11  // Unconditional jump to register
264 #define UJUMP 12  // Unconditional jump
265 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
266 #define SJUMP 14  // Conditional branch (regimm format)
267 #define COP0 15   // Coprocessor 0
268 #define COP1 16   // Coprocessor 1
269 #define C1LS 17   // Coprocessor 1 load/store
270 //#define FJUMP 18  // Conditional branch (floating point)
271 //#define FLOAT 19  // Floating point unit
272 //#define FCONV 20  // Convert integer to float
273 //#define FCOMP 21  // Floating point compare (sets FSREG)
274 #define SYSCALL 22// SYSCALL
275 #define OTHER 23  // Other
276 #define SPAN 24   // Branch/delay slot spans 2 pages
277 #define NI 25     // Not implemented
278 #define HLECALL 26// PCSX fake opcodes for HLE
279 #define COP2 27   // Coprocessor 2 move
280 #define C2LS 28   // Coprocessor 2 load/store
281 #define C2OP 29   // Coprocessor 2 operation
282 #define INTCALL 30// Call interpreter to handle rare corner cases
283
284   /* branch codes */
285 #define TAKEN 1
286 #define NOTTAKEN 2
287 #define NULLDS 3
288
289 #define DJT_1 (void *)1l // no function, just a label in assem_debug log
290 #define DJT_2 (void *)2l
291
292 // asm linkage
293 int new_recompile_block(u_int addr);
294 void *get_addr_ht(u_int vaddr);
295 void invalidate_block(u_int block);
296 void invalidate_addr(u_int addr);
297 void remove_hash(int vaddr);
298 void dyna_linker();
299 void dyna_linker_ds();
300 void verify_code();
301 void verify_code_ds();
302 void cc_interrupt();
303 void fp_exception();
304 void fp_exception_ds();
305 void jump_to_new_pc();
306 void new_dyna_leave();
307
308 // Needed by assembler
309 static void wb_register(signed char r,signed char regmap[],uint64_t dirty);
310 static void wb_dirtys(signed char i_regmap[],uint64_t i_dirty);
311 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_dirty,int addr);
312 static void load_all_regs(signed char i_regmap[]);
313 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
314 static void load_regs_entry(int t);
315 static void load_all_consts(signed char regmap[],u_int dirty,int i);
316
317 static int verify_dirty(const u_int *ptr);
318 static int get_final_value(int hr, int i, int *value);
319 static void add_stub(enum stub_type type, void *addr, void *retaddr,
320   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e);
321 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
322   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist);
323 static void add_to_linker(void *addr, u_int target, int ext);
324 static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override);
325 static void *get_direct_memhandler(void *table, u_int addr,
326   enum stub_type type, uintptr_t *addr_host);
327 static void pass_args(int a0, int a1);
328 static void emit_far_jump(const void *f);
329 static void emit_far_call(const void *f);
330
331 static void mprotect_w_x(void *start, void *end, int is_x)
332 {
333 #ifdef NO_WRITE_EXEC
334   #if defined(VITA)
335   // *Open* enables write on all memory that was
336   // allocated by sceKernelAllocMemBlockForVM()?
337   if (is_x)
338     sceKernelCloseVMDomain();
339   else
340     sceKernelOpenVMDomain();
341   #else
342   u_long mstart = (u_long)start & ~4095ul;
343   u_long mend = (u_long)end;
344   if (mprotect((void *)mstart, mend - mstart,
345                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
346     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
347   #endif
348 #endif
349 }
350
351 static void start_tcache_write(void *start, void *end)
352 {
353   mprotect_w_x(start, end, 0);
354 }
355
356 static void end_tcache_write(void *start, void *end)
357 {
358 #ifdef __arm__
359   size_t len = (char *)end - (char *)start;
360   #if   defined(__BLACKBERRY_QNX__)
361   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
362   #elif defined(__MACH__)
363   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
364   #elif defined(VITA)
365   sceKernelSyncVMDomain(sceBlock, start, len);
366   #elif defined(_3DS)
367   ctr_flush_invalidate_cache();
368   #else
369   __clear_cache(start, end);
370   #endif
371   (void)len;
372 #else
373   __clear_cache(start, end);
374 #endif
375
376   mprotect_w_x(start, end, 1);
377 }
378
379 static void *start_block(void)
380 {
381   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
382   if (end > ndrc->translation_cache + sizeof(ndrc->translation_cache))
383     end = ndrc->translation_cache + sizeof(ndrc->translation_cache);
384   start_tcache_write(out, end);
385   return out;
386 }
387
388 static void end_block(void *start)
389 {
390   end_tcache_write(start, out);
391 }
392
393 //#define DEBUG_CYCLE_COUNT 1
394
395 #define NO_CYCLE_PENALTY_THR 12
396
397 int cycle_multiplier; // 100 for 1.0
398
399 static int CLOCK_ADJUST(int x)
400 {
401   int s=(x>>31)|1;
402   return (x * cycle_multiplier + s * 50) / 100;
403 }
404
405 static u_int get_page(u_int vaddr)
406 {
407   u_int page=vaddr&~0xe0000000;
408   if (page < 0x1000000)
409     page &= ~0x0e00000; // RAM mirrors
410   page>>=12;
411   if(page>2048) page=2048+(page&2047);
412   return page;
413 }
414
415 // no virtual mem in PCSX
416 static u_int get_vpage(u_int vaddr)
417 {
418   return get_page(vaddr);
419 }
420
421 static struct ht_entry *hash_table_get(u_int vaddr)
422 {
423   return &hash_table[((vaddr>>16)^vaddr)&0xFFFF];
424 }
425
426 static void hash_table_add(struct ht_entry *ht_bin, u_int vaddr, void *tcaddr)
427 {
428   ht_bin->vaddr[1] = ht_bin->vaddr[0];
429   ht_bin->tcaddr[1] = ht_bin->tcaddr[0];
430   ht_bin->vaddr[0] = vaddr;
431   ht_bin->tcaddr[0] = tcaddr;
432 }
433
434 // some messy ari64's code, seems to rely on unsigned 32bit overflow
435 static int doesnt_expire_soon(void *tcaddr)
436 {
437   u_int diff = (u_int)((u_char *)tcaddr - out) << (32-TARGET_SIZE_2);
438   return diff > (u_int)(0x60000000 + (MAX_OUTPUT_BLOCK_SIZE << (32-TARGET_SIZE_2)));
439 }
440
441 // Get address from virtual address
442 // This is called from the recompiled JR/JALR instructions
443 void noinline *get_addr(u_int vaddr)
444 {
445   u_int page=get_page(vaddr);
446   u_int vpage=get_vpage(vaddr);
447   struct ll_entry *head;
448   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
449   head=jump_in[page];
450   while(head!=NULL) {
451     if(head->vaddr==vaddr) {
452   //printf("TRACE: count=%d next=%d (get_addr match %x: %p)\n",Count,next_interupt,vaddr,head->addr);
453       hash_table_add(hash_table_get(vaddr), vaddr, head->addr);
454       return head->addr;
455     }
456     head=head->next;
457   }
458   head=jump_dirty[vpage];
459   while(head!=NULL) {
460     if(head->vaddr==vaddr) {
461       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %p)\n",Count,next_interupt,vaddr,head->addr);
462       // Don't restore blocks which are about to expire from the cache
463       if (doesnt_expire_soon(head->addr))
464       if (verify_dirty(head->addr)) {
465         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
466         invalid_code[vaddr>>12]=0;
467         inv_code_start=inv_code_end=~0;
468         if(vpage<2048) {
469           restore_candidate[vpage>>3]|=1<<(vpage&7);
470         }
471         else restore_candidate[page>>3]|=1<<(page&7);
472         struct ht_entry *ht_bin = hash_table_get(vaddr);
473         if (ht_bin->vaddr[0] == vaddr)
474           ht_bin->tcaddr[0] = head->addr; // Replace existing entry
475         else
476           hash_table_add(ht_bin, vaddr, head->addr);
477
478         return head->addr;
479       }
480     }
481     head=head->next;
482   }
483   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
484   int r=new_recompile_block(vaddr);
485   if(r==0) return get_addr(vaddr);
486   // Execute in unmapped page, generate pagefault execption
487   Status|=2;
488   Cause=(vaddr<<31)|0x8;
489   EPC=(vaddr&1)?vaddr-5:vaddr;
490   BadVAddr=(vaddr&~1);
491   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
492   EntryHi=BadVAddr&0xFFFFE000;
493   return get_addr_ht(0x80000000);
494 }
495 // Look up address in hash table first
496 void *get_addr_ht(u_int vaddr)
497 {
498   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
499   const struct ht_entry *ht_bin = hash_table_get(vaddr);
500   if (ht_bin->vaddr[0] == vaddr) return ht_bin->tcaddr[0];
501   if (ht_bin->vaddr[1] == vaddr) return ht_bin->tcaddr[1];
502   return get_addr(vaddr);
503 }
504
505 void clear_all_regs(signed char regmap[])
506 {
507   int hr;
508   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
509 }
510
511 static signed char get_reg(const signed char regmap[],int r)
512 {
513   int hr;
514   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
515   return -1;
516 }
517
518 // Find a register that is available for two consecutive cycles
519 static signed char get_reg2(signed char regmap1[], const signed char regmap2[], int r)
520 {
521   int hr;
522   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
523   return -1;
524 }
525
526 int count_free_regs(signed char regmap[])
527 {
528   int count=0;
529   int hr;
530   for(hr=0;hr<HOST_REGS;hr++)
531   {
532     if(hr!=EXCLUDE_REG) {
533       if(regmap[hr]<0) count++;
534     }
535   }
536   return count;
537 }
538
539 void dirty_reg(struct regstat *cur,signed char reg)
540 {
541   int hr;
542   if(!reg) return;
543   for (hr=0;hr<HOST_REGS;hr++) {
544     if((cur->regmap[hr]&63)==reg) {
545       cur->dirty|=1<<hr;
546     }
547   }
548 }
549
550 void set_const(struct regstat *cur,signed char reg,uint64_t value)
551 {
552   int hr;
553   if(!reg) return;
554   for (hr=0;hr<HOST_REGS;hr++) {
555     if(cur->regmap[hr]==reg) {
556       cur->isconst|=1<<hr;
557       current_constmap[hr]=value;
558     }
559   }
560 }
561
562 void clear_const(struct regstat *cur,signed char reg)
563 {
564   int hr;
565   if(!reg) return;
566   for (hr=0;hr<HOST_REGS;hr++) {
567     if((cur->regmap[hr]&63)==reg) {
568       cur->isconst&=~(1<<hr);
569     }
570   }
571 }
572
573 int is_const(struct regstat *cur,signed char reg)
574 {
575   int hr;
576   if(reg<0) return 0;
577   if(!reg) return 1;
578   for (hr=0;hr<HOST_REGS;hr++) {
579     if((cur->regmap[hr]&63)==reg) {
580       return (cur->isconst>>hr)&1;
581     }
582   }
583   return 0;
584 }
585 uint64_t get_const(struct regstat *cur,signed char reg)
586 {
587   int hr;
588   if(!reg) return 0;
589   for (hr=0;hr<HOST_REGS;hr++) {
590     if(cur->regmap[hr]==reg) {
591       return current_constmap[hr];
592     }
593   }
594   SysPrintf("Unknown constant in r%d\n",reg);
595   abort();
596 }
597
598 // Least soon needed registers
599 // Look at the next ten instructions and see which registers
600 // will be used.  Try not to reallocate these.
601 void lsn(u_char hsn[], int i, int *preferred_reg)
602 {
603   int j;
604   int b=-1;
605   for(j=0;j<9;j++)
606   {
607     if(i+j>=slen) {
608       j=slen-i-1;
609       break;
610     }
611     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
612     {
613       // Don't go past an unconditonal jump
614       j++;
615       break;
616     }
617   }
618   for(;j>=0;j--)
619   {
620     if(rs1[i+j]) hsn[rs1[i+j]]=j;
621     if(rs2[i+j]) hsn[rs2[i+j]]=j;
622     if(rt1[i+j]) hsn[rt1[i+j]]=j;
623     if(rt2[i+j]) hsn[rt2[i+j]]=j;
624     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
625       // Stores can allocate zero
626       hsn[rs1[i+j]]=j;
627       hsn[rs2[i+j]]=j;
628     }
629     // On some architectures stores need invc_ptr
630     #if defined(HOST_IMM8)
631     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
632       hsn[INVCP]=j;
633     }
634     #endif
635     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP))
636     {
637       hsn[CCREG]=j;
638       b=j;
639     }
640   }
641   if(b>=0)
642   {
643     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
644     {
645       // Follow first branch
646       int t=(ba[i+b]-start)>>2;
647       j=7-b;if(t+j>=slen) j=slen-t-1;
648       for(;j>=0;j--)
649       {
650         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
651         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
652         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
653         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
654       }
655     }
656     // TODO: preferred register based on backward branch
657   }
658   // Delay slot should preferably not overwrite branch conditions or cycle count
659   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP)) {
660     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
661     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
662     hsn[CCREG]=1;
663     // ...or hash tables
664     hsn[RHASH]=1;
665     hsn[RHTBL]=1;
666   }
667   // Coprocessor load/store needs FTEMP, even if not declared
668   if(itype[i]==C1LS||itype[i]==C2LS) {
669     hsn[FTEMP]=0;
670   }
671   // Load L/R also uses FTEMP as a temporary register
672   if(itype[i]==LOADLR) {
673     hsn[FTEMP]=0;
674   }
675   // Also SWL/SWR/SDL/SDR
676   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
677     hsn[FTEMP]=0;
678   }
679   // Don't remove the miniht registers
680   if(itype[i]==UJUMP||itype[i]==RJUMP)
681   {
682     hsn[RHASH]=0;
683     hsn[RHTBL]=0;
684   }
685 }
686
687 // We only want to allocate registers if we're going to use them again soon
688 int needed_again(int r, int i)
689 {
690   int j;
691   int b=-1;
692   int rn=10;
693
694   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
695   {
696     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
697       return 0; // Don't need any registers if exiting the block
698   }
699   for(j=0;j<9;j++)
700   {
701     if(i+j>=slen) {
702       j=slen-i-1;
703       break;
704     }
705     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
706     {
707       // Don't go past an unconditonal jump
708       j++;
709       break;
710     }
711     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
712     {
713       break;
714     }
715   }
716   for(;j>=1;j--)
717   {
718     if(rs1[i+j]==r) rn=j;
719     if(rs2[i+j]==r) rn=j;
720     if((unneeded_reg[i+j]>>r)&1) rn=10;
721     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP))
722     {
723       b=j;
724     }
725   }
726   /*
727   if(b>=0)
728   {
729     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
730     {
731       // Follow first branch
732       int o=rn;
733       int t=(ba[i+b]-start)>>2;
734       j=7-b;if(t+j>=slen) j=slen-t-1;
735       for(;j>=0;j--)
736       {
737         if(!((unneeded_reg[t+j]>>r)&1)) {
738           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
739           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
740         }
741         else rn=o;
742       }
743     }
744   }*/
745   if(rn<10) return 1;
746   (void)b;
747   return 0;
748 }
749
750 // Try to match register allocations at the end of a loop with those
751 // at the beginning
752 int loop_reg(int i, int r, int hr)
753 {
754   int j,k;
755   for(j=0;j<9;j++)
756   {
757     if(i+j>=slen) {
758       j=slen-i-1;
759       break;
760     }
761     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
762     {
763       // Don't go past an unconditonal jump
764       j++;
765       break;
766     }
767   }
768   k=0;
769   if(i>0){
770     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP)
771       k--;
772   }
773   for(;k<j;k++)
774   {
775     assert(r < 64);
776     if((unneeded_reg[i+k]>>r)&1) return hr;
777     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP))
778     {
779       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
780       {
781         int t=(ba[i+k]-start)>>2;
782         int reg=get_reg(regs[t].regmap_entry,r);
783         if(reg>=0) return reg;
784         //reg=get_reg(regs[t+1].regmap_entry,r);
785         //if(reg>=0) return reg;
786       }
787     }
788   }
789   return hr;
790 }
791
792
793 // Allocate every register, preserving source/target regs
794 void alloc_all(struct regstat *cur,int i)
795 {
796   int hr;
797
798   for(hr=0;hr<HOST_REGS;hr++) {
799     if(hr!=EXCLUDE_REG) {
800       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
801          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
802       {
803         cur->regmap[hr]=-1;
804         cur->dirty&=~(1<<hr);
805       }
806       // Don't need zeros
807       if((cur->regmap[hr]&63)==0)
808       {
809         cur->regmap[hr]=-1;
810         cur->dirty&=~(1<<hr);
811       }
812     }
813   }
814 }
815
816 #ifndef NDEBUG
817 static int host_tempreg_in_use;
818
819 static void host_tempreg_acquire(void)
820 {
821   assert(!host_tempreg_in_use);
822   host_tempreg_in_use = 1;
823 }
824
825 static void host_tempreg_release(void)
826 {
827   host_tempreg_in_use = 0;
828 }
829 #else
830 static void host_tempreg_acquire(void) {}
831 static void host_tempreg_release(void) {}
832 #endif
833
834 #ifdef DRC_DBG
835 extern void gen_interupt();
836 extern void do_insn_cmp();
837 #define FUNCNAME(f) { f, " " #f }
838 static const struct {
839   void *addr;
840   const char *name;
841 } function_names[] = {
842   FUNCNAME(cc_interrupt),
843   FUNCNAME(gen_interupt),
844   FUNCNAME(get_addr_ht),
845   FUNCNAME(get_addr),
846   FUNCNAME(jump_handler_read8),
847   FUNCNAME(jump_handler_read16),
848   FUNCNAME(jump_handler_read32),
849   FUNCNAME(jump_handler_write8),
850   FUNCNAME(jump_handler_write16),
851   FUNCNAME(jump_handler_write32),
852   FUNCNAME(invalidate_addr),
853   FUNCNAME(jump_to_new_pc),
854   FUNCNAME(new_dyna_leave),
855   FUNCNAME(pcsx_mtc0),
856   FUNCNAME(pcsx_mtc0_ds),
857   FUNCNAME(do_insn_cmp),
858 #ifdef __arm__
859   FUNCNAME(verify_code),
860 #endif
861 };
862
863 static const char *func_name(const void *a)
864 {
865   int i;
866   for (i = 0; i < sizeof(function_names)/sizeof(function_names[0]); i++)
867     if (function_names[i].addr == a)
868       return function_names[i].name;
869   return "";
870 }
871 #else
872 #define func_name(x) ""
873 #endif
874
875 #ifdef __i386__
876 #include "assem_x86.c"
877 #endif
878 #ifdef __x86_64__
879 #include "assem_x64.c"
880 #endif
881 #ifdef __arm__
882 #include "assem_arm.c"
883 #endif
884 #ifdef __aarch64__
885 #include "assem_arm64.c"
886 #endif
887
888 static void *get_trampoline(const void *f)
889 {
890   size_t i;
891
892   for (i = 0; i < ARRAY_SIZE(ndrc->tramp.f); i++) {
893     if (ndrc->tramp.f[i] == f || ndrc->tramp.f[i] == NULL)
894       break;
895   }
896   if (i == ARRAY_SIZE(ndrc->tramp.f)) {
897     SysPrintf("trampoline table is full, last func %p\n", f);
898     abort();
899   }
900   if (ndrc->tramp.f[i] == NULL) {
901     start_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]);
902     ndrc->tramp.f[i] = f;
903     end_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]);
904   }
905   return &ndrc->tramp.ops[i];
906 }
907
908 static void emit_far_jump(const void *f)
909 {
910   if (can_jump_or_call(f)) {
911     emit_jmp(f);
912     return;
913   }
914
915   f = get_trampoline(f);
916   emit_jmp(f);
917 }
918
919 static void emit_far_call(const void *f)
920 {
921   if (can_jump_or_call(f)) {
922     emit_call(f);
923     return;
924   }
925
926   f = get_trampoline(f);
927   emit_call(f);
928 }
929
930 // Add virtual address mapping to linked list
931 void ll_add(struct ll_entry **head,int vaddr,void *addr)
932 {
933   struct ll_entry *new_entry;
934   new_entry=malloc(sizeof(struct ll_entry));
935   assert(new_entry!=NULL);
936   new_entry->vaddr=vaddr;
937   new_entry->reg_sv_flags=0;
938   new_entry->addr=addr;
939   new_entry->next=*head;
940   *head=new_entry;
941 }
942
943 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
944 {
945   ll_add(head,vaddr,addr);
946   (*head)->reg_sv_flags=reg_sv_flags;
947 }
948
949 // Check if an address is already compiled
950 // but don't return addresses which are about to expire from the cache
951 void *check_addr(u_int vaddr)
952 {
953   struct ht_entry *ht_bin = hash_table_get(vaddr);
954   size_t i;
955   for (i = 0; i < ARRAY_SIZE(ht_bin->vaddr); i++) {
956     if (ht_bin->vaddr[i] == vaddr)
957       if (doesnt_expire_soon((u_char *)ht_bin->tcaddr[i] - MAX_OUTPUT_BLOCK_SIZE))
958         if (isclean(ht_bin->tcaddr[i]))
959           return ht_bin->tcaddr[i];
960   }
961   u_int page=get_page(vaddr);
962   struct ll_entry *head;
963   head=jump_in[page];
964   while (head != NULL) {
965     if (head->vaddr == vaddr) {
966       if (doesnt_expire_soon(head->addr)) {
967         // Update existing entry with current address
968         if (ht_bin->vaddr[0] == vaddr) {
969           ht_bin->tcaddr[0] = head->addr;
970           return head->addr;
971         }
972         if (ht_bin->vaddr[1] == vaddr) {
973           ht_bin->tcaddr[1] = head->addr;
974           return head->addr;
975         }
976         // Insert into hash table with low priority.
977         // Don't evict existing entries, as they are probably
978         // addresses that are being accessed frequently.
979         if (ht_bin->vaddr[0] == -1) {
980           ht_bin->vaddr[0] = vaddr;
981           ht_bin->tcaddr[0] = head->addr;
982         }
983         else if (ht_bin->vaddr[1] == -1) {
984           ht_bin->vaddr[1] = vaddr;
985           ht_bin->tcaddr[1] = head->addr;
986         }
987         return head->addr;
988       }
989     }
990     head=head->next;
991   }
992   return 0;
993 }
994
995 void remove_hash(int vaddr)
996 {
997   //printf("remove hash: %x\n",vaddr);
998   struct ht_entry *ht_bin = hash_table_get(vaddr);
999   if (ht_bin->vaddr[1] == vaddr) {
1000     ht_bin->vaddr[1] = -1;
1001     ht_bin->tcaddr[1] = NULL;
1002   }
1003   if (ht_bin->vaddr[0] == vaddr) {
1004     ht_bin->vaddr[0] = ht_bin->vaddr[1];
1005     ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
1006     ht_bin->vaddr[1] = -1;
1007     ht_bin->tcaddr[1] = NULL;
1008   }
1009 }
1010
1011 void ll_remove_matching_addrs(struct ll_entry **head,uintptr_t addr,int shift)
1012 {
1013   struct ll_entry *next;
1014   while(*head) {
1015     if(((uintptr_t)((*head)->addr)>>shift)==(addr>>shift) ||
1016        ((uintptr_t)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1017     {
1018       inv_debug("EXP: Remove pointer to %p (%x)\n",(*head)->addr,(*head)->vaddr);
1019       remove_hash((*head)->vaddr);
1020       next=(*head)->next;
1021       free(*head);
1022       *head=next;
1023     }
1024     else
1025     {
1026       head=&((*head)->next);
1027     }
1028   }
1029 }
1030
1031 // Remove all entries from linked list
1032 void ll_clear(struct ll_entry **head)
1033 {
1034   struct ll_entry *cur;
1035   struct ll_entry *next;
1036   if((cur=*head)) {
1037     *head=0;
1038     while(cur) {
1039       next=cur->next;
1040       free(cur);
1041       cur=next;
1042     }
1043   }
1044 }
1045
1046 // Dereference the pointers and remove if it matches
1047 static void ll_kill_pointers(struct ll_entry *head,uintptr_t addr,int shift)
1048 {
1049   while(head) {
1050     uintptr_t ptr = (uintptr_t)get_pointer(head->addr);
1051     inv_debug("EXP: Lookup pointer to %lx at %p (%x)\n",(long)ptr,head->addr,head->vaddr);
1052     if(((ptr>>shift)==(addr>>shift)) ||
1053        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1054     {
1055       inv_debug("EXP: Kill pointer at %p (%x)\n",head->addr,head->vaddr);
1056       void *host_addr=find_extjump_insn(head->addr);
1057       #if defined(__arm__) || defined(__aarch64__)
1058         mark_clear_cache(host_addr);
1059       #endif
1060       set_jump_target(host_addr, head->addr);
1061     }
1062     head=head->next;
1063   }
1064 }
1065
1066 // This is called when we write to a compiled block (see do_invstub)
1067 static void invalidate_page(u_int page)
1068 {
1069   struct ll_entry *head;
1070   struct ll_entry *next;
1071   head=jump_in[page];
1072   jump_in[page]=0;
1073   while(head!=NULL) {
1074     inv_debug("INVALIDATE: %x\n",head->vaddr);
1075     remove_hash(head->vaddr);
1076     next=head->next;
1077     free(head);
1078     head=next;
1079   }
1080   head=jump_out[page];
1081   jump_out[page]=0;
1082   while(head!=NULL) {
1083     inv_debug("INVALIDATE: kill pointer to %x (%p)\n",head->vaddr,head->addr);
1084     void *host_addr=find_extjump_insn(head->addr);
1085     #if defined(__arm__) || defined(__aarch64__)
1086       mark_clear_cache(host_addr);
1087     #endif
1088     set_jump_target(host_addr, head->addr);
1089     next=head->next;
1090     free(head);
1091     head=next;
1092   }
1093 }
1094
1095 static void invalidate_block_range(u_int block, u_int first, u_int last)
1096 {
1097   u_int page=get_page(block<<12);
1098   //printf("first=%d last=%d\n",first,last);
1099   invalidate_page(page);
1100   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1101   assert(last<page+5);
1102   // Invalidate the adjacent pages if a block crosses a 4K boundary
1103   while(first<page) {
1104     invalidate_page(first);
1105     first++;
1106   }
1107   for(first=page+1;first<last;first++) {
1108     invalidate_page(first);
1109   }
1110   #if defined(__arm__) || defined(__aarch64__)
1111     do_clear_cache();
1112   #endif
1113
1114   // Don't trap writes
1115   invalid_code[block]=1;
1116
1117   #ifdef USE_MINI_HT
1118   memset(mini_ht,-1,sizeof(mini_ht));
1119   #endif
1120 }
1121
1122 void invalidate_block(u_int block)
1123 {
1124   u_int page=get_page(block<<12);
1125   u_int vpage=get_vpage(block<<12);
1126   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1127   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1128   u_int first,last;
1129   first=last=page;
1130   struct ll_entry *head;
1131   head=jump_dirty[vpage];
1132   //printf("page=%d vpage=%d\n",page,vpage);
1133   while(head!=NULL) {
1134     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1135       u_char *start, *end;
1136       get_bounds(head->addr, &start, &end);
1137       //printf("start: %p end: %p\n", start, end);
1138       if (page < 2048 && start >= rdram && end < rdram+RAM_SIZE) {
1139         if (((start-rdram)>>12) <= page && ((end-1-rdram)>>12) >= page) {
1140           if ((((start-rdram)>>12)&2047) < first) first = ((start-rdram)>>12)&2047;
1141           if ((((end-1-rdram)>>12)&2047) > last)  last = ((end-1-rdram)>>12)&2047;
1142         }
1143       }
1144     }
1145     head=head->next;
1146   }
1147   invalidate_block_range(block,first,last);
1148 }
1149
1150 void invalidate_addr(u_int addr)
1151 {
1152   //static int rhits;
1153   // this check is done by the caller
1154   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1155   u_int page=get_vpage(addr);
1156   if(page<2048) { // RAM
1157     struct ll_entry *head;
1158     u_int addr_min=~0, addr_max=0;
1159     u_int mask=RAM_SIZE-1;
1160     u_int addr_main=0x80000000|(addr&mask);
1161     int pg1;
1162     inv_code_start=addr_main&~0xfff;
1163     inv_code_end=addr_main|0xfff;
1164     pg1=page;
1165     if (pg1>0) {
1166       // must check previous page too because of spans..
1167       pg1--;
1168       inv_code_start-=0x1000;
1169     }
1170     for(;pg1<=page;pg1++) {
1171       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1172         u_char *start_h, *end_h;
1173         u_int start, end;
1174         get_bounds(head->addr, &start_h, &end_h);
1175         start = (uintptr_t)start_h - ram_offset;
1176         end = (uintptr_t)end_h - ram_offset;
1177         if(start<=addr_main&&addr_main<end) {
1178           if(start<addr_min) addr_min=start;
1179           if(end>addr_max) addr_max=end;
1180         }
1181         else if(addr_main<start) {
1182           if(start<inv_code_end)
1183             inv_code_end=start-1;
1184         }
1185         else {
1186           if(end>inv_code_start)
1187             inv_code_start=end;
1188         }
1189       }
1190     }
1191     if (addr_min!=~0) {
1192       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1193       inv_code_start=inv_code_end=~0;
1194       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1195       return;
1196     }
1197     else {
1198       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1199       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1200       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1201       return;
1202     }
1203   }
1204   invalidate_block(addr>>12);
1205 }
1206
1207 // This is called when loading a save state.
1208 // Anything could have changed, so invalidate everything.
1209 void invalidate_all_pages()
1210 {
1211   u_int page;
1212   for(page=0;page<4096;page++)
1213     invalidate_page(page);
1214   for(page=0;page<1048576;page++)
1215     if(!invalid_code[page]) {
1216       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1217       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1218     }
1219   #ifdef USE_MINI_HT
1220   memset(mini_ht,-1,sizeof(mini_ht));
1221   #endif
1222 }
1223
1224 static void do_invstub(int n)
1225 {
1226   literal_pool(20);
1227   u_int reglist=stubs[n].a;
1228   set_jump_target(stubs[n].addr, out);
1229   save_regs(reglist);
1230   if(stubs[n].b!=0) emit_mov(stubs[n].b,0);
1231   emit_far_call(invalidate_addr);
1232   restore_regs(reglist);
1233   emit_jmp(stubs[n].retaddr); // return address
1234 }
1235
1236 // Add an entry to jump_out after making a link
1237 // src should point to code by emit_extjump2()
1238 void add_link(u_int vaddr,void *src)
1239 {
1240   u_int page=get_page(vaddr);
1241   inv_debug("add_link: %p -> %x (%d)\n",src,vaddr,page);
1242   check_extjump2(src);
1243   ll_add(jump_out+page,vaddr,src);
1244   //void *ptr=get_pointer(src);
1245   //inv_debug("add_link: Pointer is to %p\n",ptr);
1246 }
1247
1248 // If a code block was found to be unmodified (bit was set in
1249 // restore_candidate) and it remains unmodified (bit is clear
1250 // in invalid_code) then move the entries for that 4K page from
1251 // the dirty list to the clean list.
1252 void clean_blocks(u_int page)
1253 {
1254   struct ll_entry *head;
1255   inv_debug("INV: clean_blocks page=%d\n",page);
1256   head=jump_dirty[page];
1257   while(head!=NULL) {
1258     if(!invalid_code[head->vaddr>>12]) {
1259       // Don't restore blocks which are about to expire from the cache
1260       if (doesnt_expire_soon(head->addr)) {
1261         if(verify_dirty(head->addr)) {
1262           u_char *start, *end;
1263           //printf("Possibly Restore %x (%p)\n",head->vaddr, head->addr);
1264           u_int i;
1265           u_int inv=0;
1266           get_bounds(head->addr, &start, &end);
1267           if (start - rdram < RAM_SIZE) {
1268             for (i = (start-rdram+0x80000000)>>12; i <= (end-1-rdram+0x80000000)>>12; i++) {
1269               inv|=invalid_code[i];
1270             }
1271           }
1272           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1273             inv=1;
1274           }
1275           if(!inv) {
1276             void *clean_addr = get_clean_addr(head->addr);
1277             if (doesnt_expire_soon(clean_addr)) {
1278               u_int ppage=page;
1279               inv_debug("INV: Restored %x (%p/%p)\n",head->vaddr, head->addr, clean_addr);
1280               //printf("page=%x, addr=%x\n",page,head->vaddr);
1281               //assert(head->vaddr>>12==(page|0x80000));
1282               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1283               struct ht_entry *ht_bin = hash_table_get(head->vaddr);
1284               if (ht_bin->vaddr[0] == head->vaddr)
1285                 ht_bin->tcaddr[0] = clean_addr; // Replace existing entry
1286               if (ht_bin->vaddr[1] == head->vaddr)
1287                 ht_bin->tcaddr[1] = clean_addr; // Replace existing entry
1288             }
1289           }
1290         }
1291       }
1292     }
1293     head=head->next;
1294   }
1295 }
1296
1297 /* Register allocation */
1298
1299 // Note: registers are allocated clean (unmodified state)
1300 // if you intend to modify the register, you must call dirty_reg().
1301 static void alloc_reg(struct regstat *cur,int i,signed char reg)
1302 {
1303   int r,hr;
1304   int preferred_reg = (reg&7);
1305   if(reg==CCREG) preferred_reg=HOST_CCREG;
1306   if(reg==PTEMP||reg==FTEMP) preferred_reg=12;
1307
1308   // Don't allocate unused registers
1309   if((cur->u>>reg)&1) return;
1310
1311   // see if it's already allocated
1312   for(hr=0;hr<HOST_REGS;hr++)
1313   {
1314     if(cur->regmap[hr]==reg) return;
1315   }
1316
1317   // Keep the same mapping if the register was already allocated in a loop
1318   preferred_reg = loop_reg(i,reg,preferred_reg);
1319
1320   // Try to allocate the preferred register
1321   if(cur->regmap[preferred_reg]==-1) {
1322     cur->regmap[preferred_reg]=reg;
1323     cur->dirty&=~(1<<preferred_reg);
1324     cur->isconst&=~(1<<preferred_reg);
1325     return;
1326   }
1327   r=cur->regmap[preferred_reg];
1328   assert(r < 64);
1329   if((cur->u>>r)&1) {
1330     cur->regmap[preferred_reg]=reg;
1331     cur->dirty&=~(1<<preferred_reg);
1332     cur->isconst&=~(1<<preferred_reg);
1333     return;
1334   }
1335
1336   // Clear any unneeded registers
1337   // We try to keep the mapping consistent, if possible, because it
1338   // makes branches easier (especially loops).  So we try to allocate
1339   // first (see above) before removing old mappings.  If this is not
1340   // possible then go ahead and clear out the registers that are no
1341   // longer needed.
1342   for(hr=0;hr<HOST_REGS;hr++)
1343   {
1344     r=cur->regmap[hr];
1345     if(r>=0) {
1346       assert(r < 64);
1347       if((cur->u>>r)&1) {cur->regmap[hr]=-1;break;}
1348     }
1349   }
1350   // Try to allocate any available register, but prefer
1351   // registers that have not been used recently.
1352   if(i>0) {
1353     for(hr=0;hr<HOST_REGS;hr++) {
1354       if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
1355         if(regs[i-1].regmap[hr]!=rs1[i-1]&&regs[i-1].regmap[hr]!=rs2[i-1]&&regs[i-1].regmap[hr]!=rt1[i-1]&&regs[i-1].regmap[hr]!=rt2[i-1]) {
1356           cur->regmap[hr]=reg;
1357           cur->dirty&=~(1<<hr);
1358           cur->isconst&=~(1<<hr);
1359           return;
1360         }
1361       }
1362     }
1363   }
1364   // Try to allocate any available register
1365   for(hr=0;hr<HOST_REGS;hr++) {
1366     if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
1367       cur->regmap[hr]=reg;
1368       cur->dirty&=~(1<<hr);
1369       cur->isconst&=~(1<<hr);
1370       return;
1371     }
1372   }
1373
1374   // Ok, now we have to evict someone
1375   // Pick a register we hopefully won't need soon
1376   u_char hsn[MAXREG+1];
1377   memset(hsn,10,sizeof(hsn));
1378   int j;
1379   lsn(hsn,i,&preferred_reg);
1380   //printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",cur->regmap[0],cur->regmap[1],cur->regmap[2],cur->regmap[3],cur->regmap[5],cur->regmap[6],cur->regmap[7]);
1381   //printf("hsn(%x): %d %d %d %d %d %d %d\n",start+i*4,hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]);
1382   if(i>0) {
1383     // Don't evict the cycle count at entry points, otherwise the entry
1384     // stub will have to write it.
1385     if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2;
1386     if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP)) hsn[CCREG]=2;
1387     for(j=10;j>=3;j--)
1388     {
1389       // Alloc preferred register if available
1390       if(hsn[r=cur->regmap[preferred_reg]&63]==j) {
1391         for(hr=0;hr<HOST_REGS;hr++) {
1392           // Evict both parts of a 64-bit register
1393           if((cur->regmap[hr]&63)==r) {
1394             cur->regmap[hr]=-1;
1395             cur->dirty&=~(1<<hr);
1396             cur->isconst&=~(1<<hr);
1397           }
1398         }
1399         cur->regmap[preferred_reg]=reg;
1400         return;
1401       }
1402       for(r=1;r<=MAXREG;r++)
1403       {
1404         if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) {
1405           for(hr=0;hr<HOST_REGS;hr++) {
1406             if(hr!=HOST_CCREG||j<hsn[CCREG]) {
1407               if(cur->regmap[hr]==r) {
1408                 cur->regmap[hr]=reg;
1409                 cur->dirty&=~(1<<hr);
1410                 cur->isconst&=~(1<<hr);
1411                 return;
1412               }
1413             }
1414           }
1415         }
1416       }
1417     }
1418   }
1419   for(j=10;j>=0;j--)
1420   {
1421     for(r=1;r<=MAXREG;r++)
1422     {
1423       if(hsn[r]==j) {
1424         for(hr=0;hr<HOST_REGS;hr++) {
1425           if(cur->regmap[hr]==r) {
1426             cur->regmap[hr]=reg;
1427             cur->dirty&=~(1<<hr);
1428             cur->isconst&=~(1<<hr);
1429             return;
1430           }
1431         }
1432       }
1433     }
1434   }
1435   SysPrintf("This shouldn't happen (alloc_reg)");abort();
1436 }
1437
1438 // Allocate a temporary register.  This is done without regard to
1439 // dirty status or whether the register we request is on the unneeded list
1440 // Note: This will only allocate one register, even if called multiple times
1441 static void alloc_reg_temp(struct regstat *cur,int i,signed char reg)
1442 {
1443   int r,hr;
1444   int preferred_reg = -1;
1445
1446   // see if it's already allocated
1447   for(hr=0;hr<HOST_REGS;hr++)
1448   {
1449     if(hr!=EXCLUDE_REG&&cur->regmap[hr]==reg) return;
1450   }
1451
1452   // Try to allocate any available register
1453   for(hr=HOST_REGS-1;hr>=0;hr--) {
1454     if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
1455       cur->regmap[hr]=reg;
1456       cur->dirty&=~(1<<hr);
1457       cur->isconst&=~(1<<hr);
1458       return;
1459     }
1460   }
1461
1462   // Find an unneeded register
1463   for(hr=HOST_REGS-1;hr>=0;hr--)
1464   {
1465     r=cur->regmap[hr];
1466     if(r>=0) {
1467       assert(r < 64);
1468       if((cur->u>>r)&1) {
1469         if(i==0||((unneeded_reg[i-1]>>r)&1)) {
1470           cur->regmap[hr]=reg;
1471           cur->dirty&=~(1<<hr);
1472           cur->isconst&=~(1<<hr);
1473           return;
1474         }
1475       }
1476     }
1477   }
1478
1479   // Ok, now we have to evict someone
1480   // Pick a register we hopefully won't need soon
1481   // TODO: we might want to follow unconditional jumps here
1482   // TODO: get rid of dupe code and make this into a function
1483   u_char hsn[MAXREG+1];
1484   memset(hsn,10,sizeof(hsn));
1485   int j;
1486   lsn(hsn,i,&preferred_reg);
1487   //printf("hsn: %d %d %d %d %d %d %d\n",hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]);
1488   if(i>0) {
1489     // Don't evict the cycle count at entry points, otherwise the entry
1490     // stub will have to write it.
1491     if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2;
1492     if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP)) hsn[CCREG]=2;
1493     for(j=10;j>=3;j--)
1494     {
1495       for(r=1;r<=MAXREG;r++)
1496       {
1497         if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) {
1498           for(hr=0;hr<HOST_REGS;hr++) {
1499             if(hr!=HOST_CCREG||hsn[CCREG]>2) {
1500               if(cur->regmap[hr]==r) {
1501                 cur->regmap[hr]=reg;
1502                 cur->dirty&=~(1<<hr);
1503                 cur->isconst&=~(1<<hr);
1504                 return;
1505               }
1506             }
1507           }
1508         }
1509       }
1510     }
1511   }
1512   for(j=10;j>=0;j--)
1513   {
1514     for(r=1;r<=MAXREG;r++)
1515     {
1516       if(hsn[r]==j) {
1517         for(hr=0;hr<HOST_REGS;hr++) {
1518           if(cur->regmap[hr]==r) {
1519             cur->regmap[hr]=reg;
1520             cur->dirty&=~(1<<hr);
1521             cur->isconst&=~(1<<hr);
1522             return;
1523           }
1524         }
1525       }
1526     }
1527   }
1528   SysPrintf("This shouldn't happen");abort();
1529 }
1530
1531 static void mov_alloc(struct regstat *current,int i)
1532 {
1533   // Note: Don't need to actually alloc the source registers
1534   //alloc_reg(current,i,rs1[i]);
1535   alloc_reg(current,i,rt1[i]);
1536
1537   clear_const(current,rs1[i]);
1538   clear_const(current,rt1[i]);
1539   dirty_reg(current,rt1[i]);
1540 }
1541
1542 static void shiftimm_alloc(struct regstat *current,int i)
1543 {
1544   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1545   {
1546     if(rt1[i]) {
1547       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1548       else lt1[i]=rs1[i];
1549       alloc_reg(current,i,rt1[i]);
1550       dirty_reg(current,rt1[i]);
1551       if(is_const(current,rs1[i])) {
1552         int v=get_const(current,rs1[i]);
1553         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1554         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1555         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1556       }
1557       else clear_const(current,rt1[i]);
1558     }
1559   }
1560   else
1561   {
1562     clear_const(current,rs1[i]);
1563     clear_const(current,rt1[i]);
1564   }
1565
1566   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1567   {
1568     assert(0);
1569   }
1570   if(opcode2[i]==0x3c) // DSLL32
1571   {
1572     assert(0);
1573   }
1574   if(opcode2[i]==0x3e) // DSRL32
1575   {
1576     assert(0);
1577   }
1578   if(opcode2[i]==0x3f) // DSRA32
1579   {
1580     assert(0);
1581   }
1582 }
1583
1584 static void shift_alloc(struct regstat *current,int i)
1585 {
1586   if(rt1[i]) {
1587     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1588     {
1589       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1590       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1591       alloc_reg(current,i,rt1[i]);
1592       if(rt1[i]==rs2[i]) {
1593         alloc_reg_temp(current,i,-1);
1594         minimum_free_regs[i]=1;
1595       }
1596     } else { // DSLLV/DSRLV/DSRAV
1597       assert(0);
1598     }
1599     clear_const(current,rs1[i]);
1600     clear_const(current,rs2[i]);
1601     clear_const(current,rt1[i]);
1602     dirty_reg(current,rt1[i]);
1603   }
1604 }
1605
1606 static void alu_alloc(struct regstat *current,int i)
1607 {
1608   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1609     if(rt1[i]) {
1610       if(rs1[i]&&rs2[i]) {
1611         alloc_reg(current,i,rs1[i]);
1612         alloc_reg(current,i,rs2[i]);
1613       }
1614       else {
1615         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1616         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1617       }
1618       alloc_reg(current,i,rt1[i]);
1619     }
1620   }
1621   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1622     if(rt1[i]) {
1623       alloc_reg(current,i,rs1[i]);
1624       alloc_reg(current,i,rs2[i]);
1625       alloc_reg(current,i,rt1[i]);
1626     }
1627   }
1628   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1629     if(rt1[i]) {
1630       if(rs1[i]&&rs2[i]) {
1631         alloc_reg(current,i,rs1[i]);
1632         alloc_reg(current,i,rs2[i]);
1633       }
1634       else
1635       {
1636         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1637         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1638       }
1639       alloc_reg(current,i,rt1[i]);
1640     }
1641   }
1642   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1643     assert(0);
1644   }
1645   clear_const(current,rs1[i]);
1646   clear_const(current,rs2[i]);
1647   clear_const(current,rt1[i]);
1648   dirty_reg(current,rt1[i]);
1649 }
1650
1651 static void imm16_alloc(struct regstat *current,int i)
1652 {
1653   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1654   else lt1[i]=rs1[i];
1655   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1656   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1657     assert(0);
1658   }
1659   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1660     clear_const(current,rs1[i]);
1661     clear_const(current,rt1[i]);
1662   }
1663   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1664     if(is_const(current,rs1[i])) {
1665       int v=get_const(current,rs1[i]);
1666       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1667       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1668       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1669     }
1670     else clear_const(current,rt1[i]);
1671   }
1672   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1673     if(is_const(current,rs1[i])) {
1674       int v=get_const(current,rs1[i]);
1675       set_const(current,rt1[i],v+imm[i]);
1676     }
1677     else clear_const(current,rt1[i]);
1678   }
1679   else {
1680     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1681   }
1682   dirty_reg(current,rt1[i]);
1683 }
1684
1685 static void load_alloc(struct regstat *current,int i)
1686 {
1687   clear_const(current,rt1[i]);
1688   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1689   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1690   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1691   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1692     alloc_reg(current,i,rt1[i]);
1693     assert(get_reg(current->regmap,rt1[i])>=0);
1694     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1695     {
1696       assert(0);
1697     }
1698     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1699     {
1700       assert(0);
1701     }
1702     dirty_reg(current,rt1[i]);
1703     // LWL/LWR need a temporary register for the old value
1704     if(opcode[i]==0x22||opcode[i]==0x26)
1705     {
1706       alloc_reg(current,i,FTEMP);
1707       alloc_reg_temp(current,i,-1);
1708       minimum_free_regs[i]=1;
1709     }
1710   }
1711   else
1712   {
1713     // Load to r0 or unneeded register (dummy load)
1714     // but we still need a register to calculate the address
1715     if(opcode[i]==0x22||opcode[i]==0x26)
1716     {
1717       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1718     }
1719     alloc_reg_temp(current,i,-1);
1720     minimum_free_regs[i]=1;
1721     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1722     {
1723       assert(0);
1724     }
1725   }
1726 }
1727
1728 void store_alloc(struct regstat *current,int i)
1729 {
1730   clear_const(current,rs2[i]);
1731   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1732   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1733   alloc_reg(current,i,rs2[i]);
1734   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1735     assert(0);
1736   }
1737   #if defined(HOST_IMM8)
1738   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1739   else alloc_reg(current,i,INVCP);
1740   #endif
1741   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1742     alloc_reg(current,i,FTEMP);
1743   }
1744   // We need a temporary register for address generation
1745   alloc_reg_temp(current,i,-1);
1746   minimum_free_regs[i]=1;
1747 }
1748
1749 void c1ls_alloc(struct regstat *current,int i)
1750 {
1751   //clear_const(current,rs1[i]); // FIXME
1752   clear_const(current,rt1[i]);
1753   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1754   alloc_reg(current,i,CSREG); // Status
1755   alloc_reg(current,i,FTEMP);
1756   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1757     assert(0);
1758   }
1759   #if defined(HOST_IMM8)
1760   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1761   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1762     alloc_reg(current,i,INVCP);
1763   #endif
1764   // We need a temporary register for address generation
1765   alloc_reg_temp(current,i,-1);
1766 }
1767
1768 void c2ls_alloc(struct regstat *current,int i)
1769 {
1770   clear_const(current,rt1[i]);
1771   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1772   alloc_reg(current,i,FTEMP);
1773   #if defined(HOST_IMM8)
1774   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1775   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1776     alloc_reg(current,i,INVCP);
1777   #endif
1778   // We need a temporary register for address generation
1779   alloc_reg_temp(current,i,-1);
1780   minimum_free_regs[i]=1;
1781 }
1782
1783 #ifndef multdiv_alloc
1784 void multdiv_alloc(struct regstat *current,int i)
1785 {
1786   //  case 0x18: MULT
1787   //  case 0x19: MULTU
1788   //  case 0x1A: DIV
1789   //  case 0x1B: DIVU
1790   //  case 0x1C: DMULT
1791   //  case 0x1D: DMULTU
1792   //  case 0x1E: DDIV
1793   //  case 0x1F: DDIVU
1794   clear_const(current,rs1[i]);
1795   clear_const(current,rs2[i]);
1796   if(rs1[i]&&rs2[i])
1797   {
1798     if((opcode2[i]&4)==0) // 32-bit
1799     {
1800       current->u&=~(1LL<<HIREG);
1801       current->u&=~(1LL<<LOREG);
1802       alloc_reg(current,i,HIREG);
1803       alloc_reg(current,i,LOREG);
1804       alloc_reg(current,i,rs1[i]);
1805       alloc_reg(current,i,rs2[i]);
1806       dirty_reg(current,HIREG);
1807       dirty_reg(current,LOREG);
1808     }
1809     else // 64-bit
1810     {
1811       assert(0);
1812     }
1813   }
1814   else
1815   {
1816     // Multiply by zero is zero.
1817     // MIPS does not have a divide by zero exception.
1818     // The result is undefined, we return zero.
1819     alloc_reg(current,i,HIREG);
1820     alloc_reg(current,i,LOREG);
1821     dirty_reg(current,HIREG);
1822     dirty_reg(current,LOREG);
1823   }
1824 }
1825 #endif
1826
1827 void cop0_alloc(struct regstat *current,int i)
1828 {
1829   if(opcode2[i]==0) // MFC0
1830   {
1831     if(rt1[i]) {
1832       clear_const(current,rt1[i]);
1833       alloc_all(current,i);
1834       alloc_reg(current,i,rt1[i]);
1835       dirty_reg(current,rt1[i]);
1836     }
1837   }
1838   else if(opcode2[i]==4) // MTC0
1839   {
1840     if(rs1[i]){
1841       clear_const(current,rs1[i]);
1842       alloc_reg(current,i,rs1[i]);
1843       alloc_all(current,i);
1844     }
1845     else {
1846       alloc_all(current,i); // FIXME: Keep r0
1847       current->u&=~1LL;
1848       alloc_reg(current,i,0);
1849     }
1850   }
1851   else
1852   {
1853     // TLBR/TLBWI/TLBWR/TLBP/ERET
1854     assert(opcode2[i]==0x10);
1855     alloc_all(current,i);
1856   }
1857   minimum_free_regs[i]=HOST_REGS;
1858 }
1859
1860 static void cop12_alloc(struct regstat *current,int i)
1861 {
1862   alloc_reg(current,i,CSREG); // Load status
1863   if(opcode2[i]<3) // MFC1/CFC1
1864   {
1865     if(rt1[i]){
1866       clear_const(current,rt1[i]);
1867       alloc_reg(current,i,rt1[i]);
1868       dirty_reg(current,rt1[i]);
1869     }
1870     alloc_reg_temp(current,i,-1);
1871   }
1872   else if(opcode2[i]>3) // MTC1/CTC1
1873   {
1874     if(rs1[i]){
1875       clear_const(current,rs1[i]);
1876       alloc_reg(current,i,rs1[i]);
1877     }
1878     else {
1879       current->u&=~1LL;
1880       alloc_reg(current,i,0);
1881     }
1882     alloc_reg_temp(current,i,-1);
1883   }
1884   minimum_free_regs[i]=1;
1885 }
1886
1887 void c2op_alloc(struct regstat *current,int i)
1888 {
1889   alloc_reg_temp(current,i,-1);
1890 }
1891
1892 void syscall_alloc(struct regstat *current,int i)
1893 {
1894   alloc_cc(current,i);
1895   dirty_reg(current,CCREG);
1896   alloc_all(current,i);
1897   minimum_free_regs[i]=HOST_REGS;
1898   current->isconst=0;
1899 }
1900
1901 void delayslot_alloc(struct regstat *current,int i)
1902 {
1903   switch(itype[i]) {
1904     case UJUMP:
1905     case CJUMP:
1906     case SJUMP:
1907     case RJUMP:
1908     case SYSCALL:
1909     case HLECALL:
1910     case SPAN:
1911       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//abort();
1912       SysPrintf("Disabled speculative precompilation\n");
1913       stop_after_jal=1;
1914       break;
1915     case IMM16:
1916       imm16_alloc(current,i);
1917       break;
1918     case LOAD:
1919     case LOADLR:
1920       load_alloc(current,i);
1921       break;
1922     case STORE:
1923     case STORELR:
1924       store_alloc(current,i);
1925       break;
1926     case ALU:
1927       alu_alloc(current,i);
1928       break;
1929     case SHIFT:
1930       shift_alloc(current,i);
1931       break;
1932     case MULTDIV:
1933       multdiv_alloc(current,i);
1934       break;
1935     case SHIFTIMM:
1936       shiftimm_alloc(current,i);
1937       break;
1938     case MOV:
1939       mov_alloc(current,i);
1940       break;
1941     case COP0:
1942       cop0_alloc(current,i);
1943       break;
1944     case COP1:
1945     case COP2:
1946       cop12_alloc(current,i);
1947       break;
1948     case C1LS:
1949       c1ls_alloc(current,i);
1950       break;
1951     case C2LS:
1952       c2ls_alloc(current,i);
1953       break;
1954     case C2OP:
1955       c2op_alloc(current,i);
1956       break;
1957   }
1958 }
1959
1960 // Special case where a branch and delay slot span two pages in virtual memory
1961 static void pagespan_alloc(struct regstat *current,int i)
1962 {
1963   current->isconst=0;
1964   current->wasconst=0;
1965   regs[i].wasconst=0;
1966   minimum_free_regs[i]=HOST_REGS;
1967   alloc_all(current,i);
1968   alloc_cc(current,i);
1969   dirty_reg(current,CCREG);
1970   if(opcode[i]==3) // JAL
1971   {
1972     alloc_reg(current,i,31);
1973     dirty_reg(current,31);
1974   }
1975   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1976   {
1977     alloc_reg(current,i,rs1[i]);
1978     if (rt1[i]!=0) {
1979       alloc_reg(current,i,rt1[i]);
1980       dirty_reg(current,rt1[i]);
1981     }
1982   }
1983   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1984   {
1985     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1986     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1987   }
1988   else
1989   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1990   {
1991     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1992   }
1993   //else ...
1994 }
1995
1996 static void add_stub(enum stub_type type, void *addr, void *retaddr,
1997   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e)
1998 {
1999   assert(stubcount < ARRAY_SIZE(stubs));
2000   stubs[stubcount].type = type;
2001   stubs[stubcount].addr = addr;
2002   stubs[stubcount].retaddr = retaddr;
2003   stubs[stubcount].a = a;
2004   stubs[stubcount].b = b;
2005   stubs[stubcount].c = c;
2006   stubs[stubcount].d = d;
2007   stubs[stubcount].e = e;
2008   stubcount++;
2009 }
2010
2011 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
2012   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist)
2013 {
2014   add_stub(type, addr, retaddr, i, addr_reg, (uintptr_t)i_regs, ccadj, reglist);
2015 }
2016
2017 // Write out a single register
2018 static void wb_register(signed char r,signed char regmap[],uint64_t dirty)
2019 {
2020   int hr;
2021   for(hr=0;hr<HOST_REGS;hr++) {
2022     if(hr!=EXCLUDE_REG) {
2023       if((regmap[hr]&63)==r) {
2024         if((dirty>>hr)&1) {
2025           assert(regmap[hr]<64);
2026           emit_storereg(r,hr);
2027         }
2028       }
2029     }
2030   }
2031 }
2032
2033 static void wb_valid(signed char pre[],signed char entry[],u_int dirty_pre,u_int dirty,uint64_t u)
2034 {
2035   //if(dirty_pre==dirty) return;
2036   int hr,reg;
2037   for(hr=0;hr<HOST_REGS;hr++) {
2038     if(hr!=EXCLUDE_REG) {
2039       reg=pre[hr];
2040       if(((~u)>>(reg&63))&1) {
2041         if(reg>0) {
2042           if(((dirty_pre&~dirty)>>hr)&1) {
2043             if(reg>0&&reg<34) {
2044               emit_storereg(reg,hr);
2045             }
2046             else if(reg>=64) {
2047               assert(0);
2048             }
2049           }
2050         }
2051       }
2052     }
2053   }
2054 }
2055
2056 // trashes r2
2057 static void pass_args(int a0, int a1)
2058 {
2059   if(a0==1&&a1==0) {
2060     // must swap
2061     emit_mov(a0,2); emit_mov(a1,1); emit_mov(2,0);
2062   }
2063   else if(a0!=0&&a1==0) {
2064     emit_mov(a1,1);
2065     if (a0>=0) emit_mov(a0,0);
2066   }
2067   else {
2068     if(a0>=0&&a0!=0) emit_mov(a0,0);
2069     if(a1>=0&&a1!=1) emit_mov(a1,1);
2070   }
2071 }
2072
2073 static void alu_assemble(int i,struct regstat *i_regs)
2074 {
2075   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2076     if(rt1[i]) {
2077       signed char s1,s2,t;
2078       t=get_reg(i_regs->regmap,rt1[i]);
2079       if(t>=0) {
2080         s1=get_reg(i_regs->regmap,rs1[i]);
2081         s2=get_reg(i_regs->regmap,rs2[i]);
2082         if(rs1[i]&&rs2[i]) {
2083           assert(s1>=0);
2084           assert(s2>=0);
2085           if(opcode2[i]&2) emit_sub(s1,s2,t);
2086           else emit_add(s1,s2,t);
2087         }
2088         else if(rs1[i]) {
2089           if(s1>=0) emit_mov(s1,t);
2090           else emit_loadreg(rs1[i],t);
2091         }
2092         else if(rs2[i]) {
2093           if(s2>=0) {
2094             if(opcode2[i]&2) emit_neg(s2,t);
2095             else emit_mov(s2,t);
2096           }
2097           else {
2098             emit_loadreg(rs2[i],t);
2099             if(opcode2[i]&2) emit_neg(t,t);
2100           }
2101         }
2102         else emit_zeroreg(t);
2103       }
2104     }
2105   }
2106   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2107     assert(0);
2108   }
2109   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2110     if(rt1[i]) {
2111       signed char s1l,s2l,t;
2112       {
2113         t=get_reg(i_regs->regmap,rt1[i]);
2114         //assert(t>=0);
2115         if(t>=0) {
2116           s1l=get_reg(i_regs->regmap,rs1[i]);
2117           s2l=get_reg(i_regs->regmap,rs2[i]);
2118           if(rs2[i]==0) // rx<r0
2119           {
2120             assert(s1l>=0);
2121             if(opcode2[i]==0x2a) // SLT
2122               emit_shrimm(s1l,31,t);
2123             else // SLTU (unsigned can not be less than zero)
2124               emit_zeroreg(t);
2125           }
2126           else if(rs1[i]==0) // r0<rx
2127           {
2128             assert(s2l>=0);
2129             if(opcode2[i]==0x2a) // SLT
2130               emit_set_gz32(s2l,t);
2131             else // SLTU (set if not zero)
2132               emit_set_nz32(s2l,t);
2133           }
2134           else{
2135             assert(s1l>=0);assert(s2l>=0);
2136             if(opcode2[i]==0x2a) // SLT
2137               emit_set_if_less32(s1l,s2l,t);
2138             else // SLTU
2139               emit_set_if_carry32(s1l,s2l,t);
2140           }
2141         }
2142       }
2143     }
2144   }
2145   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2146     if(rt1[i]) {
2147       signed char s1l,s2l,tl;
2148       tl=get_reg(i_regs->regmap,rt1[i]);
2149       {
2150         if(tl>=0) {
2151           s1l=get_reg(i_regs->regmap,rs1[i]);
2152           s2l=get_reg(i_regs->regmap,rs2[i]);
2153           if(rs1[i]&&rs2[i]) {
2154             assert(s1l>=0);
2155             assert(s2l>=0);
2156             if(opcode2[i]==0x24) { // AND
2157               emit_and(s1l,s2l,tl);
2158             } else
2159             if(opcode2[i]==0x25) { // OR
2160               emit_or(s1l,s2l,tl);
2161             } else
2162             if(opcode2[i]==0x26) { // XOR
2163               emit_xor(s1l,s2l,tl);
2164             } else
2165             if(opcode2[i]==0x27) { // NOR
2166               emit_or(s1l,s2l,tl);
2167               emit_not(tl,tl);
2168             }
2169           }
2170           else
2171           {
2172             if(opcode2[i]==0x24) { // AND
2173               emit_zeroreg(tl);
2174             } else
2175             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2176               if(rs1[i]){
2177                 if(s1l>=0) emit_mov(s1l,tl);
2178                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2179               }
2180               else
2181               if(rs2[i]){
2182                 if(s2l>=0) emit_mov(s2l,tl);
2183                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2184               }
2185               else emit_zeroreg(tl);
2186             } else
2187             if(opcode2[i]==0x27) { // NOR
2188               if(rs1[i]){
2189                 if(s1l>=0) emit_not(s1l,tl);
2190                 else {
2191                   emit_loadreg(rs1[i],tl);
2192                   emit_not(tl,tl);
2193                 }
2194               }
2195               else
2196               if(rs2[i]){
2197                 if(s2l>=0) emit_not(s2l,tl);
2198                 else {
2199                   emit_loadreg(rs2[i],tl);
2200                   emit_not(tl,tl);
2201                 }
2202               }
2203               else emit_movimm(-1,tl);
2204             }
2205           }
2206         }
2207       }
2208     }
2209   }
2210 }
2211
2212 void imm16_assemble(int i,struct regstat *i_regs)
2213 {
2214   if (opcode[i]==0x0f) { // LUI
2215     if(rt1[i]) {
2216       signed char t;
2217       t=get_reg(i_regs->regmap,rt1[i]);
2218       //assert(t>=0);
2219       if(t>=0) {
2220         if(!((i_regs->isconst>>t)&1))
2221           emit_movimm(imm[i]<<16,t);
2222       }
2223     }
2224   }
2225   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2226     if(rt1[i]) {
2227       signed char s,t;
2228       t=get_reg(i_regs->regmap,rt1[i]);
2229       s=get_reg(i_regs->regmap,rs1[i]);
2230       if(rs1[i]) {
2231         //assert(t>=0);
2232         //assert(s>=0);
2233         if(t>=0) {
2234           if(!((i_regs->isconst>>t)&1)) {
2235             if(s<0) {
2236               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2237               emit_addimm(t,imm[i],t);
2238             }else{
2239               if(!((i_regs->wasconst>>s)&1))
2240                 emit_addimm(s,imm[i],t);
2241               else
2242                 emit_movimm(constmap[i][s]+imm[i],t);
2243             }
2244           }
2245         }
2246       } else {
2247         if(t>=0) {
2248           if(!((i_regs->isconst>>t)&1))
2249             emit_movimm(imm[i],t);
2250         }
2251       }
2252     }
2253   }
2254   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2255     if(rt1[i]) {
2256       signed char sl,tl;
2257       tl=get_reg(i_regs->regmap,rt1[i]);
2258       sl=get_reg(i_regs->regmap,rs1[i]);
2259       if(tl>=0) {
2260         if(rs1[i]) {
2261           assert(sl>=0);
2262           emit_addimm(sl,imm[i],tl);
2263         } else {
2264           emit_movimm(imm[i],tl);
2265         }
2266       }
2267     }
2268   }
2269   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2270     if(rt1[i]) {
2271       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2272       signed char sl,t;
2273       t=get_reg(i_regs->regmap,rt1[i]);
2274       sl=get_reg(i_regs->regmap,rs1[i]);
2275       //assert(t>=0);
2276       if(t>=0) {
2277         if(rs1[i]>0) {
2278             if(opcode[i]==0x0a) { // SLTI
2279               if(sl<0) {
2280                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2281                 emit_slti32(t,imm[i],t);
2282               }else{
2283                 emit_slti32(sl,imm[i],t);
2284               }
2285             }
2286             else { // SLTIU
2287               if(sl<0) {
2288                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2289                 emit_sltiu32(t,imm[i],t);
2290               }else{
2291                 emit_sltiu32(sl,imm[i],t);
2292               }
2293             }
2294         }else{
2295           // SLTI(U) with r0 is just stupid,
2296           // nonetheless examples can be found
2297           if(opcode[i]==0x0a) // SLTI
2298             if(0<imm[i]) emit_movimm(1,t);
2299             else emit_zeroreg(t);
2300           else // SLTIU
2301           {
2302             if(imm[i]) emit_movimm(1,t);
2303             else emit_zeroreg(t);
2304           }
2305         }
2306       }
2307     }
2308   }
2309   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2310     if(rt1[i]) {
2311       signed char sl,tl;
2312       tl=get_reg(i_regs->regmap,rt1[i]);
2313       sl=get_reg(i_regs->regmap,rs1[i]);
2314       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2315         if(opcode[i]==0x0c) //ANDI
2316         {
2317           if(rs1[i]) {
2318             if(sl<0) {
2319               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2320               emit_andimm(tl,imm[i],tl);
2321             }else{
2322               if(!((i_regs->wasconst>>sl)&1))
2323                 emit_andimm(sl,imm[i],tl);
2324               else
2325                 emit_movimm(constmap[i][sl]&imm[i],tl);
2326             }
2327           }
2328           else
2329             emit_zeroreg(tl);
2330         }
2331         else
2332         {
2333           if(rs1[i]) {
2334             if(sl<0) {
2335               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2336             }
2337             if(opcode[i]==0x0d) { // ORI
2338               if(sl<0) {
2339                 emit_orimm(tl,imm[i],tl);
2340               }else{
2341                 if(!((i_regs->wasconst>>sl)&1))
2342                   emit_orimm(sl,imm[i],tl);
2343                 else
2344                   emit_movimm(constmap[i][sl]|imm[i],tl);
2345               }
2346             }
2347             if(opcode[i]==0x0e) { // XORI
2348               if(sl<0) {
2349                 emit_xorimm(tl,imm[i],tl);
2350               }else{
2351                 if(!((i_regs->wasconst>>sl)&1))
2352                   emit_xorimm(sl,imm[i],tl);
2353                 else
2354                   emit_movimm(constmap[i][sl]^imm[i],tl);
2355               }
2356             }
2357           }
2358           else {
2359             emit_movimm(imm[i],tl);
2360           }
2361         }
2362       }
2363     }
2364   }
2365 }
2366
2367 void shiftimm_assemble(int i,struct regstat *i_regs)
2368 {
2369   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2370   {
2371     if(rt1[i]) {
2372       signed char s,t;
2373       t=get_reg(i_regs->regmap,rt1[i]);
2374       s=get_reg(i_regs->regmap,rs1[i]);
2375       //assert(t>=0);
2376       if(t>=0&&!((i_regs->isconst>>t)&1)){
2377         if(rs1[i]==0)
2378         {
2379           emit_zeroreg(t);
2380         }
2381         else
2382         {
2383           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2384           if(imm[i]) {
2385             if(opcode2[i]==0) // SLL
2386             {
2387               emit_shlimm(s<0?t:s,imm[i],t);
2388             }
2389             if(opcode2[i]==2) // SRL
2390             {
2391               emit_shrimm(s<0?t:s,imm[i],t);
2392             }
2393             if(opcode2[i]==3) // SRA
2394             {
2395               emit_sarimm(s<0?t:s,imm[i],t);
2396             }
2397           }else{
2398             // Shift by zero
2399             if(s>=0 && s!=t) emit_mov(s,t);
2400           }
2401         }
2402       }
2403       //emit_storereg(rt1[i],t); //DEBUG
2404     }
2405   }
2406   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2407   {
2408     assert(0);
2409   }
2410   if(opcode2[i]==0x3c) // DSLL32
2411   {
2412     assert(0);
2413   }
2414   if(opcode2[i]==0x3e) // DSRL32
2415   {
2416     assert(0);
2417   }
2418   if(opcode2[i]==0x3f) // DSRA32
2419   {
2420     assert(0);
2421   }
2422 }
2423
2424 #ifndef shift_assemble
2425 static void shift_assemble(int i,struct regstat *i_regs)
2426 {
2427   signed char s,t,shift;
2428   if (rt1[i] == 0)
2429     return;
2430   assert(opcode2[i]<=0x07); // SLLV/SRLV/SRAV
2431   t = get_reg(i_regs->regmap, rt1[i]);
2432   s = get_reg(i_regs->regmap, rs1[i]);
2433   shift = get_reg(i_regs->regmap, rs2[i]);
2434   if (t < 0)
2435     return;
2436
2437   if(rs1[i]==0)
2438     emit_zeroreg(t);
2439   else if(rs2[i]==0) {
2440     assert(s>=0);
2441     if(s!=t) emit_mov(s,t);
2442   }
2443   else {
2444     host_tempreg_acquire();
2445     emit_andimm(shift,31,HOST_TEMPREG);
2446     switch(opcode2[i]) {
2447     case 4: // SLLV
2448       emit_shl(s,HOST_TEMPREG,t);
2449       break;
2450     case 6: // SRLV
2451       emit_shr(s,HOST_TEMPREG,t);
2452       break;
2453     case 7: // SRAV
2454       emit_sar(s,HOST_TEMPREG,t);
2455       break;
2456     default:
2457       assert(0);
2458     }
2459     host_tempreg_release();
2460   }
2461 }
2462
2463 #endif
2464
2465 enum {
2466   MTYPE_8000 = 0,
2467   MTYPE_8020,
2468   MTYPE_0000,
2469   MTYPE_A000,
2470   MTYPE_1F80,
2471 };
2472
2473 static int get_ptr_mem_type(u_int a)
2474 {
2475   if(a < 0x00200000) {
2476     if(a<0x1000&&((start>>20)==0xbfc||(start>>24)==0xa0))
2477       // return wrong, must use memhandler for BIOS self-test to pass
2478       // 007 does similar stuff from a00 mirror, weird stuff
2479       return MTYPE_8000;
2480     return MTYPE_0000;
2481   }
2482   if(0x1f800000 <= a && a < 0x1f801000)
2483     return MTYPE_1F80;
2484   if(0x80200000 <= a && a < 0x80800000)
2485     return MTYPE_8020;
2486   if(0xa0000000 <= a && a < 0xa0200000)
2487     return MTYPE_A000;
2488   return MTYPE_8000;
2489 }
2490
2491 static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override)
2492 {
2493   void *jaddr = NULL;
2494   int type=0;
2495   int mr=rs1[i];
2496   if(((smrv_strong|smrv_weak)>>mr)&1) {
2497     type=get_ptr_mem_type(smrv[mr]);
2498     //printf("set %08x @%08x r%d %d\n", smrv[mr], start+i*4, mr, type);
2499   }
2500   else {
2501     // use the mirror we are running on
2502     type=get_ptr_mem_type(start);
2503     //printf("set nospec   @%08x r%d %d\n", start+i*4, mr, type);
2504   }
2505
2506   if(type==MTYPE_8020) { // RAM 80200000+ mirror
2507     host_tempreg_acquire();
2508     emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
2509     addr=*addr_reg_override=HOST_TEMPREG;
2510     type=0;
2511   }
2512   else if(type==MTYPE_0000) { // RAM 0 mirror
2513     host_tempreg_acquire();
2514     emit_orimm(addr,0x80000000,HOST_TEMPREG);
2515     addr=*addr_reg_override=HOST_TEMPREG;
2516     type=0;
2517   }
2518   else if(type==MTYPE_A000) { // RAM A mirror
2519     host_tempreg_acquire();
2520     emit_andimm(addr,~0x20000000,HOST_TEMPREG);
2521     addr=*addr_reg_override=HOST_TEMPREG;
2522     type=0;
2523   }
2524   else if(type==MTYPE_1F80) { // scratchpad
2525     if (psxH == (void *)0x1f800000) {
2526       host_tempreg_acquire();
2527       emit_xorimm(addr,0x1f800000,HOST_TEMPREG);
2528       emit_cmpimm(HOST_TEMPREG,0x1000);
2529       host_tempreg_release();
2530       jaddr=out;
2531       emit_jc(0);
2532     }
2533     else {
2534       // do the usual RAM check, jump will go to the right handler
2535       type=0;
2536     }
2537   }
2538
2539   if(type==0)
2540   {
2541     emit_cmpimm(addr,RAM_SIZE);
2542     jaddr=out;
2543     #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2544     // Hint to branch predictor that the branch is unlikely to be taken
2545     if(rs1[i]>=28)
2546       emit_jno_unlikely(0);
2547     else
2548     #endif
2549       emit_jno(0);
2550     if(ram_offset!=0) {
2551       host_tempreg_acquire();
2552       emit_addimm(addr,ram_offset,HOST_TEMPREG);
2553       addr=*addr_reg_override=HOST_TEMPREG;
2554     }
2555   }
2556
2557   return jaddr;
2558 }
2559
2560 // return memhandler, or get directly accessable address and return 0
2561 static void *get_direct_memhandler(void *table, u_int addr,
2562   enum stub_type type, uintptr_t *addr_host)
2563 {
2564   uintptr_t l1, l2 = 0;
2565   l1 = ((uintptr_t *)table)[addr>>12];
2566   if ((l1 & (1ul << (sizeof(l1)*8-1))) == 0) {
2567     uintptr_t v = l1 << 1;
2568     *addr_host = v + addr;
2569     return NULL;
2570   }
2571   else {
2572     l1 <<= 1;
2573     if (type == LOADB_STUB || type == LOADBU_STUB || type == STOREB_STUB)
2574       l2 = ((uintptr_t *)l1)[0x1000/4 + 0x1000/2 + (addr&0xfff)];
2575     else if (type == LOADH_STUB || type == LOADHU_STUB || type == STOREH_STUB)
2576       l2=((uintptr_t *)l1)[0x1000/4 + (addr&0xfff)/2];
2577     else
2578       l2=((uintptr_t *)l1)[(addr&0xfff)/4];
2579     if ((l2 & (1<<31)) == 0) {
2580       uintptr_t v = l2 << 1;
2581       *addr_host = v + (addr&0xfff);
2582       return NULL;
2583     }
2584     return (void *)(l2 << 1);
2585   }
2586 }
2587
2588 static void load_assemble(int i,struct regstat *i_regs)
2589 {
2590   int s,tl,addr;
2591   int offset;
2592   void *jaddr=0;
2593   int memtarget=0,c=0;
2594   int fastio_reg_override=-1;
2595   u_int hr,reglist=0;
2596   tl=get_reg(i_regs->regmap,rt1[i]);
2597   s=get_reg(i_regs->regmap,rs1[i]);
2598   offset=imm[i];
2599   for(hr=0;hr<HOST_REGS;hr++) {
2600     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2601   }
2602   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2603   if(s>=0) {
2604     c=(i_regs->wasconst>>s)&1;
2605     if (c) {
2606       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2607     }
2608   }
2609   //printf("load_assemble: c=%d\n",c);
2610   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2611   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2612   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2613     ||rt1[i]==0) {
2614       // could be FIFO, must perform the read
2615       // ||dummy read
2616       assem_debug("(forced read)\n");
2617       tl=get_reg(i_regs->regmap,-1);
2618       assert(tl>=0);
2619   }
2620   if(offset||s<0||c) addr=tl;
2621   else addr=s;
2622   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2623  if(tl>=0) {
2624   //printf("load_assemble: c=%d\n",c);
2625   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2626   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2627   reglist&=~(1<<tl);
2628   if(!c) {
2629     #ifdef R29_HACK
2630     // Strmnnrmn's speed hack
2631     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2632     #endif
2633     {
2634       jaddr=emit_fastpath_cmp_jump(i,addr,&fastio_reg_override);
2635     }
2636   }
2637   else if(ram_offset&&memtarget) {
2638     host_tempreg_acquire();
2639     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2640     fastio_reg_override=HOST_TEMPREG;
2641   }
2642   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2643   if (opcode[i]==0x20) { // LB
2644     if(!c||memtarget) {
2645       if(!dummy) {
2646         {
2647           int x=0,a=tl;
2648           if(!c) a=addr;
2649           if(fastio_reg_override>=0) a=fastio_reg_override;
2650
2651           emit_movsbl_indexed(x,a,tl);
2652         }
2653       }
2654       if(jaddr)
2655         add_stub_r(LOADB_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2656     }
2657     else
2658       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2659   }
2660   if (opcode[i]==0x21) { // LH
2661     if(!c||memtarget) {
2662       if(!dummy) {
2663         int x=0,a=tl;
2664         if(!c) a=addr;
2665         if(fastio_reg_override>=0) a=fastio_reg_override;
2666         emit_movswl_indexed(x,a,tl);
2667       }
2668       if(jaddr)
2669         add_stub_r(LOADH_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2670     }
2671     else
2672       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2673   }
2674   if (opcode[i]==0x23) { // LW
2675     if(!c||memtarget) {
2676       if(!dummy) {
2677         int a=addr;
2678         if(fastio_reg_override>=0) a=fastio_reg_override;
2679         emit_readword_indexed(0,a,tl);
2680       }
2681       if(jaddr)
2682         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2683     }
2684     else
2685       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2686   }
2687   if (opcode[i]==0x24) { // LBU
2688     if(!c||memtarget) {
2689       if(!dummy) {
2690         int x=0,a=tl;
2691         if(!c) a=addr;
2692         if(fastio_reg_override>=0) a=fastio_reg_override;
2693
2694         emit_movzbl_indexed(x,a,tl);
2695       }
2696       if(jaddr)
2697         add_stub_r(LOADBU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2698     }
2699     else
2700       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2701   }
2702   if (opcode[i]==0x25) { // LHU
2703     if(!c||memtarget) {
2704       if(!dummy) {
2705         int x=0,a=tl;
2706         if(!c) a=addr;
2707         if(fastio_reg_override>=0) a=fastio_reg_override;
2708         emit_movzwl_indexed(x,a,tl);
2709       }
2710       if(jaddr)
2711         add_stub_r(LOADHU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2712     }
2713     else
2714       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2715   }
2716   if (opcode[i]==0x27) { // LWU
2717     assert(0);
2718   }
2719   if (opcode[i]==0x37) { // LD
2720     assert(0);
2721   }
2722  }
2723  if (fastio_reg_override == HOST_TEMPREG)
2724    host_tempreg_release();
2725 }
2726
2727 #ifndef loadlr_assemble
2728 static void loadlr_assemble(int i,struct regstat *i_regs)
2729 {
2730   int s,tl,temp,temp2,addr;
2731   int offset;
2732   void *jaddr=0;
2733   int memtarget=0,c=0;
2734   int fastio_reg_override=-1;
2735   u_int hr,reglist=0;
2736   tl=get_reg(i_regs->regmap,rt1[i]);
2737   s=get_reg(i_regs->regmap,rs1[i]);
2738   temp=get_reg(i_regs->regmap,-1);
2739   temp2=get_reg(i_regs->regmap,FTEMP);
2740   addr=get_reg(i_regs->regmap,AGEN1+(i&1));
2741   assert(addr<0);
2742   offset=imm[i];
2743   for(hr=0;hr<HOST_REGS;hr++) {
2744     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2745   }
2746   reglist|=1<<temp;
2747   if(offset||s<0||c) addr=temp2;
2748   else addr=s;
2749   if(s>=0) {
2750     c=(i_regs->wasconst>>s)&1;
2751     if(c) {
2752       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2753     }
2754   }
2755   if(!c) {
2756     emit_shlimm(addr,3,temp);
2757     if (opcode[i]==0x22||opcode[i]==0x26) {
2758       emit_andimm(addr,0xFFFFFFFC,temp2); // LWL/LWR
2759     }else{
2760       emit_andimm(addr,0xFFFFFFF8,temp2); // LDL/LDR
2761     }
2762     jaddr=emit_fastpath_cmp_jump(i,temp2,&fastio_reg_override);
2763   }
2764   else {
2765     if(ram_offset&&memtarget) {
2766       host_tempreg_acquire();
2767       emit_addimm(temp2,ram_offset,HOST_TEMPREG);
2768       fastio_reg_override=HOST_TEMPREG;
2769     }
2770     if (opcode[i]==0x22||opcode[i]==0x26) {
2771       emit_movimm(((constmap[i][s]+offset)<<3)&24,temp); // LWL/LWR
2772     }else{
2773       emit_movimm(((constmap[i][s]+offset)<<3)&56,temp); // LDL/LDR
2774     }
2775   }
2776   if (opcode[i]==0x22||opcode[i]==0x26) { // LWL/LWR
2777     if(!c||memtarget) {
2778       int a=temp2;
2779       if(fastio_reg_override>=0) a=fastio_reg_override;
2780       emit_readword_indexed(0,a,temp2);
2781       if(fastio_reg_override==HOST_TEMPREG) host_tempreg_release();
2782       if(jaddr) add_stub_r(LOADW_STUB,jaddr,out,i,temp2,i_regs,ccadj[i],reglist);
2783     }
2784     else
2785       inline_readstub(LOADW_STUB,i,(constmap[i][s]+offset)&0xFFFFFFFC,i_regs->regmap,FTEMP,ccadj[i],reglist);
2786     if(rt1[i]) {
2787       assert(tl>=0);
2788       emit_andimm(temp,24,temp);
2789       if (opcode[i]==0x22) // LWL
2790         emit_xorimm(temp,24,temp);
2791       host_tempreg_acquire();
2792       emit_movimm(-1,HOST_TEMPREG);
2793       if (opcode[i]==0x26) {
2794         emit_shr(temp2,temp,temp2);
2795         emit_bic_lsr(tl,HOST_TEMPREG,temp,tl);
2796       }else{
2797         emit_shl(temp2,temp,temp2);
2798         emit_bic_lsl(tl,HOST_TEMPREG,temp,tl);
2799       }
2800       host_tempreg_release();
2801       emit_or(temp2,tl,tl);
2802     }
2803     //emit_storereg(rt1[i],tl); // DEBUG
2804   }
2805   if (opcode[i]==0x1A||opcode[i]==0x1B) { // LDL/LDR
2806     assert(0);
2807   }
2808 }
2809 #endif
2810
2811 void store_assemble(int i,struct regstat *i_regs)
2812 {
2813   int s,tl;
2814   int addr,temp;
2815   int offset;
2816   void *jaddr=0;
2817   enum stub_type type;
2818   int memtarget=0,c=0;
2819   int agr=AGEN1+(i&1);
2820   int fastio_reg_override=-1;
2821   u_int hr,reglist=0;
2822   tl=get_reg(i_regs->regmap,rs2[i]);
2823   s=get_reg(i_regs->regmap,rs1[i]);
2824   temp=get_reg(i_regs->regmap,agr);
2825   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2826   offset=imm[i];
2827   if(s>=0) {
2828     c=(i_regs->wasconst>>s)&1;
2829     if(c) {
2830       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2831     }
2832   }
2833   assert(tl>=0);
2834   assert(temp>=0);
2835   for(hr=0;hr<HOST_REGS;hr++) {
2836     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2837   }
2838   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2839   if(offset||s<0||c) addr=temp;
2840   else addr=s;
2841   if(!c) {
2842     jaddr=emit_fastpath_cmp_jump(i,addr,&fastio_reg_override);
2843   }
2844   else if(ram_offset&&memtarget) {
2845     host_tempreg_acquire();
2846     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2847     fastio_reg_override=HOST_TEMPREG;
2848   }
2849
2850   if (opcode[i]==0x28) { // SB
2851     if(!c||memtarget) {
2852       int x=0,a=temp;
2853       if(!c) a=addr;
2854       if(fastio_reg_override>=0) a=fastio_reg_override;
2855       emit_writebyte_indexed(tl,x,a);
2856     }
2857     type=STOREB_STUB;
2858   }
2859   if (opcode[i]==0x29) { // SH
2860     if(!c||memtarget) {
2861       int x=0,a=temp;
2862       if(!c) a=addr;
2863       if(fastio_reg_override>=0) a=fastio_reg_override;
2864       emit_writehword_indexed(tl,x,a);
2865     }
2866     type=STOREH_STUB;
2867   }
2868   if (opcode[i]==0x2B) { // SW
2869     if(!c||memtarget) {
2870       int a=addr;
2871       if(fastio_reg_override>=0) a=fastio_reg_override;
2872       emit_writeword_indexed(tl,0,a);
2873     }
2874     type=STOREW_STUB;
2875   }
2876   if (opcode[i]==0x3F) { // SD
2877     assert(0);
2878     type=STORED_STUB;
2879   }
2880   if(fastio_reg_override==HOST_TEMPREG)
2881     host_tempreg_release();
2882   if(jaddr) {
2883     // PCSX store handlers don't check invcode again
2884     reglist|=1<<addr;
2885     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2886     jaddr=0;
2887   }
2888   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2889     if(!c||memtarget) {
2890       #ifdef DESTRUCTIVE_SHIFT
2891       // The x86 shift operation is 'destructive'; it overwrites the
2892       // source register, so we need to make a copy first and use that.
2893       addr=temp;
2894       #endif
2895       #if defined(HOST_IMM8)
2896       int ir=get_reg(i_regs->regmap,INVCP);
2897       assert(ir>=0);
2898       emit_cmpmem_indexedsr12_reg(ir,addr,1);
2899       #else
2900       emit_cmpmem_indexedsr12_imm(invalid_code,addr,1);
2901       #endif
2902       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2903       emit_callne(invalidate_addr_reg[addr]);
2904       #else
2905       void *jaddr2 = out;
2906       emit_jne(0);
2907       add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),addr,0,0,0);
2908       #endif
2909     }
2910   }
2911   u_int addr_val=constmap[i][s]+offset;
2912   if(jaddr) {
2913     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2914   } else if(c&&!memtarget) {
2915     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
2916   }
2917   // basic current block modification detection..
2918   // not looking back as that should be in mips cache already
2919   // (see Spyro2 title->attract mode)
2920   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
2921     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
2922     assert(i_regs->regmap==regs[i].regmap); // not delay slot
2923     if(i_regs->regmap==regs[i].regmap) {
2924       load_all_consts(regs[i].regmap_entry,regs[i].wasdirty,i);
2925       wb_dirtys(regs[i].regmap_entry,regs[i].wasdirty);
2926       emit_movimm(start+i*4+4,0);
2927       emit_writeword(0,&pcaddr);
2928       emit_addimm(HOST_CCREG,2,HOST_CCREG);
2929       emit_far_call(get_addr_ht);
2930       emit_jmpreg(0);
2931     }
2932   }
2933 }
2934
2935 static void storelr_assemble(int i,struct regstat *i_regs)
2936 {
2937   int s,tl;
2938   int temp;
2939   int offset;
2940   void *jaddr=0;
2941   void *case1, *case2, *case3;
2942   void *done0, *done1, *done2;
2943   int memtarget=0,c=0;
2944   int agr=AGEN1+(i&1);
2945   u_int hr,reglist=0;
2946   tl=get_reg(i_regs->regmap,rs2[i]);
2947   s=get_reg(i_regs->regmap,rs1[i]);
2948   temp=get_reg(i_regs->regmap,agr);
2949   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2950   offset=imm[i];
2951   if(s>=0) {
2952     c=(i_regs->isconst>>s)&1;
2953     if(c) {
2954       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2955     }
2956   }
2957   assert(tl>=0);
2958   for(hr=0;hr<HOST_REGS;hr++) {
2959     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2960   }
2961   assert(temp>=0);
2962   if(!c) {
2963     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
2964     if(!offset&&s!=temp) emit_mov(s,temp);
2965     jaddr=out;
2966     emit_jno(0);
2967   }
2968   else
2969   {
2970     if(!memtarget||!rs1[i]) {
2971       jaddr=out;
2972       emit_jmp(0);
2973     }
2974   }
2975   if(ram_offset)
2976     emit_addimm_no_flags(ram_offset,temp);
2977
2978   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
2979     assert(0);
2980   }
2981
2982   emit_xorimm(temp,3,temp);
2983   emit_testimm(temp,2);
2984   case2=out;
2985   emit_jne(0);
2986   emit_testimm(temp,1);
2987   case1=out;
2988   emit_jne(0);
2989   // 0
2990   if (opcode[i]==0x2A) { // SWL
2991     emit_writeword_indexed(tl,0,temp);
2992   }
2993   else if (opcode[i]==0x2E) { // SWR
2994     emit_writebyte_indexed(tl,3,temp);
2995   }
2996   else
2997     assert(0);
2998   done0=out;
2999   emit_jmp(0);
3000   // 1
3001   set_jump_target(case1, out);
3002   if (opcode[i]==0x2A) { // SWL
3003     // Write 3 msb into three least significant bytes
3004     if(rs2[i]) emit_rorimm(tl,8,tl);
3005     emit_writehword_indexed(tl,-1,temp);
3006     if(rs2[i]) emit_rorimm(tl,16,tl);
3007     emit_writebyte_indexed(tl,1,temp);
3008     if(rs2[i]) emit_rorimm(tl,8,tl);
3009   }
3010   else if (opcode[i]==0x2E) { // SWR
3011     // Write two lsb into two most significant bytes
3012     emit_writehword_indexed(tl,1,temp);
3013   }
3014   done1=out;
3015   emit_jmp(0);
3016   // 2
3017   set_jump_target(case2, out);
3018   emit_testimm(temp,1);
3019   case3=out;
3020   emit_jne(0);
3021   if (opcode[i]==0x2A) { // SWL
3022     // Write two msb into two least significant bytes
3023     if(rs2[i]) emit_rorimm(tl,16,tl);
3024     emit_writehword_indexed(tl,-2,temp);
3025     if(rs2[i]) emit_rorimm(tl,16,tl);
3026   }
3027   else if (opcode[i]==0x2E) { // SWR
3028     // Write 3 lsb into three most significant bytes
3029     emit_writebyte_indexed(tl,-1,temp);
3030     if(rs2[i]) emit_rorimm(tl,8,tl);
3031     emit_writehword_indexed(tl,0,temp);
3032     if(rs2[i]) emit_rorimm(tl,24,tl);
3033   }
3034   done2=out;
3035   emit_jmp(0);
3036   // 3
3037   set_jump_target(case3, out);
3038   if (opcode[i]==0x2A) { // SWL
3039     // Write msb into least significant byte
3040     if(rs2[i]) emit_rorimm(tl,24,tl);
3041     emit_writebyte_indexed(tl,-3,temp);
3042     if(rs2[i]) emit_rorimm(tl,8,tl);
3043   }
3044   else if (opcode[i]==0x2E) { // SWR
3045     // Write entire word
3046     emit_writeword_indexed(tl,-3,temp);
3047   }
3048   set_jump_target(done0, out);
3049   set_jump_target(done1, out);
3050   set_jump_target(done2, out);
3051   if(!c||!memtarget)
3052     add_stub_r(STORELR_STUB,jaddr,out,i,temp,i_regs,ccadj[i],reglist);
3053   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3054     emit_addimm_no_flags(-ram_offset,temp);
3055     #if defined(HOST_IMM8)
3056     int ir=get_reg(i_regs->regmap,INVCP);
3057     assert(ir>=0);
3058     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3059     #else
3060     emit_cmpmem_indexedsr12_imm(invalid_code,temp,1);
3061     #endif
3062     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3063     emit_callne(invalidate_addr_reg[temp]);
3064     #else
3065     void *jaddr2 = out;
3066     emit_jne(0);
3067     add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3068     #endif
3069   }
3070 }
3071
3072 static void cop0_assemble(int i,struct regstat *i_regs)
3073 {
3074   if(opcode2[i]==0) // MFC0
3075   {
3076     signed char t=get_reg(i_regs->regmap,rt1[i]);
3077     u_int copr=(source[i]>>11)&0x1f;
3078     //assert(t>=0); // Why does this happen?  OOT is weird
3079     if(t>=0&&rt1[i]!=0) {
3080       emit_readword(&reg_cop0[copr],t);
3081     }
3082   }
3083   else if(opcode2[i]==4) // MTC0
3084   {
3085     signed char s=get_reg(i_regs->regmap,rs1[i]);
3086     char copr=(source[i]>>11)&0x1f;
3087     assert(s>=0);
3088     wb_register(rs1[i],i_regs->regmap,i_regs->dirty);
3089     if(copr==9||copr==11||copr==12||copr==13) {
3090       emit_readword(&last_count,HOST_TEMPREG);
3091       emit_loadreg(CCREG,HOST_CCREG); // TODO: do proper reg alloc
3092       emit_add(HOST_CCREG,HOST_TEMPREG,HOST_CCREG);
3093       emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3094       emit_writeword(HOST_CCREG,&Count);
3095     }
3096     // What a mess.  The status register (12) can enable interrupts,
3097     // so needs a special case to handle a pending interrupt.
3098     // The interrupt must be taken immediately, because a subsequent
3099     // instruction might disable interrupts again.
3100     if(copr==12||copr==13) {
3101       if (is_delayslot) {
3102         // burn cycles to cause cc_interrupt, which will
3103         // reschedule next_interupt. Relies on CCREG from above.
3104         assem_debug("MTC0 DS %d\n", copr);
3105         emit_writeword(HOST_CCREG,&last_count);
3106         emit_movimm(0,HOST_CCREG);
3107         emit_storereg(CCREG,HOST_CCREG);
3108         emit_loadreg(rs1[i],1);
3109         emit_movimm(copr,0);
3110         emit_far_call(pcsx_mtc0_ds);
3111         emit_loadreg(rs1[i],s);
3112         return;
3113       }
3114       emit_movimm(start+i*4+4,HOST_TEMPREG);
3115       emit_writeword(HOST_TEMPREG,&pcaddr);
3116       emit_movimm(0,HOST_TEMPREG);
3117       emit_writeword(HOST_TEMPREG,&pending_exception);
3118     }
3119     if(s==HOST_CCREG)
3120       emit_loadreg(rs1[i],1);
3121     else if(s!=1)
3122       emit_mov(s,1);
3123     emit_movimm(copr,0);
3124     emit_far_call(pcsx_mtc0);
3125     if(copr==9||copr==11||copr==12||copr==13) {
3126       emit_readword(&Count,HOST_CCREG);
3127       emit_readword(&next_interupt,HOST_TEMPREG);
3128       emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3129       emit_sub(HOST_CCREG,HOST_TEMPREG,HOST_CCREG);
3130       emit_writeword(HOST_TEMPREG,&last_count);
3131       emit_storereg(CCREG,HOST_CCREG);
3132     }
3133     if(copr==12||copr==13) {
3134       assert(!is_delayslot);
3135       emit_readword(&pending_exception,14);
3136       emit_test(14,14);
3137       void *jaddr = out;
3138       emit_jeq(0);
3139       emit_readword(&pcaddr, 0);
3140       emit_addimm(HOST_CCREG,2,HOST_CCREG);
3141       emit_far_call(get_addr_ht);
3142       emit_jmpreg(0);
3143       set_jump_target(jaddr, out);
3144     }
3145     emit_loadreg(rs1[i],s);
3146   }
3147   else
3148   {
3149     assert(opcode2[i]==0x10);
3150     //if((source[i]&0x3f)==0x10) // RFE
3151     {
3152       emit_readword(&Status,0);
3153       emit_andimm(0,0x3c,1);
3154       emit_andimm(0,~0xf,0);
3155       emit_orrshr_imm(1,2,0);
3156       emit_writeword(0,&Status);
3157     }
3158   }
3159 }
3160
3161 static void cop1_unusable(int i,struct regstat *i_regs)
3162 {
3163   // XXX: should just just do the exception instead
3164   //if(!cop1_usable)
3165   {
3166     void *jaddr=out;
3167     emit_jmp(0);
3168     add_stub_r(FP_STUB,jaddr,out,i,0,i_regs,is_delayslot,0);
3169   }
3170 }
3171
3172 static void cop1_assemble(int i,struct regstat *i_regs)
3173 {
3174   cop1_unusable(i, i_regs);
3175 }
3176
3177 static void c1ls_assemble(int i,struct regstat *i_regs)
3178 {
3179   cop1_unusable(i, i_regs);
3180 }
3181
3182 // FP_STUB
3183 static void do_cop1stub(int n)
3184 {
3185   literal_pool(256);
3186   assem_debug("do_cop1stub %x\n",start+stubs[n].a*4);
3187   set_jump_target(stubs[n].addr, out);
3188   int i=stubs[n].a;
3189 //  int rs=stubs[n].b;
3190   struct regstat *i_regs=(struct regstat *)stubs[n].c;
3191   int ds=stubs[n].d;
3192   if(!ds) {
3193     load_all_consts(regs[i].regmap_entry,regs[i].wasdirty,i);
3194     //if(i_regs!=&regs[i]) printf("oops: regs[i]=%x i_regs=%x",(int)&regs[i],(int)i_regs);
3195   }
3196   //else {printf("fp exception in delay slot\n");}
3197   wb_dirtys(i_regs->regmap_entry,i_regs->wasdirty);
3198   if(regs[i].regmap_entry[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
3199   emit_movimm(start+(i-ds)*4,EAX); // Get PC
3200   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3201   emit_far_jump(ds?fp_exception_ds:fp_exception);
3202 }
3203
3204 static void cop2_get_dreg(u_int copr,signed char tl,signed char temp)
3205 {
3206   switch (copr) {
3207     case 1:
3208     case 3:
3209     case 5:
3210     case 8:
3211     case 9:
3212     case 10:
3213     case 11:
3214       emit_readword(&reg_cop2d[copr],tl);
3215       emit_signextend16(tl,tl);
3216       emit_writeword(tl,&reg_cop2d[copr]); // hmh
3217       break;
3218     case 7:
3219     case 16:
3220     case 17:
3221     case 18:
3222     case 19:
3223       emit_readword(&reg_cop2d[copr],tl);
3224       emit_andimm(tl,0xffff,tl);
3225       emit_writeword(tl,&reg_cop2d[copr]);
3226       break;
3227     case 15:
3228       emit_readword(&reg_cop2d[14],tl); // SXY2
3229       emit_writeword(tl,&reg_cop2d[copr]);
3230       break;
3231     case 28:
3232     case 29:
3233       c2op_mfc2_29_assemble(tl,temp);
3234       break;
3235     default:
3236       emit_readword(&reg_cop2d[copr],tl);
3237       break;
3238   }
3239 }
3240
3241 static void cop2_put_dreg(u_int copr,signed char sl,signed char temp)
3242 {
3243   switch (copr) {
3244     case 15:
3245       emit_readword(&reg_cop2d[13],temp);  // SXY1
3246       emit_writeword(sl,&reg_cop2d[copr]);
3247       emit_writeword(temp,&reg_cop2d[12]); // SXY0
3248       emit_readword(&reg_cop2d[14],temp);  // SXY2
3249       emit_writeword(sl,&reg_cop2d[14]);
3250       emit_writeword(temp,&reg_cop2d[13]); // SXY1
3251       break;
3252     case 28:
3253       emit_andimm(sl,0x001f,temp);
3254       emit_shlimm(temp,7,temp);
3255       emit_writeword(temp,&reg_cop2d[9]);
3256       emit_andimm(sl,0x03e0,temp);
3257       emit_shlimm(temp,2,temp);
3258       emit_writeword(temp,&reg_cop2d[10]);
3259       emit_andimm(sl,0x7c00,temp);
3260       emit_shrimm(temp,3,temp);
3261       emit_writeword(temp,&reg_cop2d[11]);
3262       emit_writeword(sl,&reg_cop2d[28]);
3263       break;
3264     case 30:
3265       emit_xorsar_imm(sl,sl,31,temp);
3266 #if defined(HAVE_ARMV5) || defined(__aarch64__)
3267       emit_clz(temp,temp);
3268 #else
3269       emit_movs(temp,HOST_TEMPREG);
3270       emit_movimm(0,temp);
3271       emit_jeq((int)out+4*4);
3272       emit_addpl_imm(temp,1,temp);
3273       emit_lslpls_imm(HOST_TEMPREG,1,HOST_TEMPREG);
3274       emit_jns((int)out-2*4);
3275 #endif
3276       emit_writeword(sl,&reg_cop2d[30]);
3277       emit_writeword(temp,&reg_cop2d[31]);
3278       break;
3279     case 31:
3280       break;
3281     default:
3282       emit_writeword(sl,&reg_cop2d[copr]);
3283       break;
3284   }
3285 }
3286
3287 static void c2ls_assemble(int i,struct regstat *i_regs)
3288 {
3289   int s,tl;
3290   int ar;
3291   int offset;
3292   int memtarget=0,c=0;
3293   void *jaddr2=NULL;
3294   enum stub_type type;
3295   int agr=AGEN1+(i&1);
3296   int fastio_reg_override=-1;
3297   u_int hr,reglist=0;
3298   u_int copr=(source[i]>>16)&0x1f;
3299   s=get_reg(i_regs->regmap,rs1[i]);
3300   tl=get_reg(i_regs->regmap,FTEMP);
3301   offset=imm[i];
3302   assert(rs1[i]>0);
3303   assert(tl>=0);
3304
3305   for(hr=0;hr<HOST_REGS;hr++) {
3306     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3307   }
3308   if(i_regs->regmap[HOST_CCREG]==CCREG)
3309     reglist&=~(1<<HOST_CCREG);
3310
3311   // get the address
3312   if (opcode[i]==0x3a) { // SWC2
3313     ar=get_reg(i_regs->regmap,agr);
3314     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3315     reglist|=1<<ar;
3316   } else { // LWC2
3317     ar=tl;
3318   }
3319   if(s>=0) c=(i_regs->wasconst>>s)&1;
3320   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3321   if (!offset&&!c&&s>=0) ar=s;
3322   assert(ar>=0);
3323
3324   if (opcode[i]==0x3a) { // SWC2
3325     cop2_get_dreg(copr,tl,-1);
3326     type=STOREW_STUB;
3327   }
3328   else
3329     type=LOADW_STUB;
3330
3331   if(c&&!memtarget) {
3332     jaddr2=out;
3333     emit_jmp(0); // inline_readstub/inline_writestub?
3334   }
3335   else {
3336     if(!c) {
3337       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3338     }
3339     else if(ram_offset&&memtarget) {
3340       host_tempreg_acquire();
3341       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3342       fastio_reg_override=HOST_TEMPREG;
3343     }
3344     if (opcode[i]==0x32) { // LWC2
3345       int a=ar;
3346       if(fastio_reg_override>=0) a=fastio_reg_override;
3347       emit_readword_indexed(0,a,tl);
3348     }
3349     if (opcode[i]==0x3a) { // SWC2
3350       #ifdef DESTRUCTIVE_SHIFT
3351       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3352       #endif
3353       int a=ar;
3354       if(fastio_reg_override>=0) a=fastio_reg_override;
3355       emit_writeword_indexed(tl,0,a);
3356     }
3357   }
3358   if(fastio_reg_override==HOST_TEMPREG)
3359     host_tempreg_release();
3360   if(jaddr2)
3361     add_stub_r(type,jaddr2,out,i,ar,i_regs,ccadj[i],reglist);
3362   if(opcode[i]==0x3a) // SWC2
3363   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3364 #if defined(HOST_IMM8)
3365     int ir=get_reg(i_regs->regmap,INVCP);
3366     assert(ir>=0);
3367     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3368 #else
3369     emit_cmpmem_indexedsr12_imm(invalid_code,ar,1);
3370 #endif
3371     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3372     emit_callne(invalidate_addr_reg[ar]);
3373     #else
3374     void *jaddr3 = out;
3375     emit_jne(0);
3376     add_stub(INVCODE_STUB,jaddr3,out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3377     #endif
3378   }
3379   if (opcode[i]==0x32) { // LWC2
3380     host_tempreg_acquire();
3381     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3382     host_tempreg_release();
3383   }
3384 }
3385
3386 static void cop2_assemble(int i,struct regstat *i_regs)
3387 {
3388   u_int copr=(source[i]>>11)&0x1f;
3389   signed char temp=get_reg(i_regs->regmap,-1);
3390   if (opcode2[i]==0) { // MFC2
3391     signed char tl=get_reg(i_regs->regmap,rt1[i]);
3392     if(tl>=0&&rt1[i]!=0)
3393       cop2_get_dreg(copr,tl,temp);
3394   }
3395   else if (opcode2[i]==4) { // MTC2
3396     signed char sl=get_reg(i_regs->regmap,rs1[i]);
3397     cop2_put_dreg(copr,sl,temp);
3398   }
3399   else if (opcode2[i]==2) // CFC2
3400   {
3401     signed char tl=get_reg(i_regs->regmap,rt1[i]);
3402     if(tl>=0&&rt1[i]!=0)
3403       emit_readword(&reg_cop2c[copr],tl);
3404   }
3405   else if (opcode2[i]==6) // CTC2
3406   {
3407     signed char sl=get_reg(i_regs->regmap,rs1[i]);
3408     switch(copr) {
3409       case 4:
3410       case 12:
3411       case 20:
3412       case 26:
3413       case 27:
3414       case 29:
3415       case 30:
3416         emit_signextend16(sl,temp);
3417         break;
3418       case 31:
3419         c2op_ctc2_31_assemble(sl,temp);
3420         break;
3421       default:
3422         temp=sl;
3423         break;
3424     }
3425     emit_writeword(temp,&reg_cop2c[copr]);
3426     assert(sl>=0);
3427   }
3428 }
3429
3430 static void do_unalignedwritestub(int n)
3431 {
3432   assem_debug("do_unalignedwritestub %x\n",start+stubs[n].a*4);
3433   literal_pool(256);
3434   set_jump_target(stubs[n].addr, out);
3435
3436   int i=stubs[n].a;
3437   struct regstat *i_regs=(struct regstat *)stubs[n].c;
3438   int addr=stubs[n].b;
3439   u_int reglist=stubs[n].e;
3440   signed char *i_regmap=i_regs->regmap;
3441   int temp2=get_reg(i_regmap,FTEMP);
3442   int rt;
3443   rt=get_reg(i_regmap,rs2[i]);
3444   assert(rt>=0);
3445   assert(addr>=0);
3446   assert(opcode[i]==0x2a||opcode[i]==0x2e); // SWL/SWR only implemented
3447   reglist|=(1<<addr);
3448   reglist&=~(1<<temp2);
3449
3450 #if 1
3451   // don't bother with it and call write handler
3452   save_regs(reglist);
3453   pass_args(addr,rt);
3454   int cc=get_reg(i_regmap,CCREG);
3455   if(cc<0)
3456     emit_loadreg(CCREG,2);
3457   emit_addimm(cc<0?2:cc,CLOCK_ADJUST((int)stubs[n].d+1),2);
3458   emit_far_call((opcode[i]==0x2a?jump_handle_swl:jump_handle_swr));
3459   emit_addimm(0,-CLOCK_ADJUST((int)stubs[n].d+1),cc<0?2:cc);
3460   if(cc<0)
3461     emit_storereg(CCREG,2);
3462   restore_regs(reglist);
3463   emit_jmp(stubs[n].retaddr); // return address
3464 #else
3465   emit_andimm(addr,0xfffffffc,temp2);
3466   emit_writeword(temp2,&address);
3467
3468   save_regs(reglist);
3469   emit_shrimm(addr,16,1);
3470   int cc=get_reg(i_regmap,CCREG);
3471   if(cc<0) {
3472     emit_loadreg(CCREG,2);
3473   }
3474   emit_movimm((u_int)readmem,0);
3475   emit_addimm(cc<0?2:cc,2*stubs[n].d+2,2);
3476   emit_call((int)&indirect_jump_indexed);
3477   restore_regs(reglist);
3478
3479   emit_readword(&readmem_dword,temp2);
3480   int temp=addr; //hmh
3481   emit_shlimm(addr,3,temp);
3482   emit_andimm(temp,24,temp);
3483   if (opcode[i]==0x2a) // SWL
3484     emit_xorimm(temp,24,temp);
3485   emit_movimm(-1,HOST_TEMPREG);
3486   if (opcode[i]==0x2a) { // SWL
3487     emit_bic_lsr(temp2,HOST_TEMPREG,temp,temp2);
3488     emit_orrshr(rt,temp,temp2);
3489   }else{
3490     emit_bic_lsl(temp2,HOST_TEMPREG,temp,temp2);
3491     emit_orrshl(rt,temp,temp2);
3492   }
3493   emit_readword(&address,addr);
3494   emit_writeword(temp2,&word);
3495   //save_regs(reglist); // don't need to, no state changes
3496   emit_shrimm(addr,16,1);
3497   emit_movimm((u_int)writemem,0);
3498   //emit_call((int)&indirect_jump_indexed);
3499   emit_mov(15,14);
3500   emit_readword_dualindexedx4(0,1,15);
3501   emit_readword(&Count,HOST_TEMPREG);
3502   emit_readword(&next_interupt,2);
3503   emit_addimm(HOST_TEMPREG,-2*stubs[n].d-2,HOST_TEMPREG);
3504   emit_writeword(2,&last_count);
3505   emit_sub(HOST_TEMPREG,2,cc<0?HOST_TEMPREG:cc);
3506   if(cc<0) {
3507     emit_storereg(CCREG,HOST_TEMPREG);
3508   }
3509   restore_regs(reglist);
3510   emit_jmp(stubs[n].retaddr); // return address
3511 #endif
3512 }
3513
3514 #ifndef multdiv_assemble
3515 void multdiv_assemble(int i,struct regstat *i_regs)
3516 {
3517   printf("Need multdiv_assemble for this architecture.\n");
3518   abort();
3519 }
3520 #endif
3521
3522 static void mov_assemble(int i,struct regstat *i_regs)
3523 {
3524   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3525   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3526   if(rt1[i]) {
3527     signed char sl,tl;
3528     tl=get_reg(i_regs->regmap,rt1[i]);
3529     //assert(tl>=0);
3530     if(tl>=0) {
3531       sl=get_reg(i_regs->regmap,rs1[i]);
3532       if(sl>=0) emit_mov(sl,tl);
3533       else emit_loadreg(rs1[i],tl);
3534     }
3535   }
3536 }
3537
3538 // call interpreter, exception handler, things that change pc/regs/cycles ...
3539 static void call_c_cpu_handler(int i, const struct regstat *i_regs, u_int pc, void *func)
3540 {
3541   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3542   assert(ccreg==HOST_CCREG);
3543   assert(!is_delayslot);
3544   (void)ccreg;
3545
3546   emit_movimm(pc,3); // Get PC
3547   emit_readword(&last_count,2);
3548   emit_writeword(3,&psxRegs.pc);
3549   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3550   emit_add(2,HOST_CCREG,2);
3551   emit_writeword(2,&psxRegs.cycle);
3552   emit_far_call(func);
3553   emit_far_jump(jump_to_new_pc);
3554 }
3555
3556 static void syscall_assemble(int i,struct regstat *i_regs)
3557 {
3558   emit_movimm(0x20,0); // cause code
3559   emit_movimm(0,1);    // not in delay slot
3560   call_c_cpu_handler(i,i_regs,start+i*4,psxException);
3561 }
3562
3563 static void hlecall_assemble(int i,struct regstat *i_regs)
3564 {
3565   void *hlefunc = psxNULL;
3566   uint32_t hleCode = source[i] & 0x03ffffff;
3567   if (hleCode < ARRAY_SIZE(psxHLEt))
3568     hlefunc = psxHLEt[hleCode];
3569
3570   call_c_cpu_handler(i,i_regs,start+i*4+4,hlefunc);
3571 }
3572
3573 static void intcall_assemble(int i,struct regstat *i_regs)
3574 {
3575   call_c_cpu_handler(i,i_regs,start+i*4,execI);
3576 }
3577
3578 static void speculate_mov(int rs,int rt)
3579 {
3580   if(rt!=0) {
3581     smrv_strong_next|=1<<rt;
3582     smrv[rt]=smrv[rs];
3583   }
3584 }
3585
3586 static void speculate_mov_weak(int rs,int rt)
3587 {
3588   if(rt!=0) {
3589     smrv_weak_next|=1<<rt;
3590     smrv[rt]=smrv[rs];
3591   }
3592 }
3593
3594 static void speculate_register_values(int i)
3595 {
3596   if(i==0) {
3597     memcpy(smrv,psxRegs.GPR.r,sizeof(smrv));
3598     // gp,sp are likely to stay the same throughout the block
3599     smrv_strong_next=(1<<28)|(1<<29)|(1<<30);
3600     smrv_weak_next=~smrv_strong_next;
3601     //printf(" llr %08x\n", smrv[4]);
3602   }
3603   smrv_strong=smrv_strong_next;
3604   smrv_weak=smrv_weak_next;
3605   switch(itype[i]) {
3606     case ALU:
3607       if     ((smrv_strong>>rs1[i])&1) speculate_mov(rs1[i],rt1[i]);
3608       else if((smrv_strong>>rs2[i])&1) speculate_mov(rs2[i],rt1[i]);
3609       else if((smrv_weak>>rs1[i])&1) speculate_mov_weak(rs1[i],rt1[i]);
3610       else if((smrv_weak>>rs2[i])&1) speculate_mov_weak(rs2[i],rt1[i]);
3611       else {
3612         smrv_strong_next&=~(1<<rt1[i]);
3613         smrv_weak_next&=~(1<<rt1[i]);
3614       }
3615       break;
3616     case SHIFTIMM:
3617       smrv_strong_next&=~(1<<rt1[i]);
3618       smrv_weak_next&=~(1<<rt1[i]);
3619       // fallthrough
3620     case IMM16:
3621       if(rt1[i]&&is_const(&regs[i],rt1[i])) {
3622         int value,hr=get_reg(regs[i].regmap,rt1[i]);
3623         if(hr>=0) {
3624           if(get_final_value(hr,i,&value))
3625                smrv[rt1[i]]=value;
3626           else smrv[rt1[i]]=constmap[i][hr];
3627           smrv_strong_next|=1<<rt1[i];
3628         }
3629       }
3630       else {
3631         if     ((smrv_strong>>rs1[i])&1) speculate_mov(rs1[i],rt1[i]);
3632         else if((smrv_weak>>rs1[i])&1) speculate_mov_weak(rs1[i],rt1[i]);
3633       }
3634       break;
3635     case LOAD:
3636       if(start<0x2000&&(rt1[i]==26||(smrv[rt1[i]]>>24)==0xa0)) {
3637         // special case for BIOS
3638         smrv[rt1[i]]=0xa0000000;
3639         smrv_strong_next|=1<<rt1[i];
3640         break;
3641       }
3642       // fallthrough
3643     case SHIFT:
3644     case LOADLR:
3645     case MOV:
3646       smrv_strong_next&=~(1<<rt1[i]);
3647       smrv_weak_next&=~(1<<rt1[i]);
3648       break;
3649     case COP0:
3650     case COP2:
3651       if(opcode2[i]==0||opcode2[i]==2) { // MFC/CFC
3652         smrv_strong_next&=~(1<<rt1[i]);
3653         smrv_weak_next&=~(1<<rt1[i]);
3654       }
3655       break;
3656     case C2LS:
3657       if (opcode[i]==0x32) { // LWC2
3658         smrv_strong_next&=~(1<<rt1[i]);
3659         smrv_weak_next&=~(1<<rt1[i]);
3660       }
3661       break;
3662   }
3663 #if 0
3664   int r=4;
3665   printf("x %08x %08x %d %d c %08x %08x\n",smrv[r],start+i*4,
3666     ((smrv_strong>>r)&1),(smrv_weak>>r)&1,regs[i].isconst,regs[i].wasconst);
3667 #endif
3668 }
3669
3670 static void ds_assemble(int i,struct regstat *i_regs)
3671 {
3672   speculate_register_values(i);
3673   is_delayslot=1;
3674   switch(itype[i]) {
3675     case ALU:
3676       alu_assemble(i,i_regs);break;
3677     case IMM16:
3678       imm16_assemble(i,i_regs);break;
3679     case SHIFT:
3680       shift_assemble(i,i_regs);break;
3681     case SHIFTIMM:
3682       shiftimm_assemble(i,i_regs);break;
3683     case LOAD:
3684       load_assemble(i,i_regs);break;
3685     case LOADLR:
3686       loadlr_assemble(i,i_regs);break;
3687     case STORE:
3688       store_assemble(i,i_regs);break;
3689     case STORELR:
3690       storelr_assemble(i,i_regs);break;
3691     case COP0:
3692       cop0_assemble(i,i_regs);break;
3693     case COP1:
3694       cop1_assemble(i,i_regs);break;
3695     case C1LS:
3696       c1ls_assemble(i,i_regs);break;
3697     case COP2:
3698       cop2_assemble(i,i_regs);break;
3699     case C2LS:
3700       c2ls_assemble(i,i_regs);break;
3701     case C2OP:
3702       c2op_assemble(i,i_regs);break;
3703     case MULTDIV:
3704       multdiv_assemble(i,i_regs);break;
3705     case MOV:
3706       mov_assemble(i,i_regs);break;
3707     case SYSCALL:
3708     case HLECALL:
3709     case INTCALL:
3710     case SPAN:
3711     case UJUMP:
3712     case RJUMP:
3713     case CJUMP:
3714     case SJUMP:
3715       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3716   }
3717   is_delayslot=0;
3718 }
3719
3720 // Is the branch target a valid internal jump?
3721 static int internal_branch(int addr)
3722 {
3723   if(addr&1) return 0; // Indirect (register) jump
3724   if(addr>=start && addr<start+slen*4-4)
3725   {
3726     return 1;
3727   }
3728   return 0;
3729 }
3730
3731 static void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t u)
3732 {
3733   int hr;
3734   for(hr=0;hr<HOST_REGS;hr++) {
3735     if(hr!=EXCLUDE_REG) {
3736       if(pre[hr]!=entry[hr]) {
3737         if(pre[hr]>=0) {
3738           if((dirty>>hr)&1) {
3739             if(get_reg(entry,pre[hr])<0) {
3740               assert(pre[hr]<64);
3741               if(!((u>>pre[hr])&1))
3742                 emit_storereg(pre[hr],hr);
3743             }
3744           }
3745         }
3746       }
3747     }
3748   }
3749   // Move from one register to another (no writeback)
3750   for(hr=0;hr<HOST_REGS;hr++) {
3751     if(hr!=EXCLUDE_REG) {
3752       if(pre[hr]!=entry[hr]) {
3753         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3754           int nr;
3755           if((nr=get_reg(entry,pre[hr]))>=0) {
3756             emit_mov(hr,nr);
3757           }
3758         }
3759       }
3760     }
3761   }
3762 }
3763
3764 // Load the specified registers
3765 // This only loads the registers given as arguments because
3766 // we don't want to load things that will be overwritten
3767 static void load_regs(signed char entry[],signed char regmap[],int rs1,int rs2)
3768 {
3769   int hr;
3770   // Load 32-bit regs
3771   for(hr=0;hr<HOST_REGS;hr++) {
3772     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3773       if(entry[hr]!=regmap[hr]) {
3774         if(regmap[hr]==rs1||regmap[hr]==rs2)
3775         {
3776           if(regmap[hr]==0) {
3777             emit_zeroreg(hr);
3778           }
3779           else
3780           {
3781             emit_loadreg(regmap[hr],hr);
3782           }
3783         }
3784       }
3785     }
3786   }
3787 }
3788
3789 // Load registers prior to the start of a loop
3790 // so that they are not loaded within the loop
3791 static void loop_preload(signed char pre[],signed char entry[])
3792 {
3793   int hr;
3794   for(hr=0;hr<HOST_REGS;hr++) {
3795     if(hr!=EXCLUDE_REG) {
3796       if(pre[hr]!=entry[hr]) {
3797         if(entry[hr]>=0) {
3798           if(get_reg(pre,entry[hr])<0) {
3799             assem_debug("loop preload:\n");
3800             //printf("loop preload: %d\n",hr);
3801             if(entry[hr]==0) {
3802               emit_zeroreg(hr);
3803             }
3804             else if(entry[hr]<TEMPREG)
3805             {
3806               emit_loadreg(entry[hr],hr);
3807             }
3808             else if(entry[hr]-64<TEMPREG)
3809             {
3810               emit_loadreg(entry[hr],hr);
3811             }
3812           }
3813         }
3814       }
3815     }
3816   }
3817 }
3818
3819 // Generate address for load/store instruction
3820 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
3821 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3822 {
3823   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
3824     int ra=-1;
3825     int agr=AGEN1+(i&1);
3826     if(itype[i]==LOAD) {
3827       ra=get_reg(i_regs->regmap,rt1[i]);
3828       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3829       assert(ra>=0);
3830     }
3831     if(itype[i]==LOADLR) {
3832       ra=get_reg(i_regs->regmap,FTEMP);
3833     }
3834     if(itype[i]==STORE||itype[i]==STORELR) {
3835       ra=get_reg(i_regs->regmap,agr);
3836       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3837     }
3838     if(itype[i]==C1LS||itype[i]==C2LS) {
3839       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
3840         ra=get_reg(i_regs->regmap,FTEMP);
3841       else { // SWC1/SDC1/SWC2/SDC2
3842         ra=get_reg(i_regs->regmap,agr);
3843         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3844       }
3845     }
3846     int rs=get_reg(i_regs->regmap,rs1[i]);
3847     if(ra>=0) {
3848       int offset=imm[i];
3849       int c=(i_regs->wasconst>>rs)&1;
3850       if(rs1[i]==0) {
3851         // Using r0 as a base address
3852         if(!entry||entry[ra]!=agr) {
3853           if (opcode[i]==0x22||opcode[i]==0x26) {
3854             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3855           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3856             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3857           }else{
3858             emit_movimm(offset,ra);
3859           }
3860         } // else did it in the previous cycle
3861       }
3862       else if(rs<0) {
3863         if(!entry||entry[ra]!=rs1[i])
3864           emit_loadreg(rs1[i],ra);
3865         //if(!entry||entry[ra]!=rs1[i])
3866         //  printf("poor load scheduling!\n");
3867       }
3868       else if(c) {
3869         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3870           if(!entry||entry[ra]!=agr) {
3871             if (opcode[i]==0x22||opcode[i]==0x26) {
3872               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3873             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3874               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3875             }else{
3876               emit_movimm(constmap[i][rs]+offset,ra);
3877               regs[i].loadedconst|=1<<ra;
3878             }
3879           } // else did it in the previous cycle
3880         } // else load_consts already did it
3881       }
3882       if(offset&&!c&&rs1[i]) {
3883         if(rs>=0) {
3884           emit_addimm(rs,offset,ra);
3885         }else{
3886           emit_addimm(ra,offset,ra);
3887         }
3888       }
3889     }
3890   }
3891   // Preload constants for next instruction
3892   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
3893     int agr,ra;
3894     // Actual address
3895     agr=AGEN1+((i+1)&1);
3896     ra=get_reg(i_regs->regmap,agr);
3897     if(ra>=0) {
3898       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3899       int offset=imm[i+1];
3900       int c=(regs[i+1].wasconst>>rs)&1;
3901       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3902         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3903           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3904         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3905           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3906         }else{
3907           emit_movimm(constmap[i+1][rs]+offset,ra);
3908           regs[i+1].loadedconst|=1<<ra;
3909         }
3910       }
3911       else if(rs1[i+1]==0) {
3912         // Using r0 as a base address
3913         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3914           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3915         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3916           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3917         }else{
3918           emit_movimm(offset,ra);
3919         }
3920       }
3921     }
3922   }
3923 }
3924
3925 static int get_final_value(int hr, int i, int *value)
3926 {
3927   int reg=regs[i].regmap[hr];
3928   while(i<slen-1) {
3929     if(regs[i+1].regmap[hr]!=reg) break;
3930     if(!((regs[i+1].isconst>>hr)&1)) break;
3931     if(bt[i+1]) break;
3932     i++;
3933   }
3934   if(i<slen-1) {
3935     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3936       *value=constmap[i][hr];
3937       return 1;
3938     }
3939     if(!bt[i+1]) {
3940       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3941         // Load in delay slot, out-of-order execution
3942         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3943         {
3944           // Precompute load address
3945           *value=constmap[i][hr]+imm[i+2];
3946           return 1;
3947         }
3948       }
3949       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3950       {
3951         // Precompute load address
3952         *value=constmap[i][hr]+imm[i+1];
3953         //printf("c=%x imm=%lx\n",(long)constmap[i][hr],imm[i+1]);
3954         return 1;
3955       }
3956     }
3957   }
3958   *value=constmap[i][hr];
3959   //printf("c=%lx\n",(long)constmap[i][hr]);
3960   if(i==slen-1) return 1;
3961   assert(reg < 64);
3962   return !((unneeded_reg[i+1]>>reg)&1);
3963 }
3964
3965 // Load registers with known constants
3966 static void load_consts(signed char pre[],signed char regmap[],int i)
3967 {
3968   int hr,hr2;
3969   // propagate loaded constant flags
3970   if(i==0||bt[i])
3971     regs[i].loadedconst=0;
3972   else {
3973     for(hr=0;hr<HOST_REGS;hr++) {
3974       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
3975          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
3976       {
3977         regs[i].loadedconst|=1<<hr;
3978       }
3979     }
3980   }
3981   // Load 32-bit regs
3982   for(hr=0;hr<HOST_REGS;hr++) {
3983     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3984       //if(entry[hr]!=regmap[hr]) {
3985       if(!((regs[i].loadedconst>>hr)&1)) {
3986         assert(regmap[hr]<64);
3987         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>0) {
3988           int value,similar=0;
3989           if(get_final_value(hr,i,&value)) {
3990             // see if some other register has similar value
3991             for(hr2=0;hr2<HOST_REGS;hr2++) {
3992               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
3993                 if(is_similar_value(value,constmap[i][hr2])) {
3994                   similar=1;
3995                   break;
3996                 }
3997               }
3998             }
3999             if(similar) {
4000               int value2;
4001               if(get_final_value(hr2,i,&value2)) // is this needed?
4002                 emit_movimm_from(value2,hr2,value,hr);
4003               else
4004                 emit_movimm(value,hr);
4005             }
4006             else if(value==0) {
4007               emit_zeroreg(hr);
4008             }
4009             else {
4010               emit_movimm(value,hr);
4011             }
4012           }
4013           regs[i].loadedconst|=1<<hr;
4014         }
4015       }
4016     }
4017   }
4018 }
4019
4020 void load_all_consts(signed char regmap[], u_int dirty, int i)
4021 {
4022   int hr;
4023   // Load 32-bit regs
4024   for(hr=0;hr<HOST_REGS;hr++) {
4025     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4026       assert(regmap[hr] < 64);
4027       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>0) {
4028         int value=constmap[i][hr];
4029         if(value==0) {
4030           emit_zeroreg(hr);
4031         }
4032         else {
4033           emit_movimm(value,hr);
4034         }
4035       }
4036     }
4037   }
4038 }
4039
4040 // Write out all dirty registers (except cycle count)
4041 static void wb_dirtys(signed char i_regmap[],uint64_t i_dirty)
4042 {
4043   int hr;
4044   for(hr=0;hr<HOST_REGS;hr++) {
4045     if(hr!=EXCLUDE_REG) {
4046       if(i_regmap[hr]>0) {
4047         if(i_regmap[hr]!=CCREG) {
4048           if((i_dirty>>hr)&1) {
4049             assert(i_regmap[hr]<64);
4050             emit_storereg(i_regmap[hr],hr);
4051           }
4052         }
4053       }
4054     }
4055   }
4056 }
4057
4058 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4059 // This writes the registers not written by store_regs_bt
4060 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_dirty,int addr)
4061 {
4062   int hr;
4063   int t=(addr-start)>>2;
4064   for(hr=0;hr<HOST_REGS;hr++) {
4065     if(hr!=EXCLUDE_REG) {
4066       if(i_regmap[hr]>0) {
4067         if(i_regmap[hr]!=CCREG) {
4068           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1)) {
4069             if((i_dirty>>hr)&1) {
4070               assert(i_regmap[hr]<64);
4071               emit_storereg(i_regmap[hr],hr);
4072             }
4073           }
4074         }
4075       }
4076     }
4077   }
4078 }
4079
4080 // Load all registers (except cycle count)
4081 void load_all_regs(signed char i_regmap[])
4082 {
4083   int hr;
4084   for(hr=0;hr<HOST_REGS;hr++) {
4085     if(hr!=EXCLUDE_REG) {
4086       if(i_regmap[hr]==0) {
4087         emit_zeroreg(hr);
4088       }
4089       else
4090       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4091       {
4092         emit_loadreg(i_regmap[hr],hr);
4093       }
4094     }
4095   }
4096 }
4097
4098 // Load all current registers also needed by next instruction
4099 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4100 {
4101   int hr;
4102   for(hr=0;hr<HOST_REGS;hr++) {
4103     if(hr!=EXCLUDE_REG) {
4104       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4105         if(i_regmap[hr]==0) {
4106           emit_zeroreg(hr);
4107         }
4108         else
4109         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4110         {
4111           emit_loadreg(i_regmap[hr],hr);
4112         }
4113       }
4114     }
4115   }
4116 }
4117
4118 // Load all regs, storing cycle count if necessary
4119 void load_regs_entry(int t)
4120 {
4121   int hr;
4122   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
4123   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
4124   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4125     emit_storereg(CCREG,HOST_CCREG);
4126   }
4127   // Load 32-bit regs
4128   for(hr=0;hr<HOST_REGS;hr++) {
4129     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4130       if(regs[t].regmap_entry[hr]==0) {
4131         emit_zeroreg(hr);
4132       }
4133       else if(regs[t].regmap_entry[hr]!=CCREG)
4134       {
4135         emit_loadreg(regs[t].regmap_entry[hr],hr);
4136       }
4137     }
4138   }
4139 }
4140
4141 // Store dirty registers prior to branch
4142 void store_regs_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
4143 {
4144   if(internal_branch(addr))
4145   {
4146     int t=(addr-start)>>2;
4147     int hr;
4148     for(hr=0;hr<HOST_REGS;hr++) {
4149       if(hr!=EXCLUDE_REG) {
4150         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4151           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1)) {
4152             if((i_dirty>>hr)&1) {
4153               assert(i_regmap[hr]<64);
4154               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4155                 emit_storereg(i_regmap[hr],hr);
4156             }
4157           }
4158         }
4159       }
4160     }
4161   }
4162   else
4163   {
4164     // Branch out of this block, write out all dirty regs
4165     wb_dirtys(i_regmap,i_dirty);
4166   }
4167 }
4168
4169 // Load all needed registers for branch target
4170 static void load_regs_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
4171 {
4172   //if(addr>=start && addr<(start+slen*4))
4173   if(internal_branch(addr))
4174   {
4175     int t=(addr-start)>>2;
4176     int hr;
4177     // Store the cycle count before loading something else
4178     if(i_regmap[HOST_CCREG]!=CCREG) {
4179       assert(i_regmap[HOST_CCREG]==-1);
4180     }
4181     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4182       emit_storereg(CCREG,HOST_CCREG);
4183     }
4184     // Load 32-bit regs
4185     for(hr=0;hr<HOST_REGS;hr++) {
4186       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4187         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4188           if(regs[t].regmap_entry[hr]==0) {
4189             emit_zeroreg(hr);
4190           }
4191           else if(regs[t].regmap_entry[hr]!=CCREG)
4192           {
4193             emit_loadreg(regs[t].regmap_entry[hr],hr);
4194           }
4195         }
4196       }
4197     }
4198   }
4199 }
4200
4201 static int match_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
4202 {
4203   if(addr>=start && addr<start+slen*4-4)
4204   {
4205     int t=(addr-start)>>2;
4206     int hr;
4207     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4208     for(hr=0;hr<HOST_REGS;hr++)
4209     {
4210       if(hr!=EXCLUDE_REG)
4211       {
4212         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4213         {
4214           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4215           {
4216             return 0;
4217           }
4218           else
4219           if((i_dirty>>hr)&1)
4220           {
4221             if(i_regmap[hr]<TEMPREG)
4222             {
4223               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4224                 return 0;
4225             }
4226             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4227             {
4228               assert(0);
4229             }
4230           }
4231         }
4232         else // Same register but is it 32-bit or dirty?
4233         if(i_regmap[hr]>=0)
4234         {
4235           if(!((regs[t].dirty>>hr)&1))
4236           {
4237             if((i_dirty>>hr)&1)
4238             {
4239               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4240               {
4241                 //printf("%x: dirty no match\n",addr);
4242                 return 0;
4243               }
4244             }
4245           }
4246         }
4247       }
4248     }
4249     // Delay slots are not valid branch targets
4250     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP)) return 0;
4251     // Delay slots require additional processing, so do not match
4252     if(is_ds[t]) return 0;
4253   }
4254   else
4255   {
4256     int hr;
4257     for(hr=0;hr<HOST_REGS;hr++)
4258     {
4259       if(hr!=EXCLUDE_REG)
4260       {
4261         if(i_regmap[hr]>=0)
4262         {
4263           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4264           {
4265             if((i_dirty>>hr)&1)
4266             {
4267               return 0;
4268             }
4269           }
4270         }
4271       }
4272     }
4273   }
4274   return 1;
4275 }
4276
4277 #ifdef DRC_DBG
4278 static void drc_dbg_emit_do_cmp(int i)
4279 {
4280   extern void do_insn_cmp();
4281   //extern int cycle;
4282   u_int hr,reglist=0;
4283
4284   for(hr=0;hr<HOST_REGS;hr++)
4285     if(regs[i].regmap[hr]>=0) reglist|=1<<hr;
4286   save_regs(reglist);
4287   emit_movimm(start+i*4,0);
4288   emit_writeword(0,&pcaddr);
4289   emit_far_call(do_insn_cmp);
4290   //emit_readword(&cycle,0);
4291   //emit_addimm(0,2,0);
4292   //emit_writeword(0,&cycle);
4293   (void)get_reg2;
4294   restore_regs(reglist);
4295 }
4296 #else
4297 #define drc_dbg_emit_do_cmp(x)
4298 #endif
4299
4300 // Used when a branch jumps into the delay slot of another branch
4301 static void ds_assemble_entry(int i)
4302 {
4303   int t=(ba[i]-start)>>2;
4304   if (!instr_addr[t])
4305     instr_addr[t] = out;
4306   assem_debug("Assemble delay slot at %x\n",ba[i]);
4307   assem_debug("<->\n");
4308   drc_dbg_emit_do_cmp(t);
4309   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4310     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty);
4311   load_regs(regs[t].regmap_entry,regs[t].regmap,rs1[t],rs2[t]);
4312   address_generation(t,&regs[t],regs[t].regmap_entry);
4313   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4314     load_regs(regs[t].regmap_entry,regs[t].regmap,INVCP,INVCP);
4315   is_delayslot=0;
4316   switch(itype[t]) {
4317     case ALU:
4318       alu_assemble(t,&regs[t]);break;
4319     case IMM16:
4320       imm16_assemble(t,&regs[t]);break;
4321     case SHIFT:
4322       shift_assemble(t,&regs[t]);break;
4323     case SHIFTIMM:
4324       shiftimm_assemble(t,&regs[t]);break;
4325     case LOAD:
4326       load_assemble(t,&regs[t]);break;
4327     case LOADLR:
4328       loadlr_assemble(t,&regs[t]);break;
4329     case STORE:
4330       store_assemble(t,&regs[t]);break;
4331     case STORELR:
4332       storelr_assemble(t,&regs[t]);break;
4333     case COP0:
4334       cop0_assemble(t,&regs[t]);break;
4335     case COP1:
4336       cop1_assemble(t,&regs[t]);break;
4337     case C1LS:
4338       c1ls_assemble(t,&regs[t]);break;
4339     case COP2:
4340       cop2_assemble(t,&regs[t]);break;
4341     case C2LS:
4342       c2ls_assemble(t,&regs[t]);break;
4343     case C2OP:
4344       c2op_assemble(t,&regs[t]);break;
4345     case MULTDIV:
4346       multdiv_assemble(t,&regs[t]);break;
4347     case MOV:
4348       mov_assemble(t,&regs[t]);break;
4349     case SYSCALL:
4350     case HLECALL:
4351     case INTCALL:
4352     case SPAN:
4353     case UJUMP:
4354     case RJUMP:
4355     case CJUMP:
4356     case SJUMP:
4357       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4358   }
4359   store_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+4);
4360   load_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+4);
4361   if(internal_branch(ba[i]+4))
4362     assem_debug("branch: internal\n");
4363   else
4364     assem_debug("branch: external\n");
4365   assert(internal_branch(ba[i]+4));
4366   add_to_linker(out,ba[i]+4,internal_branch(ba[i]+4));
4367   emit_jmp(0);
4368 }
4369
4370 static void emit_extjump(void *addr, u_int target)
4371 {
4372   emit_extjump2(addr, target, dyna_linker);
4373 }
4374
4375 static void emit_extjump_ds(void *addr, u_int target)
4376 {
4377   emit_extjump2(addr, target, dyna_linker_ds);
4378 }
4379
4380 // Load 2 immediates optimizing for small code size
4381 static void emit_mov2imm_compact(int imm1,u_int rt1,int imm2,u_int rt2)
4382 {
4383   emit_movimm(imm1,rt1);
4384   emit_movimm_from(imm1,rt1,imm2,rt2);
4385 }
4386
4387 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4388 {
4389   int count;
4390   void *jaddr;
4391   void *idle=NULL;
4392   int t=0;
4393   if(itype[i]==RJUMP)
4394   {
4395     *adj=0;
4396   }
4397   //if(ba[i]>=start && ba[i]<(start+slen*4))
4398   if(internal_branch(ba[i]))
4399   {
4400     t=(ba[i]-start)>>2;
4401     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4402     else *adj=ccadj[t];
4403   }
4404   else
4405   {
4406     *adj=0;
4407   }
4408   count=ccadj[i];
4409   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4410     // Idle loop
4411     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4412     idle=out;
4413     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4414     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4415     jaddr=out;
4416     emit_jmp(0);
4417   }
4418   else if(*adj==0||invert) {
4419     int cycles=CLOCK_ADJUST(count+2);
4420     // faster loop HACK
4421     if (t&&*adj) {
4422       int rel=t-i;
4423       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4424         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4425     }
4426     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4427     jaddr=out;
4428     emit_jns(0);
4429   }
4430   else
4431   {
4432     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4433     jaddr=out;
4434     emit_jns(0);
4435   }
4436   add_stub(CC_STUB,jaddr,idle?idle:out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4437 }
4438
4439 static void do_ccstub(int n)
4440 {
4441   literal_pool(256);
4442   assem_debug("do_ccstub %x\n",start+(u_int)stubs[n].b*4);
4443   set_jump_target(stubs[n].addr, out);
4444   int i=stubs[n].b;
4445   if(stubs[n].d==NULLDS) {
4446     // Delay slot instruction is nullified ("likely" branch)
4447     wb_dirtys(regs[i].regmap,regs[i].dirty);
4448   }
4449   else if(stubs[n].d!=TAKEN) {
4450     wb_dirtys(branch_regs[i].regmap,branch_regs[i].dirty);
4451   }
4452   else {
4453     if(internal_branch(ba[i]))
4454       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4455   }
4456   if(stubs[n].c!=-1)
4457   {
4458     // Save PC as return address
4459     emit_movimm(stubs[n].c,EAX);
4460     emit_writeword(EAX,&pcaddr);
4461   }
4462   else
4463   {
4464     // Return address depends on which way the branch goes
4465     if(itype[i]==CJUMP||itype[i]==SJUMP)
4466     {
4467       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4468       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4469       if(rs1[i]==0)
4470       {
4471         s1l=s2l;
4472         s2l=-1;
4473       }
4474       else if(rs2[i]==0)
4475       {
4476         s2l=-1;
4477       }
4478       assert(s1l>=0);
4479       #ifdef DESTRUCTIVE_WRITEBACK
4480       if(rs1[i]) {
4481         if((branch_regs[i].dirty>>s1l)&&1)
4482           emit_loadreg(rs1[i],s1l);
4483       }
4484       else {
4485         if((branch_regs[i].dirty>>s1l)&1)
4486           emit_loadreg(rs2[i],s1l);
4487       }
4488       if(s2l>=0)
4489         if((branch_regs[i].dirty>>s2l)&1)
4490           emit_loadreg(rs2[i],s2l);
4491       #endif
4492       int hr=0;
4493       int addr=-1,alt=-1,ntaddr=-1;
4494       while(hr<HOST_REGS)
4495       {
4496         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4497            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4498            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4499         {
4500           addr=hr++;break;
4501         }
4502         hr++;
4503       }
4504       while(hr<HOST_REGS)
4505       {
4506         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4507            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4508            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4509         {
4510           alt=hr++;break;
4511         }
4512         hr++;
4513       }
4514       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4515       {
4516         while(hr<HOST_REGS)
4517         {
4518           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4519              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4520              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4521           {
4522             ntaddr=hr;break;
4523           }
4524           hr++;
4525         }
4526         assert(hr<HOST_REGS);
4527       }
4528       if((opcode[i]&0x2f)==4) // BEQ
4529       {
4530         #ifdef HAVE_CMOV_IMM
4531         if(s2l>=0) emit_cmp(s1l,s2l);
4532         else emit_test(s1l,s1l);
4533         emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4534         #else
4535         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4536         if(s2l>=0) emit_cmp(s1l,s2l);
4537         else emit_test(s1l,s1l);
4538         emit_cmovne_reg(alt,addr);
4539         #endif
4540       }
4541       if((opcode[i]&0x2f)==5) // BNE
4542       {
4543         #ifdef HAVE_CMOV_IMM
4544         if(s2l>=0) emit_cmp(s1l,s2l);
4545         else emit_test(s1l,s1l);
4546         emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4547         #else
4548         emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4549         if(s2l>=0) emit_cmp(s1l,s2l);
4550         else emit_test(s1l,s1l);
4551         emit_cmovne_reg(alt,addr);
4552         #endif
4553       }
4554       if((opcode[i]&0x2f)==6) // BLEZ
4555       {
4556         //emit_movimm(ba[i],alt);
4557         //emit_movimm(start+i*4+8,addr);
4558         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4559         emit_cmpimm(s1l,1);
4560         emit_cmovl_reg(alt,addr);
4561       }
4562       if((opcode[i]&0x2f)==7) // BGTZ
4563       {
4564         //emit_movimm(ba[i],addr);
4565         //emit_movimm(start+i*4+8,ntaddr);
4566         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4567         emit_cmpimm(s1l,1);
4568         emit_cmovl_reg(ntaddr,addr);
4569       }
4570       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4571       {
4572         //emit_movimm(ba[i],alt);
4573         //emit_movimm(start+i*4+8,addr);
4574         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4575         emit_test(s1l,s1l);
4576         emit_cmovs_reg(alt,addr);
4577       }
4578       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4579       {
4580         //emit_movimm(ba[i],addr);
4581         //emit_movimm(start+i*4+8,alt);
4582         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4583         emit_test(s1l,s1l);
4584         emit_cmovs_reg(alt,addr);
4585       }
4586       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4587         if(source[i]&0x10000) // BC1T
4588         {
4589           //emit_movimm(ba[i],alt);
4590           //emit_movimm(start+i*4+8,addr);
4591           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4592           emit_testimm(s1l,0x800000);
4593           emit_cmovne_reg(alt,addr);
4594         }
4595         else // BC1F
4596         {
4597           //emit_movimm(ba[i],addr);
4598           //emit_movimm(start+i*4+8,alt);
4599           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4600           emit_testimm(s1l,0x800000);
4601           emit_cmovne_reg(alt,addr);
4602         }
4603       }
4604       emit_writeword(addr,&pcaddr);
4605     }
4606     else
4607     if(itype[i]==RJUMP)
4608     {
4609       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4610       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4611         r=get_reg(branch_regs[i].regmap,RTEMP);
4612       }
4613       emit_writeword(r,&pcaddr);
4614     }
4615     else {SysPrintf("Unknown branch type in do_ccstub\n");abort();}
4616   }
4617   // Update cycle count
4618   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4619   if(stubs[n].a) emit_addimm(HOST_CCREG,CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
4620   emit_far_call(cc_interrupt);
4621   if(stubs[n].a) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
4622   if(stubs[n].d==TAKEN) {
4623     if(internal_branch(ba[i]))
4624       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4625     else if(itype[i]==RJUMP) {
4626       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4627         emit_readword(&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4628       else
4629         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4630     }
4631   }else if(stubs[n].d==NOTTAKEN) {
4632     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4633     else load_all_regs(branch_regs[i].regmap);
4634   }else if(stubs[n].d==NULLDS) {
4635     // Delay slot instruction is nullified ("likely" branch)
4636     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4637     else load_all_regs(regs[i].regmap);
4638   }else{
4639     load_all_regs(branch_regs[i].regmap);
4640   }
4641   if (stubs[n].retaddr)
4642     emit_jmp(stubs[n].retaddr);
4643   else
4644     do_jump_vaddr(stubs[n].e);
4645 }
4646
4647 static void add_to_linker(void *addr, u_int target, int ext)
4648 {
4649   assert(linkcount < ARRAY_SIZE(link_addr));
4650   link_addr[linkcount].addr = addr;
4651   link_addr[linkcount].target = target;
4652   link_addr[linkcount].ext = ext;
4653   linkcount++;
4654 }
4655
4656 static void ujump_assemble_write_ra(int i)
4657 {
4658   int rt;
4659   unsigned int return_address;
4660   rt=get_reg(branch_regs[i].regmap,31);
4661   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4662   //assert(rt>=0);
4663   return_address=start+i*4+8;
4664   if(rt>=0) {
4665     #ifdef USE_MINI_HT
4666     if(internal_branch(return_address)&&rt1[i+1]!=31) {
4667       int temp=-1; // note: must be ds-safe
4668       #ifdef HOST_TEMPREG
4669       temp=HOST_TEMPREG;
4670       #endif
4671       if(temp>=0) do_miniht_insert(return_address,rt,temp);
4672       else emit_movimm(return_address,rt);
4673     }
4674     else
4675     #endif
4676     {
4677       #ifdef REG_PREFETCH
4678       if(temp>=0)
4679       {
4680         if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4681       }
4682       #endif
4683       emit_movimm(return_address,rt); // PC into link register
4684       #ifdef IMM_PREFETCH
4685       emit_prefetch(hash_table_get(return_address));
4686       #endif
4687     }
4688   }
4689 }
4690
4691 static void ujump_assemble(int i,struct regstat *i_regs)
4692 {
4693   int ra_done=0;
4694   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4695   address_generation(i+1,i_regs,regs[i].regmap_entry);
4696   #ifdef REG_PREFETCH
4697   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4698   if(rt1[i]==31&&temp>=0)
4699   {
4700     signed char *i_regmap=i_regs->regmap;
4701     int return_address=start+i*4+8;
4702     if(get_reg(branch_regs[i].regmap,31)>0)
4703     if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4704   }
4705   #endif
4706   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4707     ujump_assemble_write_ra(i); // writeback ra for DS
4708     ra_done=1;
4709   }
4710   ds_assemble(i+1,i_regs);
4711   uint64_t bc_unneeded=branch_regs[i].u;
4712   bc_unneeded|=1|(1LL<<rt1[i]);
4713   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
4714   load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
4715   if(!ra_done&&rt1[i]==31)
4716     ujump_assemble_write_ra(i);
4717   int cc,adj;
4718   cc=get_reg(branch_regs[i].regmap,CCREG);
4719   assert(cc==HOST_CCREG);
4720   store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4721   #ifdef REG_PREFETCH
4722   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4723   #endif
4724   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4725   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4726   load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4727   if(internal_branch(ba[i]))
4728     assem_debug("branch: internal\n");
4729   else
4730     assem_debug("branch: external\n");
4731   if(internal_branch(ba[i])&&is_ds[(ba[i]-start)>>2]) {
4732     ds_assemble_entry(i);
4733   }
4734   else {
4735     add_to_linker(out,ba[i],internal_branch(ba[i]));
4736     emit_jmp(0);
4737   }
4738 }
4739
4740 static void rjump_assemble_write_ra(int i)
4741 {
4742   int rt,return_address;
4743   assert(rt1[i+1]!=rt1[i]);
4744   assert(rt2[i+1]!=rt1[i]);
4745   rt=get_reg(branch_regs[i].regmap,rt1[i]);
4746   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4747   assert(rt>=0);
4748   return_address=start+i*4+8;
4749   #ifdef REG_PREFETCH
4750   if(temp>=0)
4751   {
4752     if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4753   }
4754   #endif
4755   emit_movimm(return_address,rt); // PC into link register
4756   #ifdef IMM_PREFETCH
4757   emit_prefetch(hash_table_get(return_address));
4758   #endif
4759 }
4760
4761 static void rjump_assemble(int i,struct regstat *i_regs)
4762 {
4763   int temp;
4764   int rs,cc;
4765   int ra_done=0;
4766   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4767   assert(rs>=0);
4768   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4769     // Delay slot abuse, make a copy of the branch address register
4770     temp=get_reg(branch_regs[i].regmap,RTEMP);
4771     assert(temp>=0);
4772     assert(regs[i].regmap[temp]==RTEMP);
4773     emit_mov(rs,temp);
4774     rs=temp;
4775   }
4776   address_generation(i+1,i_regs,regs[i].regmap_entry);
4777   #ifdef REG_PREFETCH
4778   if(rt1[i]==31)
4779   {
4780     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4781       signed char *i_regmap=i_regs->regmap;
4782       int return_address=start+i*4+8;
4783       if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4784     }
4785   }
4786   #endif
4787   #ifdef USE_MINI_HT
4788   if(rs1[i]==31) {
4789     int rh=get_reg(regs[i].regmap,RHASH);
4790     if(rh>=0) do_preload_rhash(rh);
4791   }
4792   #endif
4793   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4794     rjump_assemble_write_ra(i);
4795     ra_done=1;
4796   }
4797   ds_assemble(i+1,i_regs);
4798   uint64_t bc_unneeded=branch_regs[i].u;
4799   bc_unneeded|=1|(1LL<<rt1[i]);
4800   bc_unneeded&=~(1LL<<rs1[i]);
4801   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
4802   load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],CCREG);
4803   if(!ra_done&&rt1[i]!=0)
4804     rjump_assemble_write_ra(i);
4805   cc=get_reg(branch_regs[i].regmap,CCREG);
4806   assert(cc==HOST_CCREG);
4807   (void)cc;
4808   #ifdef USE_MINI_HT
4809   int rh=get_reg(branch_regs[i].regmap,RHASH);
4810   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4811   if(rs1[i]==31) {
4812     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4813     do_preload_rhtbl(ht);
4814     do_rhash(rs,rh);
4815   }
4816   #endif
4817   store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1);
4818   #ifdef DESTRUCTIVE_WRITEBACK
4819   if((branch_regs[i].dirty>>rs)&1) {
4820     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4821       emit_loadreg(rs1[i],rs);
4822     }
4823   }
4824   #endif
4825   #ifdef REG_PREFETCH
4826   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4827   #endif
4828   #ifdef USE_MINI_HT
4829   if(rs1[i]==31) {
4830     do_miniht_load(ht,rh);
4831   }
4832   #endif
4833   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4834   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4835   //assert(adj==0);
4836   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4837   add_stub(CC_STUB,out,NULL,0,i,-1,TAKEN,rs);
4838   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
4839     // special case for RFE
4840     emit_jmp(0);
4841   else
4842     emit_jns(0);
4843   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1);
4844   #ifdef USE_MINI_HT
4845   if(rs1[i]==31) {
4846     do_miniht_jump(rs,rh,ht);
4847   }
4848   else
4849   #endif
4850   {
4851     do_jump_vaddr(rs);
4852   }
4853   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4854   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4855   #endif
4856 }
4857
4858 static void cjump_assemble(int i,struct regstat *i_regs)
4859 {
4860   signed char *i_regmap=i_regs->regmap;
4861   int cc;
4862   int match;
4863   match=match_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4864   assem_debug("match=%d\n",match);
4865   int s1l,s2l;
4866   int unconditional=0,nop=0;
4867   int invert=0;
4868   int internal=internal_branch(ba[i]);
4869   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4870   if(!match) invert=1;
4871   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4872   if(i>(ba[i]-start)>>2) invert=1;
4873   #endif
4874   #ifdef __aarch64__
4875   invert=1; // because of near cond. branches
4876   #endif
4877
4878   if(ooo[i]) {
4879     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4880     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4881   }
4882   else {
4883     s1l=get_reg(i_regmap,rs1[i]);
4884     s2l=get_reg(i_regmap,rs2[i]);
4885   }
4886   if(rs1[i]==0&&rs2[i]==0)
4887   {
4888     if(opcode[i]&1) nop=1;
4889     else unconditional=1;
4890     //assert(opcode[i]!=5);
4891     //assert(opcode[i]!=7);
4892     //assert(opcode[i]!=0x15);
4893     //assert(opcode[i]!=0x17);
4894   }
4895   else if(rs1[i]==0)
4896   {
4897     s1l=s2l;
4898     s2l=-1;
4899   }
4900   else if(rs2[i]==0)
4901   {
4902     s2l=-1;
4903   }
4904
4905   if(ooo[i]) {
4906     // Out of order execution (delay slot first)
4907     //printf("OOOE\n");
4908     address_generation(i+1,i_regs,regs[i].regmap_entry);
4909     ds_assemble(i+1,i_regs);
4910     int adj;
4911     uint64_t bc_unneeded=branch_regs[i].u;
4912     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
4913     bc_unneeded|=1;
4914     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
4915     load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],rs2[i]);
4916     load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
4917     cc=get_reg(branch_regs[i].regmap,CCREG);
4918     assert(cc==HOST_CCREG);
4919     if(unconditional)
4920       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4921     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
4922     //assem_debug("cycle count (adj)\n");
4923     if(unconditional) {
4924       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4925       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
4926         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4927         load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4928         if(internal)
4929           assem_debug("branch: internal\n");
4930         else
4931           assem_debug("branch: external\n");
4932         if(internal&&is_ds[(ba[i]-start)>>2]) {
4933           ds_assemble_entry(i);
4934         }
4935         else {
4936           add_to_linker(out,ba[i],internal);
4937           emit_jmp(0);
4938         }
4939         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4940         if(((u_int)out)&7) emit_addnop(0);
4941         #endif
4942       }
4943     }
4944     else if(nop) {
4945       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
4946       void *jaddr=out;
4947       emit_jns(0);
4948       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
4949     }
4950     else {
4951       void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
4952       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
4953       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4954
4955       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4956       assert(s1l>=0);
4957       if(opcode[i]==4) // BEQ
4958       {
4959         if(s2l>=0) emit_cmp(s1l,s2l);
4960         else emit_test(s1l,s1l);
4961         if(invert){
4962           nottaken=out;
4963           emit_jne(DJT_1);
4964         }else{
4965           add_to_linker(out,ba[i],internal);
4966           emit_jeq(0);
4967         }
4968       }
4969       if(opcode[i]==5) // BNE
4970       {
4971         if(s2l>=0) emit_cmp(s1l,s2l);
4972         else emit_test(s1l,s1l);
4973         if(invert){
4974           nottaken=out;
4975           emit_jeq(DJT_1);
4976         }else{
4977           add_to_linker(out,ba[i],internal);
4978           emit_jne(0);
4979         }
4980       }
4981       if(opcode[i]==6) // BLEZ
4982       {
4983         emit_cmpimm(s1l,1);
4984         if(invert){
4985           nottaken=out;
4986           emit_jge(DJT_1);
4987         }else{
4988           add_to_linker(out,ba[i],internal);
4989           emit_jl(0);
4990         }
4991       }
4992       if(opcode[i]==7) // BGTZ
4993       {
4994         emit_cmpimm(s1l,1);
4995         if(invert){
4996           nottaken=out;
4997           emit_jl(DJT_1);
4998         }else{
4999           add_to_linker(out,ba[i],internal);
5000           emit_jge(0);
5001         }
5002       }
5003       if(invert) {
5004         if(taken) set_jump_target(taken, out);
5005         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5006         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5007           if(adj) {
5008             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5009             add_to_linker(out,ba[i],internal);
5010           }else{
5011             emit_addnop(13);
5012             add_to_linker(out,ba[i],internal*2);
5013           }
5014           emit_jmp(0);
5015         }else
5016         #endif
5017         {
5018           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5019           store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5020           load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5021           if(internal)
5022             assem_debug("branch: internal\n");
5023           else
5024             assem_debug("branch: external\n");
5025           if(internal&&is_ds[(ba[i]-start)>>2]) {
5026             ds_assemble_entry(i);
5027           }
5028           else {
5029             add_to_linker(out,ba[i],internal);
5030             emit_jmp(0);
5031           }
5032         }
5033         set_jump_target(nottaken, out);
5034       }
5035
5036       if(nottaken1) set_jump_target(nottaken1, out);
5037       if(adj) {
5038         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5039       }
5040     } // (!unconditional)
5041   } // if(ooo)
5042   else
5043   {
5044     // In-order execution (branch first)
5045     //if(likely[i]) printf("IOL\n");
5046     //else
5047     //printf("IOE\n");
5048     void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
5049     if(!unconditional&&!nop) {
5050       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5051       assert(s1l>=0);
5052       if((opcode[i]&0x2f)==4) // BEQ
5053       {
5054         if(s2l>=0) emit_cmp(s1l,s2l);
5055         else emit_test(s1l,s1l);
5056         nottaken=out;
5057         emit_jne(DJT_2);
5058       }
5059       if((opcode[i]&0x2f)==5) // BNE
5060       {
5061         if(s2l>=0) emit_cmp(s1l,s2l);
5062         else emit_test(s1l,s1l);
5063         nottaken=out;
5064         emit_jeq(DJT_2);
5065       }
5066       if((opcode[i]&0x2f)==6) // BLEZ
5067       {
5068         emit_cmpimm(s1l,1);
5069         nottaken=out;
5070         emit_jge(DJT_2);
5071       }
5072       if((opcode[i]&0x2f)==7) // BGTZ
5073       {
5074         emit_cmpimm(s1l,1);
5075         nottaken=out;
5076         emit_jl(DJT_2);
5077       }
5078     } // if(!unconditional)
5079     int adj;
5080     uint64_t ds_unneeded=branch_regs[i].u;
5081     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5082     ds_unneeded|=1;
5083     // branch taken
5084     if(!nop) {
5085       if(taken) set_jump_target(taken, out);
5086       assem_debug("1:\n");
5087       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5088       // load regs
5089       load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5090       address_generation(i+1,&branch_regs[i],0);
5091       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
5092       ds_assemble(i+1,&branch_regs[i]);
5093       cc=get_reg(branch_regs[i].regmap,CCREG);
5094       if(cc==-1) {
5095         emit_loadreg(CCREG,cc=HOST_CCREG);
5096         // CHECK: Is the following instruction (fall thru) allocated ok?
5097       }
5098       assert(cc==HOST_CCREG);
5099       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5100       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5101       assem_debug("cycle count (adj)\n");
5102       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5103       load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5104       if(internal)
5105         assem_debug("branch: internal\n");
5106       else
5107         assem_debug("branch: external\n");
5108       if(internal&&is_ds[(ba[i]-start)>>2]) {
5109         ds_assemble_entry(i);
5110       }
5111       else {
5112         add_to_linker(out,ba[i],internal);
5113         emit_jmp(0);
5114       }
5115     }
5116     // branch not taken
5117     if(!unconditional) {
5118       if(nottaken1) set_jump_target(nottaken1, out);
5119       set_jump_target(nottaken, out);
5120       assem_debug("2:\n");
5121       if(!likely[i]) {
5122         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5123         load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5124         address_generation(i+1,&branch_regs[i],0);
5125         load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5126         ds_assemble(i+1,&branch_regs[i]);
5127       }
5128       cc=get_reg(branch_regs[i].regmap,CCREG);
5129       if(cc==-1&&!likely[i]) {
5130         // Cycle count isn't in a register, temporarily load it then write it out
5131         emit_loadreg(CCREG,HOST_CCREG);
5132         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5133         void *jaddr=out;
5134         emit_jns(0);
5135         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5136         emit_storereg(CCREG,HOST_CCREG);
5137       }
5138       else{
5139         cc=get_reg(i_regmap,CCREG);
5140         assert(cc==HOST_CCREG);
5141         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5142         void *jaddr=out;
5143         emit_jns(0);
5144         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5145       }
5146     }
5147   }
5148 }
5149
5150 static void sjump_assemble(int i,struct regstat *i_regs)
5151 {
5152   signed char *i_regmap=i_regs->regmap;
5153   int cc;
5154   int match;
5155   match=match_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5156   assem_debug("smatch=%d\n",match);
5157   int s1l;
5158   int unconditional=0,nevertaken=0;
5159   int invert=0;
5160   int internal=internal_branch(ba[i]);
5161   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5162   if(!match) invert=1;
5163   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5164   if(i>(ba[i]-start)>>2) invert=1;
5165   #endif
5166   #ifdef __aarch64__
5167   invert=1; // because of near cond. branches
5168   #endif
5169
5170   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5171   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5172
5173   if(ooo[i]) {
5174     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5175   }
5176   else {
5177     s1l=get_reg(i_regmap,rs1[i]);
5178   }
5179   if(rs1[i]==0)
5180   {
5181     if(opcode2[i]&1) unconditional=1;
5182     else nevertaken=1;
5183     // These are never taken (r0 is never less than zero)
5184     //assert(opcode2[i]!=0);
5185     //assert(opcode2[i]!=2);
5186     //assert(opcode2[i]!=0x10);
5187     //assert(opcode2[i]!=0x12);
5188   }
5189
5190   if(ooo[i]) {
5191     // Out of order execution (delay slot first)
5192     //printf("OOOE\n");
5193     address_generation(i+1,i_regs,regs[i].regmap_entry);
5194     ds_assemble(i+1,i_regs);
5195     int adj;
5196     uint64_t bc_unneeded=branch_regs[i].u;
5197     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5198     bc_unneeded|=1;
5199     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
5200     load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],rs1[i]);
5201     load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5202     if(rt1[i]==31) {
5203       int rt,return_address;
5204       rt=get_reg(branch_regs[i].regmap,31);
5205       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5206       if(rt>=0) {
5207         // Save the PC even if the branch is not taken
5208         return_address=start+i*4+8;
5209         emit_movimm(return_address,rt); // PC into link register
5210         #ifdef IMM_PREFETCH
5211         if(!nevertaken) emit_prefetch(hash_table_get(return_address));
5212         #endif
5213       }
5214     }
5215     cc=get_reg(branch_regs[i].regmap,CCREG);
5216     assert(cc==HOST_CCREG);
5217     if(unconditional)
5218       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5219     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5220     assem_debug("cycle count (adj)\n");
5221     if(unconditional) {
5222       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5223       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5224         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5225         load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5226         if(internal)
5227           assem_debug("branch: internal\n");
5228         else
5229           assem_debug("branch: external\n");
5230         if(internal&&is_ds[(ba[i]-start)>>2]) {
5231           ds_assemble_entry(i);
5232         }
5233         else {
5234           add_to_linker(out,ba[i],internal);
5235           emit_jmp(0);
5236         }
5237         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5238         if(((u_int)out)&7) emit_addnop(0);
5239         #endif
5240       }
5241     }
5242     else if(nevertaken) {
5243       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5244       void *jaddr=out;
5245       emit_jns(0);
5246       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5247     }
5248     else {
5249       void *nottaken = NULL;
5250       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5251       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5252       {
5253         assert(s1l>=0);
5254         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5255         {
5256           emit_test(s1l,s1l);
5257           if(invert){
5258             nottaken=out;
5259             emit_jns(DJT_1);
5260           }else{
5261             add_to_linker(out,ba[i],internal);
5262             emit_js(0);
5263           }
5264         }
5265         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5266         {
5267           emit_test(s1l,s1l);
5268           if(invert){
5269             nottaken=out;
5270             emit_js(DJT_1);
5271           }else{
5272             add_to_linker(out,ba[i],internal);
5273             emit_jns(0);
5274           }
5275         }
5276       }
5277
5278       if(invert) {
5279         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5280         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5281           if(adj) {
5282             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5283             add_to_linker(out,ba[i],internal);
5284           }else{
5285             emit_addnop(13);
5286             add_to_linker(out,ba[i],internal*2);
5287           }
5288           emit_jmp(0);
5289         }else
5290         #endif
5291         {
5292           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5293           store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5294           load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5295           if(internal)
5296             assem_debug("branch: internal\n");
5297           else
5298             assem_debug("branch: external\n");
5299           if(internal&&is_ds[(ba[i]-start)>>2]) {
5300             ds_assemble_entry(i);
5301           }
5302           else {
5303             add_to_linker(out,ba[i],internal);
5304             emit_jmp(0);
5305           }
5306         }
5307         set_jump_target(nottaken, out);
5308       }
5309
5310       if(adj) {
5311         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5312       }
5313     } // (!unconditional)
5314   } // if(ooo)
5315   else
5316   {
5317     // In-order execution (branch first)
5318     //printf("IOE\n");
5319     void *nottaken = NULL;
5320     if(rt1[i]==31) {
5321       int rt,return_address;
5322       rt=get_reg(branch_regs[i].regmap,31);
5323       if(rt>=0) {
5324         // Save the PC even if the branch is not taken
5325         return_address=start+i*4+8;
5326         emit_movimm(return_address,rt); // PC into link register
5327         #ifdef IMM_PREFETCH
5328         emit_prefetch(hash_table_get(return_address));
5329         #endif
5330       }
5331     }
5332     if(!unconditional) {
5333       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5334         assert(s1l>=0);
5335         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5336         {
5337           emit_test(s1l,s1l);
5338           nottaken=out;
5339           emit_jns(DJT_1);
5340         }
5341         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5342         {
5343           emit_test(s1l,s1l);
5344           nottaken=out;
5345           emit_js(DJT_1);
5346         }
5347     } // if(!unconditional)
5348     int adj;
5349     uint64_t ds_unneeded=branch_regs[i].u;
5350     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5351     ds_unneeded|=1;
5352     // branch taken
5353     if(!nevertaken) {
5354       //assem_debug("1:\n");
5355       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5356       // load regs
5357       load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5358       address_generation(i+1,&branch_regs[i],0);
5359       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
5360       ds_assemble(i+1,&branch_regs[i]);
5361       cc=get_reg(branch_regs[i].regmap,CCREG);
5362       if(cc==-1) {
5363         emit_loadreg(CCREG,cc=HOST_CCREG);
5364         // CHECK: Is the following instruction (fall thru) allocated ok?
5365       }
5366       assert(cc==HOST_CCREG);
5367       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5368       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5369       assem_debug("cycle count (adj)\n");
5370       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5371       load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5372       if(internal)
5373         assem_debug("branch: internal\n");
5374       else
5375         assem_debug("branch: external\n");
5376       if(internal&&is_ds[(ba[i]-start)>>2]) {
5377         ds_assemble_entry(i);
5378       }
5379       else {
5380         add_to_linker(out,ba[i],internal);
5381         emit_jmp(0);
5382       }
5383     }
5384     // branch not taken
5385     if(!unconditional) {
5386       set_jump_target(nottaken, out);
5387       assem_debug("1:\n");
5388       if(!likely[i]) {
5389         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5390         load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5391         address_generation(i+1,&branch_regs[i],0);
5392         load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5393         ds_assemble(i+1,&branch_regs[i]);
5394       }
5395       cc=get_reg(branch_regs[i].regmap,CCREG);
5396       if(cc==-1&&!likely[i]) {
5397         // Cycle count isn't in a register, temporarily load it then write it out
5398         emit_loadreg(CCREG,HOST_CCREG);
5399         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5400         void *jaddr=out;
5401         emit_jns(0);
5402         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5403         emit_storereg(CCREG,HOST_CCREG);
5404       }
5405       else{
5406         cc=get_reg(i_regmap,CCREG);
5407         assert(cc==HOST_CCREG);
5408         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5409         void *jaddr=out;
5410         emit_jns(0);
5411         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5412       }
5413     }
5414   }
5415 }
5416
5417 static void pagespan_assemble(int i,struct regstat *i_regs)
5418 {
5419   int s1l=get_reg(i_regs->regmap,rs1[i]);
5420   int s2l=get_reg(i_regs->regmap,rs2[i]);
5421   void *taken = NULL;
5422   void *nottaken = NULL;
5423   int unconditional=0;
5424   if(rs1[i]==0)
5425   {
5426     s1l=s2l;
5427     s2l=-1;
5428   }
5429   else if(rs2[i]==0)
5430   {
5431     s2l=-1;
5432   }
5433   int hr=0;
5434   int addr=-1,alt=-1,ntaddr=-1;
5435   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5436   else {
5437     while(hr<HOST_REGS)
5438     {
5439       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5440          (i_regs->regmap[hr]&63)!=rs1[i] &&
5441          (i_regs->regmap[hr]&63)!=rs2[i] )
5442       {
5443         addr=hr++;break;
5444       }
5445       hr++;
5446     }
5447   }
5448   while(hr<HOST_REGS)
5449   {
5450     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5451        (i_regs->regmap[hr]&63)!=rs1[i] &&
5452        (i_regs->regmap[hr]&63)!=rs2[i] )
5453     {
5454       alt=hr++;break;
5455     }
5456     hr++;
5457   }
5458   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5459   {
5460     while(hr<HOST_REGS)
5461     {
5462       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5463          (i_regs->regmap[hr]&63)!=rs1[i] &&
5464          (i_regs->regmap[hr]&63)!=rs2[i] )
5465       {
5466         ntaddr=hr;break;
5467       }
5468       hr++;
5469     }
5470   }
5471   assert(hr<HOST_REGS);
5472   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5473     load_regs(regs[i].regmap_entry,regs[i].regmap,CCREG,CCREG);
5474   }
5475   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5476   if(opcode[i]==2) // J
5477   {
5478     unconditional=1;
5479   }
5480   if(opcode[i]==3) // JAL
5481   {
5482     // TODO: mini_ht
5483     int rt=get_reg(i_regs->regmap,31);
5484     emit_movimm(start+i*4+8,rt);
5485     unconditional=1;
5486   }
5487   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
5488   {
5489     emit_mov(s1l,addr);
5490     if(opcode2[i]==9) // JALR
5491     {
5492       int rt=get_reg(i_regs->regmap,rt1[i]);
5493       emit_movimm(start+i*4+8,rt);
5494     }
5495   }
5496   if((opcode[i]&0x3f)==4) // BEQ
5497   {
5498     if(rs1[i]==rs2[i])
5499     {
5500       unconditional=1;
5501     }
5502     else
5503     #ifdef HAVE_CMOV_IMM
5504     if(1) {
5505       if(s2l>=0) emit_cmp(s1l,s2l);
5506       else emit_test(s1l,s1l);
5507       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5508     }
5509     else
5510     #endif
5511     {
5512       assert(s1l>=0);
5513       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5514       if(s2l>=0) emit_cmp(s1l,s2l);
5515       else emit_test(s1l,s1l);
5516       emit_cmovne_reg(alt,addr);
5517     }
5518   }
5519   if((opcode[i]&0x3f)==5) // BNE
5520   {
5521     #ifdef HAVE_CMOV_IMM
5522     if(s2l>=0) emit_cmp(s1l,s2l);
5523     else emit_test(s1l,s1l);
5524     emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5525     #else
5526     assert(s1l>=0);
5527     emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5528     if(s2l>=0) emit_cmp(s1l,s2l);
5529     else emit_test(s1l,s1l);
5530     emit_cmovne_reg(alt,addr);
5531     #endif
5532   }
5533   if((opcode[i]&0x3f)==0x14) // BEQL
5534   {
5535     if(s2l>=0) emit_cmp(s1l,s2l);
5536     else emit_test(s1l,s1l);
5537     if(nottaken) set_jump_target(nottaken, out);
5538     nottaken=out;
5539     emit_jne(0);
5540   }
5541   if((opcode[i]&0x3f)==0x15) // BNEL
5542   {
5543     if(s2l>=0) emit_cmp(s1l,s2l);
5544     else emit_test(s1l,s1l);
5545     nottaken=out;
5546     emit_jeq(0);
5547     if(taken) set_jump_target(taken, out);
5548   }
5549   if((opcode[i]&0x3f)==6) // BLEZ
5550   {
5551     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5552     emit_cmpimm(s1l,1);
5553     emit_cmovl_reg(alt,addr);
5554   }
5555   if((opcode[i]&0x3f)==7) // BGTZ
5556   {
5557     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5558     emit_cmpimm(s1l,1);
5559     emit_cmovl_reg(ntaddr,addr);
5560   }
5561   if((opcode[i]&0x3f)==0x16) // BLEZL
5562   {
5563     assert((opcode[i]&0x3f)!=0x16);
5564   }
5565   if((opcode[i]&0x3f)==0x17) // BGTZL
5566   {
5567     assert((opcode[i]&0x3f)!=0x17);
5568   }
5569   assert(opcode[i]!=1); // BLTZ/BGEZ
5570
5571   //FIXME: Check CSREG
5572   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5573     if((source[i]&0x30000)==0) // BC1F
5574     {
5575       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5576       emit_testimm(s1l,0x800000);
5577       emit_cmovne_reg(alt,addr);
5578     }
5579     if((source[i]&0x30000)==0x10000) // BC1T
5580     {
5581       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5582       emit_testimm(s1l,0x800000);
5583       emit_cmovne_reg(alt,addr);
5584     }
5585     if((source[i]&0x30000)==0x20000) // BC1FL
5586     {
5587       emit_testimm(s1l,0x800000);
5588       nottaken=out;
5589       emit_jne(0);
5590     }
5591     if((source[i]&0x30000)==0x30000) // BC1TL
5592     {
5593       emit_testimm(s1l,0x800000);
5594       nottaken=out;
5595       emit_jeq(0);
5596     }
5597   }
5598
5599   assert(i_regs->regmap[HOST_CCREG]==CCREG);
5600   wb_dirtys(regs[i].regmap,regs[i].dirty);
5601   if(likely[i]||unconditional)
5602   {
5603     emit_movimm(ba[i],HOST_BTREG);
5604   }
5605   else if(addr!=HOST_BTREG)
5606   {
5607     emit_mov(addr,HOST_BTREG);
5608   }
5609   void *branch_addr=out;
5610   emit_jmp(0);
5611   int target_addr=start+i*4+5;
5612   void *stub=out;
5613   void *compiled_target_addr=check_addr(target_addr);
5614   emit_extjump_ds(branch_addr, target_addr);
5615   if(compiled_target_addr) {
5616     set_jump_target(branch_addr, compiled_target_addr);
5617     add_link(target_addr,stub);
5618   }
5619   else set_jump_target(branch_addr, stub);
5620   if(likely[i]) {
5621     // Not-taken path
5622     set_jump_target(nottaken, out);
5623     wb_dirtys(regs[i].regmap,regs[i].dirty);
5624     void *branch_addr=out;
5625     emit_jmp(0);
5626     int target_addr=start+i*4+8;
5627     void *stub=out;
5628     void *compiled_target_addr=check_addr(target_addr);
5629     emit_extjump_ds(branch_addr, target_addr);
5630     if(compiled_target_addr) {
5631       set_jump_target(branch_addr, compiled_target_addr);
5632       add_link(target_addr,stub);
5633     }
5634     else set_jump_target(branch_addr, stub);
5635   }
5636 }
5637
5638 // Assemble the delay slot for the above
5639 static void pagespan_ds()
5640 {
5641   assem_debug("initial delay slot:\n");
5642   u_int vaddr=start+1;
5643   u_int page=get_page(vaddr);
5644   u_int vpage=get_vpage(vaddr);
5645   ll_add(jump_dirty+vpage,vaddr,(void *)out);
5646   do_dirty_stub_ds();
5647   ll_add(jump_in+page,vaddr,(void *)out);
5648   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
5649   if(regs[0].regmap[HOST_CCREG]!=CCREG)
5650     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty);
5651   if(regs[0].regmap[HOST_BTREG]!=BTREG)
5652     emit_writeword(HOST_BTREG,&branch_target);
5653   load_regs(regs[0].regmap_entry,regs[0].regmap,rs1[0],rs2[0]);
5654   address_generation(0,&regs[0],regs[0].regmap_entry);
5655   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
5656     load_regs(regs[0].regmap_entry,regs[0].regmap,INVCP,INVCP);
5657   is_delayslot=0;
5658   switch(itype[0]) {
5659     case ALU:
5660       alu_assemble(0,&regs[0]);break;
5661     case IMM16:
5662       imm16_assemble(0,&regs[0]);break;
5663     case SHIFT:
5664       shift_assemble(0,&regs[0]);break;
5665     case SHIFTIMM:
5666       shiftimm_assemble(0,&regs[0]);break;
5667     case LOAD:
5668       load_assemble(0,&regs[0]);break;
5669     case LOADLR:
5670       loadlr_assemble(0,&regs[0]);break;
5671     case STORE:
5672       store_assemble(0,&regs[0]);break;
5673     case STORELR:
5674       storelr_assemble(0,&regs[0]);break;
5675     case COP0:
5676       cop0_assemble(0,&regs[0]);break;
5677     case COP1:
5678       cop1_assemble(0,&regs[0]);break;
5679     case C1LS:
5680       c1ls_assemble(0,&regs[0]);break;
5681     case COP2:
5682       cop2_assemble(0,&regs[0]);break;
5683     case C2LS:
5684       c2ls_assemble(0,&regs[0]);break;
5685     case C2OP:
5686       c2op_assemble(0,&regs[0]);break;
5687     case MULTDIV:
5688       multdiv_assemble(0,&regs[0]);break;
5689     case MOV:
5690       mov_assemble(0,&regs[0]);break;
5691     case SYSCALL:
5692     case HLECALL:
5693     case INTCALL:
5694     case SPAN:
5695     case UJUMP:
5696     case RJUMP:
5697     case CJUMP:
5698     case SJUMP:
5699       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
5700   }
5701   int btaddr=get_reg(regs[0].regmap,BTREG);
5702   if(btaddr<0) {
5703     btaddr=get_reg(regs[0].regmap,-1);
5704     emit_readword(&branch_target,btaddr);
5705   }
5706   assert(btaddr!=HOST_CCREG);
5707   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
5708 #ifdef HOST_IMM8
5709   host_tempreg_acquire();
5710   emit_movimm(start+4,HOST_TEMPREG);
5711   emit_cmp(btaddr,HOST_TEMPREG);
5712   host_tempreg_release();
5713 #else
5714   emit_cmpimm(btaddr,start+4);
5715 #endif
5716   void *branch = out;
5717   emit_jeq(0);
5718   store_regs_bt(regs[0].regmap,regs[0].dirty,-1);
5719   do_jump_vaddr(btaddr);
5720   set_jump_target(branch, out);
5721   store_regs_bt(regs[0].regmap,regs[0].dirty,start+4);
5722   load_regs_bt(regs[0].regmap,regs[0].dirty,start+4);
5723 }
5724
5725 // Basic liveness analysis for MIPS registers
5726 void unneeded_registers(int istart,int iend,int r)
5727 {
5728   int i;
5729   uint64_t u,gte_u,b,gte_b;
5730   uint64_t temp_u,temp_gte_u=0;
5731   uint64_t gte_u_unknown=0;
5732   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
5733     gte_u_unknown=~0ll;
5734   if(iend==slen-1) {
5735     u=1;
5736     gte_u=gte_u_unknown;
5737   }else{
5738     //u=unneeded_reg[iend+1];
5739     u=1;
5740     gte_u=gte_unneeded[iend+1];
5741   }
5742
5743   for (i=iend;i>=istart;i--)
5744   {
5745     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
5746     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
5747     {
5748       // If subroutine call, flag return address as a possible branch target
5749       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
5750
5751       if(ba[i]<start || ba[i]>=(start+slen*4))
5752       {
5753         // Branch out of this block, flush all regs
5754         u=1;
5755         gte_u=gte_u_unknown;
5756         branch_unneeded_reg[i]=u;
5757         // Merge in delay slot
5758         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5759         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5760         u|=1;
5761         gte_u|=gte_rt[i+1];
5762         gte_u&=~gte_rs[i+1];
5763         // If branch is "likely" (and conditional)
5764         // then we skip the delay slot on the fall-thru path
5765         if(likely[i]) {
5766           if(i<slen-1) {
5767             u&=unneeded_reg[i+2];
5768             gte_u&=gte_unneeded[i+2];
5769           }
5770           else
5771           {
5772             u=1;
5773             gte_u=gte_u_unknown;
5774           }
5775         }
5776       }
5777       else
5778       {
5779         // Internal branch, flag target
5780         bt[(ba[i]-start)>>2]=1;
5781         if(ba[i]<=start+i*4) {
5782           // Backward branch
5783           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
5784           {
5785             // Unconditional branch
5786             temp_u=1;
5787             temp_gte_u=0;
5788           } else {
5789             // Conditional branch (not taken case)
5790             temp_u=unneeded_reg[i+2];
5791             temp_gte_u&=gte_unneeded[i+2];
5792           }
5793           // Merge in delay slot
5794           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5795           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5796           temp_u|=1;
5797           temp_gte_u|=gte_rt[i+1];
5798           temp_gte_u&=~gte_rs[i+1];
5799           // If branch is "likely" (and conditional)
5800           // then we skip the delay slot on the fall-thru path
5801           if(likely[i]) {
5802             if(i<slen-1) {
5803               temp_u&=unneeded_reg[i+2];
5804               temp_gte_u&=gte_unneeded[i+2];
5805             }
5806             else
5807             {
5808               temp_u=1;
5809               temp_gte_u=gte_u_unknown;
5810             }
5811           }
5812           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
5813           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5814           temp_u|=1;
5815           temp_gte_u|=gte_rt[i];
5816           temp_gte_u&=~gte_rs[i];
5817           unneeded_reg[i]=temp_u;
5818           gte_unneeded[i]=temp_gte_u;
5819           // Only go three levels deep.  This recursion can take an
5820           // excessive amount of time if there are a lot of nested loops.
5821           if(r<2) {
5822             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
5823           }else{
5824             unneeded_reg[(ba[i]-start)>>2]=1;
5825             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
5826           }
5827         } /*else*/ if(1) {
5828           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
5829           {
5830             // Unconditional branch
5831             u=unneeded_reg[(ba[i]-start)>>2];
5832             gte_u=gte_unneeded[(ba[i]-start)>>2];
5833             branch_unneeded_reg[i]=u;
5834             // Merge in delay slot
5835             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5836             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5837             u|=1;
5838             gte_u|=gte_rt[i+1];
5839             gte_u&=~gte_rs[i+1];
5840           } else {
5841             // Conditional branch
5842             b=unneeded_reg[(ba[i]-start)>>2];
5843             gte_b=gte_unneeded[(ba[i]-start)>>2];
5844             branch_unneeded_reg[i]=b;
5845             // Branch delay slot
5846             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5847             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5848             b|=1;
5849             gte_b|=gte_rt[i+1];
5850             gte_b&=~gte_rs[i+1];
5851             // If branch is "likely" then we skip the
5852             // delay slot on the fall-thru path
5853             if(likely[i]) {
5854               u=b;
5855               gte_u=gte_b;
5856               if(i<slen-1) {
5857                 u&=unneeded_reg[i+2];
5858                 gte_u&=gte_unneeded[i+2];
5859               }
5860             } else {
5861               u&=b;
5862               gte_u&=gte_b;
5863             }
5864             if(i<slen-1) {
5865               branch_unneeded_reg[i]&=unneeded_reg[i+2];
5866             } else {
5867               branch_unneeded_reg[i]=1;
5868             }
5869           }
5870         }
5871       }
5872     }
5873     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
5874     {
5875       // SYSCALL instruction (software interrupt)
5876       u=1;
5877     }
5878     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
5879     {
5880       // ERET instruction (return from interrupt)
5881       u=1;
5882     }
5883     //u=1; // DEBUG
5884     // Written registers are unneeded
5885     u|=1LL<<rt1[i];
5886     u|=1LL<<rt2[i];
5887     gte_u|=gte_rt[i];
5888     // Accessed registers are needed
5889     u&=~(1LL<<rs1[i]);
5890     u&=~(1LL<<rs2[i]);
5891     gte_u&=~gte_rs[i];
5892     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
5893       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
5894     // Source-target dependencies
5895     // R0 is always unneeded
5896     u|=1;
5897     // Save it
5898     unneeded_reg[i]=u;
5899     gte_unneeded[i]=gte_u;
5900     /*
5901     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
5902     printf("U:");
5903     int r;
5904     for(r=1;r<=CCREG;r++) {
5905       if((unneeded_reg[i]>>r)&1) {
5906         if(r==HIREG) printf(" HI");
5907         else if(r==LOREG) printf(" LO");
5908         else printf(" r%d",r);
5909       }
5910     }
5911     printf("\n");
5912     */
5913   }
5914 }
5915
5916 // Write back dirty registers as soon as we will no longer modify them,
5917 // so that we don't end up with lots of writes at the branches.
5918 void clean_registers(int istart,int iend,int wr)
5919 {
5920   int i;
5921   int r;
5922   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
5923   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
5924   if(iend==slen-1) {
5925     will_dirty_i=will_dirty_next=0;
5926     wont_dirty_i=wont_dirty_next=0;
5927   }else{
5928     will_dirty_i=will_dirty_next=will_dirty[iend+1];
5929     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
5930   }
5931   for (i=iend;i>=istart;i--)
5932   {
5933     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
5934     {
5935       if(ba[i]<start || ba[i]>=(start+slen*4))
5936       {
5937         // Branch out of this block, flush all regs
5938         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
5939         {
5940           // Unconditional branch
5941           will_dirty_i=0;
5942           wont_dirty_i=0;
5943           // Merge in delay slot (will dirty)
5944           for(r=0;r<HOST_REGS;r++) {
5945             if(r!=EXCLUDE_REG) {
5946               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
5947               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
5948               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
5949               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
5950               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
5951               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
5952               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
5953               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
5954               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
5955               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
5956               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
5957               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
5958               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
5959               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
5960             }
5961           }
5962         }
5963         else
5964         {
5965           // Conditional branch
5966           will_dirty_i=0;
5967           wont_dirty_i=wont_dirty_next;
5968           // Merge in delay slot (will dirty)
5969           for(r=0;r<HOST_REGS;r++) {
5970             if(r!=EXCLUDE_REG) {
5971               if(!likely[i]) {
5972                 // Might not dirty if likely branch is not taken
5973                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
5974                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
5975                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
5976                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
5977                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
5978                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
5979                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
5980                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
5981                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
5982                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
5983                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
5984                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
5985                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
5986                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
5987               }
5988             }
5989           }
5990         }
5991         // Merge in delay slot (wont dirty)
5992         for(r=0;r<HOST_REGS;r++) {
5993           if(r!=EXCLUDE_REG) {
5994             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
5995             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
5996             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
5997             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
5998             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
5999             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6000             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6001             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6002             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6003             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6004           }
6005         }
6006         if(wr) {
6007           #ifndef DESTRUCTIVE_WRITEBACK
6008           branch_regs[i].dirty&=wont_dirty_i;
6009           #endif
6010           branch_regs[i].dirty|=will_dirty_i;
6011         }
6012       }
6013       else
6014       {
6015         // Internal branch
6016         if(ba[i]<=start+i*4) {
6017           // Backward branch
6018           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6019           {
6020             // Unconditional branch
6021             temp_will_dirty=0;
6022             temp_wont_dirty=0;
6023             // Merge in delay slot (will dirty)
6024             for(r=0;r<HOST_REGS;r++) {
6025               if(r!=EXCLUDE_REG) {
6026                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6027                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6028                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6029                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6030                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6031                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6032                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6033                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6034                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6035                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6036                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6037                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6038                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6039                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6040               }
6041             }
6042           } else {
6043             // Conditional branch (not taken case)
6044             temp_will_dirty=will_dirty_next;
6045             temp_wont_dirty=wont_dirty_next;
6046             // Merge in delay slot (will dirty)
6047             for(r=0;r<HOST_REGS;r++) {
6048               if(r!=EXCLUDE_REG) {
6049                 if(!likely[i]) {
6050                   // Will not dirty if likely branch is not taken
6051                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6052                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6053                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6054                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6055                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6056                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
6057                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6058                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6059                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6060                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6061                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6062                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6063                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6064                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6065                 }
6066               }
6067             }
6068           }
6069           // Merge in delay slot (wont dirty)
6070           for(r=0;r<HOST_REGS;r++) {
6071             if(r!=EXCLUDE_REG) {
6072               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6073               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6074               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6075               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6076               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6077               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6078               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6079               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6080               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6081               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6082             }
6083           }
6084           // Deal with changed mappings
6085           if(i<iend) {
6086             for(r=0;r<HOST_REGS;r++) {
6087               if(r!=EXCLUDE_REG) {
6088                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
6089                   temp_will_dirty&=~(1<<r);
6090                   temp_wont_dirty&=~(1<<r);
6091                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6092                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6093                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6094                   } else {
6095                     temp_will_dirty|=1<<r;
6096                     temp_wont_dirty|=1<<r;
6097                   }
6098                 }
6099               }
6100             }
6101           }
6102           if(wr) {
6103             will_dirty[i]=temp_will_dirty;
6104             wont_dirty[i]=temp_wont_dirty;
6105             clean_registers((ba[i]-start)>>2,i-1,0);
6106           }else{
6107             // Limit recursion.  It can take an excessive amount
6108             // of time if there are a lot of nested loops.
6109             will_dirty[(ba[i]-start)>>2]=0;
6110             wont_dirty[(ba[i]-start)>>2]=-1;
6111           }
6112         }
6113         /*else*/ if(1)
6114         {
6115           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6116           {
6117             // Unconditional branch
6118             will_dirty_i=0;
6119             wont_dirty_i=0;
6120           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6121             for(r=0;r<HOST_REGS;r++) {
6122               if(r!=EXCLUDE_REG) {
6123                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6124                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
6125                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6126                 }
6127                 if(branch_regs[i].regmap[r]>=0) {
6128                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6129                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6130                 }
6131               }
6132             }
6133           //}
6134             // Merge in delay slot
6135             for(r=0;r<HOST_REGS;r++) {
6136               if(r!=EXCLUDE_REG) {
6137                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6138                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6139                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6140                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6141                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6142                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6143                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6144                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6145                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6146                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6147                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6148                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6149                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6150                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6151               }
6152             }
6153           } else {
6154             // Conditional branch
6155             will_dirty_i=will_dirty_next;
6156             wont_dirty_i=wont_dirty_next;
6157           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6158             for(r=0;r<HOST_REGS;r++) {
6159               if(r!=EXCLUDE_REG) {
6160                 signed char target_reg=branch_regs[i].regmap[r];
6161                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6162                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6163                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6164                 }
6165                 else if(target_reg>=0) {
6166                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6167                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6168                 }
6169                 // Treat delay slot as part of branch too
6170                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6171                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6172                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6173                 }
6174                 else
6175                 {
6176                   will_dirty[i+1]&=~(1<<r);
6177                 }*/
6178               }
6179             }
6180           //}
6181             // Merge in delay slot
6182             for(r=0;r<HOST_REGS;r++) {
6183               if(r!=EXCLUDE_REG) {
6184                 if(!likely[i]) {
6185                   // Might not dirty if likely branch is not taken
6186                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6187                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6188                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6189                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6190                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6191                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6192                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6193                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6194                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6195                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6196                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6197                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6198                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6199                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6200                 }
6201               }
6202             }
6203           }
6204           // Merge in delay slot (won't dirty)
6205           for(r=0;r<HOST_REGS;r++) {
6206             if(r!=EXCLUDE_REG) {
6207               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6208               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6209               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6210               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6211               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6212               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6213               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6214               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6215               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6216               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6217             }
6218           }
6219           if(wr) {
6220             #ifndef DESTRUCTIVE_WRITEBACK
6221             branch_regs[i].dirty&=wont_dirty_i;
6222             #endif
6223             branch_regs[i].dirty|=will_dirty_i;
6224           }
6225         }
6226       }
6227     }
6228     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6229     {
6230       // SYSCALL instruction (software interrupt)
6231       will_dirty_i=0;
6232       wont_dirty_i=0;
6233     }
6234     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6235     {
6236       // ERET instruction (return from interrupt)
6237       will_dirty_i=0;
6238       wont_dirty_i=0;
6239     }
6240     will_dirty_next=will_dirty_i;
6241     wont_dirty_next=wont_dirty_i;
6242     for(r=0;r<HOST_REGS;r++) {
6243       if(r!=EXCLUDE_REG) {
6244         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6245         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6246         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6247         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6248         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6249         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6250         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6251         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6252         if(i>istart) {
6253           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP)
6254           {
6255             // Don't store a register immediately after writing it,
6256             // may prevent dual-issue.
6257             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
6258             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
6259           }
6260         }
6261       }
6262     }
6263     // Save it
6264     will_dirty[i]=will_dirty_i;
6265     wont_dirty[i]=wont_dirty_i;
6266     // Mark registers that won't be dirtied as not dirty
6267     if(wr) {
6268       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
6269       for(r=0;r<HOST_REGS;r++) {
6270         if((will_dirty_i>>r)&1) {
6271           printf(" r%d",r);
6272         }
6273       }
6274       printf("\n");*/
6275
6276       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP)) {
6277         regs[i].dirty|=will_dirty_i;
6278         #ifndef DESTRUCTIVE_WRITEBACK
6279         regs[i].dirty&=wont_dirty_i;
6280         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
6281         {
6282           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
6283             for(r=0;r<HOST_REGS;r++) {
6284               if(r!=EXCLUDE_REG) {
6285                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
6286                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
6287                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6288               }
6289             }
6290           }
6291         }
6292         else
6293         {
6294           if(i<iend) {
6295             for(r=0;r<HOST_REGS;r++) {
6296               if(r!=EXCLUDE_REG) {
6297                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
6298                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
6299                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6300               }
6301             }
6302           }
6303         }
6304         #endif
6305       //}
6306     }
6307     // Deal with changed mappings
6308     temp_will_dirty=will_dirty_i;
6309     temp_wont_dirty=wont_dirty_i;
6310     for(r=0;r<HOST_REGS;r++) {
6311       if(r!=EXCLUDE_REG) {
6312         int nr;
6313         if(regs[i].regmap[r]==regmap_pre[i][r]) {
6314           if(wr) {
6315             #ifndef DESTRUCTIVE_WRITEBACK
6316             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6317             #endif
6318             regs[i].wasdirty|=will_dirty_i&(1<<r);
6319           }
6320         }
6321         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
6322           // Register moved to a different register
6323           will_dirty_i&=~(1<<r);
6324           wont_dirty_i&=~(1<<r);
6325           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
6326           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
6327           if(wr) {
6328             #ifndef DESTRUCTIVE_WRITEBACK
6329             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6330             #endif
6331             regs[i].wasdirty|=will_dirty_i&(1<<r);
6332           }
6333         }
6334         else {
6335           will_dirty_i&=~(1<<r);
6336           wont_dirty_i&=~(1<<r);
6337           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6338             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6339             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6340           } else {
6341             wont_dirty_i|=1<<r;
6342             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
6343           }
6344         }
6345       }
6346     }
6347   }
6348 }
6349
6350 #ifdef DISASM
6351   /* disassembly */
6352 void disassemble_inst(int i)
6353 {
6354     if (bt[i]) printf("*"); else printf(" ");
6355     switch(itype[i]) {
6356       case UJUMP:
6357         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6358       case CJUMP:
6359         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
6360       case SJUMP:
6361         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
6362       case RJUMP:
6363         if (opcode[i]==0x9&&rt1[i]!=31)
6364           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
6365         else
6366           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6367         break;
6368       case SPAN:
6369         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
6370       case IMM16:
6371         if(opcode[i]==0xf) //LUI
6372           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
6373         else
6374           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6375         break;
6376       case LOAD:
6377       case LOADLR:
6378         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6379         break;
6380       case STORE:
6381       case STORELR:
6382         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
6383         break;
6384       case ALU:
6385       case SHIFT:
6386         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
6387         break;
6388       case MULTDIV:
6389         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
6390         break;
6391       case SHIFTIMM:
6392         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6393         break;
6394       case MOV:
6395         if((opcode2[i]&0x1d)==0x10)
6396           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
6397         else if((opcode2[i]&0x1d)==0x11)
6398           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6399         else
6400           printf (" %x: %s\n",start+i*4,insn[i]);
6401         break;
6402       case COP0:
6403         if(opcode2[i]==0)
6404           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
6405         else if(opcode2[i]==4)
6406           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
6407         else printf (" %x: %s\n",start+i*4,insn[i]);
6408         break;
6409       case COP1:
6410         if(opcode2[i]<3)
6411           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
6412         else if(opcode2[i]>3)
6413           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
6414         else printf (" %x: %s\n",start+i*4,insn[i]);
6415         break;
6416       case COP2:
6417         if(opcode2[i]<3)
6418           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
6419         else if(opcode2[i]>3)
6420           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
6421         else printf (" %x: %s\n",start+i*4,insn[i]);
6422         break;
6423       case C1LS:
6424         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6425         break;
6426       case C2LS:
6427         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6428         break;
6429       case INTCALL:
6430         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
6431         break;
6432       default:
6433         //printf (" %s %8x\n",insn[i],source[i]);
6434         printf (" %x: %s\n",start+i*4,insn[i]);
6435     }
6436 }
6437 #else
6438 static void disassemble_inst(int i) {}
6439 #endif // DISASM
6440
6441 #define DRC_TEST_VAL 0x74657374
6442
6443 static void new_dynarec_test(void)
6444 {
6445   int (*testfunc)(void);
6446   void *beginning;
6447   int ret[2];
6448   size_t i;
6449
6450   // check structure linkage
6451   if ((u_char *)rcnts - (u_char *)&psxRegs != sizeof(psxRegs))
6452   {
6453     SysPrintf("linkage_arm* miscompilation/breakage detected.\n");
6454   }
6455
6456   SysPrintf("testing if we can run recompiled code...\n");
6457   ((volatile u_int *)out)[0]++; // make cache dirty
6458
6459   for (i = 0; i < ARRAY_SIZE(ret); i++) {
6460     out = ndrc->translation_cache;
6461     beginning = start_block();
6462     emit_movimm(DRC_TEST_VAL + i, 0); // test
6463     emit_ret();
6464     literal_pool(0);
6465     end_block(beginning);
6466     testfunc = beginning;
6467     ret[i] = testfunc();
6468   }
6469
6470   if (ret[0] == DRC_TEST_VAL && ret[1] == DRC_TEST_VAL + 1)
6471     SysPrintf("test passed.\n");
6472   else
6473     SysPrintf("test failed, will likely crash soon (r=%08x %08x)\n", ret[0], ret[1]);
6474   out = ndrc->translation_cache;
6475 }
6476
6477 // clear the state completely, instead of just marking
6478 // things invalid like invalidate_all_pages() does
6479 void new_dynarec_clear_full()
6480 {
6481   int n;
6482   out = ndrc->translation_cache;
6483   memset(invalid_code,1,sizeof(invalid_code));
6484   memset(hash_table,0xff,sizeof(hash_table));
6485   memset(mini_ht,-1,sizeof(mini_ht));
6486   memset(restore_candidate,0,sizeof(restore_candidate));
6487   memset(shadow,0,sizeof(shadow));
6488   copy=shadow;
6489   expirep=16384; // Expiry pointer, +2 blocks
6490   pending_exception=0;
6491   literalcount=0;
6492   stop_after_jal=0;
6493   inv_code_start=inv_code_end=~0;
6494   // TLB
6495   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6496   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6497   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6498 }
6499
6500 void new_dynarec_init()
6501 {
6502   SysPrintf("Init new dynarec\n");
6503
6504 #ifdef BASE_ADDR_DYNAMIC
6505   #ifdef VITA
6506   sceBlock = sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
6507   if (sceBlock < 0)
6508     SysPrintf("sceKernelAllocMemBlockForVM failed\n");
6509   int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&ndrc);
6510   if (ret < 0)
6511     SysPrintf("sceKernelGetMemBlockBase failed\n");
6512   #else
6513   uintptr_t desired_addr = 0;
6514   #ifdef __ELF__
6515   extern char _end;
6516   desired_addr = ((uintptr_t)&_end + 0xffffff) & ~0xffffffl;
6517   #endif
6518   ndrc = mmap((void *)desired_addr, sizeof(*ndrc),
6519             PROT_READ | PROT_WRITE | PROT_EXEC,
6520             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
6521   if (ndrc == MAP_FAILED) {
6522     SysPrintf("mmap() failed: %s\n", strerror(errno));
6523     abort();
6524   }
6525   #endif
6526 #else
6527   #ifndef NO_WRITE_EXEC
6528   // not all systems allow execute in data segment by default
6529   if (mprotect(ndrc, sizeof(ndrc->translation_cache) + sizeof(ndrc->tramp.ops),
6530                PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
6531     SysPrintf("mprotect() failed: %s\n", strerror(errno));
6532   #endif
6533 #endif
6534   out = ndrc->translation_cache;
6535   cycle_multiplier=200;
6536   new_dynarec_clear_full();
6537 #ifdef HOST_IMM8
6538   // Copy this into local area so we don't have to put it in every literal pool
6539   invc_ptr=invalid_code;
6540 #endif
6541   arch_init();
6542   new_dynarec_test();
6543 #ifndef RAM_FIXED
6544   ram_offset=(uintptr_t)rdram-0x80000000;
6545 #endif
6546   if (ram_offset!=0)
6547     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
6548 }
6549
6550 void new_dynarec_cleanup()
6551 {
6552   int n;
6553 #ifdef BASE_ADDR_DYNAMIC
6554   #ifdef VITA
6555   sceKernelFreeMemBlock(sceBlock);
6556   sceBlock = -1;
6557   #else
6558   if (munmap(ndrc, sizeof(*ndrc)) < 0)
6559     SysPrintf("munmap() failed\n");
6560   #endif
6561 #endif
6562   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6563   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6564   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6565   #ifdef ROM_COPY
6566   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
6567   #endif
6568 }
6569
6570 static u_int *get_source_start(u_int addr, u_int *limit)
6571 {
6572   if (addr < 0x00200000 ||
6573     (0xa0000000 <= addr && addr < 0xa0200000)) {
6574     // used for BIOS calls mostly?
6575     *limit = (addr&0xa0000000)|0x00200000;
6576     return (u_int *)(rdram + (addr&0x1fffff));
6577   }
6578   else if (!Config.HLE && (
6579     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
6580     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
6581     // BIOS
6582     *limit = (addr & 0xfff00000) | 0x80000;
6583     return (u_int *)((u_char *)psxR + (addr&0x7ffff));
6584   }
6585   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
6586     *limit = (addr & 0x80600000) + 0x00200000;
6587     return (u_int *)(rdram + (addr&0x1fffff));
6588   }
6589   return NULL;
6590 }
6591
6592 static u_int scan_for_ret(u_int addr)
6593 {
6594   u_int limit = 0;
6595   u_int *mem;
6596
6597   mem = get_source_start(addr, &limit);
6598   if (mem == NULL)
6599     return addr;
6600
6601   if (limit > addr + 0x1000)
6602     limit = addr + 0x1000;
6603   for (; addr < limit; addr += 4, mem++) {
6604     if (*mem == 0x03e00008) // jr $ra
6605       return addr + 8;
6606   }
6607   return addr;
6608 }
6609
6610 struct savestate_block {
6611   uint32_t addr;
6612   uint32_t regflags;
6613 };
6614
6615 static int addr_cmp(const void *p1_, const void *p2_)
6616 {
6617   const struct savestate_block *p1 = p1_, *p2 = p2_;
6618   return p1->addr - p2->addr;
6619 }
6620
6621 int new_dynarec_save_blocks(void *save, int size)
6622 {
6623   struct savestate_block *blocks = save;
6624   int maxcount = size / sizeof(blocks[0]);
6625   struct savestate_block tmp_blocks[1024];
6626   struct ll_entry *head;
6627   int p, s, d, o, bcnt;
6628   u_int addr;
6629
6630   o = 0;
6631   for (p = 0; p < ARRAY_SIZE(jump_in); p++) {
6632     bcnt = 0;
6633     for (head = jump_in[p]; head != NULL; head = head->next) {
6634       tmp_blocks[bcnt].addr = head->vaddr;
6635       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
6636       bcnt++;
6637     }
6638     if (bcnt < 1)
6639       continue;
6640     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
6641
6642     addr = tmp_blocks[0].addr;
6643     for (s = d = 0; s < bcnt; s++) {
6644       if (tmp_blocks[s].addr < addr)
6645         continue;
6646       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
6647         tmp_blocks[d++] = tmp_blocks[s];
6648       addr = scan_for_ret(tmp_blocks[s].addr);
6649     }
6650
6651     if (o + d > maxcount)
6652       d = maxcount - o;
6653     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
6654     o += d;
6655   }
6656
6657   return o * sizeof(blocks[0]);
6658 }
6659
6660 void new_dynarec_load_blocks(const void *save, int size)
6661 {
6662   const struct savestate_block *blocks = save;
6663   int count = size / sizeof(blocks[0]);
6664   u_int regs_save[32];
6665   uint32_t f;
6666   int i, b;
6667
6668   get_addr(psxRegs.pc);
6669
6670   // change GPRs for speculation to at least partially work..
6671   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
6672   for (i = 1; i < 32; i++)
6673     psxRegs.GPR.r[i] = 0x80000000;
6674
6675   for (b = 0; b < count; b++) {
6676     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
6677       if (f & 1)
6678         psxRegs.GPR.r[i] = 0x1f800000;
6679     }
6680
6681     get_addr(blocks[b].addr);
6682
6683     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
6684       if (f & 1)
6685         psxRegs.GPR.r[i] = 0x80000000;
6686     }
6687   }
6688
6689   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
6690 }
6691
6692 int new_recompile_block(u_int addr)
6693 {
6694   u_int pagelimit = 0;
6695   u_int state_rflags = 0;
6696   int i;
6697
6698   assem_debug("NOTCOMPILED: addr = %x -> %p\n", addr, out);
6699   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
6700   //if(debug)
6701   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
6702
6703   // this is just for speculation
6704   for (i = 1; i < 32; i++) {
6705     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
6706       state_rflags |= 1 << i;
6707   }
6708
6709   start = (u_int)addr&~3;
6710   //assert(((u_int)addr&1)==0); // start-in-delay-slot flag
6711   new_dynarec_did_compile=1;
6712   if (Config.HLE && start == 0x80001000) // hlecall
6713   {
6714     // XXX: is this enough? Maybe check hleSoftCall?
6715     void *beginning=start_block();
6716     u_int page=get_page(start);
6717
6718     invalid_code[start>>12]=0;
6719     emit_movimm(start,0);
6720     emit_writeword(0,&pcaddr);
6721     emit_far_jump(new_dyna_leave);
6722     literal_pool(0);
6723     end_block(beginning);
6724     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
6725     return 0;
6726   }
6727
6728   source = get_source_start(start, &pagelimit);
6729   if (source == NULL) {
6730     SysPrintf("Compile at bogus memory address: %08x\n", addr);
6731     abort();
6732   }
6733
6734   /* Pass 1: disassemble */
6735   /* Pass 2: register dependencies, branch targets */
6736   /* Pass 3: register allocation */
6737   /* Pass 4: branch dependencies */
6738   /* Pass 5: pre-alloc */
6739   /* Pass 6: optimize clean/dirty state */
6740   /* Pass 7: flag 32-bit registers */
6741   /* Pass 8: assembly */
6742   /* Pass 9: linker */
6743   /* Pass 10: garbage collection / free memory */
6744
6745   int j;
6746   int done=0;
6747   unsigned int type,op,op2;
6748
6749   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
6750
6751   /* Pass 1 disassembly */
6752
6753   for(i=0;!done;i++) {
6754     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
6755     minimum_free_regs[i]=0;
6756     opcode[i]=op=source[i]>>26;
6757     switch(op)
6758     {
6759       case 0x00: strcpy(insn[i],"special"); type=NI;
6760         op2=source[i]&0x3f;
6761         switch(op2)
6762         {
6763           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
6764           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
6765           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
6766           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
6767           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
6768           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
6769           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
6770           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
6771           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
6772           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
6773           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
6774           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
6775           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
6776           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
6777           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
6778           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
6779           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
6780           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
6781           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
6782           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
6783           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
6784           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
6785           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
6786           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
6787           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
6788           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
6789           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
6790           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
6791           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
6792           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
6793           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
6794           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
6795           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
6796           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
6797           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
6798 #if 0
6799           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
6800           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
6801           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
6802           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
6803           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
6804           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
6805           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
6806           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
6807           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
6808           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
6809           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
6810           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
6811           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
6812           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
6813           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
6814           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
6815           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
6816 #endif
6817         }
6818         break;
6819       case 0x01: strcpy(insn[i],"regimm"); type=NI;
6820         op2=(source[i]>>16)&0x1f;
6821         switch(op2)
6822         {
6823           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
6824           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
6825           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
6826           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
6827           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
6828           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
6829           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
6830           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
6831           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
6832           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
6833           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
6834           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
6835           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
6836           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
6837         }
6838         break;
6839       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
6840       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
6841       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
6842       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
6843       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
6844       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
6845       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
6846       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
6847       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
6848       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
6849       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
6850       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
6851       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
6852       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
6853       case 0x10: strcpy(insn[i],"cop0"); type=NI;
6854         op2=(source[i]>>21)&0x1f;
6855         switch(op2)
6856         {
6857           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
6858           case 0x02: strcpy(insn[i],"CFC0"); type=COP0; break;
6859           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
6860           case 0x06: strcpy(insn[i],"CTC0"); type=COP0; break;
6861           case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
6862         }
6863         break;
6864       case 0x11: strcpy(insn[i],"cop1"); type=COP1;
6865         op2=(source[i]>>21)&0x1f;
6866         break;
6867 #if 0
6868       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
6869       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
6870       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
6871       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
6872       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
6873       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
6874       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
6875       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
6876 #endif
6877       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
6878       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
6879       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
6880       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
6881       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
6882       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
6883       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
6884 #if 0
6885       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
6886 #endif
6887       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
6888       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
6889       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
6890       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
6891 #if 0
6892       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
6893       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
6894 #endif
6895       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
6896       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
6897       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
6898       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
6899 #if 0
6900       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
6901       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
6902       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
6903 #endif
6904       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
6905       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
6906 #if 0
6907       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
6908       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
6909       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
6910 #endif
6911       case 0x12: strcpy(insn[i],"COP2"); type=NI;
6912         op2=(source[i]>>21)&0x1f;
6913         //if (op2 & 0x10)
6914         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
6915           if (gte_handlers[source[i]&0x3f]!=NULL) {
6916             if (gte_regnames[source[i]&0x3f]!=NULL)
6917               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
6918             else
6919               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
6920             type=C2OP;
6921           }
6922         }
6923         else switch(op2)
6924         {
6925           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
6926           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
6927           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
6928           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
6929         }
6930         break;
6931       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
6932       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
6933       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
6934       default: strcpy(insn[i],"???"); type=NI;
6935         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
6936         break;
6937     }
6938     itype[i]=type;
6939     opcode2[i]=op2;
6940     /* Get registers/immediates */
6941     lt1[i]=0;
6942     dep1[i]=0;
6943     dep2[i]=0;
6944     gte_rs[i]=gte_rt[i]=0;
6945     switch(type) {
6946       case LOAD:
6947         rs1[i]=(source[i]>>21)&0x1f;
6948         rs2[i]=0;
6949         rt1[i]=(source[i]>>16)&0x1f;
6950         rt2[i]=0;
6951         imm[i]=(short)source[i];
6952         break;
6953       case STORE:
6954       case STORELR:
6955         rs1[i]=(source[i]>>21)&0x1f;
6956         rs2[i]=(source[i]>>16)&0x1f;
6957         rt1[i]=0;
6958         rt2[i]=0;
6959         imm[i]=(short)source[i];
6960         break;
6961       case LOADLR:
6962         // LWL/LWR only load part of the register,
6963         // therefore the target register must be treated as a source too
6964         rs1[i]=(source[i]>>21)&0x1f;
6965         rs2[i]=(source[i]>>16)&0x1f;
6966         rt1[i]=(source[i]>>16)&0x1f;
6967         rt2[i]=0;
6968         imm[i]=(short)source[i];
6969         if(op==0x26) dep1[i]=rt1[i]; // LWR
6970         break;
6971       case IMM16:
6972         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
6973         else rs1[i]=(source[i]>>21)&0x1f;
6974         rs2[i]=0;
6975         rt1[i]=(source[i]>>16)&0x1f;
6976         rt2[i]=0;
6977         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
6978           imm[i]=(unsigned short)source[i];
6979         }else{
6980           imm[i]=(short)source[i];
6981         }
6982         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
6983         break;
6984       case UJUMP:
6985         rs1[i]=0;
6986         rs2[i]=0;
6987         rt1[i]=0;
6988         rt2[i]=0;
6989         // The JAL instruction writes to r31.
6990         if (op&1) {
6991           rt1[i]=31;
6992         }
6993         rs2[i]=CCREG;
6994         break;
6995       case RJUMP:
6996         rs1[i]=(source[i]>>21)&0x1f;
6997         rs2[i]=0;
6998         rt1[i]=0;
6999         rt2[i]=0;
7000         // The JALR instruction writes to rd.
7001         if (op2&1) {
7002           rt1[i]=(source[i]>>11)&0x1f;
7003         }
7004         rs2[i]=CCREG;
7005         break;
7006       case CJUMP:
7007         rs1[i]=(source[i]>>21)&0x1f;
7008         rs2[i]=(source[i]>>16)&0x1f;
7009         rt1[i]=0;
7010         rt2[i]=0;
7011         if(op&2) { // BGTZ/BLEZ
7012           rs2[i]=0;
7013         }
7014         likely[i]=op>>4;
7015         break;
7016       case SJUMP:
7017         rs1[i]=(source[i]>>21)&0x1f;
7018         rs2[i]=CCREG;
7019         rt1[i]=0;
7020         rt2[i]=0;
7021         if(op2&0x10) { // BxxAL
7022           rt1[i]=31;
7023           // NOTE: If the branch is not taken, r31 is still overwritten
7024         }
7025         likely[i]=(op2&2)>>1;
7026         break;
7027       case ALU:
7028         rs1[i]=(source[i]>>21)&0x1f; // source
7029         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7030         rt1[i]=(source[i]>>11)&0x1f; // destination
7031         rt2[i]=0;
7032         if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7033           dep1[i]=rs1[i];dep2[i]=rs2[i];
7034         }
7035         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7036           dep1[i]=rs1[i];dep2[i]=rs2[i];
7037         }
7038         break;
7039       case MULTDIV:
7040         rs1[i]=(source[i]>>21)&0x1f; // source
7041         rs2[i]=(source[i]>>16)&0x1f; // divisor
7042         rt1[i]=HIREG;
7043         rt2[i]=LOREG;
7044         break;
7045       case MOV:
7046         rs1[i]=0;
7047         rs2[i]=0;
7048         rt1[i]=0;
7049         rt2[i]=0;
7050         if(op2==0x10) rs1[i]=HIREG; // MFHI
7051         if(op2==0x11) rt1[i]=HIREG; // MTHI
7052         if(op2==0x12) rs1[i]=LOREG; // MFLO
7053         if(op2==0x13) rt1[i]=LOREG; // MTLO
7054         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7055         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7056         dep1[i]=rs1[i];
7057         break;
7058       case SHIFT:
7059         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7060         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7061         rt1[i]=(source[i]>>11)&0x1f; // destination
7062         rt2[i]=0;
7063         break;
7064       case SHIFTIMM:
7065         rs1[i]=(source[i]>>16)&0x1f;
7066         rs2[i]=0;
7067         rt1[i]=(source[i]>>11)&0x1f;
7068         rt2[i]=0;
7069         imm[i]=(source[i]>>6)&0x1f;
7070         // DSxx32 instructions
7071         if(op2>=0x3c) imm[i]|=0x20;
7072         break;
7073       case COP0:
7074         rs1[i]=0;
7075         rs2[i]=0;
7076         rt1[i]=0;
7077         rt2[i]=0;
7078         if(op2==0||op2==2) rt1[i]=(source[i]>>16)&0x1F; // MFC0/CFC0
7079         if(op2==4||op2==6) rs1[i]=(source[i]>>16)&0x1F; // MTC0/CTC0
7080         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7081         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7082         break;
7083       case COP1:
7084         rs1[i]=0;
7085         rs2[i]=0;
7086         rt1[i]=0;
7087         rt2[i]=0;
7088         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7089         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7090         rs2[i]=CSREG;
7091         break;
7092       case COP2:
7093         rs1[i]=0;
7094         rs2[i]=0;
7095         rt1[i]=0;
7096         rt2[i]=0;
7097         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
7098         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
7099         rs2[i]=CSREG;
7100         int gr=(source[i]>>11)&0x1F;
7101         switch(op2)
7102         {
7103           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
7104           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
7105           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
7106           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
7107         }
7108         break;
7109       case C1LS:
7110         rs1[i]=(source[i]>>21)&0x1F;
7111         rs2[i]=CSREG;
7112         rt1[i]=0;
7113         rt2[i]=0;
7114         imm[i]=(short)source[i];
7115         break;
7116       case C2LS:
7117         rs1[i]=(source[i]>>21)&0x1F;
7118         rs2[i]=0;
7119         rt1[i]=0;
7120         rt2[i]=0;
7121         imm[i]=(short)source[i];
7122         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
7123         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
7124         break;
7125       case C2OP:
7126         rs1[i]=0;
7127         rs2[i]=0;
7128         rt1[i]=0;
7129         rt2[i]=0;
7130         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
7131         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
7132         gte_rt[i]|=1ll<<63; // every op changes flags
7133         if((source[i]&0x3f)==GTE_MVMVA) {
7134           int v = (source[i] >> 15) & 3;
7135           gte_rs[i]&=~0xe3fll;
7136           if(v==3) gte_rs[i]|=0xe00ll;
7137           else gte_rs[i]|=3ll<<(v*2);
7138         }
7139         break;
7140       case SYSCALL:
7141       case HLECALL:
7142       case INTCALL:
7143         rs1[i]=CCREG;
7144         rs2[i]=0;
7145         rt1[i]=0;
7146         rt2[i]=0;
7147         break;
7148       default:
7149         rs1[i]=0;
7150         rs2[i]=0;
7151         rt1[i]=0;
7152         rt2[i]=0;
7153     }
7154     /* Calculate branch target addresses */
7155     if(type==UJUMP)
7156       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
7157     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
7158       ba[i]=start+i*4+8; // Ignore never taken branch
7159     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
7160       ba[i]=start+i*4+8; // Ignore never taken branch
7161     else if(type==CJUMP||type==SJUMP)
7162       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
7163     else ba[i]=-1;
7164     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP)) {
7165       int do_in_intrp=0;
7166       // branch in delay slot?
7167       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP) {
7168         // don't handle first branch and call interpreter if it's hit
7169         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
7170         do_in_intrp=1;
7171       }
7172       // basic load delay detection
7173       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
7174         int t=(ba[i-1]-start)/4;
7175         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
7176           // jump target wants DS result - potential load delay effect
7177           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
7178           do_in_intrp=1;
7179           bt[t+1]=1; // expected return from interpreter
7180         }
7181         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
7182               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
7183           // v0 overwrite like this is a sign of trouble, bail out
7184           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
7185           do_in_intrp=1;
7186         }
7187       }
7188       if(do_in_intrp) {
7189         rs1[i-1]=CCREG;
7190         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
7191         ba[i-1]=-1;
7192         itype[i-1]=INTCALL;
7193         done=2;
7194         i--; // don't compile the DS
7195       }
7196     }
7197     /* Is this the end of the block? */
7198     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
7199       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
7200         done=2;
7201       }
7202       else {
7203         if(stop_after_jal) done=1;
7204         // Stop on BREAK
7205         if((source[i+1]&0xfc00003f)==0x0d) done=1;
7206       }
7207       // Don't recompile stuff that's already compiled
7208       if(check_addr(start+i*4+4)) done=1;
7209       // Don't get too close to the limit
7210       if(i>MAXBLOCK/2) done=1;
7211     }
7212     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
7213     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
7214     if(done==2) {
7215       // Does the block continue due to a branch?
7216       for(j=i-1;j>=0;j--)
7217       {
7218         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
7219         if(ba[j]==start+i*4+4) done=j=0;
7220         if(ba[j]==start+i*4+8) done=j=0;
7221       }
7222     }
7223     //assert(i<MAXBLOCK-1);
7224     if(start+i*4==pagelimit-4) done=1;
7225     assert(start+i*4<pagelimit);
7226     if (i==MAXBLOCK-1) done=1;
7227     // Stop if we're compiling junk
7228     if(itype[i]==NI&&opcode[i]==0x11) {
7229       done=stop_after_jal=1;
7230       SysPrintf("Disabled speculative precompilation\n");
7231     }
7232   }
7233   slen=i;
7234   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP) {
7235     if(start+i*4==pagelimit) {
7236       itype[i-1]=SPAN;
7237     }
7238   }
7239   assert(slen>0);
7240
7241   /* Pass 2 - Register dependencies and branch targets */
7242
7243   unneeded_registers(0,slen-1,0);
7244
7245   /* Pass 3 - Register allocation */
7246
7247   struct regstat current; // Current register allocations/status
7248   current.dirty=0;
7249   current.u=unneeded_reg[0];
7250   clear_all_regs(current.regmap);
7251   alloc_reg(&current,0,CCREG);
7252   dirty_reg(&current,CCREG);
7253   current.isconst=0;
7254   current.wasconst=0;
7255   current.waswritten=0;
7256   int ds=0;
7257   int cc=0;
7258   int hr=-1;
7259
7260   if((u_int)addr&1) {
7261     // First instruction is delay slot
7262     cc=-1;
7263     bt[1]=1;
7264     ds=1;
7265     unneeded_reg[0]=1;
7266     current.regmap[HOST_BTREG]=BTREG;
7267   }
7268
7269   for(i=0;i<slen;i++)
7270   {
7271     if(bt[i])
7272     {
7273       int hr;
7274       for(hr=0;hr<HOST_REGS;hr++)
7275       {
7276         // Is this really necessary?
7277         if(current.regmap[hr]==0) current.regmap[hr]=-1;
7278       }
7279       current.isconst=0;
7280       current.waswritten=0;
7281     }
7282
7283     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
7284     regs[i].wasconst=current.isconst;
7285     regs[i].wasdirty=current.dirty;
7286     regs[i].loadedconst=0;
7287     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP) {
7288       if(i+1<slen) {
7289         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7290         current.u|=1;
7291       } else {
7292         current.u=1;
7293       }
7294     } else {
7295       if(i+1<slen) {
7296         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
7297         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7298         current.u|=1;
7299       } else { SysPrintf("oops, branch at end of block with no delay slot\n");abort(); }
7300     }
7301     is_ds[i]=ds;
7302     if(ds) {
7303       ds=0; // Skip delay slot, already allocated as part of branch
7304       // ...but we need to alloc it in case something jumps here
7305       if(i+1<slen) {
7306         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
7307       }else{
7308         current.u=branch_unneeded_reg[i-1];
7309       }
7310       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7311       current.u|=1;
7312       struct regstat temp;
7313       memcpy(&temp,&current,sizeof(current));
7314       temp.wasdirty=temp.dirty;
7315       // TODO: Take into account unconditional branches, as below
7316       delayslot_alloc(&temp,i);
7317       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
7318       regs[i].wasdirty=temp.wasdirty;
7319       regs[i].dirty=temp.dirty;
7320       regs[i].isconst=0;
7321       regs[i].wasconst=0;
7322       current.isconst=0;
7323       // Create entry (branch target) regmap
7324       for(hr=0;hr<HOST_REGS;hr++)
7325       {
7326         int r=temp.regmap[hr];
7327         if(r>=0) {
7328           if(r!=regmap_pre[i][hr]) {
7329             regs[i].regmap_entry[hr]=-1;
7330           }
7331           else
7332           {
7333               assert(r < 64);
7334               if((current.u>>r)&1) {
7335                 regs[i].regmap_entry[hr]=-1;
7336                 regs[i].regmap[hr]=-1;
7337                 //Don't clear regs in the delay slot as the branch might need them
7338                 //current.regmap[hr]=-1;
7339               }else
7340                 regs[i].regmap_entry[hr]=r;
7341           }
7342         } else {
7343           // First instruction expects CCREG to be allocated
7344           if(i==0&&hr==HOST_CCREG)
7345             regs[i].regmap_entry[hr]=CCREG;
7346           else
7347             regs[i].regmap_entry[hr]=-1;
7348         }
7349       }
7350     }
7351     else { // Not delay slot
7352       switch(itype[i]) {
7353         case UJUMP:
7354           //current.isconst=0; // DEBUG
7355           //current.wasconst=0; // DEBUG
7356           //regs[i].wasconst=0; // DEBUG
7357           clear_const(&current,rt1[i]);
7358           alloc_cc(&current,i);
7359           dirty_reg(&current,CCREG);
7360           if (rt1[i]==31) {
7361             alloc_reg(&current,i,31);
7362             dirty_reg(&current,31);
7363             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
7364             //assert(rt1[i+1]!=rt1[i]);
7365             #ifdef REG_PREFETCH
7366             alloc_reg(&current,i,PTEMP);
7367             #endif
7368           }
7369           ooo[i]=1;
7370           delayslot_alloc(&current,i+1);
7371           //current.isconst=0; // DEBUG
7372           ds=1;
7373           //printf("i=%d, isconst=%x\n",i,current.isconst);
7374           break;
7375         case RJUMP:
7376           //current.isconst=0;
7377           //current.wasconst=0;
7378           //regs[i].wasconst=0;
7379           clear_const(&current,rs1[i]);
7380           clear_const(&current,rt1[i]);
7381           alloc_cc(&current,i);
7382           dirty_reg(&current,CCREG);
7383           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
7384             alloc_reg(&current,i,rs1[i]);
7385             if (rt1[i]!=0) {
7386               alloc_reg(&current,i,rt1[i]);
7387               dirty_reg(&current,rt1[i]);
7388               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
7389               assert(rt1[i+1]!=rt1[i]);
7390               #ifdef REG_PREFETCH
7391               alloc_reg(&current,i,PTEMP);
7392               #endif
7393             }
7394             #ifdef USE_MINI_HT
7395             if(rs1[i]==31) { // JALR
7396               alloc_reg(&current,i,RHASH);
7397               alloc_reg(&current,i,RHTBL);
7398             }
7399             #endif
7400             delayslot_alloc(&current,i+1);
7401           } else {
7402             // The delay slot overwrites our source register,
7403             // allocate a temporary register to hold the old value.
7404             current.isconst=0;
7405             current.wasconst=0;
7406             regs[i].wasconst=0;
7407             delayslot_alloc(&current,i+1);
7408             current.isconst=0;
7409             alloc_reg(&current,i,RTEMP);
7410           }
7411           //current.isconst=0; // DEBUG
7412           ooo[i]=1;
7413           ds=1;
7414           break;
7415         case CJUMP:
7416           //current.isconst=0;
7417           //current.wasconst=0;
7418           //regs[i].wasconst=0;
7419           clear_const(&current,rs1[i]);
7420           clear_const(&current,rs2[i]);
7421           if((opcode[i]&0x3E)==4) // BEQ/BNE
7422           {
7423             alloc_cc(&current,i);
7424             dirty_reg(&current,CCREG);
7425             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7426             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
7427             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
7428                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
7429               // The delay slot overwrites one of our conditions.
7430               // Allocate the branch condition registers instead.
7431               current.isconst=0;
7432               current.wasconst=0;
7433               regs[i].wasconst=0;
7434               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7435               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
7436             }
7437             else
7438             {
7439               ooo[i]=1;
7440               delayslot_alloc(&current,i+1);
7441             }
7442           }
7443           else
7444           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
7445           {
7446             alloc_cc(&current,i);
7447             dirty_reg(&current,CCREG);
7448             alloc_reg(&current,i,rs1[i]);
7449             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
7450               // The delay slot overwrites one of our conditions.
7451               // Allocate the branch condition registers instead.
7452               current.isconst=0;
7453               current.wasconst=0;
7454               regs[i].wasconst=0;
7455               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7456             }
7457             else
7458             {
7459               ooo[i]=1;
7460               delayslot_alloc(&current,i+1);
7461             }
7462           }
7463           else
7464           // Don't alloc the delay slot yet because we might not execute it
7465           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
7466           {
7467             current.isconst=0;
7468             current.wasconst=0;
7469             regs[i].wasconst=0;
7470             alloc_cc(&current,i);
7471             dirty_reg(&current,CCREG);
7472             alloc_reg(&current,i,rs1[i]);
7473             alloc_reg(&current,i,rs2[i]);
7474           }
7475           else
7476           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
7477           {
7478             current.isconst=0;
7479             current.wasconst=0;
7480             regs[i].wasconst=0;
7481             alloc_cc(&current,i);
7482             dirty_reg(&current,CCREG);
7483             alloc_reg(&current,i,rs1[i]);
7484           }
7485           ds=1;
7486           //current.isconst=0;
7487           break;
7488         case SJUMP:
7489           //current.isconst=0;
7490           //current.wasconst=0;
7491           //regs[i].wasconst=0;
7492           clear_const(&current,rs1[i]);
7493           clear_const(&current,rt1[i]);
7494           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
7495           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
7496           {
7497             alloc_cc(&current,i);
7498             dirty_reg(&current,CCREG);
7499             alloc_reg(&current,i,rs1[i]);
7500             if (rt1[i]==31) { // BLTZAL/BGEZAL
7501               alloc_reg(&current,i,31);
7502               dirty_reg(&current,31);
7503               //#ifdef REG_PREFETCH
7504               //alloc_reg(&current,i,PTEMP);
7505               //#endif
7506             }
7507             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
7508                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
7509               // Allocate the branch condition registers instead.
7510               current.isconst=0;
7511               current.wasconst=0;
7512               regs[i].wasconst=0;
7513               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7514             }
7515             else
7516             {
7517               ooo[i]=1;
7518               delayslot_alloc(&current,i+1);
7519             }
7520           }
7521           else
7522           // Don't alloc the delay slot yet because we might not execute it
7523           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
7524           {
7525             current.isconst=0;
7526             current.wasconst=0;
7527             regs[i].wasconst=0;
7528             alloc_cc(&current,i);
7529             dirty_reg(&current,CCREG);
7530             alloc_reg(&current,i,rs1[i]);
7531           }
7532           ds=1;
7533           //current.isconst=0;
7534           break;
7535         case IMM16:
7536           imm16_alloc(&current,i);
7537           break;
7538         case LOAD:
7539         case LOADLR:
7540           load_alloc(&current,i);
7541           break;
7542         case STORE:
7543         case STORELR:
7544           store_alloc(&current,i);
7545           break;
7546         case ALU:
7547           alu_alloc(&current,i);
7548           break;
7549         case SHIFT:
7550           shift_alloc(&current,i);
7551           break;
7552         case MULTDIV:
7553           multdiv_alloc(&current,i);
7554           break;
7555         case SHIFTIMM:
7556           shiftimm_alloc(&current,i);
7557           break;
7558         case MOV:
7559           mov_alloc(&current,i);
7560           break;
7561         case COP0:
7562           cop0_alloc(&current,i);
7563           break;
7564         case COP1:
7565         case COP2:
7566           cop12_alloc(&current,i);
7567           break;
7568         case C1LS:
7569           c1ls_alloc(&current,i);
7570           break;
7571         case C2LS:
7572           c2ls_alloc(&current,i);
7573           break;
7574         case C2OP:
7575           c2op_alloc(&current,i);
7576           break;
7577         case SYSCALL:
7578         case HLECALL:
7579         case INTCALL:
7580           syscall_alloc(&current,i);
7581           break;
7582         case SPAN:
7583           pagespan_alloc(&current,i);
7584           break;
7585       }
7586
7587       // Create entry (branch target) regmap
7588       for(hr=0;hr<HOST_REGS;hr++)
7589       {
7590         int r,or;
7591         r=current.regmap[hr];
7592         if(r>=0) {
7593           if(r!=regmap_pre[i][hr]) {
7594             // TODO: delay slot (?)
7595             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
7596             if(or<0||(r&63)>=TEMPREG){
7597               regs[i].regmap_entry[hr]=-1;
7598             }
7599             else
7600             {
7601               // Just move it to a different register
7602               regs[i].regmap_entry[hr]=r;
7603               // If it was dirty before, it's still dirty
7604               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
7605             }
7606           }
7607           else
7608           {
7609             // Unneeded
7610             if(r==0){
7611               regs[i].regmap_entry[hr]=0;
7612             }
7613             else
7614             {
7615               assert(r<64);
7616               if((current.u>>r)&1) {
7617                 regs[i].regmap_entry[hr]=-1;
7618                 //regs[i].regmap[hr]=-1;
7619                 current.regmap[hr]=-1;
7620               }else
7621                 regs[i].regmap_entry[hr]=r;
7622             }
7623           }
7624         } else {
7625           // Branches expect CCREG to be allocated at the target
7626           if(regmap_pre[i][hr]==CCREG)
7627             regs[i].regmap_entry[hr]=CCREG;
7628           else
7629             regs[i].regmap_entry[hr]=-1;
7630         }
7631       }
7632       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
7633     }
7634
7635     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
7636       current.waswritten|=1<<rs1[i-1];
7637     current.waswritten&=~(1<<rt1[i]);
7638     current.waswritten&=~(1<<rt2[i]);
7639     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
7640       current.waswritten&=~(1<<rs1[i]);
7641
7642     /* Branch post-alloc */
7643     if(i>0)
7644     {
7645       current.wasdirty=current.dirty;
7646       switch(itype[i-1]) {
7647         case UJUMP:
7648           memcpy(&branch_regs[i-1],&current,sizeof(current));
7649           branch_regs[i-1].isconst=0;
7650           branch_regs[i-1].wasconst=0;
7651           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
7652           alloc_cc(&branch_regs[i-1],i-1);
7653           dirty_reg(&branch_regs[i-1],CCREG);
7654           if(rt1[i-1]==31) { // JAL
7655             alloc_reg(&branch_regs[i-1],i-1,31);
7656             dirty_reg(&branch_regs[i-1],31);
7657           }
7658           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7659           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
7660           break;
7661         case RJUMP:
7662           memcpy(&branch_regs[i-1],&current,sizeof(current));
7663           branch_regs[i-1].isconst=0;
7664           branch_regs[i-1].wasconst=0;
7665           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
7666           alloc_cc(&branch_regs[i-1],i-1);
7667           dirty_reg(&branch_regs[i-1],CCREG);
7668           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
7669           if(rt1[i-1]!=0) { // JALR
7670             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
7671             dirty_reg(&branch_regs[i-1],rt1[i-1]);
7672           }
7673           #ifdef USE_MINI_HT
7674           if(rs1[i-1]==31) { // JALR
7675             alloc_reg(&branch_regs[i-1],i-1,RHASH);
7676             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
7677           }
7678           #endif
7679           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7680           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
7681           break;
7682         case CJUMP:
7683           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
7684           {
7685             alloc_cc(&current,i-1);
7686             dirty_reg(&current,CCREG);
7687             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
7688                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
7689               // The delay slot overwrote one of our conditions
7690               // Delay slot goes after the test (in order)
7691               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7692               current.u|=1;
7693               delayslot_alloc(&current,i);
7694               current.isconst=0;
7695             }
7696             else
7697             {
7698               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
7699               // Alloc the branch condition registers
7700               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
7701               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
7702             }
7703             memcpy(&branch_regs[i-1],&current,sizeof(current));
7704             branch_regs[i-1].isconst=0;
7705             branch_regs[i-1].wasconst=0;
7706             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
7707             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
7708           }
7709           else
7710           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
7711           {
7712             alloc_cc(&current,i-1);
7713             dirty_reg(&current,CCREG);
7714             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
7715               // The delay slot overwrote the branch condition
7716               // Delay slot goes after the test (in order)
7717               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7718               current.u|=1;
7719               delayslot_alloc(&current,i);
7720               current.isconst=0;
7721             }
7722             else
7723             {
7724               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
7725               // Alloc the branch condition register
7726               alloc_reg(&current,i-1,rs1[i-1]);
7727             }
7728             memcpy(&branch_regs[i-1],&current,sizeof(current));
7729             branch_regs[i-1].isconst=0;
7730             branch_regs[i-1].wasconst=0;
7731             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
7732             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
7733           }
7734           else
7735           // Alloc the delay slot in case the branch is taken
7736           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
7737           {
7738             memcpy(&branch_regs[i-1],&current,sizeof(current));
7739             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
7740             alloc_cc(&branch_regs[i-1],i);
7741             dirty_reg(&branch_regs[i-1],CCREG);
7742             delayslot_alloc(&branch_regs[i-1],i);
7743             branch_regs[i-1].isconst=0;
7744             alloc_reg(&current,i,CCREG); // Not taken path
7745             dirty_reg(&current,CCREG);
7746             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7747           }
7748           else
7749           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
7750           {
7751             memcpy(&branch_regs[i-1],&current,sizeof(current));
7752             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
7753             alloc_cc(&branch_regs[i-1],i);
7754             dirty_reg(&branch_regs[i-1],CCREG);
7755             delayslot_alloc(&branch_regs[i-1],i);
7756             branch_regs[i-1].isconst=0;
7757             alloc_reg(&current,i,CCREG); // Not taken path
7758             dirty_reg(&current,CCREG);
7759             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7760           }
7761           break;
7762         case SJUMP:
7763           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
7764           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
7765           {
7766             alloc_cc(&current,i-1);
7767             dirty_reg(&current,CCREG);
7768             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
7769               // The delay slot overwrote the branch condition
7770               // Delay slot goes after the test (in order)
7771               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7772               current.u|=1;
7773               delayslot_alloc(&current,i);
7774               current.isconst=0;
7775             }
7776             else
7777             {
7778               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
7779               // Alloc the branch condition register
7780               alloc_reg(&current,i-1,rs1[i-1]);
7781             }
7782             memcpy(&branch_regs[i-1],&current,sizeof(current));
7783             branch_regs[i-1].isconst=0;
7784             branch_regs[i-1].wasconst=0;
7785             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
7786             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
7787           }
7788           else
7789           // Alloc the delay slot in case the branch is taken
7790           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
7791           {
7792             memcpy(&branch_regs[i-1],&current,sizeof(current));
7793             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
7794             alloc_cc(&branch_regs[i-1],i);
7795             dirty_reg(&branch_regs[i-1],CCREG);
7796             delayslot_alloc(&branch_regs[i-1],i);
7797             branch_regs[i-1].isconst=0;
7798             alloc_reg(&current,i,CCREG); // Not taken path
7799             dirty_reg(&current,CCREG);
7800             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7801           }
7802           // FIXME: BLTZAL/BGEZAL
7803           if(opcode2[i-1]&0x10) { // BxxZAL
7804             alloc_reg(&branch_regs[i-1],i-1,31);
7805             dirty_reg(&branch_regs[i-1],31);
7806           }
7807           break;
7808       }
7809
7810       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
7811       {
7812         if(rt1[i-1]==31) // JAL/JALR
7813         {
7814           // Subroutine call will return here, don't alloc any registers
7815           current.dirty=0;
7816           clear_all_regs(current.regmap);
7817           alloc_reg(&current,i,CCREG);
7818           dirty_reg(&current,CCREG);
7819         }
7820         else if(i+1<slen)
7821         {
7822           // Internal branch will jump here, match registers to caller
7823           current.dirty=0;
7824           clear_all_regs(current.regmap);
7825           alloc_reg(&current,i,CCREG);
7826           dirty_reg(&current,CCREG);
7827           for(j=i-1;j>=0;j--)
7828           {
7829             if(ba[j]==start+i*4+4) {
7830               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
7831               current.dirty=branch_regs[j].dirty;
7832               break;
7833             }
7834           }
7835           while(j>=0) {
7836             if(ba[j]==start+i*4+4) {
7837               for(hr=0;hr<HOST_REGS;hr++) {
7838                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
7839                   current.regmap[hr]=-1;
7840                 }
7841                 current.dirty&=branch_regs[j].dirty;
7842               }
7843             }
7844             j--;
7845           }
7846         }
7847       }
7848     }
7849
7850     // Count cycles in between branches
7851     ccadj[i]=cc;
7852     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
7853     {
7854       cc=0;
7855     }
7856 #if !defined(DRC_DBG)
7857     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
7858     {
7859       // GTE runs in parallel until accessed, divide by 2 for a rough guess
7860       cc+=gte_cycletab[source[i]&0x3f]/2;
7861     }
7862     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
7863     {
7864       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
7865     }
7866     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
7867     {
7868       cc+=4;
7869     }
7870     else if(itype[i]==C2LS)
7871     {
7872       cc+=4;
7873     }
7874 #endif
7875     else
7876     {
7877       cc++;
7878     }
7879
7880     if(!is_ds[i]) {
7881       regs[i].dirty=current.dirty;
7882       regs[i].isconst=current.isconst;
7883       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
7884     }
7885     for(hr=0;hr<HOST_REGS;hr++) {
7886       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
7887         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
7888           regs[i].wasconst&=~(1<<hr);
7889         }
7890       }
7891     }
7892     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
7893     regs[i].waswritten=current.waswritten;
7894   }
7895
7896   /* Pass 4 - Cull unused host registers */
7897
7898   uint64_t nr=0;
7899
7900   for (i=slen-1;i>=0;i--)
7901   {
7902     int hr;
7903     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
7904     {
7905       if(ba[i]<start || ba[i]>=(start+slen*4))
7906       {
7907         // Branch out of this block, don't need anything
7908         nr=0;
7909       }
7910       else
7911       {
7912         // Internal branch
7913         // Need whatever matches the target
7914         nr=0;
7915         int t=(ba[i]-start)>>2;
7916         for(hr=0;hr<HOST_REGS;hr++)
7917         {
7918           if(regs[i].regmap_entry[hr]>=0) {
7919             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
7920           }
7921         }
7922       }
7923       // Conditional branch may need registers for following instructions
7924       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7925       {
7926         if(i<slen-2) {
7927           nr|=needed_reg[i+2];
7928           for(hr=0;hr<HOST_REGS;hr++)
7929           {
7930             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
7931             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
7932           }
7933         }
7934       }
7935       // Don't need stuff which is overwritten
7936       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
7937       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
7938       // Merge in delay slot
7939       for(hr=0;hr<HOST_REGS;hr++)
7940       {
7941         if(!likely[i]) {
7942           // These are overwritten unless the branch is "likely"
7943           // and the delay slot is nullified if not taken
7944           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
7945           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
7946         }
7947         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
7948         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
7949         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
7950         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
7951         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
7952           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
7953           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
7954         }
7955       }
7956     }
7957     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7958     {
7959       // SYSCALL instruction (software interrupt)
7960       nr=0;
7961     }
7962     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7963     {
7964       // ERET instruction (return from interrupt)
7965       nr=0;
7966     }
7967     else // Non-branch
7968     {
7969       if(i<slen-1) {
7970         for(hr=0;hr<HOST_REGS;hr++) {
7971           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
7972           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
7973           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
7974           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
7975         }
7976       }
7977     }
7978     for(hr=0;hr<HOST_REGS;hr++)
7979     {
7980       // Overwritten registers are not needed
7981       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
7982       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
7983       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
7984       // Source registers are needed
7985       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
7986       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
7987       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
7988       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
7989       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
7990         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
7991         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
7992       }
7993       // Don't store a register immediately after writing it,
7994       // may prevent dual-issue.
7995       // But do so if this is a branch target, otherwise we
7996       // might have to load the register before the branch.
7997       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
7998         if((regmap_pre[i][hr]>0&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1))) {
7999           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8000           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8001         }
8002         if((regs[i].regmap_entry[hr]>0&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1))) {
8003           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8004           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8005         }
8006       }
8007     }
8008     // Cycle count is needed at branches.  Assume it is needed at the target too.
8009     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==SPAN) {
8010       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8011       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8012     }
8013     // Save it
8014     needed_reg[i]=nr;
8015
8016     // Deallocate unneeded registers
8017     for(hr=0;hr<HOST_REGS;hr++)
8018     {
8019       if(!((nr>>hr)&1)) {
8020         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
8021         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8022            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8023            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
8024         {
8025           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8026           {
8027             if(likely[i]) {
8028               regs[i].regmap[hr]=-1;
8029               regs[i].isconst&=~(1<<hr);
8030               if(i<slen-2) {
8031                 regmap_pre[i+2][hr]=-1;
8032                 regs[i+2].wasconst&=~(1<<hr);
8033               }
8034             }
8035           }
8036         }
8037         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
8038         {
8039           int map=0,temp=0;
8040           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
8041              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8042             map=INVCP;
8043           }
8044           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
8045              itype[i+1]==C1LS || itype[i+1]==C2LS)
8046             temp=FTEMP;
8047           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8048              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8049              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
8050              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
8051              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
8052              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
8053              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
8054              regs[i].regmap[hr]!=map )
8055           {
8056             regs[i].regmap[hr]=-1;
8057             regs[i].isconst&=~(1<<hr);
8058             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
8059                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
8060                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
8061                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
8062                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
8063                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
8064                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
8065                branch_regs[i].regmap[hr]!=map)
8066             {
8067               branch_regs[i].regmap[hr]=-1;
8068               branch_regs[i].regmap_entry[hr]=-1;
8069               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8070               {
8071                 if(!likely[i]&&i<slen-2) {
8072                   regmap_pre[i+2][hr]=-1;
8073                   regs[i+2].wasconst&=~(1<<hr);
8074                 }
8075               }
8076             }
8077           }
8078         }
8079         else
8080         {
8081           // Non-branch
8082           if(i>0)
8083           {
8084             int map=-1,temp=-1;
8085             if(itype[i]==STORE || itype[i]==STORELR ||
8086                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8087               map=INVCP;
8088             }
8089             if(itype[i]==LOADLR || itype[i]==STORELR ||
8090                itype[i]==C1LS || itype[i]==C2LS)
8091               temp=FTEMP;
8092             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8093                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
8094                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
8095                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
8096             {
8097               if(i<slen-1&&!is_ds[i]) {
8098                 assert(regs[i].regmap[hr]<64);
8099                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]>0)
8100                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
8101                 {
8102                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
8103                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
8104                 }
8105                 regmap_pre[i+1][hr]=-1;
8106                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
8107                 regs[i+1].wasconst&=~(1<<hr);
8108               }
8109               regs[i].regmap[hr]=-1;
8110               regs[i].isconst&=~(1<<hr);
8111             }
8112           }
8113         }
8114       } // if needed
8115     } // for hr
8116   }
8117
8118   /* Pass 5 - Pre-allocate registers */
8119
8120   // If a register is allocated during a loop, try to allocate it for the
8121   // entire loop, if possible.  This avoids loading/storing registers
8122   // inside of the loop.
8123
8124   signed char f_regmap[HOST_REGS];
8125   clear_all_regs(f_regmap);
8126   for(i=0;i<slen-1;i++)
8127   {
8128     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
8129     {
8130       if(ba[i]>=start && ba[i]<(start+i*4))
8131       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
8132       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
8133       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
8134       ||itype[i+1]==SHIFT||itype[i+1]==COP1
8135       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
8136       {
8137         int t=(ba[i]-start)>>2;
8138         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP)) // loop_preload can't handle jumps into delay slots
8139         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
8140         for(hr=0;hr<HOST_REGS;hr++)
8141         {
8142           if(regs[i].regmap[hr]>=0) {
8143             if(f_regmap[hr]!=regs[i].regmap[hr]) {
8144               // dealloc old register
8145               int n;
8146               for(n=0;n<HOST_REGS;n++)
8147               {
8148                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
8149               }
8150               // and alloc new one
8151               f_regmap[hr]=regs[i].regmap[hr];
8152             }
8153           }
8154           if(branch_regs[i].regmap[hr]>=0) {
8155             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
8156               // dealloc old register
8157               int n;
8158               for(n=0;n<HOST_REGS;n++)
8159               {
8160                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
8161               }
8162               // and alloc new one
8163               f_regmap[hr]=branch_regs[i].regmap[hr];
8164             }
8165           }
8166           if(ooo[i]) {
8167             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
8168               f_regmap[hr]=branch_regs[i].regmap[hr];
8169           }else{
8170             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
8171               f_regmap[hr]=branch_regs[i].regmap[hr];
8172           }
8173           // Avoid dirty->clean transition
8174           #ifdef DESTRUCTIVE_WRITEBACK
8175           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
8176           #endif
8177           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
8178           // case above, however it's always a good idea.  We can't hoist the
8179           // load if the register was already allocated, so there's no point
8180           // wasting time analyzing most of these cases.  It only "succeeds"
8181           // when the mapping was different and the load can be replaced with
8182           // a mov, which is of negligible benefit.  So such cases are
8183           // skipped below.
8184           if(f_regmap[hr]>0) {
8185             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
8186               int r=f_regmap[hr];
8187               for(j=t;j<=i;j++)
8188               {
8189                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
8190                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
8191                 assert(r < 64);
8192                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
8193                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
8194                   int k;
8195                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
8196                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
8197                     if(r>63) {
8198                       if(get_reg(regs[i].regmap,r&63)<0) break;
8199                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
8200                     }
8201                     k=i;
8202                     while(k>1&&regs[k-1].regmap[hr]==-1) {
8203                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
8204                         //printf("no free regs for store %x\n",start+(k-1)*4);
8205                         break;
8206                       }
8207                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
8208                         //printf("no-match due to different register\n");
8209                         break;
8210                       }
8211                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP) {
8212                         //printf("no-match due to branch\n");
8213                         break;
8214                       }
8215                       // call/ret fast path assumes no registers allocated
8216                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
8217                         break;
8218                       }
8219                       assert(r < 64);
8220                       k--;
8221                     }
8222                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
8223                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
8224                       while(k<i) {
8225                         regs[k].regmap_entry[hr]=f_regmap[hr];
8226                         regs[k].regmap[hr]=f_regmap[hr];
8227                         regmap_pre[k+1][hr]=f_regmap[hr];
8228                         regs[k].wasdirty&=~(1<<hr);
8229                         regs[k].dirty&=~(1<<hr);
8230                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
8231                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
8232                         regs[k].wasconst&=~(1<<hr);
8233                         regs[k].isconst&=~(1<<hr);
8234                         k++;
8235                       }
8236                     }
8237                     else {
8238                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
8239                       break;
8240                     }
8241                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
8242                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
8243                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
8244                       regs[i].regmap_entry[hr]=f_regmap[hr];
8245                       regs[i].regmap[hr]=f_regmap[hr];
8246                       regs[i].wasdirty&=~(1<<hr);
8247                       regs[i].dirty&=~(1<<hr);
8248                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
8249                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
8250                       regs[i].wasconst&=~(1<<hr);
8251                       regs[i].isconst&=~(1<<hr);
8252                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
8253                       branch_regs[i].wasdirty&=~(1<<hr);
8254                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
8255                       branch_regs[i].regmap[hr]=f_regmap[hr];
8256                       branch_regs[i].dirty&=~(1<<hr);
8257                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
8258                       branch_regs[i].wasconst&=~(1<<hr);
8259                       branch_regs[i].isconst&=~(1<<hr);
8260                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
8261                         regmap_pre[i+2][hr]=f_regmap[hr];
8262                         regs[i+2].wasdirty&=~(1<<hr);
8263                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
8264                       }
8265                     }
8266                   }
8267                   for(k=t;k<j;k++) {
8268                     // Alloc register clean at beginning of loop,
8269                     // but may dirty it in pass 6
8270                     regs[k].regmap_entry[hr]=f_regmap[hr];
8271                     regs[k].regmap[hr]=f_regmap[hr];
8272                     regs[k].dirty&=~(1<<hr);
8273                     regs[k].wasconst&=~(1<<hr);
8274                     regs[k].isconst&=~(1<<hr);
8275                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP) {
8276                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
8277                       branch_regs[k].regmap[hr]=f_regmap[hr];
8278                       branch_regs[k].dirty&=~(1<<hr);
8279                       branch_regs[k].wasconst&=~(1<<hr);
8280                       branch_regs[k].isconst&=~(1<<hr);
8281                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
8282                         regmap_pre[k+2][hr]=f_regmap[hr];
8283                         regs[k+2].wasdirty&=~(1<<hr);
8284                       }
8285                     }
8286                     else
8287                     {
8288                       regmap_pre[k+1][hr]=f_regmap[hr];
8289                       regs[k+1].wasdirty&=~(1<<hr);
8290                     }
8291                   }
8292                   if(regs[j].regmap[hr]==f_regmap[hr])
8293                     regs[j].regmap_entry[hr]=f_regmap[hr];
8294                   break;
8295                 }
8296                 if(j==i) break;
8297                 if(regs[j].regmap[hr]>=0)
8298                   break;
8299                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
8300                   //printf("no-match due to different register\n");
8301                   break;
8302                 }
8303                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
8304                 {
8305                   // Stop on unconditional branch
8306                   break;
8307                 }
8308                 if(itype[j]==CJUMP||itype[j]==SJUMP)
8309                 {
8310                   if(ooo[j]) {
8311                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
8312                       break;
8313                   }else{
8314                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
8315                       break;
8316                   }
8317                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
8318                     //printf("no-match due to different register (branch)\n");
8319                     break;
8320                   }
8321                 }
8322                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
8323                   //printf("No free regs for store %x\n",start+j*4);
8324                   break;
8325                 }
8326                 assert(f_regmap[hr]<64);
8327               }
8328             }
8329           }
8330         }
8331       }
8332     }else{
8333       // Non branch or undetermined branch target
8334       for(hr=0;hr<HOST_REGS;hr++)
8335       {
8336         if(hr!=EXCLUDE_REG) {
8337           if(regs[i].regmap[hr]>=0) {
8338             if(f_regmap[hr]!=regs[i].regmap[hr]) {
8339               // dealloc old register
8340               int n;
8341               for(n=0;n<HOST_REGS;n++)
8342               {
8343                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
8344               }
8345               // and alloc new one
8346               f_regmap[hr]=regs[i].regmap[hr];
8347             }
8348           }
8349         }
8350       }
8351       // Try to restore cycle count at branch targets
8352       if(bt[i]) {
8353         for(j=i;j<slen-1;j++) {
8354           if(regs[j].regmap[HOST_CCREG]!=-1) break;
8355           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
8356             //printf("no free regs for store %x\n",start+j*4);
8357             break;
8358           }
8359         }
8360         if(regs[j].regmap[HOST_CCREG]==CCREG) {
8361           int k=i;
8362           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
8363           while(k<j) {
8364             regs[k].regmap_entry[HOST_CCREG]=CCREG;
8365             regs[k].regmap[HOST_CCREG]=CCREG;
8366             regmap_pre[k+1][HOST_CCREG]=CCREG;
8367             regs[k+1].wasdirty|=1<<HOST_CCREG;
8368             regs[k].dirty|=1<<HOST_CCREG;
8369             regs[k].wasconst&=~(1<<HOST_CCREG);
8370             regs[k].isconst&=~(1<<HOST_CCREG);
8371             k++;
8372           }
8373           regs[j].regmap_entry[HOST_CCREG]=CCREG;
8374         }
8375         // Work backwards from the branch target
8376         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
8377         {
8378           //printf("Extend backwards\n");
8379           int k;
8380           k=i;
8381           while(regs[k-1].regmap[HOST_CCREG]==-1) {
8382             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
8383               //printf("no free regs for store %x\n",start+(k-1)*4);
8384               break;
8385             }
8386             k--;
8387           }
8388           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
8389             //printf("Extend CC, %x ->\n",start+k*4);
8390             while(k<=i) {
8391               regs[k].regmap_entry[HOST_CCREG]=CCREG;
8392               regs[k].regmap[HOST_CCREG]=CCREG;
8393               regmap_pre[k+1][HOST_CCREG]=CCREG;
8394               regs[k+1].wasdirty|=1<<HOST_CCREG;
8395               regs[k].dirty|=1<<HOST_CCREG;
8396               regs[k].wasconst&=~(1<<HOST_CCREG);
8397               regs[k].isconst&=~(1<<HOST_CCREG);
8398               k++;
8399             }
8400           }
8401           else {
8402             //printf("Fail Extend CC, %x ->\n",start+k*4);
8403           }
8404         }
8405       }
8406       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
8407          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
8408          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1)
8409       {
8410         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
8411       }
8412     }
8413   }
8414
8415   // This allocates registers (if possible) one instruction prior
8416   // to use, which can avoid a load-use penalty on certain CPUs.
8417   for(i=0;i<slen-1;i++)
8418   {
8419     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP))
8420     {
8421       if(!bt[i+1])
8422       {
8423         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
8424            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
8425         {
8426           if(rs1[i+1]) {
8427             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
8428             {
8429               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8430               {
8431                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
8432                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
8433                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
8434                 regs[i].isconst&=~(1<<hr);
8435                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8436                 constmap[i][hr]=constmap[i+1][hr];
8437                 regs[i+1].wasdirty&=~(1<<hr);
8438                 regs[i].dirty&=~(1<<hr);
8439               }
8440             }
8441           }
8442           if(rs2[i+1]) {
8443             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
8444             {
8445               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8446               {
8447                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
8448                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
8449                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
8450                 regs[i].isconst&=~(1<<hr);
8451                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8452                 constmap[i][hr]=constmap[i+1][hr];
8453                 regs[i+1].wasdirty&=~(1<<hr);
8454                 regs[i].dirty&=~(1<<hr);
8455               }
8456             }
8457           }
8458           // Preload target address for load instruction (non-constant)
8459           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8460             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
8461             {
8462               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8463               {
8464                 regs[i].regmap[hr]=rs1[i+1];
8465                 regmap_pre[i+1][hr]=rs1[i+1];
8466                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8467                 regs[i].isconst&=~(1<<hr);
8468                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8469                 constmap[i][hr]=constmap[i+1][hr];
8470                 regs[i+1].wasdirty&=~(1<<hr);
8471                 regs[i].dirty&=~(1<<hr);
8472               }
8473             }
8474           }
8475           // Load source into target register
8476           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8477             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
8478             {
8479               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8480               {
8481                 regs[i].regmap[hr]=rs1[i+1];
8482                 regmap_pre[i+1][hr]=rs1[i+1];
8483                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8484                 regs[i].isconst&=~(1<<hr);
8485                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8486                 constmap[i][hr]=constmap[i+1][hr];
8487                 regs[i+1].wasdirty&=~(1<<hr);
8488                 regs[i].dirty&=~(1<<hr);
8489               }
8490             }
8491           }
8492           // Address for store instruction (non-constant)
8493           if(itype[i+1]==STORE||itype[i+1]==STORELR
8494              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
8495             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8496               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
8497               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
8498               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
8499               assert(hr>=0);
8500               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8501               {
8502                 regs[i].regmap[hr]=rs1[i+1];
8503                 regmap_pre[i+1][hr]=rs1[i+1];
8504                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8505                 regs[i].isconst&=~(1<<hr);
8506                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8507                 constmap[i][hr]=constmap[i+1][hr];
8508                 regs[i+1].wasdirty&=~(1<<hr);
8509                 regs[i].dirty&=~(1<<hr);
8510               }
8511             }
8512           }
8513           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
8514             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8515               int nr;
8516               hr=get_reg(regs[i+1].regmap,FTEMP);
8517               assert(hr>=0);
8518               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8519               {
8520                 regs[i].regmap[hr]=rs1[i+1];
8521                 regmap_pre[i+1][hr]=rs1[i+1];
8522                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8523                 regs[i].isconst&=~(1<<hr);
8524                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8525                 constmap[i][hr]=constmap[i+1][hr];
8526                 regs[i+1].wasdirty&=~(1<<hr);
8527                 regs[i].dirty&=~(1<<hr);
8528               }
8529               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
8530               {
8531                 // move it to another register
8532                 regs[i+1].regmap[hr]=-1;
8533                 regmap_pre[i+2][hr]=-1;
8534                 regs[i+1].regmap[nr]=FTEMP;
8535                 regmap_pre[i+2][nr]=FTEMP;
8536                 regs[i].regmap[nr]=rs1[i+1];
8537                 regmap_pre[i+1][nr]=rs1[i+1];
8538                 regs[i+1].regmap_entry[nr]=rs1[i+1];
8539                 regs[i].isconst&=~(1<<nr);
8540                 regs[i+1].isconst&=~(1<<nr);
8541                 regs[i].dirty&=~(1<<nr);
8542                 regs[i+1].wasdirty&=~(1<<nr);
8543                 regs[i+1].dirty&=~(1<<nr);
8544                 regs[i+2].wasdirty&=~(1<<nr);
8545               }
8546             }
8547           }
8548           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
8549             if(itype[i+1]==LOAD)
8550               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
8551             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
8552               hr=get_reg(regs[i+1].regmap,FTEMP);
8553             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
8554               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
8555               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
8556             }
8557             if(hr>=0&&regs[i].regmap[hr]<0) {
8558               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
8559               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
8560                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
8561                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
8562                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
8563                 regs[i].isconst&=~(1<<hr);
8564                 regs[i+1].wasdirty&=~(1<<hr);
8565                 regs[i].dirty&=~(1<<hr);
8566               }
8567             }
8568           }
8569         }
8570       }
8571     }
8572   }
8573
8574   /* Pass 6 - Optimize clean/dirty state */
8575   clean_registers(0,slen-1,1);
8576
8577   /* Pass 7 - Identify 32-bit registers */
8578   for (i=slen-1;i>=0;i--)
8579   {
8580     if(itype[i]==CJUMP||itype[i]==SJUMP)
8581     {
8582       // Conditional branch
8583       if((source[i]>>16)!=0x1000&&i<slen-2) {
8584         // Mark this address as a branch target since it may be called
8585         // upon return from interrupt
8586         bt[i+2]=1;
8587       }
8588     }
8589   }
8590
8591   if(itype[slen-1]==SPAN) {
8592     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
8593   }
8594
8595 #ifdef DISASM
8596   /* Debug/disassembly */
8597   for(i=0;i<slen;i++)
8598   {
8599     printf("U:");
8600     int r;
8601     for(r=1;r<=CCREG;r++) {
8602       if((unneeded_reg[i]>>r)&1) {
8603         if(r==HIREG) printf(" HI");
8604         else if(r==LOREG) printf(" LO");
8605         else printf(" r%d",r);
8606       }
8607     }
8608     printf("\n");
8609     #if defined(__i386__) || defined(__x86_64__)
8610     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
8611     #endif
8612     #ifdef __arm__
8613     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
8614     #endif
8615     #if defined(__i386__) || defined(__x86_64__)
8616     printf("needs: ");
8617     if(needed_reg[i]&1) printf("eax ");
8618     if((needed_reg[i]>>1)&1) printf("ecx ");
8619     if((needed_reg[i]>>2)&1) printf("edx ");
8620     if((needed_reg[i]>>3)&1) printf("ebx ");
8621     if((needed_reg[i]>>5)&1) printf("ebp ");
8622     if((needed_reg[i]>>6)&1) printf("esi ");
8623     if((needed_reg[i]>>7)&1) printf("edi ");
8624     printf("\n");
8625     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
8626     printf("dirty: ");
8627     if(regs[i].wasdirty&1) printf("eax ");
8628     if((regs[i].wasdirty>>1)&1) printf("ecx ");
8629     if((regs[i].wasdirty>>2)&1) printf("edx ");
8630     if((regs[i].wasdirty>>3)&1) printf("ebx ");
8631     if((regs[i].wasdirty>>5)&1) printf("ebp ");
8632     if((regs[i].wasdirty>>6)&1) printf("esi ");
8633     if((regs[i].wasdirty>>7)&1) printf("edi ");
8634     #endif
8635     #ifdef __arm__
8636     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
8637     printf("dirty: ");
8638     if(regs[i].wasdirty&1) printf("r0 ");
8639     if((regs[i].wasdirty>>1)&1) printf("r1 ");
8640     if((regs[i].wasdirty>>2)&1) printf("r2 ");
8641     if((regs[i].wasdirty>>3)&1) printf("r3 ");
8642     if((regs[i].wasdirty>>4)&1) printf("r4 ");
8643     if((regs[i].wasdirty>>5)&1) printf("r5 ");
8644     if((regs[i].wasdirty>>6)&1) printf("r6 ");
8645     if((regs[i].wasdirty>>7)&1) printf("r7 ");
8646     if((regs[i].wasdirty>>8)&1) printf("r8 ");
8647     if((regs[i].wasdirty>>9)&1) printf("r9 ");
8648     if((regs[i].wasdirty>>10)&1) printf("r10 ");
8649     if((regs[i].wasdirty>>12)&1) printf("r12 ");
8650     #endif
8651     printf("\n");
8652     disassemble_inst(i);
8653     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
8654     #if defined(__i386__) || defined(__x86_64__)
8655     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
8656     if(regs[i].dirty&1) printf("eax ");
8657     if((regs[i].dirty>>1)&1) printf("ecx ");
8658     if((regs[i].dirty>>2)&1) printf("edx ");
8659     if((regs[i].dirty>>3)&1) printf("ebx ");
8660     if((regs[i].dirty>>5)&1) printf("ebp ");
8661     if((regs[i].dirty>>6)&1) printf("esi ");
8662     if((regs[i].dirty>>7)&1) printf("edi ");
8663     #endif
8664     #ifdef __arm__
8665     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
8666     if(regs[i].dirty&1) printf("r0 ");
8667     if((regs[i].dirty>>1)&1) printf("r1 ");
8668     if((regs[i].dirty>>2)&1) printf("r2 ");
8669     if((regs[i].dirty>>3)&1) printf("r3 ");
8670     if((regs[i].dirty>>4)&1) printf("r4 ");
8671     if((regs[i].dirty>>5)&1) printf("r5 ");
8672     if((regs[i].dirty>>6)&1) printf("r6 ");
8673     if((regs[i].dirty>>7)&1) printf("r7 ");
8674     if((regs[i].dirty>>8)&1) printf("r8 ");
8675     if((regs[i].dirty>>9)&1) printf("r9 ");
8676     if((regs[i].dirty>>10)&1) printf("r10 ");
8677     if((regs[i].dirty>>12)&1) printf("r12 ");
8678     #endif
8679     printf("\n");
8680     if(regs[i].isconst) {
8681       printf("constants: ");
8682       #if defined(__i386__) || defined(__x86_64__)
8683       if(regs[i].isconst&1) printf("eax=%x ",(u_int)constmap[i][0]);
8684       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(u_int)constmap[i][1]);
8685       if((regs[i].isconst>>2)&1) printf("edx=%x ",(u_int)constmap[i][2]);
8686       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(u_int)constmap[i][3]);
8687       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(u_int)constmap[i][5]);
8688       if((regs[i].isconst>>6)&1) printf("esi=%x ",(u_int)constmap[i][6]);
8689       if((regs[i].isconst>>7)&1) printf("edi=%x ",(u_int)constmap[i][7]);
8690       #endif
8691       #if defined(__arm__) || defined(__aarch64__)
8692       int r;
8693       for (r = 0; r < ARRAY_SIZE(constmap[i]); r++)
8694         if ((regs[i].isconst >> r) & 1)
8695           printf(" r%d=%x", r, (u_int)constmap[i][r]);
8696       #endif
8697       printf("\n");
8698     }
8699     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
8700       #if defined(__i386__) || defined(__x86_64__)
8701       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
8702       if(branch_regs[i].dirty&1) printf("eax ");
8703       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
8704       if((branch_regs[i].dirty>>2)&1) printf("edx ");
8705       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
8706       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
8707       if((branch_regs[i].dirty>>6)&1) printf("esi ");
8708       if((branch_regs[i].dirty>>7)&1) printf("edi ");
8709       #endif
8710       #ifdef __arm__
8711       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
8712       if(branch_regs[i].dirty&1) printf("r0 ");
8713       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
8714       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
8715       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
8716       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
8717       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
8718       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
8719       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
8720       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
8721       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
8722       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
8723       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
8724       #endif
8725     }
8726   }
8727 #endif // DISASM
8728
8729   /* Pass 8 - Assembly */
8730   linkcount=0;stubcount=0;
8731   ds=0;is_delayslot=0;
8732   u_int dirty_pre=0;
8733   void *beginning=start_block();
8734   if((u_int)addr&1) {
8735     ds=1;
8736     pagespan_ds();
8737   }
8738   void *instr_addr0_override = NULL;
8739
8740   if (start == 0x80030000) {
8741     // nasty hack for the fastbios thing
8742     // override block entry to this code
8743     instr_addr0_override = out;
8744     emit_movimm(start,0);
8745     // abuse io address var as a flag that we
8746     // have already returned here once
8747     emit_readword(&address,1);
8748     emit_writeword(0,&pcaddr);
8749     emit_writeword(0,&address);
8750     emit_cmp(0,1);
8751     #ifdef __aarch64__
8752     emit_jeq(out + 4*2);
8753     emit_far_jump(new_dyna_leave);
8754     #else
8755     emit_jne(new_dyna_leave);
8756     #endif
8757   }
8758   for(i=0;i<slen;i++)
8759   {
8760     //if(ds) printf("ds: ");
8761     disassemble_inst(i);
8762     if(ds) {
8763       ds=0; // Skip delay slot
8764       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
8765       instr_addr[i] = NULL;
8766     } else {
8767       speculate_register_values(i);
8768       #ifndef DESTRUCTIVE_WRITEBACK
8769       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
8770       {
8771         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,unneeded_reg[i]);
8772       }
8773       if((itype[i]==CJUMP||itype[i]==SJUMP)&&!likely[i]) {
8774         dirty_pre=branch_regs[i].dirty;
8775       }else{
8776         dirty_pre=regs[i].dirty;
8777       }
8778       #endif
8779       // write back
8780       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
8781       {
8782         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,unneeded_reg[i]);
8783         loop_preload(regmap_pre[i],regs[i].regmap_entry);
8784       }
8785       // branch target entry point
8786       instr_addr[i] = out;
8787       assem_debug("<->\n");
8788       drc_dbg_emit_do_cmp(i);
8789
8790       // load regs
8791       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
8792         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty);
8793       load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i],rs2[i]);
8794       address_generation(i,&regs[i],regs[i].regmap_entry);
8795       load_consts(regmap_pre[i],regs[i].regmap,i);
8796       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
8797       {
8798         // Load the delay slot registers if necessary
8799         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
8800           load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i+1],rs1[i+1]);
8801         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
8802           load_regs(regs[i].regmap_entry,regs[i].regmap,rs2[i+1],rs2[i+1]);
8803         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
8804           load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP);
8805       }
8806       else if(i+1<slen)
8807       {
8808         // Preload registers for following instruction
8809         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
8810           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
8811             load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i+1],rs1[i+1]);
8812         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
8813           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
8814             load_regs(regs[i].regmap_entry,regs[i].regmap,rs2[i+1],rs2[i+1]);
8815       }
8816       // TODO: if(is_ooo(i)) address_generation(i+1);
8817       if(itype[i]==CJUMP)
8818         load_regs(regs[i].regmap_entry,regs[i].regmap,CCREG,CCREG);
8819       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
8820         load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP);
8821       // assemble
8822       switch(itype[i]) {
8823         case ALU:
8824           alu_assemble(i,&regs[i]);break;
8825         case IMM16:
8826           imm16_assemble(i,&regs[i]);break;
8827         case SHIFT:
8828           shift_assemble(i,&regs[i]);break;
8829         case SHIFTIMM:
8830           shiftimm_assemble(i,&regs[i]);break;
8831         case LOAD:
8832           load_assemble(i,&regs[i]);break;
8833         case LOADLR:
8834           loadlr_assemble(i,&regs[i]);break;
8835         case STORE:
8836           store_assemble(i,&regs[i]);break;
8837         case STORELR:
8838           storelr_assemble(i,&regs[i]);break;
8839         case COP0:
8840           cop0_assemble(i,&regs[i]);break;
8841         case COP1:
8842           cop1_assemble(i,&regs[i]);break;
8843         case C1LS:
8844           c1ls_assemble(i,&regs[i]);break;
8845         case COP2:
8846           cop2_assemble(i,&regs[i]);break;
8847         case C2LS:
8848           c2ls_assemble(i,&regs[i]);break;
8849         case C2OP:
8850           c2op_assemble(i,&regs[i]);break;
8851         case MULTDIV:
8852           multdiv_assemble(i,&regs[i]);break;
8853         case MOV:
8854           mov_assemble(i,&regs[i]);break;
8855         case SYSCALL:
8856           syscall_assemble(i,&regs[i]);break;
8857         case HLECALL:
8858           hlecall_assemble(i,&regs[i]);break;
8859         case INTCALL:
8860           intcall_assemble(i,&regs[i]);break;
8861         case UJUMP:
8862           ujump_assemble(i,&regs[i]);ds=1;break;
8863         case RJUMP:
8864           rjump_assemble(i,&regs[i]);ds=1;break;
8865         case CJUMP:
8866           cjump_assemble(i,&regs[i]);ds=1;break;
8867         case SJUMP:
8868           sjump_assemble(i,&regs[i]);ds=1;break;
8869         case SPAN:
8870           pagespan_assemble(i,&regs[i]);break;
8871       }
8872       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
8873         literal_pool(1024);
8874       else
8875         literal_pool_jumpover(256);
8876     }
8877   }
8878   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
8879   // If the block did not end with an unconditional branch,
8880   // add a jump to the next instruction.
8881   if(i>1) {
8882     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
8883       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP);
8884       assert(i==slen);
8885       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP) {
8886         store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4);
8887         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
8888           emit_loadreg(CCREG,HOST_CCREG);
8889         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
8890       }
8891       else if(!likely[i-2])
8892       {
8893         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].dirty,start+i*4);
8894         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
8895       }
8896       else
8897       {
8898         store_regs_bt(regs[i-2].regmap,regs[i-2].dirty,start+i*4);
8899         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
8900       }
8901       add_to_linker(out,start+i*4,0);
8902       emit_jmp(0);
8903     }
8904   }
8905   else
8906   {
8907     assert(i>0);
8908     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP);
8909     store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4);
8910     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
8911       emit_loadreg(CCREG,HOST_CCREG);
8912     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
8913     add_to_linker(out,start+i*4,0);
8914     emit_jmp(0);
8915   }
8916
8917   // TODO: delay slot stubs?
8918   // Stubs
8919   for(i=0;i<stubcount;i++)
8920   {
8921     switch(stubs[i].type)
8922     {
8923       case LOADB_STUB:
8924       case LOADH_STUB:
8925       case LOADW_STUB:
8926       case LOADD_STUB:
8927       case LOADBU_STUB:
8928       case LOADHU_STUB:
8929         do_readstub(i);break;
8930       case STOREB_STUB:
8931       case STOREH_STUB:
8932       case STOREW_STUB:
8933       case STORED_STUB:
8934         do_writestub(i);break;
8935       case CC_STUB:
8936         do_ccstub(i);break;
8937       case INVCODE_STUB:
8938         do_invstub(i);break;
8939       case FP_STUB:
8940         do_cop1stub(i);break;
8941       case STORELR_STUB:
8942         do_unalignedwritestub(i);break;
8943     }
8944   }
8945
8946   if (instr_addr0_override)
8947     instr_addr[0] = instr_addr0_override;
8948
8949   /* Pass 9 - Linker */
8950   for(i=0;i<linkcount;i++)
8951   {
8952     assem_debug("%p -> %8x\n",link_addr[i].addr,link_addr[i].target);
8953     literal_pool(64);
8954     if (!link_addr[i].ext)
8955     {
8956       void *stub = out;
8957       void *addr = check_addr(link_addr[i].target);
8958       emit_extjump(link_addr[i].addr, link_addr[i].target);
8959       if (addr) {
8960         set_jump_target(link_addr[i].addr, addr);
8961         add_link(link_addr[i].target,stub);
8962       }
8963       else
8964         set_jump_target(link_addr[i].addr, stub);
8965     }
8966     else
8967     {
8968       // Internal branch
8969       int target=(link_addr[i].target-start)>>2;
8970       assert(target>=0&&target<slen);
8971       assert(instr_addr[target]);
8972       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
8973       //set_jump_target_fillslot(link_addr[i].addr,instr_addr[target],link_addr[i].ext>>1);
8974       //#else
8975       set_jump_target(link_addr[i].addr, instr_addr[target]);
8976       //#endif
8977     }
8978   }
8979   // External Branch Targets (jump_in)
8980   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
8981   for(i=0;i<slen;i++)
8982   {
8983     if(bt[i]||i==0)
8984     {
8985       if(instr_addr[i]) // TODO - delay slots (=null)
8986       {
8987         u_int vaddr=start+i*4;
8988         u_int page=get_page(vaddr);
8989         u_int vpage=get_vpage(vaddr);
8990         literal_pool(256);
8991         {
8992           assem_debug("%p (%d) <- %8x\n",instr_addr[i],i,start+i*4);
8993           assem_debug("jump_in: %x\n",start+i*4);
8994           ll_add(jump_dirty+vpage,vaddr,out);
8995           void *entry_point = do_dirty_stub(i);
8996           ll_add_flags(jump_in+page,vaddr,state_rflags,entry_point);
8997           // If there was an existing entry in the hash table,
8998           // replace it with the new address.
8999           // Don't add new entries.  We'll insert the
9000           // ones that actually get used in check_addr().
9001           struct ht_entry *ht_bin = hash_table_get(vaddr);
9002           if (ht_bin->vaddr[0] == vaddr)
9003             ht_bin->tcaddr[0] = entry_point;
9004           if (ht_bin->vaddr[1] == vaddr)
9005             ht_bin->tcaddr[1] = entry_point;
9006         }
9007       }
9008     }
9009   }
9010   // Write out the literal pool if necessary
9011   literal_pool(0);
9012   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
9013   // Align code
9014   if(((u_int)out)&7) emit_addnop(13);
9015   #endif
9016   assert(out - (u_char *)beginning < MAX_OUTPUT_BLOCK_SIZE);
9017   //printf("shadow buffer: %p-%p\n",copy,(u_char *)copy+slen*4);
9018   memcpy(copy,source,slen*4);
9019   copy+=slen*4;
9020
9021   end_block(beginning);
9022
9023   // If we're within 256K of the end of the buffer,
9024   // start over from the beginning. (Is 256K enough?)
9025   if (out > ndrc->translation_cache + sizeof(ndrc->translation_cache) - MAX_OUTPUT_BLOCK_SIZE)
9026     out = ndrc->translation_cache;
9027
9028   // Trap writes to any of the pages we compiled
9029   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
9030     invalid_code[i]=0;
9031   }
9032   inv_code_start=inv_code_end=~0;
9033
9034   // for PCSX we need to mark all mirrors too
9035   if(get_page(start)<(RAM_SIZE>>12))
9036     for(i=start>>12;i<=(start+slen*4)>>12;i++)
9037       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
9038       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
9039       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
9040
9041   /* Pass 10 - Free memory by expiring oldest blocks */
9042
9043   int end=(((out-ndrc->translation_cache)>>(TARGET_SIZE_2-16))+16384)&65535;
9044   while(expirep!=end)
9045   {
9046     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
9047     uintptr_t base=(uintptr_t)ndrc->translation_cache+((expirep>>13)<<shift); // Base address of this block
9048     inv_debug("EXP: Phase %d\n",expirep);
9049     switch((expirep>>11)&3)
9050     {
9051       case 0:
9052         // Clear jump_in and jump_dirty
9053         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
9054         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
9055         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
9056         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
9057         break;
9058       case 1:
9059         // Clear pointers
9060         ll_kill_pointers(jump_out[expirep&2047],base,shift);
9061         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
9062         break;
9063       case 2:
9064         // Clear hash table
9065         for(i=0;i<32;i++) {
9066           struct ht_entry *ht_bin = &hash_table[((expirep&2047)<<5)+i];
9067           if (((uintptr_t)ht_bin->tcaddr[1]>>shift) == (base>>shift) ||
9068              (((uintptr_t)ht_bin->tcaddr[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
9069             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[1],ht_bin->tcaddr[1]);
9070             ht_bin->vaddr[1] = -1;
9071             ht_bin->tcaddr[1] = NULL;
9072           }
9073           if (((uintptr_t)ht_bin->tcaddr[0]>>shift) == (base>>shift) ||
9074              (((uintptr_t)ht_bin->tcaddr[0]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
9075             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[0],ht_bin->tcaddr[0]);
9076             ht_bin->vaddr[0] = ht_bin->vaddr[1];
9077             ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
9078             ht_bin->vaddr[1] = -1;
9079             ht_bin->tcaddr[1] = NULL;
9080           }
9081         }
9082         break;
9083       case 3:
9084         // Clear jump_out
9085         #if defined(__arm__) || defined(__aarch64__)
9086         if((expirep&2047)==0)
9087           do_clear_cache();
9088         #endif
9089         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
9090         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
9091         break;
9092     }
9093     expirep=(expirep+1)&65535;
9094   }
9095   return 0;
9096 }
9097
9098 // vim:shiftwidth=2:expandtab