drc: try to make gte stall handling less bloaty
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29 #ifdef _3DS
30 #include <3ds_utils.h>
31 #endif
32 #ifdef VITA
33 #include <psp2/kernel/sysmem.h>
34 static int sceBlock;
35 #endif
36
37 #include "new_dynarec_config.h"
38 #include "../psxhle.h"
39 #include "../psxinterpreter.h"
40 #include "../gte.h"
41 #include "emu_if.h" // emulator interface
42
43 #define noinline __attribute__((noinline,noclone))
44 #ifndef ARRAY_SIZE
45 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
46 #endif
47 #ifndef min
48 #define min(a, b) ((b) < (a) ? (b) : (a))
49 #endif
50
51 //#define DISASM
52 //#define assem_debug printf
53 //#define inv_debug printf
54 #define assem_debug(...)
55 #define inv_debug(...)
56
57 #ifdef __i386__
58 #include "assem_x86.h"
59 #endif
60 #ifdef __x86_64__
61 #include "assem_x64.h"
62 #endif
63 #ifdef __arm__
64 #include "assem_arm.h"
65 #endif
66 #ifdef __aarch64__
67 #include "assem_arm64.h"
68 #endif
69
70 #define RAM_SIZE 0x200000
71 #define MAXBLOCK 4096
72 #define MAX_OUTPUT_BLOCK_SIZE 262144
73
74 struct ndrc_mem
75 {
76   u_char translation_cache[1 << TARGET_SIZE_2];
77   struct
78   {
79     struct tramp_insns ops[2048 / sizeof(struct tramp_insns)];
80     const void *f[2048 / sizeof(void *)];
81   } tramp;
82 };
83
84 #ifdef BASE_ADDR_DYNAMIC
85 static struct ndrc_mem *ndrc;
86 #else
87 static struct ndrc_mem ndrc_ __attribute__((aligned(4096)));
88 static struct ndrc_mem *ndrc = &ndrc_;
89 #endif
90
91 // stubs
92 enum stub_type {
93   CC_STUB = 1,
94   FP_STUB = 2,
95   LOADB_STUB = 3,
96   LOADH_STUB = 4,
97   LOADW_STUB = 5,
98   LOADD_STUB = 6,
99   LOADBU_STUB = 7,
100   LOADHU_STUB = 8,
101   STOREB_STUB = 9,
102   STOREH_STUB = 10,
103   STOREW_STUB = 11,
104   STORED_STUB = 12,
105   STORELR_STUB = 13,
106   INVCODE_STUB = 14,
107 };
108
109 struct regstat
110 {
111   signed char regmap_entry[HOST_REGS];
112   signed char regmap[HOST_REGS];
113   uint64_t wasdirty;
114   uint64_t dirty;
115   uint64_t u;
116   u_int wasconst;
117   u_int isconst;
118   u_int loadedconst;             // host regs that have constants loaded
119   u_int waswritten;              // MIPS regs that were used as store base before
120 };
121
122 // note: asm depends on this layout
123 struct ll_entry
124 {
125   u_int vaddr;
126   u_int reg_sv_flags;
127   void *addr;
128   struct ll_entry *next;
129 };
130
131 struct ht_entry
132 {
133   u_int vaddr[2];
134   void *tcaddr[2];
135 };
136
137 struct code_stub
138 {
139   enum stub_type type;
140   void *addr;
141   void *retaddr;
142   u_int a;
143   uintptr_t b;
144   uintptr_t c;
145   u_int d;
146   u_int e;
147 };
148
149 struct link_entry
150 {
151   void *addr;
152   u_int target;
153   u_int ext;
154 };
155
156   // used by asm:
157   u_char *out;
158   struct ht_entry hash_table[65536]  __attribute__((aligned(16)));
159   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
160   struct ll_entry *jump_dirty[4096];
161
162   static struct ll_entry *jump_out[4096];
163   static u_int start;
164   static u_int *source;
165   static char insn[MAXBLOCK][10];
166   static u_char itype[MAXBLOCK];
167   static u_char opcode[MAXBLOCK];
168   static u_char opcode2[MAXBLOCK];
169   static u_char bt[MAXBLOCK];
170   static u_char rs1[MAXBLOCK];
171   static u_char rs2[MAXBLOCK];
172   static u_char rt1[MAXBLOCK];
173   static u_char rt2[MAXBLOCK];
174   static u_char dep1[MAXBLOCK];
175   static u_char dep2[MAXBLOCK];
176   static u_char lt1[MAXBLOCK];
177   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
178   static uint64_t gte_rt[MAXBLOCK];
179   static uint64_t gte_unneeded[MAXBLOCK];
180   static u_int smrv[32]; // speculated MIPS register values
181   static u_int smrv_strong; // mask or regs that are likely to have correct values
182   static u_int smrv_weak; // same, but somewhat less likely
183   static u_int smrv_strong_next; // same, but after current insn executes
184   static u_int smrv_weak_next;
185   static int imm[MAXBLOCK];
186   static u_int ba[MAXBLOCK];
187   static char likely[MAXBLOCK];
188   static char is_ds[MAXBLOCK];
189   static char ooo[MAXBLOCK];
190   static uint64_t unneeded_reg[MAXBLOCK];
191   static uint64_t branch_unneeded_reg[MAXBLOCK];
192   static signed char regmap_pre[MAXBLOCK][HOST_REGS]; // pre-instruction i?
193   // contains 'real' consts at [i] insn, but may differ from what's actually
194   // loaded in host reg as 'final' value is always loaded, see get_final_value()
195   static uint32_t current_constmap[HOST_REGS];
196   static uint32_t constmap[MAXBLOCK][HOST_REGS];
197   static struct regstat regs[MAXBLOCK];
198   static struct regstat branch_regs[MAXBLOCK];
199   static signed char minimum_free_regs[MAXBLOCK];
200   static u_int needed_reg[MAXBLOCK];
201   static u_int wont_dirty[MAXBLOCK];
202   static u_int will_dirty[MAXBLOCK];
203   static int ccadj[MAXBLOCK];
204   static int slen;
205   static void *instr_addr[MAXBLOCK];
206   static struct link_entry link_addr[MAXBLOCK];
207   static int linkcount;
208   static struct code_stub stubs[MAXBLOCK*3];
209   static int stubcount;
210   static u_int literals[1024][2];
211   static int literalcount;
212   static int is_delayslot;
213   static char shadow[1048576]  __attribute__((aligned(16)));
214   static void *copy;
215   static int expirep;
216   static u_int stop_after_jal;
217 #ifndef RAM_FIXED
218   static uintptr_t ram_offset;
219 #else
220   static const uintptr_t ram_offset=0;
221 #endif
222
223   int new_dynarec_hacks;
224   int new_dynarec_hacks_pergame;
225   int new_dynarec_did_compile;
226
227   #define HACK_ENABLED(x) ((new_dynarec_hacks | new_dynarec_hacks_pergame) & (x))
228
229   extern int cycle_count; // ... until end of the timeslice, counts -N -> 0
230   extern int last_count;  // last absolute target, often = next_interupt
231   extern int pcaddr;
232   extern int pending_exception;
233   extern int branch_target;
234   extern uintptr_t mini_ht[32][2];
235   extern u_char restore_candidate[512];
236
237   /* registers that may be allocated */
238   /* 1-31 gpr */
239 #define LOREG 32 // lo
240 #define HIREG 33 // hi
241 //#define FSREG 34 // FPU status (FCSR)
242 #define CSREG 35 // Coprocessor status
243 #define CCREG 36 // Cycle count
244 #define INVCP 37 // Pointer to invalid_code
245 //#define MMREG 38 // Pointer to memory_map
246 //#define ROREG 39 // ram offset (if rdram!=0x80000000)
247 #define TEMPREG 40
248 #define FTEMP 40 // FPU temporary register
249 #define PTEMP 41 // Prefetch temporary register
250 //#define TLREG 42 // TLB mapping offset
251 #define RHASH 43 // Return address hash
252 #define RHTBL 44 // Return address hash table address
253 #define RTEMP 45 // JR/JALR address register
254 #define MAXREG 45
255 #define AGEN1 46 // Address generation temporary register
256 //#define AGEN2 47 // Address generation temporary register
257 //#define MGEN1 48 // Maptable address generation temporary register
258 //#define MGEN2 49 // Maptable address generation temporary register
259 #define BTREG 50 // Branch target temporary register
260
261   /* instruction types */
262 #define NOP 0     // No operation
263 #define LOAD 1    // Load
264 #define STORE 2   // Store
265 #define LOADLR 3  // Unaligned load
266 #define STORELR 4 // Unaligned store
267 #define MOV 5     // Move
268 #define ALU 6     // Arithmetic/logic
269 #define MULTDIV 7 // Multiply/divide
270 #define SHIFT 8   // Shift by register
271 #define SHIFTIMM 9// Shift by immediate
272 #define IMM16 10  // 16-bit immediate
273 #define RJUMP 11  // Unconditional jump to register
274 #define UJUMP 12  // Unconditional jump
275 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
276 #define SJUMP 14  // Conditional branch (regimm format)
277 #define COP0 15   // Coprocessor 0
278 #define COP1 16   // Coprocessor 1
279 #define C1LS 17   // Coprocessor 1 load/store
280 //#define FJUMP 18  // Conditional branch (floating point)
281 //#define FLOAT 19  // Floating point unit
282 //#define FCONV 20  // Convert integer to float
283 //#define FCOMP 21  // Floating point compare (sets FSREG)
284 #define SYSCALL 22// SYSCALL
285 #define OTHER 23  // Other
286 #define SPAN 24   // Branch/delay slot spans 2 pages
287 #define NI 25     // Not implemented
288 #define HLECALL 26// PCSX fake opcodes for HLE
289 #define COP2 27   // Coprocessor 2 move
290 #define C2LS 28   // Coprocessor 2 load/store
291 #define C2OP 29   // Coprocessor 2 operation
292 #define INTCALL 30// Call interpreter to handle rare corner cases
293
294   /* branch codes */
295 #define TAKEN 1
296 #define NOTTAKEN 2
297 #define NULLDS 3
298
299 #define DJT_1 (void *)1l // no function, just a label in assem_debug log
300 #define DJT_2 (void *)2l
301
302 // asm linkage
303 int new_recompile_block(u_int addr);
304 void *get_addr_ht(u_int vaddr);
305 void invalidate_block(u_int block);
306 void invalidate_addr(u_int addr);
307 void remove_hash(int vaddr);
308 void dyna_linker();
309 void dyna_linker_ds();
310 void verify_code();
311 void verify_code_ds();
312 void cc_interrupt();
313 void fp_exception();
314 void fp_exception_ds();
315 void jump_to_new_pc();
316 void call_gteStall();
317 void new_dyna_leave();
318
319 // Needed by assembler
320 static void wb_register(signed char r,signed char regmap[],uint64_t dirty);
321 static void wb_dirtys(signed char i_regmap[],uint64_t i_dirty);
322 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_dirty,int addr);
323 static void load_all_regs(signed char i_regmap[]);
324 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
325 static void load_regs_entry(int t);
326 static void load_all_consts(signed char regmap[],u_int dirty,int i);
327 static u_int get_host_reglist(const signed char *regmap);
328
329 static int verify_dirty(const u_int *ptr);
330 static int get_final_value(int hr, int i, int *value);
331 static void add_stub(enum stub_type type, void *addr, void *retaddr,
332   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e);
333 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
334   int i, int addr_reg, const struct regstat *i_regs, int ccadj, u_int reglist);
335 static void add_to_linker(void *addr, u_int target, int ext);
336 static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override);
337 static void *get_direct_memhandler(void *table, u_int addr,
338   enum stub_type type, uintptr_t *addr_host);
339 static void cop2_call_stall_check(u_int op, int i, const struct regstat *i_regs, u_int reglist);
340 static void pass_args(int a0, int a1);
341 static void emit_far_jump(const void *f);
342 static void emit_far_call(const void *f);
343
344 static void mprotect_w_x(void *start, void *end, int is_x)
345 {
346 #ifdef NO_WRITE_EXEC
347   #if defined(VITA)
348   // *Open* enables write on all memory that was
349   // allocated by sceKernelAllocMemBlockForVM()?
350   if (is_x)
351     sceKernelCloseVMDomain();
352   else
353     sceKernelOpenVMDomain();
354   #else
355   u_long mstart = (u_long)start & ~4095ul;
356   u_long mend = (u_long)end;
357   if (mprotect((void *)mstart, mend - mstart,
358                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
359     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
360   #endif
361 #endif
362 }
363
364 static void start_tcache_write(void *start, void *end)
365 {
366   mprotect_w_x(start, end, 0);
367 }
368
369 static void end_tcache_write(void *start, void *end)
370 {
371 #if defined(__arm__) || defined(__aarch64__)
372   size_t len = (char *)end - (char *)start;
373   #if   defined(__BLACKBERRY_QNX__)
374   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
375   #elif defined(__MACH__)
376   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
377   #elif defined(VITA)
378   sceKernelSyncVMDomain(sceBlock, start, len);
379   #elif defined(_3DS)
380   ctr_flush_invalidate_cache();
381   #elif defined(__aarch64__)
382   // as of 2021, __clear_cache() is still broken on arm64
383   // so here is a custom one :(
384   clear_cache_arm64(start, end);
385   #else
386   __clear_cache(start, end);
387   #endif
388   (void)len;
389 #endif
390
391   mprotect_w_x(start, end, 1);
392 }
393
394 static void *start_block(void)
395 {
396   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
397   if (end > ndrc->translation_cache + sizeof(ndrc->translation_cache))
398     end = ndrc->translation_cache + sizeof(ndrc->translation_cache);
399   start_tcache_write(out, end);
400   return out;
401 }
402
403 static void end_block(void *start)
404 {
405   end_tcache_write(start, out);
406 }
407
408 // also takes care of w^x mappings when patching code
409 static u_int needs_clear_cache[1<<(TARGET_SIZE_2-17)];
410
411 static void mark_clear_cache(void *target)
412 {
413   uintptr_t offset = (u_char *)target - ndrc->translation_cache;
414   u_int mask = 1u << ((offset >> 12) & 31);
415   if (!(needs_clear_cache[offset >> 17] & mask)) {
416     char *start = (char *)((uintptr_t)target & ~4095l);
417     start_tcache_write(start, start + 4095);
418     needs_clear_cache[offset >> 17] |= mask;
419   }
420 }
421
422 // Clearing the cache is rather slow on ARM Linux, so mark the areas
423 // that need to be cleared, and then only clear these areas once.
424 static void do_clear_cache(void)
425 {
426   int i, j;
427   for (i = 0; i < (1<<(TARGET_SIZE_2-17)); i++)
428   {
429     u_int bitmap = needs_clear_cache[i];
430     if (!bitmap)
431       continue;
432     for (j = 0; j < 32; j++)
433     {
434       u_char *start, *end;
435       if (!(bitmap & (1<<j)))
436         continue;
437
438       start = ndrc->translation_cache + i*131072 + j*4096;
439       end = start + 4095;
440       for (j++; j < 32; j++) {
441         if (!(bitmap & (1<<j)))
442           break;
443         end += 4096;
444       }
445       end_tcache_write(start, end);
446     }
447     needs_clear_cache[i] = 0;
448   }
449 }
450
451 //#define DEBUG_CYCLE_COUNT 1
452
453 #define NO_CYCLE_PENALTY_THR 12
454
455 int cycle_multiplier; // 100 for 1.0
456 int cycle_multiplier_override;
457
458 static int CLOCK_ADJUST(int x)
459 {
460   int m = cycle_multiplier_override
461         ? cycle_multiplier_override : cycle_multiplier;
462   int s=(x>>31)|1;
463   return (x * m + s * 50) / 100;
464 }
465
466 // is the op an unconditional jump?
467 static int is_ujump(int i)
468 {
469   return itype[i] == UJUMP || itype[i] == RJUMP
470     || (source[i] >> 16) == 0x1000; // beq r0, r0, offset // b offset
471 }
472
473 static int is_jump(int i)
474 {
475   return itype[i] == RJUMP || itype[i] == UJUMP || itype[i] == CJUMP || itype[i] == SJUMP;
476 }
477
478 static u_int get_page(u_int vaddr)
479 {
480   u_int page=vaddr&~0xe0000000;
481   if (page < 0x1000000)
482     page &= ~0x0e00000; // RAM mirrors
483   page>>=12;
484   if(page>2048) page=2048+(page&2047);
485   return page;
486 }
487
488 // no virtual mem in PCSX
489 static u_int get_vpage(u_int vaddr)
490 {
491   return get_page(vaddr);
492 }
493
494 static struct ht_entry *hash_table_get(u_int vaddr)
495 {
496   return &hash_table[((vaddr>>16)^vaddr)&0xFFFF];
497 }
498
499 static void hash_table_add(struct ht_entry *ht_bin, u_int vaddr, void *tcaddr)
500 {
501   ht_bin->vaddr[1] = ht_bin->vaddr[0];
502   ht_bin->tcaddr[1] = ht_bin->tcaddr[0];
503   ht_bin->vaddr[0] = vaddr;
504   ht_bin->tcaddr[0] = tcaddr;
505 }
506
507 // some messy ari64's code, seems to rely on unsigned 32bit overflow
508 static int doesnt_expire_soon(void *tcaddr)
509 {
510   u_int diff = (u_int)((u_char *)tcaddr - out) << (32-TARGET_SIZE_2);
511   return diff > (u_int)(0x60000000 + (MAX_OUTPUT_BLOCK_SIZE << (32-TARGET_SIZE_2)));
512 }
513
514 // Get address from virtual address
515 // This is called from the recompiled JR/JALR instructions
516 void noinline *get_addr(u_int vaddr)
517 {
518   u_int page=get_page(vaddr);
519   u_int vpage=get_vpage(vaddr);
520   struct ll_entry *head;
521   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
522   head=jump_in[page];
523   while(head!=NULL) {
524     if(head->vaddr==vaddr) {
525   //printf("TRACE: count=%d next=%d (get_addr match %x: %p)\n",Count,next_interupt,vaddr,head->addr);
526       hash_table_add(hash_table_get(vaddr), vaddr, head->addr);
527       return head->addr;
528     }
529     head=head->next;
530   }
531   head=jump_dirty[vpage];
532   while(head!=NULL) {
533     if(head->vaddr==vaddr) {
534       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %p)\n",Count,next_interupt,vaddr,head->addr);
535       // Don't restore blocks which are about to expire from the cache
536       if (doesnt_expire_soon(head->addr))
537       if (verify_dirty(head->addr)) {
538         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
539         invalid_code[vaddr>>12]=0;
540         inv_code_start=inv_code_end=~0;
541         if(vpage<2048) {
542           restore_candidate[vpage>>3]|=1<<(vpage&7);
543         }
544         else restore_candidate[page>>3]|=1<<(page&7);
545         struct ht_entry *ht_bin = hash_table_get(vaddr);
546         if (ht_bin->vaddr[0] == vaddr)
547           ht_bin->tcaddr[0] = head->addr; // Replace existing entry
548         else
549           hash_table_add(ht_bin, vaddr, head->addr);
550
551         return head->addr;
552       }
553     }
554     head=head->next;
555   }
556   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
557   int r=new_recompile_block(vaddr);
558   if(r==0) return get_addr(vaddr);
559   // Execute in unmapped page, generate pagefault execption
560   Status|=2;
561   Cause=(vaddr<<31)|0x8;
562   EPC=(vaddr&1)?vaddr-5:vaddr;
563   BadVAddr=(vaddr&~1);
564   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
565   EntryHi=BadVAddr&0xFFFFE000;
566   return get_addr_ht(0x80000000);
567 }
568 // Look up address in hash table first
569 void *get_addr_ht(u_int vaddr)
570 {
571   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
572   const struct ht_entry *ht_bin = hash_table_get(vaddr);
573   if (ht_bin->vaddr[0] == vaddr) return ht_bin->tcaddr[0];
574   if (ht_bin->vaddr[1] == vaddr) return ht_bin->tcaddr[1];
575   return get_addr(vaddr);
576 }
577
578 void clear_all_regs(signed char regmap[])
579 {
580   int hr;
581   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
582 }
583
584 static signed char get_reg(const signed char regmap[],int r)
585 {
586   int hr;
587   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
588   return -1;
589 }
590
591 // Find a register that is available for two consecutive cycles
592 static signed char get_reg2(signed char regmap1[], const signed char regmap2[], int r)
593 {
594   int hr;
595   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
596   return -1;
597 }
598
599 int count_free_regs(signed char regmap[])
600 {
601   int count=0;
602   int hr;
603   for(hr=0;hr<HOST_REGS;hr++)
604   {
605     if(hr!=EXCLUDE_REG) {
606       if(regmap[hr]<0) count++;
607     }
608   }
609   return count;
610 }
611
612 void dirty_reg(struct regstat *cur,signed char reg)
613 {
614   int hr;
615   if(!reg) return;
616   for (hr=0;hr<HOST_REGS;hr++) {
617     if((cur->regmap[hr]&63)==reg) {
618       cur->dirty|=1<<hr;
619     }
620   }
621 }
622
623 static void set_const(struct regstat *cur, signed char reg, uint32_t value)
624 {
625   int hr;
626   if(!reg) return;
627   for (hr=0;hr<HOST_REGS;hr++) {
628     if(cur->regmap[hr]==reg) {
629       cur->isconst|=1<<hr;
630       current_constmap[hr]=value;
631     }
632   }
633 }
634
635 static void clear_const(struct regstat *cur, signed char reg)
636 {
637   int hr;
638   if(!reg) return;
639   for (hr=0;hr<HOST_REGS;hr++) {
640     if((cur->regmap[hr]&63)==reg) {
641       cur->isconst&=~(1<<hr);
642     }
643   }
644 }
645
646 static int is_const(struct regstat *cur, signed char reg)
647 {
648   int hr;
649   if(reg<0) return 0;
650   if(!reg) return 1;
651   for (hr=0;hr<HOST_REGS;hr++) {
652     if((cur->regmap[hr]&63)==reg) {
653       return (cur->isconst>>hr)&1;
654     }
655   }
656   return 0;
657 }
658
659 static uint32_t get_const(struct regstat *cur, signed char reg)
660 {
661   int hr;
662   if(!reg) return 0;
663   for (hr=0;hr<HOST_REGS;hr++) {
664     if(cur->regmap[hr]==reg) {
665       return current_constmap[hr];
666     }
667   }
668   SysPrintf("Unknown constant in r%d\n",reg);
669   abort();
670 }
671
672 // Least soon needed registers
673 // Look at the next ten instructions and see which registers
674 // will be used.  Try not to reallocate these.
675 void lsn(u_char hsn[], int i, int *preferred_reg)
676 {
677   int j;
678   int b=-1;
679   for(j=0;j<9;j++)
680   {
681     if(i+j>=slen) {
682       j=slen-i-1;
683       break;
684     }
685     if (is_ujump(i+j))
686     {
687       // Don't go past an unconditonal jump
688       j++;
689       break;
690     }
691   }
692   for(;j>=0;j--)
693   {
694     if(rs1[i+j]) hsn[rs1[i+j]]=j;
695     if(rs2[i+j]) hsn[rs2[i+j]]=j;
696     if(rt1[i+j]) hsn[rt1[i+j]]=j;
697     if(rt2[i+j]) hsn[rt2[i+j]]=j;
698     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
699       // Stores can allocate zero
700       hsn[rs1[i+j]]=j;
701       hsn[rs2[i+j]]=j;
702     }
703     // On some architectures stores need invc_ptr
704     #if defined(HOST_IMM8)
705     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
706       hsn[INVCP]=j;
707     }
708     #endif
709     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP))
710     {
711       hsn[CCREG]=j;
712       b=j;
713     }
714   }
715   if(b>=0)
716   {
717     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
718     {
719       // Follow first branch
720       int t=(ba[i+b]-start)>>2;
721       j=7-b;if(t+j>=slen) j=slen-t-1;
722       for(;j>=0;j--)
723       {
724         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
725         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
726         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
727         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
728       }
729     }
730     // TODO: preferred register based on backward branch
731   }
732   // Delay slot should preferably not overwrite branch conditions or cycle count
733   if (i > 0 && is_jump(i-1)) {
734     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
735     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
736     hsn[CCREG]=1;
737     // ...or hash tables
738     hsn[RHASH]=1;
739     hsn[RHTBL]=1;
740   }
741   // Coprocessor load/store needs FTEMP, even if not declared
742   if(itype[i]==C1LS||itype[i]==C2LS) {
743     hsn[FTEMP]=0;
744   }
745   // Load L/R also uses FTEMP as a temporary register
746   if(itype[i]==LOADLR) {
747     hsn[FTEMP]=0;
748   }
749   // Also SWL/SWR/SDL/SDR
750   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
751     hsn[FTEMP]=0;
752   }
753   // Don't remove the miniht registers
754   if(itype[i]==UJUMP||itype[i]==RJUMP)
755   {
756     hsn[RHASH]=0;
757     hsn[RHTBL]=0;
758   }
759 }
760
761 // We only want to allocate registers if we're going to use them again soon
762 int needed_again(int r, int i)
763 {
764   int j;
765   int b=-1;
766   int rn=10;
767
768   if (i > 0 && is_ujump(i-1))
769   {
770     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
771       return 0; // Don't need any registers if exiting the block
772   }
773   for(j=0;j<9;j++)
774   {
775     if(i+j>=slen) {
776       j=slen-i-1;
777       break;
778     }
779     if (is_ujump(i+j))
780     {
781       // Don't go past an unconditonal jump
782       j++;
783       break;
784     }
785     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
786     {
787       break;
788     }
789   }
790   for(;j>=1;j--)
791   {
792     if(rs1[i+j]==r) rn=j;
793     if(rs2[i+j]==r) rn=j;
794     if((unneeded_reg[i+j]>>r)&1) rn=10;
795     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP))
796     {
797       b=j;
798     }
799   }
800   /*
801   if(b>=0)
802   {
803     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
804     {
805       // Follow first branch
806       int o=rn;
807       int t=(ba[i+b]-start)>>2;
808       j=7-b;if(t+j>=slen) j=slen-t-1;
809       for(;j>=0;j--)
810       {
811         if(!((unneeded_reg[t+j]>>r)&1)) {
812           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
813           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
814         }
815         else rn=o;
816       }
817     }
818   }*/
819   if(rn<10) return 1;
820   (void)b;
821   return 0;
822 }
823
824 // Try to match register allocations at the end of a loop with those
825 // at the beginning
826 int loop_reg(int i, int r, int hr)
827 {
828   int j,k;
829   for(j=0;j<9;j++)
830   {
831     if(i+j>=slen) {
832       j=slen-i-1;
833       break;
834     }
835     if (is_ujump(i+j))
836     {
837       // Don't go past an unconditonal jump
838       j++;
839       break;
840     }
841   }
842   k=0;
843   if(i>0){
844     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP)
845       k--;
846   }
847   for(;k<j;k++)
848   {
849     assert(r < 64);
850     if((unneeded_reg[i+k]>>r)&1) return hr;
851     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP))
852     {
853       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
854       {
855         int t=(ba[i+k]-start)>>2;
856         int reg=get_reg(regs[t].regmap_entry,r);
857         if(reg>=0) return reg;
858         //reg=get_reg(regs[t+1].regmap_entry,r);
859         //if(reg>=0) return reg;
860       }
861     }
862   }
863   return hr;
864 }
865
866
867 // Allocate every register, preserving source/target regs
868 void alloc_all(struct regstat *cur,int i)
869 {
870   int hr;
871
872   for(hr=0;hr<HOST_REGS;hr++) {
873     if(hr!=EXCLUDE_REG) {
874       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
875          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
876       {
877         cur->regmap[hr]=-1;
878         cur->dirty&=~(1<<hr);
879       }
880       // Don't need zeros
881       if((cur->regmap[hr]&63)==0)
882       {
883         cur->regmap[hr]=-1;
884         cur->dirty&=~(1<<hr);
885       }
886     }
887   }
888 }
889
890 #ifndef NDEBUG
891 static int host_tempreg_in_use;
892
893 static void host_tempreg_acquire(void)
894 {
895   assert(!host_tempreg_in_use);
896   host_tempreg_in_use = 1;
897 }
898
899 static void host_tempreg_release(void)
900 {
901   host_tempreg_in_use = 0;
902 }
903 #else
904 static void host_tempreg_acquire(void) {}
905 static void host_tempreg_release(void) {}
906 #endif
907
908 #ifdef DRC_DBG
909 extern void gen_interupt();
910 extern void do_insn_cmp();
911 #define FUNCNAME(f) { f, " " #f }
912 static const struct {
913   void *addr;
914   const char *name;
915 } function_names[] = {
916   FUNCNAME(cc_interrupt),
917   FUNCNAME(gen_interupt),
918   FUNCNAME(get_addr_ht),
919   FUNCNAME(get_addr),
920   FUNCNAME(jump_handler_read8),
921   FUNCNAME(jump_handler_read16),
922   FUNCNAME(jump_handler_read32),
923   FUNCNAME(jump_handler_write8),
924   FUNCNAME(jump_handler_write16),
925   FUNCNAME(jump_handler_write32),
926   FUNCNAME(invalidate_addr),
927   FUNCNAME(jump_to_new_pc),
928   FUNCNAME(call_gteStall),
929   FUNCNAME(new_dyna_leave),
930   FUNCNAME(pcsx_mtc0),
931   FUNCNAME(pcsx_mtc0_ds),
932   FUNCNAME(do_insn_cmp),
933 #ifdef __arm__
934   FUNCNAME(verify_code),
935 #endif
936 };
937
938 static const char *func_name(const void *a)
939 {
940   int i;
941   for (i = 0; i < sizeof(function_names)/sizeof(function_names[0]); i++)
942     if (function_names[i].addr == a)
943       return function_names[i].name;
944   return "";
945 }
946 #else
947 #define func_name(x) ""
948 #endif
949
950 #ifdef __i386__
951 #include "assem_x86.c"
952 #endif
953 #ifdef __x86_64__
954 #include "assem_x64.c"
955 #endif
956 #ifdef __arm__
957 #include "assem_arm.c"
958 #endif
959 #ifdef __aarch64__
960 #include "assem_arm64.c"
961 #endif
962
963 static void *get_trampoline(const void *f)
964 {
965   size_t i;
966
967   for (i = 0; i < ARRAY_SIZE(ndrc->tramp.f); i++) {
968     if (ndrc->tramp.f[i] == f || ndrc->tramp.f[i] == NULL)
969       break;
970   }
971   if (i == ARRAY_SIZE(ndrc->tramp.f)) {
972     SysPrintf("trampoline table is full, last func %p\n", f);
973     abort();
974   }
975   if (ndrc->tramp.f[i] == NULL) {
976     start_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]);
977     ndrc->tramp.f[i] = f;
978     end_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]);
979   }
980   return &ndrc->tramp.ops[i];
981 }
982
983 static void emit_far_jump(const void *f)
984 {
985   if (can_jump_or_call(f)) {
986     emit_jmp(f);
987     return;
988   }
989
990   f = get_trampoline(f);
991   emit_jmp(f);
992 }
993
994 static void emit_far_call(const void *f)
995 {
996   if (can_jump_or_call(f)) {
997     emit_call(f);
998     return;
999   }
1000
1001   f = get_trampoline(f);
1002   emit_call(f);
1003 }
1004
1005 // Add virtual address mapping to linked list
1006 void ll_add(struct ll_entry **head,int vaddr,void *addr)
1007 {
1008   struct ll_entry *new_entry;
1009   new_entry=malloc(sizeof(struct ll_entry));
1010   assert(new_entry!=NULL);
1011   new_entry->vaddr=vaddr;
1012   new_entry->reg_sv_flags=0;
1013   new_entry->addr=addr;
1014   new_entry->next=*head;
1015   *head=new_entry;
1016 }
1017
1018 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
1019 {
1020   ll_add(head,vaddr,addr);
1021   (*head)->reg_sv_flags=reg_sv_flags;
1022 }
1023
1024 // Check if an address is already compiled
1025 // but don't return addresses which are about to expire from the cache
1026 void *check_addr(u_int vaddr)
1027 {
1028   struct ht_entry *ht_bin = hash_table_get(vaddr);
1029   size_t i;
1030   for (i = 0; i < ARRAY_SIZE(ht_bin->vaddr); i++) {
1031     if (ht_bin->vaddr[i] == vaddr)
1032       if (doesnt_expire_soon((u_char *)ht_bin->tcaddr[i] - MAX_OUTPUT_BLOCK_SIZE))
1033         if (isclean(ht_bin->tcaddr[i]))
1034           return ht_bin->tcaddr[i];
1035   }
1036   u_int page=get_page(vaddr);
1037   struct ll_entry *head;
1038   head=jump_in[page];
1039   while (head != NULL) {
1040     if (head->vaddr == vaddr) {
1041       if (doesnt_expire_soon(head->addr)) {
1042         // Update existing entry with current address
1043         if (ht_bin->vaddr[0] == vaddr) {
1044           ht_bin->tcaddr[0] = head->addr;
1045           return head->addr;
1046         }
1047         if (ht_bin->vaddr[1] == vaddr) {
1048           ht_bin->tcaddr[1] = head->addr;
1049           return head->addr;
1050         }
1051         // Insert into hash table with low priority.
1052         // Don't evict existing entries, as they are probably
1053         // addresses that are being accessed frequently.
1054         if (ht_bin->vaddr[0] == -1) {
1055           ht_bin->vaddr[0] = vaddr;
1056           ht_bin->tcaddr[0] = head->addr;
1057         }
1058         else if (ht_bin->vaddr[1] == -1) {
1059           ht_bin->vaddr[1] = vaddr;
1060           ht_bin->tcaddr[1] = head->addr;
1061         }
1062         return head->addr;
1063       }
1064     }
1065     head=head->next;
1066   }
1067   return 0;
1068 }
1069
1070 void remove_hash(int vaddr)
1071 {
1072   //printf("remove hash: %x\n",vaddr);
1073   struct ht_entry *ht_bin = hash_table_get(vaddr);
1074   if (ht_bin->vaddr[1] == vaddr) {
1075     ht_bin->vaddr[1] = -1;
1076     ht_bin->tcaddr[1] = NULL;
1077   }
1078   if (ht_bin->vaddr[0] == vaddr) {
1079     ht_bin->vaddr[0] = ht_bin->vaddr[1];
1080     ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
1081     ht_bin->vaddr[1] = -1;
1082     ht_bin->tcaddr[1] = NULL;
1083   }
1084 }
1085
1086 void ll_remove_matching_addrs(struct ll_entry **head,uintptr_t addr,int shift)
1087 {
1088   struct ll_entry *next;
1089   while(*head) {
1090     if(((uintptr_t)((*head)->addr)>>shift)==(addr>>shift) ||
1091        ((uintptr_t)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1092     {
1093       inv_debug("EXP: Remove pointer to %p (%x)\n",(*head)->addr,(*head)->vaddr);
1094       remove_hash((*head)->vaddr);
1095       next=(*head)->next;
1096       free(*head);
1097       *head=next;
1098     }
1099     else
1100     {
1101       head=&((*head)->next);
1102     }
1103   }
1104 }
1105
1106 // Remove all entries from linked list
1107 void ll_clear(struct ll_entry **head)
1108 {
1109   struct ll_entry *cur;
1110   struct ll_entry *next;
1111   if((cur=*head)) {
1112     *head=0;
1113     while(cur) {
1114       next=cur->next;
1115       free(cur);
1116       cur=next;
1117     }
1118   }
1119 }
1120
1121 // Dereference the pointers and remove if it matches
1122 static void ll_kill_pointers(struct ll_entry *head,uintptr_t addr,int shift)
1123 {
1124   while(head) {
1125     uintptr_t ptr = (uintptr_t)get_pointer(head->addr);
1126     inv_debug("EXP: Lookup pointer to %lx at %p (%x)\n",(long)ptr,head->addr,head->vaddr);
1127     if(((ptr>>shift)==(addr>>shift)) ||
1128        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1129     {
1130       inv_debug("EXP: Kill pointer at %p (%x)\n",head->addr,head->vaddr);
1131       void *host_addr=find_extjump_insn(head->addr);
1132       mark_clear_cache(host_addr);
1133       set_jump_target(host_addr, head->addr);
1134     }
1135     head=head->next;
1136   }
1137 }
1138
1139 // This is called when we write to a compiled block (see do_invstub)
1140 static void invalidate_page(u_int page)
1141 {
1142   struct ll_entry *head;
1143   struct ll_entry *next;
1144   head=jump_in[page];
1145   jump_in[page]=0;
1146   while(head!=NULL) {
1147     inv_debug("INVALIDATE: %x\n",head->vaddr);
1148     remove_hash(head->vaddr);
1149     next=head->next;
1150     free(head);
1151     head=next;
1152   }
1153   head=jump_out[page];
1154   jump_out[page]=0;
1155   while(head!=NULL) {
1156     inv_debug("INVALIDATE: kill pointer to %x (%p)\n",head->vaddr,head->addr);
1157     void *host_addr=find_extjump_insn(head->addr);
1158     mark_clear_cache(host_addr);
1159     set_jump_target(host_addr, head->addr);
1160     next=head->next;
1161     free(head);
1162     head=next;
1163   }
1164 }
1165
1166 static void invalidate_block_range(u_int block, u_int first, u_int last)
1167 {
1168   u_int page=get_page(block<<12);
1169   //printf("first=%d last=%d\n",first,last);
1170   invalidate_page(page);
1171   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1172   assert(last<page+5);
1173   // Invalidate the adjacent pages if a block crosses a 4K boundary
1174   while(first<page) {
1175     invalidate_page(first);
1176     first++;
1177   }
1178   for(first=page+1;first<last;first++) {
1179     invalidate_page(first);
1180   }
1181   do_clear_cache();
1182
1183   // Don't trap writes
1184   invalid_code[block]=1;
1185
1186   #ifdef USE_MINI_HT
1187   memset(mini_ht,-1,sizeof(mini_ht));
1188   #endif
1189 }
1190
1191 void invalidate_block(u_int block)
1192 {
1193   u_int page=get_page(block<<12);
1194   u_int vpage=get_vpage(block<<12);
1195   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1196   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1197   u_int first,last;
1198   first=last=page;
1199   struct ll_entry *head;
1200   head=jump_dirty[vpage];
1201   //printf("page=%d vpage=%d\n",page,vpage);
1202   while(head!=NULL) {
1203     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1204       u_char *start, *end;
1205       get_bounds(head->addr, &start, &end);
1206       //printf("start: %p end: %p\n", start, end);
1207       if (page < 2048 && start >= rdram && end < rdram+RAM_SIZE) {
1208         if (((start-rdram)>>12) <= page && ((end-1-rdram)>>12) >= page) {
1209           if ((((start-rdram)>>12)&2047) < first) first = ((start-rdram)>>12)&2047;
1210           if ((((end-1-rdram)>>12)&2047) > last)  last = ((end-1-rdram)>>12)&2047;
1211         }
1212       }
1213     }
1214     head=head->next;
1215   }
1216   invalidate_block_range(block,first,last);
1217 }
1218
1219 void invalidate_addr(u_int addr)
1220 {
1221   //static int rhits;
1222   // this check is done by the caller
1223   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1224   u_int page=get_vpage(addr);
1225   if(page<2048) { // RAM
1226     struct ll_entry *head;
1227     u_int addr_min=~0, addr_max=0;
1228     u_int mask=RAM_SIZE-1;
1229     u_int addr_main=0x80000000|(addr&mask);
1230     int pg1;
1231     inv_code_start=addr_main&~0xfff;
1232     inv_code_end=addr_main|0xfff;
1233     pg1=page;
1234     if (pg1>0) {
1235       // must check previous page too because of spans..
1236       pg1--;
1237       inv_code_start-=0x1000;
1238     }
1239     for(;pg1<=page;pg1++) {
1240       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1241         u_char *start_h, *end_h;
1242         u_int start, end;
1243         get_bounds(head->addr, &start_h, &end_h);
1244         start = (uintptr_t)start_h - ram_offset;
1245         end = (uintptr_t)end_h - ram_offset;
1246         if(start<=addr_main&&addr_main<end) {
1247           if(start<addr_min) addr_min=start;
1248           if(end>addr_max) addr_max=end;
1249         }
1250         else if(addr_main<start) {
1251           if(start<inv_code_end)
1252             inv_code_end=start-1;
1253         }
1254         else {
1255           if(end>inv_code_start)
1256             inv_code_start=end;
1257         }
1258       }
1259     }
1260     if (addr_min!=~0) {
1261       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1262       inv_code_start=inv_code_end=~0;
1263       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1264       return;
1265     }
1266     else {
1267       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1268       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1269       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1270       return;
1271     }
1272   }
1273   invalidate_block(addr>>12);
1274 }
1275
1276 // This is called when loading a save state.
1277 // Anything could have changed, so invalidate everything.
1278 void invalidate_all_pages(void)
1279 {
1280   u_int page;
1281   for(page=0;page<4096;page++)
1282     invalidate_page(page);
1283   for(page=0;page<1048576;page++)
1284     if(!invalid_code[page]) {
1285       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1286       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1287     }
1288   #ifdef USE_MINI_HT
1289   memset(mini_ht,-1,sizeof(mini_ht));
1290   #endif
1291   do_clear_cache();
1292 }
1293
1294 static void do_invstub(int n)
1295 {
1296   literal_pool(20);
1297   u_int reglist=stubs[n].a;
1298   set_jump_target(stubs[n].addr, out);
1299   save_regs(reglist);
1300   if(stubs[n].b!=0) emit_mov(stubs[n].b,0);
1301   emit_far_call(invalidate_addr);
1302   restore_regs(reglist);
1303   emit_jmp(stubs[n].retaddr); // return address
1304 }
1305
1306 // Add an entry to jump_out after making a link
1307 // src should point to code by emit_extjump2()
1308 void add_link(u_int vaddr,void *src)
1309 {
1310   u_int page=get_page(vaddr);
1311   inv_debug("add_link: %p -> %x (%d)\n",src,vaddr,page);
1312   check_extjump2(src);
1313   ll_add(jump_out+page,vaddr,src);
1314   //void *ptr=get_pointer(src);
1315   //inv_debug("add_link: Pointer is to %p\n",ptr);
1316 }
1317
1318 // If a code block was found to be unmodified (bit was set in
1319 // restore_candidate) and it remains unmodified (bit is clear
1320 // in invalid_code) then move the entries for that 4K page from
1321 // the dirty list to the clean list.
1322 void clean_blocks(u_int page)
1323 {
1324   struct ll_entry *head;
1325   inv_debug("INV: clean_blocks page=%d\n",page);
1326   head=jump_dirty[page];
1327   while(head!=NULL) {
1328     if(!invalid_code[head->vaddr>>12]) {
1329       // Don't restore blocks which are about to expire from the cache
1330       if (doesnt_expire_soon(head->addr)) {
1331         if(verify_dirty(head->addr)) {
1332           u_char *start, *end;
1333           //printf("Possibly Restore %x (%p)\n",head->vaddr, head->addr);
1334           u_int i;
1335           u_int inv=0;
1336           get_bounds(head->addr, &start, &end);
1337           if (start - rdram < RAM_SIZE) {
1338             for (i = (start-rdram+0x80000000)>>12; i <= (end-1-rdram+0x80000000)>>12; i++) {
1339               inv|=invalid_code[i];
1340             }
1341           }
1342           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1343             inv=1;
1344           }
1345           if(!inv) {
1346             void *clean_addr = get_clean_addr(head->addr);
1347             if (doesnt_expire_soon(clean_addr)) {
1348               u_int ppage=page;
1349               inv_debug("INV: Restored %x (%p/%p)\n",head->vaddr, head->addr, clean_addr);
1350               //printf("page=%x, addr=%x\n",page,head->vaddr);
1351               //assert(head->vaddr>>12==(page|0x80000));
1352               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1353               struct ht_entry *ht_bin = hash_table_get(head->vaddr);
1354               if (ht_bin->vaddr[0] == head->vaddr)
1355                 ht_bin->tcaddr[0] = clean_addr; // Replace existing entry
1356               if (ht_bin->vaddr[1] == head->vaddr)
1357                 ht_bin->tcaddr[1] = clean_addr; // Replace existing entry
1358             }
1359           }
1360         }
1361       }
1362     }
1363     head=head->next;
1364   }
1365 }
1366
1367 /* Register allocation */
1368
1369 // Note: registers are allocated clean (unmodified state)
1370 // if you intend to modify the register, you must call dirty_reg().
1371 static void alloc_reg(struct regstat *cur,int i,signed char reg)
1372 {
1373   int r,hr;
1374   int preferred_reg = (reg&7);
1375   if(reg==CCREG) preferred_reg=HOST_CCREG;
1376   if(reg==PTEMP||reg==FTEMP) preferred_reg=12;
1377
1378   // Don't allocate unused registers
1379   if((cur->u>>reg)&1) return;
1380
1381   // see if it's already allocated
1382   for(hr=0;hr<HOST_REGS;hr++)
1383   {
1384     if(cur->regmap[hr]==reg) return;
1385   }
1386
1387   // Keep the same mapping if the register was already allocated in a loop
1388   preferred_reg = loop_reg(i,reg,preferred_reg);
1389
1390   // Try to allocate the preferred register
1391   if(cur->regmap[preferred_reg]==-1) {
1392     cur->regmap[preferred_reg]=reg;
1393     cur->dirty&=~(1<<preferred_reg);
1394     cur->isconst&=~(1<<preferred_reg);
1395     return;
1396   }
1397   r=cur->regmap[preferred_reg];
1398   assert(r < 64);
1399   if((cur->u>>r)&1) {
1400     cur->regmap[preferred_reg]=reg;
1401     cur->dirty&=~(1<<preferred_reg);
1402     cur->isconst&=~(1<<preferred_reg);
1403     return;
1404   }
1405
1406   // Clear any unneeded registers
1407   // We try to keep the mapping consistent, if possible, because it
1408   // makes branches easier (especially loops).  So we try to allocate
1409   // first (see above) before removing old mappings.  If this is not
1410   // possible then go ahead and clear out the registers that are no
1411   // longer needed.
1412   for(hr=0;hr<HOST_REGS;hr++)
1413   {
1414     r=cur->regmap[hr];
1415     if(r>=0) {
1416       assert(r < 64);
1417       if((cur->u>>r)&1) {cur->regmap[hr]=-1;break;}
1418     }
1419   }
1420   // Try to allocate any available register, but prefer
1421   // registers that have not been used recently.
1422   if(i>0) {
1423     for(hr=0;hr<HOST_REGS;hr++) {
1424       if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
1425         if(regs[i-1].regmap[hr]!=rs1[i-1]&&regs[i-1].regmap[hr]!=rs2[i-1]&&regs[i-1].regmap[hr]!=rt1[i-1]&&regs[i-1].regmap[hr]!=rt2[i-1]) {
1426           cur->regmap[hr]=reg;
1427           cur->dirty&=~(1<<hr);
1428           cur->isconst&=~(1<<hr);
1429           return;
1430         }
1431       }
1432     }
1433   }
1434   // Try to allocate any available register
1435   for(hr=0;hr<HOST_REGS;hr++) {
1436     if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
1437       cur->regmap[hr]=reg;
1438       cur->dirty&=~(1<<hr);
1439       cur->isconst&=~(1<<hr);
1440       return;
1441     }
1442   }
1443
1444   // Ok, now we have to evict someone
1445   // Pick a register we hopefully won't need soon
1446   u_char hsn[MAXREG+1];
1447   memset(hsn,10,sizeof(hsn));
1448   int j;
1449   lsn(hsn,i,&preferred_reg);
1450   //printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",cur->regmap[0],cur->regmap[1],cur->regmap[2],cur->regmap[3],cur->regmap[5],cur->regmap[6],cur->regmap[7]);
1451   //printf("hsn(%x): %d %d %d %d %d %d %d\n",start+i*4,hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]);
1452   if(i>0) {
1453     // Don't evict the cycle count at entry points, otherwise the entry
1454     // stub will have to write it.
1455     if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2;
1456     if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP)) hsn[CCREG]=2;
1457     for(j=10;j>=3;j--)
1458     {
1459       // Alloc preferred register if available
1460       if(hsn[r=cur->regmap[preferred_reg]&63]==j) {
1461         for(hr=0;hr<HOST_REGS;hr++) {
1462           // Evict both parts of a 64-bit register
1463           if((cur->regmap[hr]&63)==r) {
1464             cur->regmap[hr]=-1;
1465             cur->dirty&=~(1<<hr);
1466             cur->isconst&=~(1<<hr);
1467           }
1468         }
1469         cur->regmap[preferred_reg]=reg;
1470         return;
1471       }
1472       for(r=1;r<=MAXREG;r++)
1473       {
1474         if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) {
1475           for(hr=0;hr<HOST_REGS;hr++) {
1476             if(hr!=HOST_CCREG||j<hsn[CCREG]) {
1477               if(cur->regmap[hr]==r) {
1478                 cur->regmap[hr]=reg;
1479                 cur->dirty&=~(1<<hr);
1480                 cur->isconst&=~(1<<hr);
1481                 return;
1482               }
1483             }
1484           }
1485         }
1486       }
1487     }
1488   }
1489   for(j=10;j>=0;j--)
1490   {
1491     for(r=1;r<=MAXREG;r++)
1492     {
1493       if(hsn[r]==j) {
1494         for(hr=0;hr<HOST_REGS;hr++) {
1495           if(cur->regmap[hr]==r) {
1496             cur->regmap[hr]=reg;
1497             cur->dirty&=~(1<<hr);
1498             cur->isconst&=~(1<<hr);
1499             return;
1500           }
1501         }
1502       }
1503     }
1504   }
1505   SysPrintf("This shouldn't happen (alloc_reg)");abort();
1506 }
1507
1508 // Allocate a temporary register.  This is done without regard to
1509 // dirty status or whether the register we request is on the unneeded list
1510 // Note: This will only allocate one register, even if called multiple times
1511 static void alloc_reg_temp(struct regstat *cur,int i,signed char reg)
1512 {
1513   int r,hr;
1514   int preferred_reg = -1;
1515
1516   // see if it's already allocated
1517   for(hr=0;hr<HOST_REGS;hr++)
1518   {
1519     if(hr!=EXCLUDE_REG&&cur->regmap[hr]==reg) return;
1520   }
1521
1522   // Try to allocate any available register
1523   for(hr=HOST_REGS-1;hr>=0;hr--) {
1524     if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
1525       cur->regmap[hr]=reg;
1526       cur->dirty&=~(1<<hr);
1527       cur->isconst&=~(1<<hr);
1528       return;
1529     }
1530   }
1531
1532   // Find an unneeded register
1533   for(hr=HOST_REGS-1;hr>=0;hr--)
1534   {
1535     r=cur->regmap[hr];
1536     if(r>=0) {
1537       assert(r < 64);
1538       if((cur->u>>r)&1) {
1539         if(i==0||((unneeded_reg[i-1]>>r)&1)) {
1540           cur->regmap[hr]=reg;
1541           cur->dirty&=~(1<<hr);
1542           cur->isconst&=~(1<<hr);
1543           return;
1544         }
1545       }
1546     }
1547   }
1548
1549   // Ok, now we have to evict someone
1550   // Pick a register we hopefully won't need soon
1551   // TODO: we might want to follow unconditional jumps here
1552   // TODO: get rid of dupe code and make this into a function
1553   u_char hsn[MAXREG+1];
1554   memset(hsn,10,sizeof(hsn));
1555   int j;
1556   lsn(hsn,i,&preferred_reg);
1557   //printf("hsn: %d %d %d %d %d %d %d\n",hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]);
1558   if(i>0) {
1559     // Don't evict the cycle count at entry points, otherwise the entry
1560     // stub will have to write it.
1561     if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2;
1562     if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP)) hsn[CCREG]=2;
1563     for(j=10;j>=3;j--)
1564     {
1565       for(r=1;r<=MAXREG;r++)
1566       {
1567         if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) {
1568           for(hr=0;hr<HOST_REGS;hr++) {
1569             if(hr!=HOST_CCREG||hsn[CCREG]>2) {
1570               if(cur->regmap[hr]==r) {
1571                 cur->regmap[hr]=reg;
1572                 cur->dirty&=~(1<<hr);
1573                 cur->isconst&=~(1<<hr);
1574                 return;
1575               }
1576             }
1577           }
1578         }
1579       }
1580     }
1581   }
1582   for(j=10;j>=0;j--)
1583   {
1584     for(r=1;r<=MAXREG;r++)
1585     {
1586       if(hsn[r]==j) {
1587         for(hr=0;hr<HOST_REGS;hr++) {
1588           if(cur->regmap[hr]==r) {
1589             cur->regmap[hr]=reg;
1590             cur->dirty&=~(1<<hr);
1591             cur->isconst&=~(1<<hr);
1592             return;
1593           }
1594         }
1595       }
1596     }
1597   }
1598   SysPrintf("This shouldn't happen");abort();
1599 }
1600
1601 static void mov_alloc(struct regstat *current,int i)
1602 {
1603   // Note: Don't need to actually alloc the source registers
1604   //alloc_reg(current,i,rs1[i]);
1605   alloc_reg(current,i,rt1[i]);
1606
1607   clear_const(current,rs1[i]);
1608   clear_const(current,rt1[i]);
1609   dirty_reg(current,rt1[i]);
1610 }
1611
1612 static void shiftimm_alloc(struct regstat *current,int i)
1613 {
1614   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1615   {
1616     if(rt1[i]) {
1617       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1618       else lt1[i]=rs1[i];
1619       alloc_reg(current,i,rt1[i]);
1620       dirty_reg(current,rt1[i]);
1621       if(is_const(current,rs1[i])) {
1622         int v=get_const(current,rs1[i]);
1623         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1624         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1625         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1626       }
1627       else clear_const(current,rt1[i]);
1628     }
1629   }
1630   else
1631   {
1632     clear_const(current,rs1[i]);
1633     clear_const(current,rt1[i]);
1634   }
1635
1636   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1637   {
1638     assert(0);
1639   }
1640   if(opcode2[i]==0x3c) // DSLL32
1641   {
1642     assert(0);
1643   }
1644   if(opcode2[i]==0x3e) // DSRL32
1645   {
1646     assert(0);
1647   }
1648   if(opcode2[i]==0x3f) // DSRA32
1649   {
1650     assert(0);
1651   }
1652 }
1653
1654 static void shift_alloc(struct regstat *current,int i)
1655 {
1656   if(rt1[i]) {
1657     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1658     {
1659       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1660       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1661       alloc_reg(current,i,rt1[i]);
1662       if(rt1[i]==rs2[i]) {
1663         alloc_reg_temp(current,i,-1);
1664         minimum_free_regs[i]=1;
1665       }
1666     } else { // DSLLV/DSRLV/DSRAV
1667       assert(0);
1668     }
1669     clear_const(current,rs1[i]);
1670     clear_const(current,rs2[i]);
1671     clear_const(current,rt1[i]);
1672     dirty_reg(current,rt1[i]);
1673   }
1674 }
1675
1676 static void alu_alloc(struct regstat *current,int i)
1677 {
1678   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1679     if(rt1[i]) {
1680       if(rs1[i]&&rs2[i]) {
1681         alloc_reg(current,i,rs1[i]);
1682         alloc_reg(current,i,rs2[i]);
1683       }
1684       else {
1685         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1686         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1687       }
1688       alloc_reg(current,i,rt1[i]);
1689     }
1690   }
1691   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1692     if(rt1[i]) {
1693       alloc_reg(current,i,rs1[i]);
1694       alloc_reg(current,i,rs2[i]);
1695       alloc_reg(current,i,rt1[i]);
1696     }
1697   }
1698   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1699     if(rt1[i]) {
1700       if(rs1[i]&&rs2[i]) {
1701         alloc_reg(current,i,rs1[i]);
1702         alloc_reg(current,i,rs2[i]);
1703       }
1704       else
1705       {
1706         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1707         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1708       }
1709       alloc_reg(current,i,rt1[i]);
1710     }
1711   }
1712   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1713     assert(0);
1714   }
1715   clear_const(current,rs1[i]);
1716   clear_const(current,rs2[i]);
1717   clear_const(current,rt1[i]);
1718   dirty_reg(current,rt1[i]);
1719 }
1720
1721 static void imm16_alloc(struct regstat *current,int i)
1722 {
1723   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1724   else lt1[i]=rs1[i];
1725   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1726   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1727     assert(0);
1728   }
1729   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1730     clear_const(current,rs1[i]);
1731     clear_const(current,rt1[i]);
1732   }
1733   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1734     if(is_const(current,rs1[i])) {
1735       int v=get_const(current,rs1[i]);
1736       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1737       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1738       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1739     }
1740     else clear_const(current,rt1[i]);
1741   }
1742   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1743     if(is_const(current,rs1[i])) {
1744       int v=get_const(current,rs1[i]);
1745       set_const(current,rt1[i],v+imm[i]);
1746     }
1747     else clear_const(current,rt1[i]);
1748   }
1749   else {
1750     set_const(current,rt1[i],imm[i]<<16); // LUI
1751   }
1752   dirty_reg(current,rt1[i]);
1753 }
1754
1755 static void load_alloc(struct regstat *current,int i)
1756 {
1757   clear_const(current,rt1[i]);
1758   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1759   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1760   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1761   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1762     alloc_reg(current,i,rt1[i]);
1763     assert(get_reg(current->regmap,rt1[i])>=0);
1764     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1765     {
1766       assert(0);
1767     }
1768     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1769     {
1770       assert(0);
1771     }
1772     dirty_reg(current,rt1[i]);
1773     // LWL/LWR need a temporary register for the old value
1774     if(opcode[i]==0x22||opcode[i]==0x26)
1775     {
1776       alloc_reg(current,i,FTEMP);
1777       alloc_reg_temp(current,i,-1);
1778       minimum_free_regs[i]=1;
1779     }
1780   }
1781   else
1782   {
1783     // Load to r0 or unneeded register (dummy load)
1784     // but we still need a register to calculate the address
1785     if(opcode[i]==0x22||opcode[i]==0x26)
1786     {
1787       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1788     }
1789     alloc_reg_temp(current,i,-1);
1790     minimum_free_regs[i]=1;
1791     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1792     {
1793       assert(0);
1794     }
1795   }
1796 }
1797
1798 void store_alloc(struct regstat *current,int i)
1799 {
1800   clear_const(current,rs2[i]);
1801   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1802   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1803   alloc_reg(current,i,rs2[i]);
1804   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1805     assert(0);
1806   }
1807   #if defined(HOST_IMM8)
1808   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1809   else alloc_reg(current,i,INVCP);
1810   #endif
1811   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1812     alloc_reg(current,i,FTEMP);
1813   }
1814   // We need a temporary register for address generation
1815   alloc_reg_temp(current,i,-1);
1816   minimum_free_regs[i]=1;
1817 }
1818
1819 void c1ls_alloc(struct regstat *current,int i)
1820 {
1821   //clear_const(current,rs1[i]); // FIXME
1822   clear_const(current,rt1[i]);
1823   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1824   alloc_reg(current,i,CSREG); // Status
1825   alloc_reg(current,i,FTEMP);
1826   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1827     assert(0);
1828   }
1829   #if defined(HOST_IMM8)
1830   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1831   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1832     alloc_reg(current,i,INVCP);
1833   #endif
1834   // We need a temporary register for address generation
1835   alloc_reg_temp(current,i,-1);
1836 }
1837
1838 void c2ls_alloc(struct regstat *current,int i)
1839 {
1840   clear_const(current,rt1[i]);
1841   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1842   alloc_reg(current,i,FTEMP);
1843   #if defined(HOST_IMM8)
1844   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1845   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1846     alloc_reg(current,i,INVCP);
1847   #endif
1848   // We need a temporary register for address generation
1849   alloc_reg_temp(current,i,-1);
1850   minimum_free_regs[i]=1;
1851 }
1852
1853 #ifndef multdiv_alloc
1854 void multdiv_alloc(struct regstat *current,int i)
1855 {
1856   //  case 0x18: MULT
1857   //  case 0x19: MULTU
1858   //  case 0x1A: DIV
1859   //  case 0x1B: DIVU
1860   //  case 0x1C: DMULT
1861   //  case 0x1D: DMULTU
1862   //  case 0x1E: DDIV
1863   //  case 0x1F: DDIVU
1864   clear_const(current,rs1[i]);
1865   clear_const(current,rs2[i]);
1866   if(rs1[i]&&rs2[i])
1867   {
1868     if((opcode2[i]&4)==0) // 32-bit
1869     {
1870       current->u&=~(1LL<<HIREG);
1871       current->u&=~(1LL<<LOREG);
1872       alloc_reg(current,i,HIREG);
1873       alloc_reg(current,i,LOREG);
1874       alloc_reg(current,i,rs1[i]);
1875       alloc_reg(current,i,rs2[i]);
1876       dirty_reg(current,HIREG);
1877       dirty_reg(current,LOREG);
1878     }
1879     else // 64-bit
1880     {
1881       assert(0);
1882     }
1883   }
1884   else
1885   {
1886     // Multiply by zero is zero.
1887     // MIPS does not have a divide by zero exception.
1888     // The result is undefined, we return zero.
1889     alloc_reg(current,i,HIREG);
1890     alloc_reg(current,i,LOREG);
1891     dirty_reg(current,HIREG);
1892     dirty_reg(current,LOREG);
1893   }
1894 }
1895 #endif
1896
1897 void cop0_alloc(struct regstat *current,int i)
1898 {
1899   if(opcode2[i]==0) // MFC0
1900   {
1901     if(rt1[i]) {
1902       clear_const(current,rt1[i]);
1903       alloc_all(current,i);
1904       alloc_reg(current,i,rt1[i]);
1905       dirty_reg(current,rt1[i]);
1906     }
1907   }
1908   else if(opcode2[i]==4) // MTC0
1909   {
1910     if(rs1[i]){
1911       clear_const(current,rs1[i]);
1912       alloc_reg(current,i,rs1[i]);
1913       alloc_all(current,i);
1914     }
1915     else {
1916       alloc_all(current,i); // FIXME: Keep r0
1917       current->u&=~1LL;
1918       alloc_reg(current,i,0);
1919     }
1920   }
1921   else
1922   {
1923     // TLBR/TLBWI/TLBWR/TLBP/ERET
1924     assert(opcode2[i]==0x10);
1925     alloc_all(current,i);
1926   }
1927   minimum_free_regs[i]=HOST_REGS;
1928 }
1929
1930 static void cop2_alloc(struct regstat *current,int i)
1931 {
1932   if (opcode2[i] < 3) // MFC2/CFC2
1933   {
1934     alloc_cc(current,i); // for stalls
1935     dirty_reg(current,CCREG);
1936     if(rt1[i]){
1937       clear_const(current,rt1[i]);
1938       alloc_reg(current,i,rt1[i]);
1939       dirty_reg(current,rt1[i]);
1940     }
1941   }
1942   else if (opcode2[i] > 3) // MTC2/CTC2
1943   {
1944     if(rs1[i]){
1945       clear_const(current,rs1[i]);
1946       alloc_reg(current,i,rs1[i]);
1947     }
1948     else {
1949       current->u&=~1LL;
1950       alloc_reg(current,i,0);
1951     }
1952   }
1953   alloc_reg_temp(current,i,-1);
1954   minimum_free_regs[i]=1;
1955 }
1956
1957 void c2op_alloc(struct regstat *current,int i)
1958 {
1959   alloc_cc(current,i); // for stalls
1960   dirty_reg(current,CCREG);
1961   alloc_reg_temp(current,i,-1);
1962 }
1963
1964 void syscall_alloc(struct regstat *current,int i)
1965 {
1966   alloc_cc(current,i);
1967   dirty_reg(current,CCREG);
1968   alloc_all(current,i);
1969   minimum_free_regs[i]=HOST_REGS;
1970   current->isconst=0;
1971 }
1972
1973 void delayslot_alloc(struct regstat *current,int i)
1974 {
1975   switch(itype[i]) {
1976     case UJUMP:
1977     case CJUMP:
1978     case SJUMP:
1979     case RJUMP:
1980     case SYSCALL:
1981     case HLECALL:
1982     case SPAN:
1983       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//abort();
1984       SysPrintf("Disabled speculative precompilation\n");
1985       stop_after_jal=1;
1986       break;
1987     case IMM16:
1988       imm16_alloc(current,i);
1989       break;
1990     case LOAD:
1991     case LOADLR:
1992       load_alloc(current,i);
1993       break;
1994     case STORE:
1995     case STORELR:
1996       store_alloc(current,i);
1997       break;
1998     case ALU:
1999       alu_alloc(current,i);
2000       break;
2001     case SHIFT:
2002       shift_alloc(current,i);
2003       break;
2004     case MULTDIV:
2005       multdiv_alloc(current,i);
2006       break;
2007     case SHIFTIMM:
2008       shiftimm_alloc(current,i);
2009       break;
2010     case MOV:
2011       mov_alloc(current,i);
2012       break;
2013     case COP0:
2014       cop0_alloc(current,i);
2015       break;
2016     case COP1:
2017       break;
2018     case COP2:
2019       cop2_alloc(current,i);
2020       break;
2021     case C1LS:
2022       c1ls_alloc(current,i);
2023       break;
2024     case C2LS:
2025       c2ls_alloc(current,i);
2026       break;
2027     case C2OP:
2028       c2op_alloc(current,i);
2029       break;
2030   }
2031 }
2032
2033 // Special case where a branch and delay slot span two pages in virtual memory
2034 static void pagespan_alloc(struct regstat *current,int i)
2035 {
2036   current->isconst=0;
2037   current->wasconst=0;
2038   regs[i].wasconst=0;
2039   minimum_free_regs[i]=HOST_REGS;
2040   alloc_all(current,i);
2041   alloc_cc(current,i);
2042   dirty_reg(current,CCREG);
2043   if(opcode[i]==3) // JAL
2044   {
2045     alloc_reg(current,i,31);
2046     dirty_reg(current,31);
2047   }
2048   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
2049   {
2050     alloc_reg(current,i,rs1[i]);
2051     if (rt1[i]!=0) {
2052       alloc_reg(current,i,rt1[i]);
2053       dirty_reg(current,rt1[i]);
2054     }
2055   }
2056   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
2057   {
2058     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2059     if(rs2[i]) alloc_reg(current,i,rs2[i]);
2060   }
2061   else
2062   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
2063   {
2064     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2065   }
2066   //else ...
2067 }
2068
2069 static void add_stub(enum stub_type type, void *addr, void *retaddr,
2070   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e)
2071 {
2072   assert(stubcount < ARRAY_SIZE(stubs));
2073   stubs[stubcount].type = type;
2074   stubs[stubcount].addr = addr;
2075   stubs[stubcount].retaddr = retaddr;
2076   stubs[stubcount].a = a;
2077   stubs[stubcount].b = b;
2078   stubs[stubcount].c = c;
2079   stubs[stubcount].d = d;
2080   stubs[stubcount].e = e;
2081   stubcount++;
2082 }
2083
2084 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
2085   int i, int addr_reg, const struct regstat *i_regs, int ccadj, u_int reglist)
2086 {
2087   add_stub(type, addr, retaddr, i, addr_reg, (uintptr_t)i_regs, ccadj, reglist);
2088 }
2089
2090 // Write out a single register
2091 static void wb_register(signed char r,signed char regmap[],uint64_t dirty)
2092 {
2093   int hr;
2094   for(hr=0;hr<HOST_REGS;hr++) {
2095     if(hr!=EXCLUDE_REG) {
2096       if((regmap[hr]&63)==r) {
2097         if((dirty>>hr)&1) {
2098           assert(regmap[hr]<64);
2099           emit_storereg(r,hr);
2100         }
2101       }
2102     }
2103   }
2104 }
2105
2106 static void wb_valid(signed char pre[],signed char entry[],u_int dirty_pre,u_int dirty,uint64_t u)
2107 {
2108   //if(dirty_pre==dirty) return;
2109   int hr,reg;
2110   for(hr=0;hr<HOST_REGS;hr++) {
2111     if(hr!=EXCLUDE_REG) {
2112       reg=pre[hr];
2113       if(((~u)>>(reg&63))&1) {
2114         if(reg>0) {
2115           if(((dirty_pre&~dirty)>>hr)&1) {
2116             if(reg>0&&reg<34) {
2117               emit_storereg(reg,hr);
2118             }
2119             else if(reg>=64) {
2120               assert(0);
2121             }
2122           }
2123         }
2124       }
2125     }
2126   }
2127 }
2128
2129 // trashes r2
2130 static void pass_args(int a0, int a1)
2131 {
2132   if(a0==1&&a1==0) {
2133     // must swap
2134     emit_mov(a0,2); emit_mov(a1,1); emit_mov(2,0);
2135   }
2136   else if(a0!=0&&a1==0) {
2137     emit_mov(a1,1);
2138     if (a0>=0) emit_mov(a0,0);
2139   }
2140   else {
2141     if(a0>=0&&a0!=0) emit_mov(a0,0);
2142     if(a1>=0&&a1!=1) emit_mov(a1,1);
2143   }
2144 }
2145
2146 static void alu_assemble(int i,struct regstat *i_regs)
2147 {
2148   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2149     if(rt1[i]) {
2150       signed char s1,s2,t;
2151       t=get_reg(i_regs->regmap,rt1[i]);
2152       if(t>=0) {
2153         s1=get_reg(i_regs->regmap,rs1[i]);
2154         s2=get_reg(i_regs->regmap,rs2[i]);
2155         if(rs1[i]&&rs2[i]) {
2156           assert(s1>=0);
2157           assert(s2>=0);
2158           if(opcode2[i]&2) emit_sub(s1,s2,t);
2159           else emit_add(s1,s2,t);
2160         }
2161         else if(rs1[i]) {
2162           if(s1>=0) emit_mov(s1,t);
2163           else emit_loadreg(rs1[i],t);
2164         }
2165         else if(rs2[i]) {
2166           if(s2>=0) {
2167             if(opcode2[i]&2) emit_neg(s2,t);
2168             else emit_mov(s2,t);
2169           }
2170           else {
2171             emit_loadreg(rs2[i],t);
2172             if(opcode2[i]&2) emit_neg(t,t);
2173           }
2174         }
2175         else emit_zeroreg(t);
2176       }
2177     }
2178   }
2179   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2180     assert(0);
2181   }
2182   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2183     if(rt1[i]) {
2184       signed char s1l,s2l,t;
2185       {
2186         t=get_reg(i_regs->regmap,rt1[i]);
2187         //assert(t>=0);
2188         if(t>=0) {
2189           s1l=get_reg(i_regs->regmap,rs1[i]);
2190           s2l=get_reg(i_regs->regmap,rs2[i]);
2191           if(rs2[i]==0) // rx<r0
2192           {
2193             if(opcode2[i]==0x2a&&rs1[i]!=0) { // SLT
2194               assert(s1l>=0);
2195               emit_shrimm(s1l,31,t);
2196             }
2197             else // SLTU (unsigned can not be less than zero, 0<0)
2198               emit_zeroreg(t);
2199           }
2200           else if(rs1[i]==0) // r0<rx
2201           {
2202             assert(s2l>=0);
2203             if(opcode2[i]==0x2a) // SLT
2204               emit_set_gz32(s2l,t);
2205             else // SLTU (set if not zero)
2206               emit_set_nz32(s2l,t);
2207           }
2208           else{
2209             assert(s1l>=0);assert(s2l>=0);
2210             if(opcode2[i]==0x2a) // SLT
2211               emit_set_if_less32(s1l,s2l,t);
2212             else // SLTU
2213               emit_set_if_carry32(s1l,s2l,t);
2214           }
2215         }
2216       }
2217     }
2218   }
2219   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2220     if(rt1[i]) {
2221       signed char s1l,s2l,tl;
2222       tl=get_reg(i_regs->regmap,rt1[i]);
2223       {
2224         if(tl>=0) {
2225           s1l=get_reg(i_regs->regmap,rs1[i]);
2226           s2l=get_reg(i_regs->regmap,rs2[i]);
2227           if(rs1[i]&&rs2[i]) {
2228             assert(s1l>=0);
2229             assert(s2l>=0);
2230             if(opcode2[i]==0x24) { // AND
2231               emit_and(s1l,s2l,tl);
2232             } else
2233             if(opcode2[i]==0x25) { // OR
2234               emit_or(s1l,s2l,tl);
2235             } else
2236             if(opcode2[i]==0x26) { // XOR
2237               emit_xor(s1l,s2l,tl);
2238             } else
2239             if(opcode2[i]==0x27) { // NOR
2240               emit_or(s1l,s2l,tl);
2241               emit_not(tl,tl);
2242             }
2243           }
2244           else
2245           {
2246             if(opcode2[i]==0x24) { // AND
2247               emit_zeroreg(tl);
2248             } else
2249             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2250               if(rs1[i]){
2251                 if(s1l>=0) emit_mov(s1l,tl);
2252                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2253               }
2254               else
2255               if(rs2[i]){
2256                 if(s2l>=0) emit_mov(s2l,tl);
2257                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2258               }
2259               else emit_zeroreg(tl);
2260             } else
2261             if(opcode2[i]==0x27) { // NOR
2262               if(rs1[i]){
2263                 if(s1l>=0) emit_not(s1l,tl);
2264                 else {
2265                   emit_loadreg(rs1[i],tl);
2266                   emit_not(tl,tl);
2267                 }
2268               }
2269               else
2270               if(rs2[i]){
2271                 if(s2l>=0) emit_not(s2l,tl);
2272                 else {
2273                   emit_loadreg(rs2[i],tl);
2274                   emit_not(tl,tl);
2275                 }
2276               }
2277               else emit_movimm(-1,tl);
2278             }
2279           }
2280         }
2281       }
2282     }
2283   }
2284 }
2285
2286 void imm16_assemble(int i,struct regstat *i_regs)
2287 {
2288   if (opcode[i]==0x0f) { // LUI
2289     if(rt1[i]) {
2290       signed char t;
2291       t=get_reg(i_regs->regmap,rt1[i]);
2292       //assert(t>=0);
2293       if(t>=0) {
2294         if(!((i_regs->isconst>>t)&1))
2295           emit_movimm(imm[i]<<16,t);
2296       }
2297     }
2298   }
2299   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2300     if(rt1[i]) {
2301       signed char s,t;
2302       t=get_reg(i_regs->regmap,rt1[i]);
2303       s=get_reg(i_regs->regmap,rs1[i]);
2304       if(rs1[i]) {
2305         //assert(t>=0);
2306         //assert(s>=0);
2307         if(t>=0) {
2308           if(!((i_regs->isconst>>t)&1)) {
2309             if(s<0) {
2310               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2311               emit_addimm(t,imm[i],t);
2312             }else{
2313               if(!((i_regs->wasconst>>s)&1))
2314                 emit_addimm(s,imm[i],t);
2315               else
2316                 emit_movimm(constmap[i][s]+imm[i],t);
2317             }
2318           }
2319         }
2320       } else {
2321         if(t>=0) {
2322           if(!((i_regs->isconst>>t)&1))
2323             emit_movimm(imm[i],t);
2324         }
2325       }
2326     }
2327   }
2328   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2329     if(rt1[i]) {
2330       signed char sl,tl;
2331       tl=get_reg(i_regs->regmap,rt1[i]);
2332       sl=get_reg(i_regs->regmap,rs1[i]);
2333       if(tl>=0) {
2334         if(rs1[i]) {
2335           assert(sl>=0);
2336           emit_addimm(sl,imm[i],tl);
2337         } else {
2338           emit_movimm(imm[i],tl);
2339         }
2340       }
2341     }
2342   }
2343   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2344     if(rt1[i]) {
2345       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2346       signed char sl,t;
2347       t=get_reg(i_regs->regmap,rt1[i]);
2348       sl=get_reg(i_regs->regmap,rs1[i]);
2349       //assert(t>=0);
2350       if(t>=0) {
2351         if(rs1[i]>0) {
2352             if(opcode[i]==0x0a) { // SLTI
2353               if(sl<0) {
2354                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2355                 emit_slti32(t,imm[i],t);
2356               }else{
2357                 emit_slti32(sl,imm[i],t);
2358               }
2359             }
2360             else { // SLTIU
2361               if(sl<0) {
2362                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2363                 emit_sltiu32(t,imm[i],t);
2364               }else{
2365                 emit_sltiu32(sl,imm[i],t);
2366               }
2367             }
2368         }else{
2369           // SLTI(U) with r0 is just stupid,
2370           // nonetheless examples can be found
2371           if(opcode[i]==0x0a) // SLTI
2372             if(0<imm[i]) emit_movimm(1,t);
2373             else emit_zeroreg(t);
2374           else // SLTIU
2375           {
2376             if(imm[i]) emit_movimm(1,t);
2377             else emit_zeroreg(t);
2378           }
2379         }
2380       }
2381     }
2382   }
2383   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2384     if(rt1[i]) {
2385       signed char sl,tl;
2386       tl=get_reg(i_regs->regmap,rt1[i]);
2387       sl=get_reg(i_regs->regmap,rs1[i]);
2388       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2389         if(opcode[i]==0x0c) //ANDI
2390         {
2391           if(rs1[i]) {
2392             if(sl<0) {
2393               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2394               emit_andimm(tl,imm[i],tl);
2395             }else{
2396               if(!((i_regs->wasconst>>sl)&1))
2397                 emit_andimm(sl,imm[i],tl);
2398               else
2399                 emit_movimm(constmap[i][sl]&imm[i],tl);
2400             }
2401           }
2402           else
2403             emit_zeroreg(tl);
2404         }
2405         else
2406         {
2407           if(rs1[i]) {
2408             if(sl<0) {
2409               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2410             }
2411             if(opcode[i]==0x0d) { // ORI
2412               if(sl<0) {
2413                 emit_orimm(tl,imm[i],tl);
2414               }else{
2415                 if(!((i_regs->wasconst>>sl)&1))
2416                   emit_orimm(sl,imm[i],tl);
2417                 else
2418                   emit_movimm(constmap[i][sl]|imm[i],tl);
2419               }
2420             }
2421             if(opcode[i]==0x0e) { // XORI
2422               if(sl<0) {
2423                 emit_xorimm(tl,imm[i],tl);
2424               }else{
2425                 if(!((i_regs->wasconst>>sl)&1))
2426                   emit_xorimm(sl,imm[i],tl);
2427                 else
2428                   emit_movimm(constmap[i][sl]^imm[i],tl);
2429               }
2430             }
2431           }
2432           else {
2433             emit_movimm(imm[i],tl);
2434           }
2435         }
2436       }
2437     }
2438   }
2439 }
2440
2441 void shiftimm_assemble(int i,struct regstat *i_regs)
2442 {
2443   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2444   {
2445     if(rt1[i]) {
2446       signed char s,t;
2447       t=get_reg(i_regs->regmap,rt1[i]);
2448       s=get_reg(i_regs->regmap,rs1[i]);
2449       //assert(t>=0);
2450       if(t>=0&&!((i_regs->isconst>>t)&1)){
2451         if(rs1[i]==0)
2452         {
2453           emit_zeroreg(t);
2454         }
2455         else
2456         {
2457           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2458           if(imm[i]) {
2459             if(opcode2[i]==0) // SLL
2460             {
2461               emit_shlimm(s<0?t:s,imm[i],t);
2462             }
2463             if(opcode2[i]==2) // SRL
2464             {
2465               emit_shrimm(s<0?t:s,imm[i],t);
2466             }
2467             if(opcode2[i]==3) // SRA
2468             {
2469               emit_sarimm(s<0?t:s,imm[i],t);
2470             }
2471           }else{
2472             // Shift by zero
2473             if(s>=0 && s!=t) emit_mov(s,t);
2474           }
2475         }
2476       }
2477       //emit_storereg(rt1[i],t); //DEBUG
2478     }
2479   }
2480   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2481   {
2482     assert(0);
2483   }
2484   if(opcode2[i]==0x3c) // DSLL32
2485   {
2486     assert(0);
2487   }
2488   if(opcode2[i]==0x3e) // DSRL32
2489   {
2490     assert(0);
2491   }
2492   if(opcode2[i]==0x3f) // DSRA32
2493   {
2494     assert(0);
2495   }
2496 }
2497
2498 #ifndef shift_assemble
2499 static void shift_assemble(int i,struct regstat *i_regs)
2500 {
2501   signed char s,t,shift;
2502   if (rt1[i] == 0)
2503     return;
2504   assert(opcode2[i]<=0x07); // SLLV/SRLV/SRAV
2505   t = get_reg(i_regs->regmap, rt1[i]);
2506   s = get_reg(i_regs->regmap, rs1[i]);
2507   shift = get_reg(i_regs->regmap, rs2[i]);
2508   if (t < 0)
2509     return;
2510
2511   if(rs1[i]==0)
2512     emit_zeroreg(t);
2513   else if(rs2[i]==0) {
2514     assert(s>=0);
2515     if(s!=t) emit_mov(s,t);
2516   }
2517   else {
2518     host_tempreg_acquire();
2519     emit_andimm(shift,31,HOST_TEMPREG);
2520     switch(opcode2[i]) {
2521     case 4: // SLLV
2522       emit_shl(s,HOST_TEMPREG,t);
2523       break;
2524     case 6: // SRLV
2525       emit_shr(s,HOST_TEMPREG,t);
2526       break;
2527     case 7: // SRAV
2528       emit_sar(s,HOST_TEMPREG,t);
2529       break;
2530     default:
2531       assert(0);
2532     }
2533     host_tempreg_release();
2534   }
2535 }
2536
2537 #endif
2538
2539 enum {
2540   MTYPE_8000 = 0,
2541   MTYPE_8020,
2542   MTYPE_0000,
2543   MTYPE_A000,
2544   MTYPE_1F80,
2545 };
2546
2547 static int get_ptr_mem_type(u_int a)
2548 {
2549   if(a < 0x00200000) {
2550     if(a<0x1000&&((start>>20)==0xbfc||(start>>24)==0xa0))
2551       // return wrong, must use memhandler for BIOS self-test to pass
2552       // 007 does similar stuff from a00 mirror, weird stuff
2553       return MTYPE_8000;
2554     return MTYPE_0000;
2555   }
2556   if(0x1f800000 <= a && a < 0x1f801000)
2557     return MTYPE_1F80;
2558   if(0x80200000 <= a && a < 0x80800000)
2559     return MTYPE_8020;
2560   if(0xa0000000 <= a && a < 0xa0200000)
2561     return MTYPE_A000;
2562   return MTYPE_8000;
2563 }
2564
2565 static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override)
2566 {
2567   void *jaddr = NULL;
2568   int type=0;
2569   int mr=rs1[i];
2570   if(((smrv_strong|smrv_weak)>>mr)&1) {
2571     type=get_ptr_mem_type(smrv[mr]);
2572     //printf("set %08x @%08x r%d %d\n", smrv[mr], start+i*4, mr, type);
2573   }
2574   else {
2575     // use the mirror we are running on
2576     type=get_ptr_mem_type(start);
2577     //printf("set nospec   @%08x r%d %d\n", start+i*4, mr, type);
2578   }
2579
2580   if(type==MTYPE_8020) { // RAM 80200000+ mirror
2581     host_tempreg_acquire();
2582     emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
2583     addr=*addr_reg_override=HOST_TEMPREG;
2584     type=0;
2585   }
2586   else if(type==MTYPE_0000) { // RAM 0 mirror
2587     host_tempreg_acquire();
2588     emit_orimm(addr,0x80000000,HOST_TEMPREG);
2589     addr=*addr_reg_override=HOST_TEMPREG;
2590     type=0;
2591   }
2592   else if(type==MTYPE_A000) { // RAM A mirror
2593     host_tempreg_acquire();
2594     emit_andimm(addr,~0x20000000,HOST_TEMPREG);
2595     addr=*addr_reg_override=HOST_TEMPREG;
2596     type=0;
2597   }
2598   else if(type==MTYPE_1F80) { // scratchpad
2599     if (psxH == (void *)0x1f800000) {
2600       host_tempreg_acquire();
2601       emit_xorimm(addr,0x1f800000,HOST_TEMPREG);
2602       emit_cmpimm(HOST_TEMPREG,0x1000);
2603       host_tempreg_release();
2604       jaddr=out;
2605       emit_jc(0);
2606     }
2607     else {
2608       // do the usual RAM check, jump will go to the right handler
2609       type=0;
2610     }
2611   }
2612
2613   if(type==0)
2614   {
2615     emit_cmpimm(addr,RAM_SIZE);
2616     jaddr=out;
2617     #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2618     // Hint to branch predictor that the branch is unlikely to be taken
2619     if(rs1[i]>=28)
2620       emit_jno_unlikely(0);
2621     else
2622     #endif
2623       emit_jno(0);
2624     if(ram_offset!=0) {
2625       host_tempreg_acquire();
2626       emit_addimm(addr,ram_offset,HOST_TEMPREG);
2627       addr=*addr_reg_override=HOST_TEMPREG;
2628     }
2629   }
2630
2631   return jaddr;
2632 }
2633
2634 // return memhandler, or get directly accessable address and return 0
2635 static void *get_direct_memhandler(void *table, u_int addr,
2636   enum stub_type type, uintptr_t *addr_host)
2637 {
2638   uintptr_t l1, l2 = 0;
2639   l1 = ((uintptr_t *)table)[addr>>12];
2640   if ((l1 & (1ul << (sizeof(l1)*8-1))) == 0) {
2641     uintptr_t v = l1 << 1;
2642     *addr_host = v + addr;
2643     return NULL;
2644   }
2645   else {
2646     l1 <<= 1;
2647     if (type == LOADB_STUB || type == LOADBU_STUB || type == STOREB_STUB)
2648       l2 = ((uintptr_t *)l1)[0x1000/4 + 0x1000/2 + (addr&0xfff)];
2649     else if (type == LOADH_STUB || type == LOADHU_STUB || type == STOREH_STUB)
2650       l2=((uintptr_t *)l1)[0x1000/4 + (addr&0xfff)/2];
2651     else
2652       l2=((uintptr_t *)l1)[(addr&0xfff)/4];
2653     if ((l2 & (1<<31)) == 0) {
2654       uintptr_t v = l2 << 1;
2655       *addr_host = v + (addr&0xfff);
2656       return NULL;
2657     }
2658     return (void *)(l2 << 1);
2659   }
2660 }
2661
2662 static u_int get_host_reglist(const signed char *regmap)
2663 {
2664   u_int reglist = 0, hr;
2665   for (hr = 0; hr < HOST_REGS; hr++) {
2666     if (hr != EXCLUDE_REG && regmap[hr] >= 0)
2667       reglist |= 1 << hr;
2668   }
2669   return reglist;
2670 }
2671
2672 static u_int reglist_exclude(u_int reglist, int r1, int r2)
2673 {
2674   if (r1 >= 0)
2675     reglist &= ~(1u << r1);
2676   if (r2 >= 0)
2677     reglist &= ~(1u << r2);
2678   return reglist;
2679 }
2680
2681 // find a temp caller-saved register not in reglist (so assumed to be free)
2682 static int reglist_find_free(u_int reglist)
2683 {
2684   u_int free_regs = ~reglist & CALLER_SAVE_REGS;
2685   if (free_regs == 0)
2686     return -1;
2687   return __builtin_ctz(free_regs);
2688 }
2689
2690 static void load_assemble(int i, const struct regstat *i_regs)
2691 {
2692   int s,tl,addr;
2693   int offset;
2694   void *jaddr=0;
2695   int memtarget=0,c=0;
2696   int fastio_reg_override=-1;
2697   u_int reglist=get_host_reglist(i_regs->regmap);
2698   tl=get_reg(i_regs->regmap,rt1[i]);
2699   s=get_reg(i_regs->regmap,rs1[i]);
2700   offset=imm[i];
2701   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2702   if(s>=0) {
2703     c=(i_regs->wasconst>>s)&1;
2704     if (c) {
2705       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2706     }
2707   }
2708   //printf("load_assemble: c=%d\n",c);
2709   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2710   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2711   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2712     ||rt1[i]==0) {
2713       // could be FIFO, must perform the read
2714       // ||dummy read
2715       assem_debug("(forced read)\n");
2716       tl=get_reg(i_regs->regmap,-1);
2717       assert(tl>=0);
2718   }
2719   if(offset||s<0||c) addr=tl;
2720   else addr=s;
2721   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2722  if(tl>=0) {
2723   //printf("load_assemble: c=%d\n",c);
2724   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2725   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2726   reglist&=~(1<<tl);
2727   if(!c) {
2728     #ifdef R29_HACK
2729     // Strmnnrmn's speed hack
2730     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2731     #endif
2732     {
2733       jaddr=emit_fastpath_cmp_jump(i,addr,&fastio_reg_override);
2734     }
2735   }
2736   else if(ram_offset&&memtarget) {
2737     host_tempreg_acquire();
2738     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2739     fastio_reg_override=HOST_TEMPREG;
2740   }
2741   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2742   if (opcode[i]==0x20) { // LB
2743     if(!c||memtarget) {
2744       if(!dummy) {
2745         {
2746           int x=0,a=tl;
2747           if(!c) a=addr;
2748           if(fastio_reg_override>=0) a=fastio_reg_override;
2749
2750           emit_movsbl_indexed(x,a,tl);
2751         }
2752       }
2753       if(jaddr)
2754         add_stub_r(LOADB_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2755     }
2756     else
2757       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2758   }
2759   if (opcode[i]==0x21) { // LH
2760     if(!c||memtarget) {
2761       if(!dummy) {
2762         int x=0,a=tl;
2763         if(!c) a=addr;
2764         if(fastio_reg_override>=0) a=fastio_reg_override;
2765         emit_movswl_indexed(x,a,tl);
2766       }
2767       if(jaddr)
2768         add_stub_r(LOADH_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2769     }
2770     else
2771       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2772   }
2773   if (opcode[i]==0x23) { // LW
2774     if(!c||memtarget) {
2775       if(!dummy) {
2776         int a=addr;
2777         if(fastio_reg_override>=0) a=fastio_reg_override;
2778         emit_readword_indexed(0,a,tl);
2779       }
2780       if(jaddr)
2781         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2782     }
2783     else
2784       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2785   }
2786   if (opcode[i]==0x24) { // LBU
2787     if(!c||memtarget) {
2788       if(!dummy) {
2789         int x=0,a=tl;
2790         if(!c) a=addr;
2791         if(fastio_reg_override>=0) a=fastio_reg_override;
2792
2793         emit_movzbl_indexed(x,a,tl);
2794       }
2795       if(jaddr)
2796         add_stub_r(LOADBU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2797     }
2798     else
2799       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2800   }
2801   if (opcode[i]==0x25) { // LHU
2802     if(!c||memtarget) {
2803       if(!dummy) {
2804         int x=0,a=tl;
2805         if(!c) a=addr;
2806         if(fastio_reg_override>=0) a=fastio_reg_override;
2807         emit_movzwl_indexed(x,a,tl);
2808       }
2809       if(jaddr)
2810         add_stub_r(LOADHU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2811     }
2812     else
2813       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2814   }
2815   if (opcode[i]==0x27) { // LWU
2816     assert(0);
2817   }
2818   if (opcode[i]==0x37) { // LD
2819     assert(0);
2820   }
2821  }
2822  if (fastio_reg_override == HOST_TEMPREG)
2823    host_tempreg_release();
2824 }
2825
2826 #ifndef loadlr_assemble
2827 static void loadlr_assemble(int i, const struct regstat *i_regs)
2828 {
2829   int s,tl,temp,temp2,addr;
2830   int offset;
2831   void *jaddr=0;
2832   int memtarget=0,c=0;
2833   int fastio_reg_override=-1;
2834   u_int reglist=get_host_reglist(i_regs->regmap);
2835   tl=get_reg(i_regs->regmap,rt1[i]);
2836   s=get_reg(i_regs->regmap,rs1[i]);
2837   temp=get_reg(i_regs->regmap,-1);
2838   temp2=get_reg(i_regs->regmap,FTEMP);
2839   addr=get_reg(i_regs->regmap,AGEN1+(i&1));
2840   assert(addr<0);
2841   offset=imm[i];
2842   reglist|=1<<temp;
2843   if(offset||s<0||c) addr=temp2;
2844   else addr=s;
2845   if(s>=0) {
2846     c=(i_regs->wasconst>>s)&1;
2847     if(c) {
2848       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2849     }
2850   }
2851   if(!c) {
2852     emit_shlimm(addr,3,temp);
2853     if (opcode[i]==0x22||opcode[i]==0x26) {
2854       emit_andimm(addr,0xFFFFFFFC,temp2); // LWL/LWR
2855     }else{
2856       emit_andimm(addr,0xFFFFFFF8,temp2); // LDL/LDR
2857     }
2858     jaddr=emit_fastpath_cmp_jump(i,temp2,&fastio_reg_override);
2859   }
2860   else {
2861     if(ram_offset&&memtarget) {
2862       host_tempreg_acquire();
2863       emit_addimm(temp2,ram_offset,HOST_TEMPREG);
2864       fastio_reg_override=HOST_TEMPREG;
2865     }
2866     if (opcode[i]==0x22||opcode[i]==0x26) {
2867       emit_movimm(((constmap[i][s]+offset)<<3)&24,temp); // LWL/LWR
2868     }else{
2869       emit_movimm(((constmap[i][s]+offset)<<3)&56,temp); // LDL/LDR
2870     }
2871   }
2872   if (opcode[i]==0x22||opcode[i]==0x26) { // LWL/LWR
2873     if(!c||memtarget) {
2874       int a=temp2;
2875       if(fastio_reg_override>=0) a=fastio_reg_override;
2876       emit_readword_indexed(0,a,temp2);
2877       if(fastio_reg_override==HOST_TEMPREG) host_tempreg_release();
2878       if(jaddr) add_stub_r(LOADW_STUB,jaddr,out,i,temp2,i_regs,ccadj[i],reglist);
2879     }
2880     else
2881       inline_readstub(LOADW_STUB,i,(constmap[i][s]+offset)&0xFFFFFFFC,i_regs->regmap,FTEMP,ccadj[i],reglist);
2882     if(rt1[i]) {
2883       assert(tl>=0);
2884       emit_andimm(temp,24,temp);
2885       if (opcode[i]==0x22) // LWL
2886         emit_xorimm(temp,24,temp);
2887       host_tempreg_acquire();
2888       emit_movimm(-1,HOST_TEMPREG);
2889       if (opcode[i]==0x26) {
2890         emit_shr(temp2,temp,temp2);
2891         emit_bic_lsr(tl,HOST_TEMPREG,temp,tl);
2892       }else{
2893         emit_shl(temp2,temp,temp2);
2894         emit_bic_lsl(tl,HOST_TEMPREG,temp,tl);
2895       }
2896       host_tempreg_release();
2897       emit_or(temp2,tl,tl);
2898     }
2899     //emit_storereg(rt1[i],tl); // DEBUG
2900   }
2901   if (opcode[i]==0x1A||opcode[i]==0x1B) { // LDL/LDR
2902     assert(0);
2903   }
2904 }
2905 #endif
2906
2907 void store_assemble(int i, const struct regstat *i_regs)
2908 {
2909   int s,tl;
2910   int addr,temp;
2911   int offset;
2912   void *jaddr=0;
2913   enum stub_type type;
2914   int memtarget=0,c=0;
2915   int agr=AGEN1+(i&1);
2916   int fastio_reg_override=-1;
2917   u_int reglist=get_host_reglist(i_regs->regmap);
2918   tl=get_reg(i_regs->regmap,rs2[i]);
2919   s=get_reg(i_regs->regmap,rs1[i]);
2920   temp=get_reg(i_regs->regmap,agr);
2921   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2922   offset=imm[i];
2923   if(s>=0) {
2924     c=(i_regs->wasconst>>s)&1;
2925     if(c) {
2926       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2927     }
2928   }
2929   assert(tl>=0);
2930   assert(temp>=0);
2931   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2932   if(offset||s<0||c) addr=temp;
2933   else addr=s;
2934   if(!c) {
2935     jaddr=emit_fastpath_cmp_jump(i,addr,&fastio_reg_override);
2936   }
2937   else if(ram_offset&&memtarget) {
2938     host_tempreg_acquire();
2939     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2940     fastio_reg_override=HOST_TEMPREG;
2941   }
2942
2943   if (opcode[i]==0x28) { // SB
2944     if(!c||memtarget) {
2945       int x=0,a=temp;
2946       if(!c) a=addr;
2947       if(fastio_reg_override>=0) a=fastio_reg_override;
2948       emit_writebyte_indexed(tl,x,a);
2949     }
2950     type=STOREB_STUB;
2951   }
2952   if (opcode[i]==0x29) { // SH
2953     if(!c||memtarget) {
2954       int x=0,a=temp;
2955       if(!c) a=addr;
2956       if(fastio_reg_override>=0) a=fastio_reg_override;
2957       emit_writehword_indexed(tl,x,a);
2958     }
2959     type=STOREH_STUB;
2960   }
2961   if (opcode[i]==0x2B) { // SW
2962     if(!c||memtarget) {
2963       int a=addr;
2964       if(fastio_reg_override>=0) a=fastio_reg_override;
2965       emit_writeword_indexed(tl,0,a);
2966     }
2967     type=STOREW_STUB;
2968   }
2969   if (opcode[i]==0x3F) { // SD
2970     assert(0);
2971     type=STORED_STUB;
2972   }
2973   if(fastio_reg_override==HOST_TEMPREG)
2974     host_tempreg_release();
2975   if(jaddr) {
2976     // PCSX store handlers don't check invcode again
2977     reglist|=1<<addr;
2978     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2979     jaddr=0;
2980   }
2981   if(!(i_regs->waswritten&(1<<rs1[i])) && !HACK_ENABLED(NDHACK_NO_SMC_CHECK)) {
2982     if(!c||memtarget) {
2983       #ifdef DESTRUCTIVE_SHIFT
2984       // The x86 shift operation is 'destructive'; it overwrites the
2985       // source register, so we need to make a copy first and use that.
2986       addr=temp;
2987       #endif
2988       #if defined(HOST_IMM8)
2989       int ir=get_reg(i_regs->regmap,INVCP);
2990       assert(ir>=0);
2991       emit_cmpmem_indexedsr12_reg(ir,addr,1);
2992       #else
2993       emit_cmpmem_indexedsr12_imm(invalid_code,addr,1);
2994       #endif
2995       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2996       emit_callne(invalidate_addr_reg[addr]);
2997       #else
2998       void *jaddr2 = out;
2999       emit_jne(0);
3000       add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3001       #endif
3002     }
3003   }
3004   u_int addr_val=constmap[i][s]+offset;
3005   if(jaddr) {
3006     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
3007   } else if(c&&!memtarget) {
3008     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
3009   }
3010   // basic current block modification detection..
3011   // not looking back as that should be in mips cache already
3012   // (see Spyro2 title->attract mode)
3013   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
3014     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
3015     assert(i_regs->regmap==regs[i].regmap); // not delay slot
3016     if(i_regs->regmap==regs[i].regmap) {
3017       load_all_consts(regs[i].regmap_entry,regs[i].wasdirty,i);
3018       wb_dirtys(regs[i].regmap_entry,regs[i].wasdirty);
3019       emit_movimm(start+i*4+4,0);
3020       emit_writeword(0,&pcaddr);
3021       emit_addimm(HOST_CCREG,2,HOST_CCREG);
3022       emit_far_call(get_addr_ht);
3023       emit_jmpreg(0);
3024     }
3025   }
3026 }
3027
3028 static void storelr_assemble(int i, const struct regstat *i_regs)
3029 {
3030   int s,tl;
3031   int temp;
3032   int offset;
3033   void *jaddr=0;
3034   void *case1, *case2, *case3;
3035   void *done0, *done1, *done2;
3036   int memtarget=0,c=0;
3037   int agr=AGEN1+(i&1);
3038   u_int reglist=get_host_reglist(i_regs->regmap);
3039   tl=get_reg(i_regs->regmap,rs2[i]);
3040   s=get_reg(i_regs->regmap,rs1[i]);
3041   temp=get_reg(i_regs->regmap,agr);
3042   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3043   offset=imm[i];
3044   if(s>=0) {
3045     c=(i_regs->isconst>>s)&1;
3046     if(c) {
3047       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3048     }
3049   }
3050   assert(tl>=0);
3051   assert(temp>=0);
3052   if(!c) {
3053     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3054     if(!offset&&s!=temp) emit_mov(s,temp);
3055     jaddr=out;
3056     emit_jno(0);
3057   }
3058   else
3059   {
3060     if(!memtarget||!rs1[i]) {
3061       jaddr=out;
3062       emit_jmp(0);
3063     }
3064   }
3065   if(ram_offset)
3066     emit_addimm_no_flags(ram_offset,temp);
3067
3068   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3069     assert(0);
3070   }
3071
3072   emit_xorimm(temp,3,temp);
3073   emit_testimm(temp,2);
3074   case2=out;
3075   emit_jne(0);
3076   emit_testimm(temp,1);
3077   case1=out;
3078   emit_jne(0);
3079   // 0
3080   if (opcode[i]==0x2A) { // SWL
3081     emit_writeword_indexed(tl,0,temp);
3082   }
3083   else if (opcode[i]==0x2E) { // SWR
3084     emit_writebyte_indexed(tl,3,temp);
3085   }
3086   else
3087     assert(0);
3088   done0=out;
3089   emit_jmp(0);
3090   // 1
3091   set_jump_target(case1, out);
3092   if (opcode[i]==0x2A) { // SWL
3093     // Write 3 msb into three least significant bytes
3094     if(rs2[i]) emit_rorimm(tl,8,tl);
3095     emit_writehword_indexed(tl,-1,temp);
3096     if(rs2[i]) emit_rorimm(tl,16,tl);
3097     emit_writebyte_indexed(tl,1,temp);
3098     if(rs2[i]) emit_rorimm(tl,8,tl);
3099   }
3100   else if (opcode[i]==0x2E) { // SWR
3101     // Write two lsb into two most significant bytes
3102     emit_writehword_indexed(tl,1,temp);
3103   }
3104   done1=out;
3105   emit_jmp(0);
3106   // 2
3107   set_jump_target(case2, out);
3108   emit_testimm(temp,1);
3109   case3=out;
3110   emit_jne(0);
3111   if (opcode[i]==0x2A) { // SWL
3112     // Write two msb into two least significant bytes
3113     if(rs2[i]) emit_rorimm(tl,16,tl);
3114     emit_writehword_indexed(tl,-2,temp);
3115     if(rs2[i]) emit_rorimm(tl,16,tl);
3116   }
3117   else if (opcode[i]==0x2E) { // SWR
3118     // Write 3 lsb into three most significant bytes
3119     emit_writebyte_indexed(tl,-1,temp);
3120     if(rs2[i]) emit_rorimm(tl,8,tl);
3121     emit_writehword_indexed(tl,0,temp);
3122     if(rs2[i]) emit_rorimm(tl,24,tl);
3123   }
3124   done2=out;
3125   emit_jmp(0);
3126   // 3
3127   set_jump_target(case3, out);
3128   if (opcode[i]==0x2A) { // SWL
3129     // Write msb into least significant byte
3130     if(rs2[i]) emit_rorimm(tl,24,tl);
3131     emit_writebyte_indexed(tl,-3,temp);
3132     if(rs2[i]) emit_rorimm(tl,8,tl);
3133   }
3134   else if (opcode[i]==0x2E) { // SWR
3135     // Write entire word
3136     emit_writeword_indexed(tl,-3,temp);
3137   }
3138   set_jump_target(done0, out);
3139   set_jump_target(done1, out);
3140   set_jump_target(done2, out);
3141   if(!c||!memtarget)
3142     add_stub_r(STORELR_STUB,jaddr,out,i,temp,i_regs,ccadj[i],reglist);
3143   if(!(i_regs->waswritten&(1<<rs1[i])) && !HACK_ENABLED(NDHACK_NO_SMC_CHECK)) {
3144     emit_addimm_no_flags(-ram_offset,temp);
3145     #if defined(HOST_IMM8)
3146     int ir=get_reg(i_regs->regmap,INVCP);
3147     assert(ir>=0);
3148     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3149     #else
3150     emit_cmpmem_indexedsr12_imm(invalid_code,temp,1);
3151     #endif
3152     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3153     emit_callne(invalidate_addr_reg[temp]);
3154     #else
3155     void *jaddr2 = out;
3156     emit_jne(0);
3157     add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3158     #endif
3159   }
3160 }
3161
3162 static void cop0_assemble(int i,struct regstat *i_regs)
3163 {
3164   if(opcode2[i]==0) // MFC0
3165   {
3166     signed char t=get_reg(i_regs->regmap,rt1[i]);
3167     u_int copr=(source[i]>>11)&0x1f;
3168     //assert(t>=0); // Why does this happen?  OOT is weird
3169     if(t>=0&&rt1[i]!=0) {
3170       emit_readword(&reg_cop0[copr],t);
3171     }
3172   }
3173   else if(opcode2[i]==4) // MTC0
3174   {
3175     signed char s=get_reg(i_regs->regmap,rs1[i]);
3176     char copr=(source[i]>>11)&0x1f;
3177     assert(s>=0);
3178     wb_register(rs1[i],i_regs->regmap,i_regs->dirty);
3179     if(copr==9||copr==11||copr==12||copr==13) {
3180       emit_readword(&last_count,HOST_TEMPREG);
3181       emit_loadreg(CCREG,HOST_CCREG); // TODO: do proper reg alloc
3182       emit_add(HOST_CCREG,HOST_TEMPREG,HOST_CCREG);
3183       emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3184       emit_writeword(HOST_CCREG,&Count);
3185     }
3186     // What a mess.  The status register (12) can enable interrupts,
3187     // so needs a special case to handle a pending interrupt.
3188     // The interrupt must be taken immediately, because a subsequent
3189     // instruction might disable interrupts again.
3190     if(copr==12||copr==13) {
3191       if (is_delayslot) {
3192         // burn cycles to cause cc_interrupt, which will
3193         // reschedule next_interupt. Relies on CCREG from above.
3194         assem_debug("MTC0 DS %d\n", copr);
3195         emit_writeword(HOST_CCREG,&last_count);
3196         emit_movimm(0,HOST_CCREG);
3197         emit_storereg(CCREG,HOST_CCREG);
3198         emit_loadreg(rs1[i],1);
3199         emit_movimm(copr,0);
3200         emit_far_call(pcsx_mtc0_ds);
3201         emit_loadreg(rs1[i],s);
3202         return;
3203       }
3204       emit_movimm(start+i*4+4,HOST_TEMPREG);
3205       emit_writeword(HOST_TEMPREG,&pcaddr);
3206       emit_movimm(0,HOST_TEMPREG);
3207       emit_writeword(HOST_TEMPREG,&pending_exception);
3208     }
3209     if(s==HOST_CCREG)
3210       emit_loadreg(rs1[i],1);
3211     else if(s!=1)
3212       emit_mov(s,1);
3213     emit_movimm(copr,0);
3214     emit_far_call(pcsx_mtc0);
3215     if(copr==9||copr==11||copr==12||copr==13) {
3216       emit_readword(&Count,HOST_CCREG);
3217       emit_readword(&next_interupt,HOST_TEMPREG);
3218       emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3219       emit_sub(HOST_CCREG,HOST_TEMPREG,HOST_CCREG);
3220       emit_writeword(HOST_TEMPREG,&last_count);
3221       emit_storereg(CCREG,HOST_CCREG);
3222     }
3223     if(copr==12||copr==13) {
3224       assert(!is_delayslot);
3225       emit_readword(&pending_exception,14);
3226       emit_test(14,14);
3227       void *jaddr = out;
3228       emit_jeq(0);
3229       emit_readword(&pcaddr, 0);
3230       emit_addimm(HOST_CCREG,2,HOST_CCREG);
3231       emit_far_call(get_addr_ht);
3232       emit_jmpreg(0);
3233       set_jump_target(jaddr, out);
3234     }
3235     emit_loadreg(rs1[i],s);
3236   }
3237   else
3238   {
3239     assert(opcode2[i]==0x10);
3240     //if((source[i]&0x3f)==0x10) // RFE
3241     {
3242       emit_readword(&Status,0);
3243       emit_andimm(0,0x3c,1);
3244       emit_andimm(0,~0xf,0);
3245       emit_orrshr_imm(1,2,0);
3246       emit_writeword(0,&Status);
3247     }
3248   }
3249 }
3250
3251 static void cop1_unusable(int i,struct regstat *i_regs)
3252 {
3253   // XXX: should just just do the exception instead
3254   //if(!cop1_usable)
3255   {
3256     void *jaddr=out;
3257     emit_jmp(0);
3258     add_stub_r(FP_STUB,jaddr,out,i,0,i_regs,is_delayslot,0);
3259   }
3260 }
3261
3262 static void cop1_assemble(int i,struct regstat *i_regs)
3263 {
3264   cop1_unusable(i, i_regs);
3265 }
3266
3267 static void c1ls_assemble(int i,struct regstat *i_regs)
3268 {
3269   cop1_unusable(i, i_regs);
3270 }
3271
3272 // FP_STUB
3273 static void do_cop1stub(int n)
3274 {
3275   literal_pool(256);
3276   assem_debug("do_cop1stub %x\n",start+stubs[n].a*4);
3277   set_jump_target(stubs[n].addr, out);
3278   int i=stubs[n].a;
3279 //  int rs=stubs[n].b;
3280   struct regstat *i_regs=(struct regstat *)stubs[n].c;
3281   int ds=stubs[n].d;
3282   if(!ds) {
3283     load_all_consts(regs[i].regmap_entry,regs[i].wasdirty,i);
3284     //if(i_regs!=&regs[i]) printf("oops: regs[i]=%x i_regs=%x",(int)&regs[i],(int)i_regs);
3285   }
3286   //else {printf("fp exception in delay slot\n");}
3287   wb_dirtys(i_regs->regmap_entry,i_regs->wasdirty);
3288   if(regs[i].regmap_entry[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
3289   emit_movimm(start+(i-ds)*4,EAX); // Get PC
3290   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3291   emit_far_jump(ds?fp_exception_ds:fp_exception);
3292 }
3293
3294 static int cop2_is_stalling_op(int i, int *cycles)
3295 {
3296   if (opcode[i] == 0x3a) { // SWC2
3297     *cycles = 0;
3298     return 1;
3299   }
3300   if (itype[i] == COP2 && (opcode2[i] == 0 || opcode2[i] == 2)) { // MFC2/CFC2
3301     *cycles = 0;
3302     return 1;
3303   }
3304   if (itype[i] == C2OP) {
3305     *cycles = gte_cycletab[source[i] & 0x3f];
3306     return 1;
3307   }
3308   // ... what about MTC2/CTC2/LWC2?
3309   return 0;
3310 }
3311
3312 #if 0
3313 static void log_gte_stall(int stall, u_int cycle)
3314 {
3315   if ((u_int)stall <= 44)
3316     printf("x    stall %2d %u\n", stall, cycle + last_count);
3317  if (cycle + last_count > 1215348544) exit(1);
3318 }
3319
3320 static void emit_log_gte_stall(int i, int stall, u_int reglist)
3321 {
3322   save_regs(reglist);
3323   if (stall > 0)
3324     emit_movimm(stall, 0);
3325   else
3326     emit_mov(HOST_TEMPREG, 0);
3327   emit_addimm(HOST_CCREG, CLOCK_ADJUST(ccadj[i]), 1);
3328   emit_far_call(log_gte_stall);
3329   restore_regs(reglist);
3330 }
3331 #endif
3332
3333 static void cop2_call_stall_check(u_int op, int i, const struct regstat *i_regs, u_int reglist)
3334 {
3335   int j = i, other_gte_op_cycles = -1, stall = -MAXBLOCK, cycles_passed;
3336   int rtmp = reglist_find_free(reglist);
3337
3338   if (HACK_ENABLED(NDHACK_GTE_NO_STALL))
3339     return;
3340   //assert(get_reg(i_regs->regmap, CCREG) == HOST_CCREG);
3341   if (get_reg(i_regs->regmap, CCREG) != HOST_CCREG) {
3342     // happens occasionally... cc evicted? Don't bother then
3343     //printf("no cc %08x\n", start + i*4);
3344     return;
3345   }
3346   if (!bt[i]) {
3347     for (j = i - 1; j >= 0; j--) {
3348       //if (is_ds[j]) break;
3349       if (cop2_is_stalling_op(j, &other_gte_op_cycles) || bt[j])
3350         break;
3351     }
3352   }
3353   cycles_passed = CLOCK_ADJUST(ccadj[i] - ccadj[j]);
3354   if (other_gte_op_cycles >= 0)
3355     stall = other_gte_op_cycles - cycles_passed;
3356   else if (cycles_passed >= 44)
3357     stall = 0; // can't stall
3358   if (stall == -MAXBLOCK && rtmp >= 0) {
3359     // unknown stall, do the expensive runtime check
3360     assem_debug("; cop2_call_stall_check\n");
3361 #if 0 // too slow
3362     save_regs(reglist);
3363     emit_movimm(gte_cycletab[op], 0);
3364     emit_addimm(HOST_CCREG, CLOCK_ADJUST(ccadj[i]), 1);
3365     emit_far_call(call_gteStall);
3366     restore_regs(reglist);
3367 #else
3368     host_tempreg_acquire();
3369     emit_readword(&psxRegs.gteBusyCycle, rtmp);
3370     emit_addimm(rtmp, -CLOCK_ADJUST(ccadj[i]), rtmp);
3371     emit_sub(rtmp, HOST_CCREG, HOST_TEMPREG);
3372     emit_cmpimm(HOST_TEMPREG, 44);
3373     emit_cmovb_reg(rtmp, HOST_CCREG);
3374     //emit_log_gte_stall(i, 0, reglist);
3375     host_tempreg_release();
3376 #endif
3377   }
3378   else if (stall > 0) {
3379     //emit_log_gte_stall(i, stall, reglist);
3380     emit_addimm(HOST_CCREG, stall, HOST_CCREG);
3381   }
3382
3383   // save gteBusyCycle, if needed
3384   if (gte_cycletab[op] == 0)
3385     return;
3386   other_gte_op_cycles = -1;
3387   for (j = i + 1; j < slen; j++) {
3388     if (cop2_is_stalling_op(j, &other_gte_op_cycles))
3389       break;
3390     if (is_jump(j)) {
3391       // check ds
3392       if (j + 1 < slen && cop2_is_stalling_op(j + 1, &other_gte_op_cycles))
3393         j++;
3394       break;
3395     }
3396   }
3397   if (other_gte_op_cycles >= 0)
3398     // will handle stall when assembling that op
3399     return;
3400   cycles_passed = CLOCK_ADJUST(ccadj[min(j, slen -1)] - ccadj[i]);
3401   if (cycles_passed >= 44)
3402     return;
3403   assem_debug("; save gteBusyCycle\n");
3404   host_tempreg_acquire();
3405 #if 0
3406   emit_readword(&last_count, HOST_TEMPREG);
3407   emit_add(HOST_TEMPREG, HOST_CCREG, HOST_TEMPREG);
3408   emit_addimm(HOST_TEMPREG, CLOCK_ADJUST(ccadj[i]), HOST_TEMPREG);
3409   emit_addimm(HOST_TEMPREG, gte_cycletab[op]), HOST_TEMPREG);
3410   emit_writeword(HOST_TEMPREG, &psxRegs.gteBusyCycle);
3411 #else
3412   emit_addimm(HOST_CCREG, CLOCK_ADJUST(ccadj[i]) + gte_cycletab[op], HOST_TEMPREG);
3413   emit_writeword(HOST_TEMPREG, &psxRegs.gteBusyCycle);
3414 #endif
3415   host_tempreg_release();
3416 }
3417
3418 static void cop2_get_dreg(u_int copr,signed char tl,signed char temp)
3419 {
3420   switch (copr) {
3421     case 1:
3422     case 3:
3423     case 5:
3424     case 8:
3425     case 9:
3426     case 10:
3427     case 11:
3428       emit_readword(&reg_cop2d[copr],tl);
3429       emit_signextend16(tl,tl);
3430       emit_writeword(tl,&reg_cop2d[copr]); // hmh
3431       break;
3432     case 7:
3433     case 16:
3434     case 17:
3435     case 18:
3436     case 19:
3437       emit_readword(&reg_cop2d[copr],tl);
3438       emit_andimm(tl,0xffff,tl);
3439       emit_writeword(tl,&reg_cop2d[copr]);
3440       break;
3441     case 15:
3442       emit_readword(&reg_cop2d[14],tl); // SXY2
3443       emit_writeword(tl,&reg_cop2d[copr]);
3444       break;
3445     case 28:
3446     case 29:
3447       c2op_mfc2_29_assemble(tl,temp);
3448       break;
3449     default:
3450       emit_readword(&reg_cop2d[copr],tl);
3451       break;
3452   }
3453 }
3454
3455 static void cop2_put_dreg(u_int copr,signed char sl,signed char temp)
3456 {
3457   switch (copr) {
3458     case 15:
3459       emit_readword(&reg_cop2d[13],temp);  // SXY1
3460       emit_writeword(sl,&reg_cop2d[copr]);
3461       emit_writeword(temp,&reg_cop2d[12]); // SXY0
3462       emit_readword(&reg_cop2d[14],temp);  // SXY2
3463       emit_writeword(sl,&reg_cop2d[14]);
3464       emit_writeword(temp,&reg_cop2d[13]); // SXY1
3465       break;
3466     case 28:
3467       emit_andimm(sl,0x001f,temp);
3468       emit_shlimm(temp,7,temp);
3469       emit_writeword(temp,&reg_cop2d[9]);
3470       emit_andimm(sl,0x03e0,temp);
3471       emit_shlimm(temp,2,temp);
3472       emit_writeword(temp,&reg_cop2d[10]);
3473       emit_andimm(sl,0x7c00,temp);
3474       emit_shrimm(temp,3,temp);
3475       emit_writeword(temp,&reg_cop2d[11]);
3476       emit_writeword(sl,&reg_cop2d[28]);
3477       break;
3478     case 30:
3479       emit_xorsar_imm(sl,sl,31,temp);
3480 #if defined(HAVE_ARMV5) || defined(__aarch64__)
3481       emit_clz(temp,temp);
3482 #else
3483       emit_movs(temp,HOST_TEMPREG);
3484       emit_movimm(0,temp);
3485       emit_jeq((int)out+4*4);
3486       emit_addpl_imm(temp,1,temp);
3487       emit_lslpls_imm(HOST_TEMPREG,1,HOST_TEMPREG);
3488       emit_jns((int)out-2*4);
3489 #endif
3490       emit_writeword(sl,&reg_cop2d[30]);
3491       emit_writeword(temp,&reg_cop2d[31]);
3492       break;
3493     case 31:
3494       break;
3495     default:
3496       emit_writeword(sl,&reg_cop2d[copr]);
3497       break;
3498   }
3499 }
3500
3501 static void c2ls_assemble(int i, const struct regstat *i_regs)
3502 {
3503   int s,tl;
3504   int ar;
3505   int offset;
3506   int memtarget=0,c=0;
3507   void *jaddr2=NULL;
3508   enum stub_type type;
3509   int agr=AGEN1+(i&1);
3510   int fastio_reg_override=-1;
3511   u_int reglist=get_host_reglist(i_regs->regmap);
3512   u_int copr=(source[i]>>16)&0x1f;
3513   s=get_reg(i_regs->regmap,rs1[i]);
3514   tl=get_reg(i_regs->regmap,FTEMP);
3515   offset=imm[i];
3516   assert(rs1[i]>0);
3517   assert(tl>=0);
3518
3519   if(i_regs->regmap[HOST_CCREG]==CCREG)
3520     reglist&=~(1<<HOST_CCREG);
3521
3522   // get the address
3523   if (opcode[i]==0x3a) { // SWC2
3524     ar=get_reg(i_regs->regmap,agr);
3525     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3526     reglist|=1<<ar;
3527   } else { // LWC2
3528     ar=tl;
3529   }
3530   if(s>=0) c=(i_regs->wasconst>>s)&1;
3531   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3532   if (!offset&&!c&&s>=0) ar=s;
3533   assert(ar>=0);
3534
3535   if (opcode[i]==0x3a) { // SWC2
3536     cop2_call_stall_check(0, i, i_regs, reglist_exclude(reglist, tl, -1));
3537     cop2_get_dreg(copr,tl,-1);
3538     type=STOREW_STUB;
3539   }
3540   else
3541     type=LOADW_STUB;
3542
3543   if(c&&!memtarget) {
3544     jaddr2=out;
3545     emit_jmp(0); // inline_readstub/inline_writestub?
3546   }
3547   else {
3548     if(!c) {
3549       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3550     }
3551     else if(ram_offset&&memtarget) {
3552       host_tempreg_acquire();
3553       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3554       fastio_reg_override=HOST_TEMPREG;
3555     }
3556     if (opcode[i]==0x32) { // LWC2
3557       int a=ar;
3558       if(fastio_reg_override>=0) a=fastio_reg_override;
3559       emit_readword_indexed(0,a,tl);
3560     }
3561     if (opcode[i]==0x3a) { // SWC2
3562       #ifdef DESTRUCTIVE_SHIFT
3563       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3564       #endif
3565       int a=ar;
3566       if(fastio_reg_override>=0) a=fastio_reg_override;
3567       emit_writeword_indexed(tl,0,a);
3568     }
3569   }
3570   if(fastio_reg_override==HOST_TEMPREG)
3571     host_tempreg_release();
3572   if(jaddr2)
3573     add_stub_r(type,jaddr2,out,i,ar,i_regs,ccadj[i],reglist);
3574   if(opcode[i]==0x3a) // SWC2
3575   if(!(i_regs->waswritten&(1<<rs1[i])) && !HACK_ENABLED(NDHACK_NO_SMC_CHECK)) {
3576 #if defined(HOST_IMM8)
3577     int ir=get_reg(i_regs->regmap,INVCP);
3578     assert(ir>=0);
3579     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3580 #else
3581     emit_cmpmem_indexedsr12_imm(invalid_code,ar,1);
3582 #endif
3583     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3584     emit_callne(invalidate_addr_reg[ar]);
3585     #else
3586     void *jaddr3 = out;
3587     emit_jne(0);
3588     add_stub(INVCODE_STUB,jaddr3,out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3589     #endif
3590   }
3591   if (opcode[i]==0x32) { // LWC2
3592     host_tempreg_acquire();
3593     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3594     host_tempreg_release();
3595   }
3596 }
3597
3598 static void cop2_assemble(int i, const struct regstat *i_regs)
3599 {
3600   u_int copr = (source[i]>>11) & 0x1f;
3601   signed char temp = get_reg(i_regs->regmap, -1);
3602
3603   if (opcode2[i] == 0 || opcode2[i] == 2) { // MFC2/CFC2
3604     if (!HACK_ENABLED(NDHACK_GTE_NO_STALL)) {
3605       signed char tl = get_reg(i_regs->regmap, rt1[i]);
3606       u_int reglist = reglist_exclude(get_host_reglist(i_regs->regmap), tl, temp);
3607       cop2_call_stall_check(0, i, i_regs, reglist);
3608     }
3609   }
3610   if (opcode2[i]==0) { // MFC2
3611     signed char tl=get_reg(i_regs->regmap,rt1[i]);
3612     if(tl>=0&&rt1[i]!=0)
3613       cop2_get_dreg(copr,tl,temp);
3614   }
3615   else if (opcode2[i]==4) { // MTC2
3616     signed char sl=get_reg(i_regs->regmap,rs1[i]);
3617     cop2_put_dreg(copr,sl,temp);
3618   }
3619   else if (opcode2[i]==2) // CFC2
3620   {
3621     signed char tl=get_reg(i_regs->regmap,rt1[i]);
3622     if(tl>=0&&rt1[i]!=0)
3623       emit_readword(&reg_cop2c[copr],tl);
3624   }
3625   else if (opcode2[i]==6) // CTC2
3626   {
3627     signed char sl=get_reg(i_regs->regmap,rs1[i]);
3628     switch(copr) {
3629       case 4:
3630       case 12:
3631       case 20:
3632       case 26:
3633       case 27:
3634       case 29:
3635       case 30:
3636         emit_signextend16(sl,temp);
3637         break;
3638       case 31:
3639         c2op_ctc2_31_assemble(sl,temp);
3640         break;
3641       default:
3642         temp=sl;
3643         break;
3644     }
3645     emit_writeword(temp,&reg_cop2c[copr]);
3646     assert(sl>=0);
3647   }
3648 }
3649
3650 static void do_unalignedwritestub(int n)
3651 {
3652   assem_debug("do_unalignedwritestub %x\n",start+stubs[n].a*4);
3653   literal_pool(256);
3654   set_jump_target(stubs[n].addr, out);
3655
3656   int i=stubs[n].a;
3657   struct regstat *i_regs=(struct regstat *)stubs[n].c;
3658   int addr=stubs[n].b;
3659   u_int reglist=stubs[n].e;
3660   signed char *i_regmap=i_regs->regmap;
3661   int temp2=get_reg(i_regmap,FTEMP);
3662   int rt;
3663   rt=get_reg(i_regmap,rs2[i]);
3664   assert(rt>=0);
3665   assert(addr>=0);
3666   assert(opcode[i]==0x2a||opcode[i]==0x2e); // SWL/SWR only implemented
3667   reglist|=(1<<addr);
3668   reglist&=~(1<<temp2);
3669
3670 #if 1
3671   // don't bother with it and call write handler
3672   save_regs(reglist);
3673   pass_args(addr,rt);
3674   int cc=get_reg(i_regmap,CCREG);
3675   if(cc<0)
3676     emit_loadreg(CCREG,2);
3677   emit_addimm(cc<0?2:cc,CLOCK_ADJUST((int)stubs[n].d+1),2);
3678   emit_far_call((opcode[i]==0x2a?jump_handle_swl:jump_handle_swr));
3679   emit_addimm(0,-CLOCK_ADJUST((int)stubs[n].d+1),cc<0?2:cc);
3680   if(cc<0)
3681     emit_storereg(CCREG,2);
3682   restore_regs(reglist);
3683   emit_jmp(stubs[n].retaddr); // return address
3684 #else
3685   emit_andimm(addr,0xfffffffc,temp2);
3686   emit_writeword(temp2,&address);
3687
3688   save_regs(reglist);
3689   emit_shrimm(addr,16,1);
3690   int cc=get_reg(i_regmap,CCREG);
3691   if(cc<0) {
3692     emit_loadreg(CCREG,2);
3693   }
3694   emit_movimm((u_int)readmem,0);
3695   emit_addimm(cc<0?2:cc,2*stubs[n].d+2,2);
3696   emit_call((int)&indirect_jump_indexed);
3697   restore_regs(reglist);
3698
3699   emit_readword(&readmem_dword,temp2);
3700   int temp=addr; //hmh
3701   emit_shlimm(addr,3,temp);
3702   emit_andimm(temp,24,temp);
3703   if (opcode[i]==0x2a) // SWL
3704     emit_xorimm(temp,24,temp);
3705   emit_movimm(-1,HOST_TEMPREG);
3706   if (opcode[i]==0x2a) { // SWL
3707     emit_bic_lsr(temp2,HOST_TEMPREG,temp,temp2);
3708     emit_orrshr(rt,temp,temp2);
3709   }else{
3710     emit_bic_lsl(temp2,HOST_TEMPREG,temp,temp2);
3711     emit_orrshl(rt,temp,temp2);
3712   }
3713   emit_readword(&address,addr);
3714   emit_writeword(temp2,&word);
3715   //save_regs(reglist); // don't need to, no state changes
3716   emit_shrimm(addr,16,1);
3717   emit_movimm((u_int)writemem,0);
3718   //emit_call((int)&indirect_jump_indexed);
3719   emit_mov(15,14);
3720   emit_readword_dualindexedx4(0,1,15);
3721   emit_readword(&Count,HOST_TEMPREG);
3722   emit_readword(&next_interupt,2);
3723   emit_addimm(HOST_TEMPREG,-2*stubs[n].d-2,HOST_TEMPREG);
3724   emit_writeword(2,&last_count);
3725   emit_sub(HOST_TEMPREG,2,cc<0?HOST_TEMPREG:cc);
3726   if(cc<0) {
3727     emit_storereg(CCREG,HOST_TEMPREG);
3728   }
3729   restore_regs(reglist);
3730   emit_jmp(stubs[n].retaddr); // return address
3731 #endif
3732 }
3733
3734 #ifndef multdiv_assemble
3735 void multdiv_assemble(int i,struct regstat *i_regs)
3736 {
3737   printf("Need multdiv_assemble for this architecture.\n");
3738   abort();
3739 }
3740 #endif
3741
3742 static void mov_assemble(int i,struct regstat *i_regs)
3743 {
3744   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3745   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3746   if(rt1[i]) {
3747     signed char sl,tl;
3748     tl=get_reg(i_regs->regmap,rt1[i]);
3749     //assert(tl>=0);
3750     if(tl>=0) {
3751       sl=get_reg(i_regs->regmap,rs1[i]);
3752       if(sl>=0) emit_mov(sl,tl);
3753       else emit_loadreg(rs1[i],tl);
3754     }
3755   }
3756 }
3757
3758 // call interpreter, exception handler, things that change pc/regs/cycles ...
3759 static void call_c_cpu_handler(int i, const struct regstat *i_regs, u_int pc, void *func)
3760 {
3761   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3762   assert(ccreg==HOST_CCREG);
3763   assert(!is_delayslot);
3764   (void)ccreg;
3765
3766   emit_movimm(pc,3); // Get PC
3767   emit_readword(&last_count,2);
3768   emit_writeword(3,&psxRegs.pc);
3769   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3770   emit_add(2,HOST_CCREG,2);
3771   emit_writeword(2,&psxRegs.cycle);
3772   emit_far_call(func);
3773   emit_far_jump(jump_to_new_pc);
3774 }
3775
3776 static void syscall_assemble(int i,struct regstat *i_regs)
3777 {
3778   emit_movimm(0x20,0); // cause code
3779   emit_movimm(0,1);    // not in delay slot
3780   call_c_cpu_handler(i,i_regs,start+i*4,psxException);
3781 }
3782
3783 static void hlecall_assemble(int i,struct regstat *i_regs)
3784 {
3785   void *hlefunc = psxNULL;
3786   uint32_t hleCode = source[i] & 0x03ffffff;
3787   if (hleCode < ARRAY_SIZE(psxHLEt))
3788     hlefunc = psxHLEt[hleCode];
3789
3790   call_c_cpu_handler(i,i_regs,start+i*4+4,hlefunc);
3791 }
3792
3793 static void intcall_assemble(int i,struct regstat *i_regs)
3794 {
3795   call_c_cpu_handler(i,i_regs,start+i*4,execI);
3796 }
3797
3798 static void speculate_mov(int rs,int rt)
3799 {
3800   if(rt!=0) {
3801     smrv_strong_next|=1<<rt;
3802     smrv[rt]=smrv[rs];
3803   }
3804 }
3805
3806 static void speculate_mov_weak(int rs,int rt)
3807 {
3808   if(rt!=0) {
3809     smrv_weak_next|=1<<rt;
3810     smrv[rt]=smrv[rs];
3811   }
3812 }
3813
3814 static void speculate_register_values(int i)
3815 {
3816   if(i==0) {
3817     memcpy(smrv,psxRegs.GPR.r,sizeof(smrv));
3818     // gp,sp are likely to stay the same throughout the block
3819     smrv_strong_next=(1<<28)|(1<<29)|(1<<30);
3820     smrv_weak_next=~smrv_strong_next;
3821     //printf(" llr %08x\n", smrv[4]);
3822   }
3823   smrv_strong=smrv_strong_next;
3824   smrv_weak=smrv_weak_next;
3825   switch(itype[i]) {
3826     case ALU:
3827       if     ((smrv_strong>>rs1[i])&1) speculate_mov(rs1[i],rt1[i]);
3828       else if((smrv_strong>>rs2[i])&1) speculate_mov(rs2[i],rt1[i]);
3829       else if((smrv_weak>>rs1[i])&1) speculate_mov_weak(rs1[i],rt1[i]);
3830       else if((smrv_weak>>rs2[i])&1) speculate_mov_weak(rs2[i],rt1[i]);
3831       else {
3832         smrv_strong_next&=~(1<<rt1[i]);
3833         smrv_weak_next&=~(1<<rt1[i]);
3834       }
3835       break;
3836     case SHIFTIMM:
3837       smrv_strong_next&=~(1<<rt1[i]);
3838       smrv_weak_next&=~(1<<rt1[i]);
3839       // fallthrough
3840     case IMM16:
3841       if(rt1[i]&&is_const(&regs[i],rt1[i])) {
3842         int value,hr=get_reg(regs[i].regmap,rt1[i]);
3843         if(hr>=0) {
3844           if(get_final_value(hr,i,&value))
3845                smrv[rt1[i]]=value;
3846           else smrv[rt1[i]]=constmap[i][hr];
3847           smrv_strong_next|=1<<rt1[i];
3848         }
3849       }
3850       else {
3851         if     ((smrv_strong>>rs1[i])&1) speculate_mov(rs1[i],rt1[i]);
3852         else if((smrv_weak>>rs1[i])&1) speculate_mov_weak(rs1[i],rt1[i]);
3853       }
3854       break;
3855     case LOAD:
3856       if(start<0x2000&&(rt1[i]==26||(smrv[rt1[i]]>>24)==0xa0)) {
3857         // special case for BIOS
3858         smrv[rt1[i]]=0xa0000000;
3859         smrv_strong_next|=1<<rt1[i];
3860         break;
3861       }
3862       // fallthrough
3863     case SHIFT:
3864     case LOADLR:
3865     case MOV:
3866       smrv_strong_next&=~(1<<rt1[i]);
3867       smrv_weak_next&=~(1<<rt1[i]);
3868       break;
3869     case COP0:
3870     case COP2:
3871       if(opcode2[i]==0||opcode2[i]==2) { // MFC/CFC
3872         smrv_strong_next&=~(1<<rt1[i]);
3873         smrv_weak_next&=~(1<<rt1[i]);
3874       }
3875       break;
3876     case C2LS:
3877       if (opcode[i]==0x32) { // LWC2
3878         smrv_strong_next&=~(1<<rt1[i]);
3879         smrv_weak_next&=~(1<<rt1[i]);
3880       }
3881       break;
3882   }
3883 #if 0
3884   int r=4;
3885   printf("x %08x %08x %d %d c %08x %08x\n",smrv[r],start+i*4,
3886     ((smrv_strong>>r)&1),(smrv_weak>>r)&1,regs[i].isconst,regs[i].wasconst);
3887 #endif
3888 }
3889
3890 static void ds_assemble(int i,struct regstat *i_regs)
3891 {
3892   speculate_register_values(i);
3893   is_delayslot=1;
3894   switch(itype[i]) {
3895     case ALU:
3896       alu_assemble(i,i_regs);break;
3897     case IMM16:
3898       imm16_assemble(i,i_regs);break;
3899     case SHIFT:
3900       shift_assemble(i,i_regs);break;
3901     case SHIFTIMM:
3902       shiftimm_assemble(i,i_regs);break;
3903     case LOAD:
3904       load_assemble(i,i_regs);break;
3905     case LOADLR:
3906       loadlr_assemble(i,i_regs);break;
3907     case STORE:
3908       store_assemble(i,i_regs);break;
3909     case STORELR:
3910       storelr_assemble(i,i_regs);break;
3911     case COP0:
3912       cop0_assemble(i,i_regs);break;
3913     case COP1:
3914       cop1_assemble(i,i_regs);break;
3915     case C1LS:
3916       c1ls_assemble(i,i_regs);break;
3917     case COP2:
3918       cop2_assemble(i,i_regs);break;
3919     case C2LS:
3920       c2ls_assemble(i,i_regs);break;
3921     case C2OP:
3922       c2op_assemble(i,i_regs);break;
3923     case MULTDIV:
3924       multdiv_assemble(i,i_regs);break;
3925     case MOV:
3926       mov_assemble(i,i_regs);break;
3927     case SYSCALL:
3928     case HLECALL:
3929     case INTCALL:
3930     case SPAN:
3931     case UJUMP:
3932     case RJUMP:
3933     case CJUMP:
3934     case SJUMP:
3935       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3936   }
3937   is_delayslot=0;
3938 }
3939
3940 // Is the branch target a valid internal jump?
3941 static int internal_branch(int addr)
3942 {
3943   if(addr&1) return 0; // Indirect (register) jump
3944   if(addr>=start && addr<start+slen*4-4)
3945   {
3946     return 1;
3947   }
3948   return 0;
3949 }
3950
3951 static void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t u)
3952 {
3953   int hr;
3954   for(hr=0;hr<HOST_REGS;hr++) {
3955     if(hr!=EXCLUDE_REG) {
3956       if(pre[hr]!=entry[hr]) {
3957         if(pre[hr]>=0) {
3958           if((dirty>>hr)&1) {
3959             if(get_reg(entry,pre[hr])<0) {
3960               assert(pre[hr]<64);
3961               if(!((u>>pre[hr])&1))
3962                 emit_storereg(pre[hr],hr);
3963             }
3964           }
3965         }
3966       }
3967     }
3968   }
3969   // Move from one register to another (no writeback)
3970   for(hr=0;hr<HOST_REGS;hr++) {
3971     if(hr!=EXCLUDE_REG) {
3972       if(pre[hr]!=entry[hr]) {
3973         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3974           int nr;
3975           if((nr=get_reg(entry,pre[hr]))>=0) {
3976             emit_mov(hr,nr);
3977           }
3978         }
3979       }
3980     }
3981   }
3982 }
3983
3984 // Load the specified registers
3985 // This only loads the registers given as arguments because
3986 // we don't want to load things that will be overwritten
3987 static void load_regs(signed char entry[],signed char regmap[],int rs1,int rs2)
3988 {
3989   int hr;
3990   // Load 32-bit regs
3991   for(hr=0;hr<HOST_REGS;hr++) {
3992     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3993       if(entry[hr]!=regmap[hr]) {
3994         if(regmap[hr]==rs1||regmap[hr]==rs2)
3995         {
3996           if(regmap[hr]==0) {
3997             emit_zeroreg(hr);
3998           }
3999           else
4000           {
4001             emit_loadreg(regmap[hr],hr);
4002           }
4003         }
4004       }
4005     }
4006   }
4007 }
4008
4009 // Load registers prior to the start of a loop
4010 // so that they are not loaded within the loop
4011 static void loop_preload(signed char pre[],signed char entry[])
4012 {
4013   int hr;
4014   for(hr=0;hr<HOST_REGS;hr++) {
4015     if(hr!=EXCLUDE_REG) {
4016       if(pre[hr]!=entry[hr]) {
4017         if(entry[hr]>=0) {
4018           if(get_reg(pre,entry[hr])<0) {
4019             assem_debug("loop preload:\n");
4020             //printf("loop preload: %d\n",hr);
4021             if(entry[hr]==0) {
4022               emit_zeroreg(hr);
4023             }
4024             else if(entry[hr]<TEMPREG)
4025             {
4026               emit_loadreg(entry[hr],hr);
4027             }
4028             else if(entry[hr]-64<TEMPREG)
4029             {
4030               emit_loadreg(entry[hr],hr);
4031             }
4032           }
4033         }
4034       }
4035     }
4036   }
4037 }
4038
4039 // Generate address for load/store instruction
4040 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
4041 void address_generation(int i,struct regstat *i_regs,signed char entry[])
4042 {
4043   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
4044     int ra=-1;
4045     int agr=AGEN1+(i&1);
4046     if(itype[i]==LOAD) {
4047       ra=get_reg(i_regs->regmap,rt1[i]);
4048       if(ra<0) ra=get_reg(i_regs->regmap,-1);
4049       assert(ra>=0);
4050     }
4051     if(itype[i]==LOADLR) {
4052       ra=get_reg(i_regs->regmap,FTEMP);
4053     }
4054     if(itype[i]==STORE||itype[i]==STORELR) {
4055       ra=get_reg(i_regs->regmap,agr);
4056       if(ra<0) ra=get_reg(i_regs->regmap,-1);
4057     }
4058     if(itype[i]==C1LS||itype[i]==C2LS) {
4059       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
4060         ra=get_reg(i_regs->regmap,FTEMP);
4061       else { // SWC1/SDC1/SWC2/SDC2
4062         ra=get_reg(i_regs->regmap,agr);
4063         if(ra<0) ra=get_reg(i_regs->regmap,-1);
4064       }
4065     }
4066     int rs=get_reg(i_regs->regmap,rs1[i]);
4067     if(ra>=0) {
4068       int offset=imm[i];
4069       int c=(i_regs->wasconst>>rs)&1;
4070       if(rs1[i]==0) {
4071         // Using r0 as a base address
4072         if(!entry||entry[ra]!=agr) {
4073           if (opcode[i]==0x22||opcode[i]==0x26) {
4074             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4075           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4076             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4077           }else{
4078             emit_movimm(offset,ra);
4079           }
4080         } // else did it in the previous cycle
4081       }
4082       else if(rs<0) {
4083         if(!entry||entry[ra]!=rs1[i])
4084           emit_loadreg(rs1[i],ra);
4085         //if(!entry||entry[ra]!=rs1[i])
4086         //  printf("poor load scheduling!\n");
4087       }
4088       else if(c) {
4089         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
4090           if(!entry||entry[ra]!=agr) {
4091             if (opcode[i]==0x22||opcode[i]==0x26) {
4092               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4093             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4094               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4095             }else{
4096               emit_movimm(constmap[i][rs]+offset,ra);
4097               regs[i].loadedconst|=1<<ra;
4098             }
4099           } // else did it in the previous cycle
4100         } // else load_consts already did it
4101       }
4102       if(offset&&!c&&rs1[i]) {
4103         if(rs>=0) {
4104           emit_addimm(rs,offset,ra);
4105         }else{
4106           emit_addimm(ra,offset,ra);
4107         }
4108       }
4109     }
4110   }
4111   // Preload constants for next instruction
4112   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
4113     int agr,ra;
4114     // Actual address
4115     agr=AGEN1+((i+1)&1);
4116     ra=get_reg(i_regs->regmap,agr);
4117     if(ra>=0) {
4118       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4119       int offset=imm[i+1];
4120       int c=(regs[i+1].wasconst>>rs)&1;
4121       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
4122         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4123           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4124         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4125           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4126         }else{
4127           emit_movimm(constmap[i+1][rs]+offset,ra);
4128           regs[i+1].loadedconst|=1<<ra;
4129         }
4130       }
4131       else if(rs1[i+1]==0) {
4132         // Using r0 as a base address
4133         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4134           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4135         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4136           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4137         }else{
4138           emit_movimm(offset,ra);
4139         }
4140       }
4141     }
4142   }
4143 }
4144
4145 static int get_final_value(int hr, int i, int *value)
4146 {
4147   int reg=regs[i].regmap[hr];
4148   while(i<slen-1) {
4149     if(regs[i+1].regmap[hr]!=reg) break;
4150     if(!((regs[i+1].isconst>>hr)&1)) break;
4151     if(bt[i+1]) break;
4152     i++;
4153   }
4154   if(i<slen-1) {
4155     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
4156       *value=constmap[i][hr];
4157       return 1;
4158     }
4159     if(!bt[i+1]) {
4160       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
4161         // Load in delay slot, out-of-order execution
4162         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
4163         {
4164           // Precompute load address
4165           *value=constmap[i][hr]+imm[i+2];
4166           return 1;
4167         }
4168       }
4169       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
4170       {
4171         // Precompute load address
4172         *value=constmap[i][hr]+imm[i+1];
4173         //printf("c=%x imm=%lx\n",(long)constmap[i][hr],imm[i+1]);
4174         return 1;
4175       }
4176     }
4177   }
4178   *value=constmap[i][hr];
4179   //printf("c=%lx\n",(long)constmap[i][hr]);
4180   if(i==slen-1) return 1;
4181   assert(reg < 64);
4182   return !((unneeded_reg[i+1]>>reg)&1);
4183 }
4184
4185 // Load registers with known constants
4186 static void load_consts(signed char pre[],signed char regmap[],int i)
4187 {
4188   int hr,hr2;
4189   // propagate loaded constant flags
4190   if(i==0||bt[i])
4191     regs[i].loadedconst=0;
4192   else {
4193     for(hr=0;hr<HOST_REGS;hr++) {
4194       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
4195          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
4196       {
4197         regs[i].loadedconst|=1<<hr;
4198       }
4199     }
4200   }
4201   // Load 32-bit regs
4202   for(hr=0;hr<HOST_REGS;hr++) {
4203     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4204       //if(entry[hr]!=regmap[hr]) {
4205       if(!((regs[i].loadedconst>>hr)&1)) {
4206         assert(regmap[hr]<64);
4207         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>0) {
4208           int value,similar=0;
4209           if(get_final_value(hr,i,&value)) {
4210             // see if some other register has similar value
4211             for(hr2=0;hr2<HOST_REGS;hr2++) {
4212               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
4213                 if(is_similar_value(value,constmap[i][hr2])) {
4214                   similar=1;
4215                   break;
4216                 }
4217               }
4218             }
4219             if(similar) {
4220               int value2;
4221               if(get_final_value(hr2,i,&value2)) // is this needed?
4222                 emit_movimm_from(value2,hr2,value,hr);
4223               else
4224                 emit_movimm(value,hr);
4225             }
4226             else if(value==0) {
4227               emit_zeroreg(hr);
4228             }
4229             else {
4230               emit_movimm(value,hr);
4231             }
4232           }
4233           regs[i].loadedconst|=1<<hr;
4234         }
4235       }
4236     }
4237   }
4238 }
4239
4240 void load_all_consts(signed char regmap[], u_int dirty, int i)
4241 {
4242   int hr;
4243   // Load 32-bit regs
4244   for(hr=0;hr<HOST_REGS;hr++) {
4245     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4246       assert(regmap[hr] < 64);
4247       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>0) {
4248         int value=constmap[i][hr];
4249         if(value==0) {
4250           emit_zeroreg(hr);
4251         }
4252         else {
4253           emit_movimm(value,hr);
4254         }
4255       }
4256     }
4257   }
4258 }
4259
4260 // Write out all dirty registers (except cycle count)
4261 static void wb_dirtys(signed char i_regmap[],uint64_t i_dirty)
4262 {
4263   int hr;
4264   for(hr=0;hr<HOST_REGS;hr++) {
4265     if(hr!=EXCLUDE_REG) {
4266       if(i_regmap[hr]>0) {
4267         if(i_regmap[hr]!=CCREG) {
4268           if((i_dirty>>hr)&1) {
4269             assert(i_regmap[hr]<64);
4270             emit_storereg(i_regmap[hr],hr);
4271           }
4272         }
4273       }
4274     }
4275   }
4276 }
4277
4278 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4279 // This writes the registers not written by store_regs_bt
4280 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_dirty,int addr)
4281 {
4282   int hr;
4283   int t=(addr-start)>>2;
4284   for(hr=0;hr<HOST_REGS;hr++) {
4285     if(hr!=EXCLUDE_REG) {
4286       if(i_regmap[hr]>0) {
4287         if(i_regmap[hr]!=CCREG) {
4288           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1)) {
4289             if((i_dirty>>hr)&1) {
4290               assert(i_regmap[hr]<64);
4291               emit_storereg(i_regmap[hr],hr);
4292             }
4293           }
4294         }
4295       }
4296     }
4297   }
4298 }
4299
4300 // Load all registers (except cycle count)
4301 void load_all_regs(signed char i_regmap[])
4302 {
4303   int hr;
4304   for(hr=0;hr<HOST_REGS;hr++) {
4305     if(hr!=EXCLUDE_REG) {
4306       if(i_regmap[hr]==0) {
4307         emit_zeroreg(hr);
4308       }
4309       else
4310       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4311       {
4312         emit_loadreg(i_regmap[hr],hr);
4313       }
4314     }
4315   }
4316 }
4317
4318 // Load all current registers also needed by next instruction
4319 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4320 {
4321   int hr;
4322   for(hr=0;hr<HOST_REGS;hr++) {
4323     if(hr!=EXCLUDE_REG) {
4324       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4325         if(i_regmap[hr]==0) {
4326           emit_zeroreg(hr);
4327         }
4328         else
4329         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4330         {
4331           emit_loadreg(i_regmap[hr],hr);
4332         }
4333       }
4334     }
4335   }
4336 }
4337
4338 // Load all regs, storing cycle count if necessary
4339 void load_regs_entry(int t)
4340 {
4341   int hr;
4342   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
4343   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
4344   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4345     emit_storereg(CCREG,HOST_CCREG);
4346   }
4347   // Load 32-bit regs
4348   for(hr=0;hr<HOST_REGS;hr++) {
4349     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4350       if(regs[t].regmap_entry[hr]==0) {
4351         emit_zeroreg(hr);
4352       }
4353       else if(regs[t].regmap_entry[hr]!=CCREG)
4354       {
4355         emit_loadreg(regs[t].regmap_entry[hr],hr);
4356       }
4357     }
4358   }
4359 }
4360
4361 // Store dirty registers prior to branch
4362 void store_regs_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
4363 {
4364   if(internal_branch(addr))
4365   {
4366     int t=(addr-start)>>2;
4367     int hr;
4368     for(hr=0;hr<HOST_REGS;hr++) {
4369       if(hr!=EXCLUDE_REG) {
4370         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4371           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1)) {
4372             if((i_dirty>>hr)&1) {
4373               assert(i_regmap[hr]<64);
4374               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4375                 emit_storereg(i_regmap[hr],hr);
4376             }
4377           }
4378         }
4379       }
4380     }
4381   }
4382   else
4383   {
4384     // Branch out of this block, write out all dirty regs
4385     wb_dirtys(i_regmap,i_dirty);
4386   }
4387 }
4388
4389 // Load all needed registers for branch target
4390 static void load_regs_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
4391 {
4392   //if(addr>=start && addr<(start+slen*4))
4393   if(internal_branch(addr))
4394   {
4395     int t=(addr-start)>>2;
4396     int hr;
4397     // Store the cycle count before loading something else
4398     if(i_regmap[HOST_CCREG]!=CCREG) {
4399       assert(i_regmap[HOST_CCREG]==-1);
4400     }
4401     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4402       emit_storereg(CCREG,HOST_CCREG);
4403     }
4404     // Load 32-bit regs
4405     for(hr=0;hr<HOST_REGS;hr++) {
4406       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4407         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4408           if(regs[t].regmap_entry[hr]==0) {
4409             emit_zeroreg(hr);
4410           }
4411           else if(regs[t].regmap_entry[hr]!=CCREG)
4412           {
4413             emit_loadreg(regs[t].regmap_entry[hr],hr);
4414           }
4415         }
4416       }
4417     }
4418   }
4419 }
4420
4421 static int match_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
4422 {
4423   if(addr>=start && addr<start+slen*4-4)
4424   {
4425     int t=(addr-start)>>2;
4426     int hr;
4427     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4428     for(hr=0;hr<HOST_REGS;hr++)
4429     {
4430       if(hr!=EXCLUDE_REG)
4431       {
4432         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4433         {
4434           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4435           {
4436             return 0;
4437           }
4438           else
4439           if((i_dirty>>hr)&1)
4440           {
4441             if(i_regmap[hr]<TEMPREG)
4442             {
4443               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4444                 return 0;
4445             }
4446             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4447             {
4448               assert(0);
4449             }
4450           }
4451         }
4452         else // Same register but is it 32-bit or dirty?
4453         if(i_regmap[hr]>=0)
4454         {
4455           if(!((regs[t].dirty>>hr)&1))
4456           {
4457             if((i_dirty>>hr)&1)
4458             {
4459               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4460               {
4461                 //printf("%x: dirty no match\n",addr);
4462                 return 0;
4463               }
4464             }
4465           }
4466         }
4467       }
4468     }
4469     // Delay slots are not valid branch targets
4470     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP)) return 0;
4471     // Delay slots require additional processing, so do not match
4472     if(is_ds[t]) return 0;
4473   }
4474   else
4475   {
4476     int hr;
4477     for(hr=0;hr<HOST_REGS;hr++)
4478     {
4479       if(hr!=EXCLUDE_REG)
4480       {
4481         if(i_regmap[hr]>=0)
4482         {
4483           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4484           {
4485             if((i_dirty>>hr)&1)
4486             {
4487               return 0;
4488             }
4489           }
4490         }
4491       }
4492     }
4493   }
4494   return 1;
4495 }
4496
4497 #ifdef DRC_DBG
4498 static void drc_dbg_emit_do_cmp(int i)
4499 {
4500   extern void do_insn_cmp();
4501   //extern int cycle;
4502   u_int hr, reglist = get_host_reglist(regs[i].regmap);
4503
4504   assem_debug("//do_insn_cmp %08x\n", start+i*4);
4505   save_regs(reglist);
4506   // write out changed consts to match the interpreter
4507   if (i > 0 && !bt[i]) {
4508     for (hr = 0; hr < HOST_REGS; hr++) {
4509       int reg = regs[i-1].regmap[hr];
4510       if (hr == EXCLUDE_REG || reg < 0)
4511         continue;
4512       if (!((regs[i-1].isconst >> hr) & 1))
4513         continue;
4514       if (i > 1 && reg == regs[i-2].regmap[hr] && constmap[i-1][hr] == constmap[i-2][hr])
4515         continue;
4516       emit_movimm(constmap[i-1][hr],0);
4517       emit_storereg(reg, 0);
4518     }
4519   }
4520   emit_movimm(start+i*4,0);
4521   emit_writeword(0,&pcaddr);
4522   emit_far_call(do_insn_cmp);
4523   //emit_readword(&cycle,0);
4524   //emit_addimm(0,2,0);
4525   //emit_writeword(0,&cycle);
4526   (void)get_reg2;
4527   restore_regs(reglist);
4528   assem_debug("\\\\do_insn_cmp\n");
4529 }
4530 #else
4531 #define drc_dbg_emit_do_cmp(x)
4532 #endif
4533
4534 // Used when a branch jumps into the delay slot of another branch
4535 static void ds_assemble_entry(int i)
4536 {
4537   int t=(ba[i]-start)>>2;
4538   if (!instr_addr[t])
4539     instr_addr[t] = out;
4540   assem_debug("Assemble delay slot at %x\n",ba[i]);
4541   assem_debug("<->\n");
4542   drc_dbg_emit_do_cmp(t);
4543   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4544     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty);
4545   load_regs(regs[t].regmap_entry,regs[t].regmap,rs1[t],rs2[t]);
4546   address_generation(t,&regs[t],regs[t].regmap_entry);
4547   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4548     load_regs(regs[t].regmap_entry,regs[t].regmap,INVCP,INVCP);
4549   is_delayslot=0;
4550   switch(itype[t]) {
4551     case ALU:
4552       alu_assemble(t,&regs[t]);break;
4553     case IMM16:
4554       imm16_assemble(t,&regs[t]);break;
4555     case SHIFT:
4556       shift_assemble(t,&regs[t]);break;
4557     case SHIFTIMM:
4558       shiftimm_assemble(t,&regs[t]);break;
4559     case LOAD:
4560       load_assemble(t,&regs[t]);break;
4561     case LOADLR:
4562       loadlr_assemble(t,&regs[t]);break;
4563     case STORE:
4564       store_assemble(t,&regs[t]);break;
4565     case STORELR:
4566       storelr_assemble(t,&regs[t]);break;
4567     case COP0:
4568       cop0_assemble(t,&regs[t]);break;
4569     case COP1:
4570       cop1_assemble(t,&regs[t]);break;
4571     case C1LS:
4572       c1ls_assemble(t,&regs[t]);break;
4573     case COP2:
4574       cop2_assemble(t,&regs[t]);break;
4575     case C2LS:
4576       c2ls_assemble(t,&regs[t]);break;
4577     case C2OP:
4578       c2op_assemble(t,&regs[t]);break;
4579     case MULTDIV:
4580       multdiv_assemble(t,&regs[t]);break;
4581     case MOV:
4582       mov_assemble(t,&regs[t]);break;
4583     case SYSCALL:
4584     case HLECALL:
4585     case INTCALL:
4586     case SPAN:
4587     case UJUMP:
4588     case RJUMP:
4589     case CJUMP:
4590     case SJUMP:
4591       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4592   }
4593   store_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+4);
4594   load_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+4);
4595   if(internal_branch(ba[i]+4))
4596     assem_debug("branch: internal\n");
4597   else
4598     assem_debug("branch: external\n");
4599   assert(internal_branch(ba[i]+4));
4600   add_to_linker(out,ba[i]+4,internal_branch(ba[i]+4));
4601   emit_jmp(0);
4602 }
4603
4604 static void emit_extjump(void *addr, u_int target)
4605 {
4606   emit_extjump2(addr, target, dyna_linker);
4607 }
4608
4609 static void emit_extjump_ds(void *addr, u_int target)
4610 {
4611   emit_extjump2(addr, target, dyna_linker_ds);
4612 }
4613
4614 // Load 2 immediates optimizing for small code size
4615 static void emit_mov2imm_compact(int imm1,u_int rt1,int imm2,u_int rt2)
4616 {
4617   emit_movimm(imm1,rt1);
4618   emit_movimm_from(imm1,rt1,imm2,rt2);
4619 }
4620
4621 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4622 {
4623   int count;
4624   void *jaddr;
4625   void *idle=NULL;
4626   int t=0;
4627   if(itype[i]==RJUMP)
4628   {
4629     *adj=0;
4630   }
4631   //if(ba[i]>=start && ba[i]<(start+slen*4))
4632   if(internal_branch(ba[i]))
4633   {
4634     t=(ba[i]-start)>>2;
4635     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4636     else *adj=ccadj[t];
4637   }
4638   else
4639   {
4640     *adj=0;
4641   }
4642   count=ccadj[i];
4643   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4644     // Idle loop
4645     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4646     idle=out;
4647     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4648     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4649     jaddr=out;
4650     emit_jmp(0);
4651   }
4652   else if(*adj==0||invert) {
4653     int cycles=CLOCK_ADJUST(count+2);
4654     // faster loop HACK
4655 #if 0
4656     if (t&&*adj) {
4657       int rel=t-i;
4658       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4659         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4660     }
4661 #endif
4662     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4663     jaddr=out;
4664     emit_jns(0);
4665   }
4666   else
4667   {
4668     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4669     jaddr=out;
4670     emit_jns(0);
4671   }
4672   add_stub(CC_STUB,jaddr,idle?idle:out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4673 }
4674
4675 static void do_ccstub(int n)
4676 {
4677   literal_pool(256);
4678   assem_debug("do_ccstub %x\n",start+(u_int)stubs[n].b*4);
4679   set_jump_target(stubs[n].addr, out);
4680   int i=stubs[n].b;
4681   if(stubs[n].d==NULLDS) {
4682     // Delay slot instruction is nullified ("likely" branch)
4683     wb_dirtys(regs[i].regmap,regs[i].dirty);
4684   }
4685   else if(stubs[n].d!=TAKEN) {
4686     wb_dirtys(branch_regs[i].regmap,branch_regs[i].dirty);
4687   }
4688   else {
4689     if(internal_branch(ba[i]))
4690       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4691   }
4692   if(stubs[n].c!=-1)
4693   {
4694     // Save PC as return address
4695     emit_movimm(stubs[n].c,EAX);
4696     emit_writeword(EAX,&pcaddr);
4697   }
4698   else
4699   {
4700     // Return address depends on which way the branch goes
4701     if(itype[i]==CJUMP||itype[i]==SJUMP)
4702     {
4703       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4704       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4705       if(rs1[i]==0)
4706       {
4707         s1l=s2l;
4708         s2l=-1;
4709       }
4710       else if(rs2[i]==0)
4711       {
4712         s2l=-1;
4713       }
4714       assert(s1l>=0);
4715       #ifdef DESTRUCTIVE_WRITEBACK
4716       if(rs1[i]) {
4717         if((branch_regs[i].dirty>>s1l)&&1)
4718           emit_loadreg(rs1[i],s1l);
4719       }
4720       else {
4721         if((branch_regs[i].dirty>>s1l)&1)
4722           emit_loadreg(rs2[i],s1l);
4723       }
4724       if(s2l>=0)
4725         if((branch_regs[i].dirty>>s2l)&1)
4726           emit_loadreg(rs2[i],s2l);
4727       #endif
4728       int hr=0;
4729       int addr=-1,alt=-1,ntaddr=-1;
4730       while(hr<HOST_REGS)
4731       {
4732         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4733            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4734            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4735         {
4736           addr=hr++;break;
4737         }
4738         hr++;
4739       }
4740       while(hr<HOST_REGS)
4741       {
4742         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4743            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4744            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4745         {
4746           alt=hr++;break;
4747         }
4748         hr++;
4749       }
4750       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4751       {
4752         while(hr<HOST_REGS)
4753         {
4754           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4755              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4756              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4757           {
4758             ntaddr=hr;break;
4759           }
4760           hr++;
4761         }
4762         assert(hr<HOST_REGS);
4763       }
4764       if((opcode[i]&0x2f)==4) // BEQ
4765       {
4766         #ifdef HAVE_CMOV_IMM
4767         if(s2l>=0) emit_cmp(s1l,s2l);
4768         else emit_test(s1l,s1l);
4769         emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4770         #else
4771         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4772         if(s2l>=0) emit_cmp(s1l,s2l);
4773         else emit_test(s1l,s1l);
4774         emit_cmovne_reg(alt,addr);
4775         #endif
4776       }
4777       if((opcode[i]&0x2f)==5) // BNE
4778       {
4779         #ifdef HAVE_CMOV_IMM
4780         if(s2l>=0) emit_cmp(s1l,s2l);
4781         else emit_test(s1l,s1l);
4782         emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4783         #else
4784         emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4785         if(s2l>=0) emit_cmp(s1l,s2l);
4786         else emit_test(s1l,s1l);
4787         emit_cmovne_reg(alt,addr);
4788         #endif
4789       }
4790       if((opcode[i]&0x2f)==6) // BLEZ
4791       {
4792         //emit_movimm(ba[i],alt);
4793         //emit_movimm(start+i*4+8,addr);
4794         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4795         emit_cmpimm(s1l,1);
4796         emit_cmovl_reg(alt,addr);
4797       }
4798       if((opcode[i]&0x2f)==7) // BGTZ
4799       {
4800         //emit_movimm(ba[i],addr);
4801         //emit_movimm(start+i*4+8,ntaddr);
4802         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4803         emit_cmpimm(s1l,1);
4804         emit_cmovl_reg(ntaddr,addr);
4805       }
4806       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4807       {
4808         //emit_movimm(ba[i],alt);
4809         //emit_movimm(start+i*4+8,addr);
4810         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4811         emit_test(s1l,s1l);
4812         emit_cmovs_reg(alt,addr);
4813       }
4814       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4815       {
4816         //emit_movimm(ba[i],addr);
4817         //emit_movimm(start+i*4+8,alt);
4818         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4819         emit_test(s1l,s1l);
4820         emit_cmovs_reg(alt,addr);
4821       }
4822       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4823         if(source[i]&0x10000) // BC1T
4824         {
4825           //emit_movimm(ba[i],alt);
4826           //emit_movimm(start+i*4+8,addr);
4827           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4828           emit_testimm(s1l,0x800000);
4829           emit_cmovne_reg(alt,addr);
4830         }
4831         else // BC1F
4832         {
4833           //emit_movimm(ba[i],addr);
4834           //emit_movimm(start+i*4+8,alt);
4835           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4836           emit_testimm(s1l,0x800000);
4837           emit_cmovne_reg(alt,addr);
4838         }
4839       }
4840       emit_writeword(addr,&pcaddr);
4841     }
4842     else
4843     if(itype[i]==RJUMP)
4844     {
4845       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4846       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4847         r=get_reg(branch_regs[i].regmap,RTEMP);
4848       }
4849       emit_writeword(r,&pcaddr);
4850     }
4851     else {SysPrintf("Unknown branch type in do_ccstub\n");abort();}
4852   }
4853   // Update cycle count
4854   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4855   if(stubs[n].a) emit_addimm(HOST_CCREG,CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
4856   emit_far_call(cc_interrupt);
4857   if(stubs[n].a) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
4858   if(stubs[n].d==TAKEN) {
4859     if(internal_branch(ba[i]))
4860       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4861     else if(itype[i]==RJUMP) {
4862       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4863         emit_readword(&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4864       else
4865         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4866     }
4867   }else if(stubs[n].d==NOTTAKEN) {
4868     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4869     else load_all_regs(branch_regs[i].regmap);
4870   }else if(stubs[n].d==NULLDS) {
4871     // Delay slot instruction is nullified ("likely" branch)
4872     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4873     else load_all_regs(regs[i].regmap);
4874   }else{
4875     load_all_regs(branch_regs[i].regmap);
4876   }
4877   if (stubs[n].retaddr)
4878     emit_jmp(stubs[n].retaddr);
4879   else
4880     do_jump_vaddr(stubs[n].e);
4881 }
4882
4883 static void add_to_linker(void *addr, u_int target, int ext)
4884 {
4885   assert(linkcount < ARRAY_SIZE(link_addr));
4886   link_addr[linkcount].addr = addr;
4887   link_addr[linkcount].target = target;
4888   link_addr[linkcount].ext = ext;
4889   linkcount++;
4890 }
4891
4892 static void ujump_assemble_write_ra(int i)
4893 {
4894   int rt;
4895   unsigned int return_address;
4896   rt=get_reg(branch_regs[i].regmap,31);
4897   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4898   //assert(rt>=0);
4899   return_address=start+i*4+8;
4900   if(rt>=0) {
4901     #ifdef USE_MINI_HT
4902     if(internal_branch(return_address)&&rt1[i+1]!=31) {
4903       int temp=-1; // note: must be ds-safe
4904       #ifdef HOST_TEMPREG
4905       temp=HOST_TEMPREG;
4906       #endif
4907       if(temp>=0) do_miniht_insert(return_address,rt,temp);
4908       else emit_movimm(return_address,rt);
4909     }
4910     else
4911     #endif
4912     {
4913       #ifdef REG_PREFETCH
4914       if(temp>=0)
4915       {
4916         if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4917       }
4918       #endif
4919       emit_movimm(return_address,rt); // PC into link register
4920       #ifdef IMM_PREFETCH
4921       emit_prefetch(hash_table_get(return_address));
4922       #endif
4923     }
4924   }
4925 }
4926
4927 static void ujump_assemble(int i,struct regstat *i_regs)
4928 {
4929   int ra_done=0;
4930   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4931   address_generation(i+1,i_regs,regs[i].regmap_entry);
4932   #ifdef REG_PREFETCH
4933   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4934   if(rt1[i]==31&&temp>=0)
4935   {
4936     signed char *i_regmap=i_regs->regmap;
4937     int return_address=start+i*4+8;
4938     if(get_reg(branch_regs[i].regmap,31)>0)
4939     if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4940   }
4941   #endif
4942   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4943     ujump_assemble_write_ra(i); // writeback ra for DS
4944     ra_done=1;
4945   }
4946   ds_assemble(i+1,i_regs);
4947   uint64_t bc_unneeded=branch_regs[i].u;
4948   bc_unneeded|=1|(1LL<<rt1[i]);
4949   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
4950   load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
4951   if(!ra_done&&rt1[i]==31)
4952     ujump_assemble_write_ra(i);
4953   int cc,adj;
4954   cc=get_reg(branch_regs[i].regmap,CCREG);
4955   assert(cc==HOST_CCREG);
4956   store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4957   #ifdef REG_PREFETCH
4958   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4959   #endif
4960   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4961   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4962   load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4963   if(internal_branch(ba[i]))
4964     assem_debug("branch: internal\n");
4965   else
4966     assem_debug("branch: external\n");
4967   if(internal_branch(ba[i])&&is_ds[(ba[i]-start)>>2]) {
4968     ds_assemble_entry(i);
4969   }
4970   else {
4971     add_to_linker(out,ba[i],internal_branch(ba[i]));
4972     emit_jmp(0);
4973   }
4974 }
4975
4976 static void rjump_assemble_write_ra(int i)
4977 {
4978   int rt,return_address;
4979   assert(rt1[i+1]!=rt1[i]);
4980   assert(rt2[i+1]!=rt1[i]);
4981   rt=get_reg(branch_regs[i].regmap,rt1[i]);
4982   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4983   assert(rt>=0);
4984   return_address=start+i*4+8;
4985   #ifdef REG_PREFETCH
4986   if(temp>=0)
4987   {
4988     if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4989   }
4990   #endif
4991   emit_movimm(return_address,rt); // PC into link register
4992   #ifdef IMM_PREFETCH
4993   emit_prefetch(hash_table_get(return_address));
4994   #endif
4995 }
4996
4997 static void rjump_assemble(int i,struct regstat *i_regs)
4998 {
4999   int temp;
5000   int rs,cc;
5001   int ra_done=0;
5002   rs=get_reg(branch_regs[i].regmap,rs1[i]);
5003   assert(rs>=0);
5004   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5005     // Delay slot abuse, make a copy of the branch address register
5006     temp=get_reg(branch_regs[i].regmap,RTEMP);
5007     assert(temp>=0);
5008     assert(regs[i].regmap[temp]==RTEMP);
5009     emit_mov(rs,temp);
5010     rs=temp;
5011   }
5012   address_generation(i+1,i_regs,regs[i].regmap_entry);
5013   #ifdef REG_PREFETCH
5014   if(rt1[i]==31)
5015   {
5016     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
5017       signed char *i_regmap=i_regs->regmap;
5018       int return_address=start+i*4+8;
5019       if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
5020     }
5021   }
5022   #endif
5023   #ifdef USE_MINI_HT
5024   if(rs1[i]==31) {
5025     int rh=get_reg(regs[i].regmap,RHASH);
5026     if(rh>=0) do_preload_rhash(rh);
5027   }
5028   #endif
5029   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
5030     rjump_assemble_write_ra(i);
5031     ra_done=1;
5032   }
5033   ds_assemble(i+1,i_regs);
5034   uint64_t bc_unneeded=branch_regs[i].u;
5035   bc_unneeded|=1|(1LL<<rt1[i]);
5036   bc_unneeded&=~(1LL<<rs1[i]);
5037   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
5038   load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],CCREG);
5039   if(!ra_done&&rt1[i]!=0)
5040     rjump_assemble_write_ra(i);
5041   cc=get_reg(branch_regs[i].regmap,CCREG);
5042   assert(cc==HOST_CCREG);
5043   (void)cc;
5044   #ifdef USE_MINI_HT
5045   int rh=get_reg(branch_regs[i].regmap,RHASH);
5046   int ht=get_reg(branch_regs[i].regmap,RHTBL);
5047   if(rs1[i]==31) {
5048     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
5049     do_preload_rhtbl(ht);
5050     do_rhash(rs,rh);
5051   }
5052   #endif
5053   store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1);
5054   #ifdef DESTRUCTIVE_WRITEBACK
5055   if((branch_regs[i].dirty>>rs)&1) {
5056     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
5057       emit_loadreg(rs1[i],rs);
5058     }
5059   }
5060   #endif
5061   #ifdef REG_PREFETCH
5062   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5063   #endif
5064   #ifdef USE_MINI_HT
5065   if(rs1[i]==31) {
5066     do_miniht_load(ht,rh);
5067   }
5068   #endif
5069   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
5070   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
5071   //assert(adj==0);
5072   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5073   add_stub(CC_STUB,out,NULL,0,i,-1,TAKEN,rs);
5074   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
5075     // special case for RFE
5076     emit_jmp(0);
5077   else
5078     emit_jns(0);
5079   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1);
5080   #ifdef USE_MINI_HT
5081   if(rs1[i]==31) {
5082     do_miniht_jump(rs,rh,ht);
5083   }
5084   else
5085   #endif
5086   {
5087     do_jump_vaddr(rs);
5088   }
5089   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5090   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
5091   #endif
5092 }
5093
5094 static void cjump_assemble(int i,struct regstat *i_regs)
5095 {
5096   signed char *i_regmap=i_regs->regmap;
5097   int cc;
5098   int match;
5099   match=match_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5100   assem_debug("match=%d\n",match);
5101   int s1l,s2l;
5102   int unconditional=0,nop=0;
5103   int invert=0;
5104   int internal=internal_branch(ba[i]);
5105   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5106   if(!match) invert=1;
5107   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5108   if(i>(ba[i]-start)>>2) invert=1;
5109   #endif
5110   #ifdef __aarch64__
5111   invert=1; // because of near cond. branches
5112   #endif
5113
5114   if(ooo[i]) {
5115     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5116     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5117   }
5118   else {
5119     s1l=get_reg(i_regmap,rs1[i]);
5120     s2l=get_reg(i_regmap,rs2[i]);
5121   }
5122   if(rs1[i]==0&&rs2[i]==0)
5123   {
5124     if(opcode[i]&1) nop=1;
5125     else unconditional=1;
5126     //assert(opcode[i]!=5);
5127     //assert(opcode[i]!=7);
5128     //assert(opcode[i]!=0x15);
5129     //assert(opcode[i]!=0x17);
5130   }
5131   else if(rs1[i]==0)
5132   {
5133     s1l=s2l;
5134     s2l=-1;
5135   }
5136   else if(rs2[i]==0)
5137   {
5138     s2l=-1;
5139   }
5140
5141   if(ooo[i]) {
5142     // Out of order execution (delay slot first)
5143     //printf("OOOE\n");
5144     address_generation(i+1,i_regs,regs[i].regmap_entry);
5145     ds_assemble(i+1,i_regs);
5146     int adj;
5147     uint64_t bc_unneeded=branch_regs[i].u;
5148     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5149     bc_unneeded|=1;
5150     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
5151     load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],rs2[i]);
5152     load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5153     cc=get_reg(branch_regs[i].regmap,CCREG);
5154     assert(cc==HOST_CCREG);
5155     if(unconditional)
5156       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5157     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5158     //assem_debug("cycle count (adj)\n");
5159     if(unconditional) {
5160       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5161       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5162         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5163         load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5164         if(internal)
5165           assem_debug("branch: internal\n");
5166         else
5167           assem_debug("branch: external\n");
5168         if(internal&&is_ds[(ba[i]-start)>>2]) {
5169           ds_assemble_entry(i);
5170         }
5171         else {
5172           add_to_linker(out,ba[i],internal);
5173           emit_jmp(0);
5174         }
5175         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5176         if(((u_int)out)&7) emit_addnop(0);
5177         #endif
5178       }
5179     }
5180     else if(nop) {
5181       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5182       void *jaddr=out;
5183       emit_jns(0);
5184       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5185     }
5186     else {
5187       void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
5188       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5189       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5190
5191       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5192       assert(s1l>=0);
5193       if(opcode[i]==4) // BEQ
5194       {
5195         if(s2l>=0) emit_cmp(s1l,s2l);
5196         else emit_test(s1l,s1l);
5197         if(invert){
5198           nottaken=out;
5199           emit_jne(DJT_1);
5200         }else{
5201           add_to_linker(out,ba[i],internal);
5202           emit_jeq(0);
5203         }
5204       }
5205       if(opcode[i]==5) // BNE
5206       {
5207         if(s2l>=0) emit_cmp(s1l,s2l);
5208         else emit_test(s1l,s1l);
5209         if(invert){
5210           nottaken=out;
5211           emit_jeq(DJT_1);
5212         }else{
5213           add_to_linker(out,ba[i],internal);
5214           emit_jne(0);
5215         }
5216       }
5217       if(opcode[i]==6) // BLEZ
5218       {
5219         emit_cmpimm(s1l,1);
5220         if(invert){
5221           nottaken=out;
5222           emit_jge(DJT_1);
5223         }else{
5224           add_to_linker(out,ba[i],internal);
5225           emit_jl(0);
5226         }
5227       }
5228       if(opcode[i]==7) // BGTZ
5229       {
5230         emit_cmpimm(s1l,1);
5231         if(invert){
5232           nottaken=out;
5233           emit_jl(DJT_1);
5234         }else{
5235           add_to_linker(out,ba[i],internal);
5236           emit_jge(0);
5237         }
5238       }
5239       if(invert) {
5240         if(taken) set_jump_target(taken, out);
5241         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5242         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5243           if(adj) {
5244             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5245             add_to_linker(out,ba[i],internal);
5246           }else{
5247             emit_addnop(13);
5248             add_to_linker(out,ba[i],internal*2);
5249           }
5250           emit_jmp(0);
5251         }else
5252         #endif
5253         {
5254           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5255           store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5256           load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5257           if(internal)
5258             assem_debug("branch: internal\n");
5259           else
5260             assem_debug("branch: external\n");
5261           if(internal&&is_ds[(ba[i]-start)>>2]) {
5262             ds_assemble_entry(i);
5263           }
5264           else {
5265             add_to_linker(out,ba[i],internal);
5266             emit_jmp(0);
5267           }
5268         }
5269         set_jump_target(nottaken, out);
5270       }
5271
5272       if(nottaken1) set_jump_target(nottaken1, out);
5273       if(adj) {
5274         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5275       }
5276     } // (!unconditional)
5277   } // if(ooo)
5278   else
5279   {
5280     // In-order execution (branch first)
5281     //if(likely[i]) printf("IOL\n");
5282     //else
5283     //printf("IOE\n");
5284     void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
5285     if(!unconditional&&!nop) {
5286       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5287       assert(s1l>=0);
5288       if((opcode[i]&0x2f)==4) // BEQ
5289       {
5290         if(s2l>=0) emit_cmp(s1l,s2l);
5291         else emit_test(s1l,s1l);
5292         nottaken=out;
5293         emit_jne(DJT_2);
5294       }
5295       if((opcode[i]&0x2f)==5) // BNE
5296       {
5297         if(s2l>=0) emit_cmp(s1l,s2l);
5298         else emit_test(s1l,s1l);
5299         nottaken=out;
5300         emit_jeq(DJT_2);
5301       }
5302       if((opcode[i]&0x2f)==6) // BLEZ
5303       {
5304         emit_cmpimm(s1l,1);
5305         nottaken=out;
5306         emit_jge(DJT_2);
5307       }
5308       if((opcode[i]&0x2f)==7) // BGTZ
5309       {
5310         emit_cmpimm(s1l,1);
5311         nottaken=out;
5312         emit_jl(DJT_2);
5313       }
5314     } // if(!unconditional)
5315     int adj;
5316     uint64_t ds_unneeded=branch_regs[i].u;
5317     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5318     ds_unneeded|=1;
5319     // branch taken
5320     if(!nop) {
5321       if(taken) set_jump_target(taken, out);
5322       assem_debug("1:\n");
5323       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5324       // load regs
5325       load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5326       address_generation(i+1,&branch_regs[i],0);
5327       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
5328       ds_assemble(i+1,&branch_regs[i]);
5329       cc=get_reg(branch_regs[i].regmap,CCREG);
5330       if(cc==-1) {
5331         emit_loadreg(CCREG,cc=HOST_CCREG);
5332         // CHECK: Is the following instruction (fall thru) allocated ok?
5333       }
5334       assert(cc==HOST_CCREG);
5335       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5336       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5337       assem_debug("cycle count (adj)\n");
5338       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5339       load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5340       if(internal)
5341         assem_debug("branch: internal\n");
5342       else
5343         assem_debug("branch: external\n");
5344       if(internal&&is_ds[(ba[i]-start)>>2]) {
5345         ds_assemble_entry(i);
5346       }
5347       else {
5348         add_to_linker(out,ba[i],internal);
5349         emit_jmp(0);
5350       }
5351     }
5352     // branch not taken
5353     if(!unconditional) {
5354       if(nottaken1) set_jump_target(nottaken1, out);
5355       set_jump_target(nottaken, out);
5356       assem_debug("2:\n");
5357       if(!likely[i]) {
5358         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5359         load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5360         address_generation(i+1,&branch_regs[i],0);
5361         load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5362         ds_assemble(i+1,&branch_regs[i]);
5363       }
5364       cc=get_reg(branch_regs[i].regmap,CCREG);
5365       if(cc==-1&&!likely[i]) {
5366         // Cycle count isn't in a register, temporarily load it then write it out
5367         emit_loadreg(CCREG,HOST_CCREG);
5368         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5369         void *jaddr=out;
5370         emit_jns(0);
5371         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5372         emit_storereg(CCREG,HOST_CCREG);
5373       }
5374       else{
5375         cc=get_reg(i_regmap,CCREG);
5376         assert(cc==HOST_CCREG);
5377         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5378         void *jaddr=out;
5379         emit_jns(0);
5380         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5381       }
5382     }
5383   }
5384 }
5385
5386 static void sjump_assemble(int i,struct regstat *i_regs)
5387 {
5388   signed char *i_regmap=i_regs->regmap;
5389   int cc;
5390   int match;
5391   match=match_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5392   assem_debug("smatch=%d\n",match);
5393   int s1l;
5394   int unconditional=0,nevertaken=0;
5395   int invert=0;
5396   int internal=internal_branch(ba[i]);
5397   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5398   if(!match) invert=1;
5399   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5400   if(i>(ba[i]-start)>>2) invert=1;
5401   #endif
5402   #ifdef __aarch64__
5403   invert=1; // because of near cond. branches
5404   #endif
5405
5406   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5407   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5408
5409   if(ooo[i]) {
5410     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5411   }
5412   else {
5413     s1l=get_reg(i_regmap,rs1[i]);
5414   }
5415   if(rs1[i]==0)
5416   {
5417     if(opcode2[i]&1) unconditional=1;
5418     else nevertaken=1;
5419     // These are never taken (r0 is never less than zero)
5420     //assert(opcode2[i]!=0);
5421     //assert(opcode2[i]!=2);
5422     //assert(opcode2[i]!=0x10);
5423     //assert(opcode2[i]!=0x12);
5424   }
5425
5426   if(ooo[i]) {
5427     // Out of order execution (delay slot first)
5428     //printf("OOOE\n");
5429     address_generation(i+1,i_regs,regs[i].regmap_entry);
5430     ds_assemble(i+1,i_regs);
5431     int adj;
5432     uint64_t bc_unneeded=branch_regs[i].u;
5433     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5434     bc_unneeded|=1;
5435     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
5436     load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],rs1[i]);
5437     load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5438     if(rt1[i]==31) {
5439       int rt,return_address;
5440       rt=get_reg(branch_regs[i].regmap,31);
5441       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5442       if(rt>=0) {
5443         // Save the PC even if the branch is not taken
5444         return_address=start+i*4+8;
5445         emit_movimm(return_address,rt); // PC into link register
5446         #ifdef IMM_PREFETCH
5447         if(!nevertaken) emit_prefetch(hash_table_get(return_address));
5448         #endif
5449       }
5450     }
5451     cc=get_reg(branch_regs[i].regmap,CCREG);
5452     assert(cc==HOST_CCREG);
5453     if(unconditional)
5454       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5455     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5456     assem_debug("cycle count (adj)\n");
5457     if(unconditional) {
5458       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5459       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5460         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5461         load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5462         if(internal)
5463           assem_debug("branch: internal\n");
5464         else
5465           assem_debug("branch: external\n");
5466         if(internal&&is_ds[(ba[i]-start)>>2]) {
5467           ds_assemble_entry(i);
5468         }
5469         else {
5470           add_to_linker(out,ba[i],internal);
5471           emit_jmp(0);
5472         }
5473         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5474         if(((u_int)out)&7) emit_addnop(0);
5475         #endif
5476       }
5477     }
5478     else if(nevertaken) {
5479       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5480       void *jaddr=out;
5481       emit_jns(0);
5482       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5483     }
5484     else {
5485       void *nottaken = NULL;
5486       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5487       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5488       {
5489         assert(s1l>=0);
5490         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5491         {
5492           emit_test(s1l,s1l);
5493           if(invert){
5494             nottaken=out;
5495             emit_jns(DJT_1);
5496           }else{
5497             add_to_linker(out,ba[i],internal);
5498             emit_js(0);
5499           }
5500         }
5501         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5502         {
5503           emit_test(s1l,s1l);
5504           if(invert){
5505             nottaken=out;
5506             emit_js(DJT_1);
5507           }else{
5508             add_to_linker(out,ba[i],internal);
5509             emit_jns(0);
5510           }
5511         }
5512       }
5513
5514       if(invert) {
5515         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5516         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5517           if(adj) {
5518             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5519             add_to_linker(out,ba[i],internal);
5520           }else{
5521             emit_addnop(13);
5522             add_to_linker(out,ba[i],internal*2);
5523           }
5524           emit_jmp(0);
5525         }else
5526         #endif
5527         {
5528           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5529           store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5530           load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5531           if(internal)
5532             assem_debug("branch: internal\n");
5533           else
5534             assem_debug("branch: external\n");
5535           if(internal&&is_ds[(ba[i]-start)>>2]) {
5536             ds_assemble_entry(i);
5537           }
5538           else {
5539             add_to_linker(out,ba[i],internal);
5540             emit_jmp(0);
5541           }
5542         }
5543         set_jump_target(nottaken, out);
5544       }
5545
5546       if(adj) {
5547         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5548       }
5549     } // (!unconditional)
5550   } // if(ooo)
5551   else
5552   {
5553     // In-order execution (branch first)
5554     //printf("IOE\n");
5555     void *nottaken = NULL;
5556     if(rt1[i]==31) {
5557       int rt,return_address;
5558       rt=get_reg(branch_regs[i].regmap,31);
5559       if(rt>=0) {
5560         // Save the PC even if the branch is not taken
5561         return_address=start+i*4+8;
5562         emit_movimm(return_address,rt); // PC into link register
5563         #ifdef IMM_PREFETCH
5564         emit_prefetch(hash_table_get(return_address));
5565         #endif
5566       }
5567     }
5568     if(!unconditional) {
5569       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5570         assert(s1l>=0);
5571         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5572         {
5573           emit_test(s1l,s1l);
5574           nottaken=out;
5575           emit_jns(DJT_1);
5576         }
5577         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5578         {
5579           emit_test(s1l,s1l);
5580           nottaken=out;
5581           emit_js(DJT_1);
5582         }
5583     } // if(!unconditional)
5584     int adj;
5585     uint64_t ds_unneeded=branch_regs[i].u;
5586     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5587     ds_unneeded|=1;
5588     // branch taken
5589     if(!nevertaken) {
5590       //assem_debug("1:\n");
5591       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5592       // load regs
5593       load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5594       address_generation(i+1,&branch_regs[i],0);
5595       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
5596       ds_assemble(i+1,&branch_regs[i]);
5597       cc=get_reg(branch_regs[i].regmap,CCREG);
5598       if(cc==-1) {
5599         emit_loadreg(CCREG,cc=HOST_CCREG);
5600         // CHECK: Is the following instruction (fall thru) allocated ok?
5601       }
5602       assert(cc==HOST_CCREG);
5603       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5604       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5605       assem_debug("cycle count (adj)\n");
5606       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5607       load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5608       if(internal)
5609         assem_debug("branch: internal\n");
5610       else
5611         assem_debug("branch: external\n");
5612       if(internal&&is_ds[(ba[i]-start)>>2]) {
5613         ds_assemble_entry(i);
5614       }
5615       else {
5616         add_to_linker(out,ba[i],internal);
5617         emit_jmp(0);
5618       }
5619     }
5620     // branch not taken
5621     if(!unconditional) {
5622       set_jump_target(nottaken, out);
5623       assem_debug("1:\n");
5624       if(!likely[i]) {
5625         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5626         load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5627         address_generation(i+1,&branch_regs[i],0);
5628         load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5629         ds_assemble(i+1,&branch_regs[i]);
5630       }
5631       cc=get_reg(branch_regs[i].regmap,CCREG);
5632       if(cc==-1&&!likely[i]) {
5633         // Cycle count isn't in a register, temporarily load it then write it out
5634         emit_loadreg(CCREG,HOST_CCREG);
5635         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5636         void *jaddr=out;
5637         emit_jns(0);
5638         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5639         emit_storereg(CCREG,HOST_CCREG);
5640       }
5641       else{
5642         cc=get_reg(i_regmap,CCREG);
5643         assert(cc==HOST_CCREG);
5644         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5645         void *jaddr=out;
5646         emit_jns(0);
5647         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5648       }
5649     }
5650   }
5651 }
5652
5653 static void pagespan_assemble(int i,struct regstat *i_regs)
5654 {
5655   int s1l=get_reg(i_regs->regmap,rs1[i]);
5656   int s2l=get_reg(i_regs->regmap,rs2[i]);
5657   void *taken = NULL;
5658   void *nottaken = NULL;
5659   int unconditional=0;
5660   if(rs1[i]==0)
5661   {
5662     s1l=s2l;
5663     s2l=-1;
5664   }
5665   else if(rs2[i]==0)
5666   {
5667     s2l=-1;
5668   }
5669   int hr=0;
5670   int addr=-1,alt=-1,ntaddr=-1;
5671   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5672   else {
5673     while(hr<HOST_REGS)
5674     {
5675       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5676          (i_regs->regmap[hr]&63)!=rs1[i] &&
5677          (i_regs->regmap[hr]&63)!=rs2[i] )
5678       {
5679         addr=hr++;break;
5680       }
5681       hr++;
5682     }
5683   }
5684   while(hr<HOST_REGS)
5685   {
5686     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5687        (i_regs->regmap[hr]&63)!=rs1[i] &&
5688        (i_regs->regmap[hr]&63)!=rs2[i] )
5689     {
5690       alt=hr++;break;
5691     }
5692     hr++;
5693   }
5694   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5695   {
5696     while(hr<HOST_REGS)
5697     {
5698       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5699          (i_regs->regmap[hr]&63)!=rs1[i] &&
5700          (i_regs->regmap[hr]&63)!=rs2[i] )
5701       {
5702         ntaddr=hr;break;
5703       }
5704       hr++;
5705     }
5706   }
5707   assert(hr<HOST_REGS);
5708   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5709     load_regs(regs[i].regmap_entry,regs[i].regmap,CCREG,CCREG);
5710   }
5711   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5712   if(opcode[i]==2) // J
5713   {
5714     unconditional=1;
5715   }
5716   if(opcode[i]==3) // JAL
5717   {
5718     // TODO: mini_ht
5719     int rt=get_reg(i_regs->regmap,31);
5720     emit_movimm(start+i*4+8,rt);
5721     unconditional=1;
5722   }
5723   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
5724   {
5725     emit_mov(s1l,addr);
5726     if(opcode2[i]==9) // JALR
5727     {
5728       int rt=get_reg(i_regs->regmap,rt1[i]);
5729       emit_movimm(start+i*4+8,rt);
5730     }
5731   }
5732   if((opcode[i]&0x3f)==4) // BEQ
5733   {
5734     if(rs1[i]==rs2[i])
5735     {
5736       unconditional=1;
5737     }
5738     else
5739     #ifdef HAVE_CMOV_IMM
5740     if(1) {
5741       if(s2l>=0) emit_cmp(s1l,s2l);
5742       else emit_test(s1l,s1l);
5743       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5744     }
5745     else
5746     #endif
5747     {
5748       assert(s1l>=0);
5749       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5750       if(s2l>=0) emit_cmp(s1l,s2l);
5751       else emit_test(s1l,s1l);
5752       emit_cmovne_reg(alt,addr);
5753     }
5754   }
5755   if((opcode[i]&0x3f)==5) // BNE
5756   {
5757     #ifdef HAVE_CMOV_IMM
5758     if(s2l>=0) emit_cmp(s1l,s2l);
5759     else emit_test(s1l,s1l);
5760     emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5761     #else
5762     assert(s1l>=0);
5763     emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5764     if(s2l>=0) emit_cmp(s1l,s2l);
5765     else emit_test(s1l,s1l);
5766     emit_cmovne_reg(alt,addr);
5767     #endif
5768   }
5769   if((opcode[i]&0x3f)==0x14) // BEQL
5770   {
5771     if(s2l>=0) emit_cmp(s1l,s2l);
5772     else emit_test(s1l,s1l);
5773     if(nottaken) set_jump_target(nottaken, out);
5774     nottaken=out;
5775     emit_jne(0);
5776   }
5777   if((opcode[i]&0x3f)==0x15) // BNEL
5778   {
5779     if(s2l>=0) emit_cmp(s1l,s2l);
5780     else emit_test(s1l,s1l);
5781     nottaken=out;
5782     emit_jeq(0);
5783     if(taken) set_jump_target(taken, out);
5784   }
5785   if((opcode[i]&0x3f)==6) // BLEZ
5786   {
5787     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5788     emit_cmpimm(s1l,1);
5789     emit_cmovl_reg(alt,addr);
5790   }
5791   if((opcode[i]&0x3f)==7) // BGTZ
5792   {
5793     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5794     emit_cmpimm(s1l,1);
5795     emit_cmovl_reg(ntaddr,addr);
5796   }
5797   if((opcode[i]&0x3f)==0x16) // BLEZL
5798   {
5799     assert((opcode[i]&0x3f)!=0x16);
5800   }
5801   if((opcode[i]&0x3f)==0x17) // BGTZL
5802   {
5803     assert((opcode[i]&0x3f)!=0x17);
5804   }
5805   assert(opcode[i]!=1); // BLTZ/BGEZ
5806
5807   //FIXME: Check CSREG
5808   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5809     if((source[i]&0x30000)==0) // BC1F
5810     {
5811       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5812       emit_testimm(s1l,0x800000);
5813       emit_cmovne_reg(alt,addr);
5814     }
5815     if((source[i]&0x30000)==0x10000) // BC1T
5816     {
5817       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5818       emit_testimm(s1l,0x800000);
5819       emit_cmovne_reg(alt,addr);
5820     }
5821     if((source[i]&0x30000)==0x20000) // BC1FL
5822     {
5823       emit_testimm(s1l,0x800000);
5824       nottaken=out;
5825       emit_jne(0);
5826     }
5827     if((source[i]&0x30000)==0x30000) // BC1TL
5828     {
5829       emit_testimm(s1l,0x800000);
5830       nottaken=out;
5831       emit_jeq(0);
5832     }
5833   }
5834
5835   assert(i_regs->regmap[HOST_CCREG]==CCREG);
5836   wb_dirtys(regs[i].regmap,regs[i].dirty);
5837   if(likely[i]||unconditional)
5838   {
5839     emit_movimm(ba[i],HOST_BTREG);
5840   }
5841   else if(addr!=HOST_BTREG)
5842   {
5843     emit_mov(addr,HOST_BTREG);
5844   }
5845   void *branch_addr=out;
5846   emit_jmp(0);
5847   int target_addr=start+i*4+5;
5848   void *stub=out;
5849   void *compiled_target_addr=check_addr(target_addr);
5850   emit_extjump_ds(branch_addr, target_addr);
5851   if(compiled_target_addr) {
5852     set_jump_target(branch_addr, compiled_target_addr);
5853     add_link(target_addr,stub);
5854   }
5855   else set_jump_target(branch_addr, stub);
5856   if(likely[i]) {
5857     // Not-taken path
5858     set_jump_target(nottaken, out);
5859     wb_dirtys(regs[i].regmap,regs[i].dirty);
5860     void *branch_addr=out;
5861     emit_jmp(0);
5862     int target_addr=start+i*4+8;
5863     void *stub=out;
5864     void *compiled_target_addr=check_addr(target_addr);
5865     emit_extjump_ds(branch_addr, target_addr);
5866     if(compiled_target_addr) {
5867       set_jump_target(branch_addr, compiled_target_addr);
5868       add_link(target_addr,stub);
5869     }
5870     else set_jump_target(branch_addr, stub);
5871   }
5872 }
5873
5874 // Assemble the delay slot for the above
5875 static void pagespan_ds()
5876 {
5877   assem_debug("initial delay slot:\n");
5878   u_int vaddr=start+1;
5879   u_int page=get_page(vaddr);
5880   u_int vpage=get_vpage(vaddr);
5881   ll_add(jump_dirty+vpage,vaddr,(void *)out);
5882   do_dirty_stub_ds();
5883   ll_add(jump_in+page,vaddr,(void *)out);
5884   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
5885   if(regs[0].regmap[HOST_CCREG]!=CCREG)
5886     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty);
5887   if(regs[0].regmap[HOST_BTREG]!=BTREG)
5888     emit_writeword(HOST_BTREG,&branch_target);
5889   load_regs(regs[0].regmap_entry,regs[0].regmap,rs1[0],rs2[0]);
5890   address_generation(0,&regs[0],regs[0].regmap_entry);
5891   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
5892     load_regs(regs[0].regmap_entry,regs[0].regmap,INVCP,INVCP);
5893   is_delayslot=0;
5894   switch(itype[0]) {
5895     case ALU:
5896       alu_assemble(0,&regs[0]);break;
5897     case IMM16:
5898       imm16_assemble(0,&regs[0]);break;
5899     case SHIFT:
5900       shift_assemble(0,&regs[0]);break;
5901     case SHIFTIMM:
5902       shiftimm_assemble(0,&regs[0]);break;
5903     case LOAD:
5904       load_assemble(0,&regs[0]);break;
5905     case LOADLR:
5906       loadlr_assemble(0,&regs[0]);break;
5907     case STORE:
5908       store_assemble(0,&regs[0]);break;
5909     case STORELR:
5910       storelr_assemble(0,&regs[0]);break;
5911     case COP0:
5912       cop0_assemble(0,&regs[0]);break;
5913     case COP1:
5914       cop1_assemble(0,&regs[0]);break;
5915     case C1LS:
5916       c1ls_assemble(0,&regs[0]);break;
5917     case COP2:
5918       cop2_assemble(0,&regs[0]);break;
5919     case C2LS:
5920       c2ls_assemble(0,&regs[0]);break;
5921     case C2OP:
5922       c2op_assemble(0,&regs[0]);break;
5923     case MULTDIV:
5924       multdiv_assemble(0,&regs[0]);break;
5925     case MOV:
5926       mov_assemble(0,&regs[0]);break;
5927     case SYSCALL:
5928     case HLECALL:
5929     case INTCALL:
5930     case SPAN:
5931     case UJUMP:
5932     case RJUMP:
5933     case CJUMP:
5934     case SJUMP:
5935       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
5936   }
5937   int btaddr=get_reg(regs[0].regmap,BTREG);
5938   if(btaddr<0) {
5939     btaddr=get_reg(regs[0].regmap,-1);
5940     emit_readword(&branch_target,btaddr);
5941   }
5942   assert(btaddr!=HOST_CCREG);
5943   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
5944 #ifdef HOST_IMM8
5945   host_tempreg_acquire();
5946   emit_movimm(start+4,HOST_TEMPREG);
5947   emit_cmp(btaddr,HOST_TEMPREG);
5948   host_tempreg_release();
5949 #else
5950   emit_cmpimm(btaddr,start+4);
5951 #endif
5952   void *branch = out;
5953   emit_jeq(0);
5954   store_regs_bt(regs[0].regmap,regs[0].dirty,-1);
5955   do_jump_vaddr(btaddr);
5956   set_jump_target(branch, out);
5957   store_regs_bt(regs[0].regmap,regs[0].dirty,start+4);
5958   load_regs_bt(regs[0].regmap,regs[0].dirty,start+4);
5959 }
5960
5961 // Basic liveness analysis for MIPS registers
5962 void unneeded_registers(int istart,int iend,int r)
5963 {
5964   int i;
5965   uint64_t u,gte_u,b,gte_b;
5966   uint64_t temp_u,temp_gte_u=0;
5967   uint64_t gte_u_unknown=0;
5968   if (HACK_ENABLED(NDHACK_GTE_UNNEEDED))
5969     gte_u_unknown=~0ll;
5970   if(iend==slen-1) {
5971     u=1;
5972     gte_u=gte_u_unknown;
5973   }else{
5974     //u=unneeded_reg[iend+1];
5975     u=1;
5976     gte_u=gte_unneeded[iend+1];
5977   }
5978
5979   for (i=iend;i>=istart;i--)
5980   {
5981     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
5982     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
5983     {
5984       // If subroutine call, flag return address as a possible branch target
5985       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
5986
5987       if(ba[i]<start || ba[i]>=(start+slen*4))
5988       {
5989         // Branch out of this block, flush all regs
5990         u=1;
5991         gte_u=gte_u_unknown;
5992         branch_unneeded_reg[i]=u;
5993         // Merge in delay slot
5994         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5995         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5996         u|=1;
5997         gte_u|=gte_rt[i+1];
5998         gte_u&=~gte_rs[i+1];
5999         // If branch is "likely" (and conditional)
6000         // then we skip the delay slot on the fall-thru path
6001         if(likely[i]) {
6002           if(i<slen-1) {
6003             u&=unneeded_reg[i+2];
6004             gte_u&=gte_unneeded[i+2];
6005           }
6006           else
6007           {
6008             u=1;
6009             gte_u=gte_u_unknown;
6010           }
6011         }
6012       }
6013       else
6014       {
6015         // Internal branch, flag target
6016         bt[(ba[i]-start)>>2]=1;
6017         if(ba[i]<=start+i*4) {
6018           // Backward branch
6019           if(is_ujump(i))
6020           {
6021             // Unconditional branch
6022             temp_u=1;
6023             temp_gte_u=0;
6024           } else {
6025             // Conditional branch (not taken case)
6026             temp_u=unneeded_reg[i+2];
6027             temp_gte_u&=gte_unneeded[i+2];
6028           }
6029           // Merge in delay slot
6030           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6031           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6032           temp_u|=1;
6033           temp_gte_u|=gte_rt[i+1];
6034           temp_gte_u&=~gte_rs[i+1];
6035           // If branch is "likely" (and conditional)
6036           // then we skip the delay slot on the fall-thru path
6037           if(likely[i]) {
6038             if(i<slen-1) {
6039               temp_u&=unneeded_reg[i+2];
6040               temp_gte_u&=gte_unneeded[i+2];
6041             }
6042             else
6043             {
6044               temp_u=1;
6045               temp_gte_u=gte_u_unknown;
6046             }
6047           }
6048           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6049           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6050           temp_u|=1;
6051           temp_gte_u|=gte_rt[i];
6052           temp_gte_u&=~gte_rs[i];
6053           unneeded_reg[i]=temp_u;
6054           gte_unneeded[i]=temp_gte_u;
6055           // Only go three levels deep.  This recursion can take an
6056           // excessive amount of time if there are a lot of nested loops.
6057           if(r<2) {
6058             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6059           }else{
6060             unneeded_reg[(ba[i]-start)>>2]=1;
6061             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
6062           }
6063         } /*else*/ if(1) {
6064           if (is_ujump(i))
6065           {
6066             // Unconditional branch
6067             u=unneeded_reg[(ba[i]-start)>>2];
6068             gte_u=gte_unneeded[(ba[i]-start)>>2];
6069             branch_unneeded_reg[i]=u;
6070             // Merge in delay slot
6071             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6072             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6073             u|=1;
6074             gte_u|=gte_rt[i+1];
6075             gte_u&=~gte_rs[i+1];
6076           } else {
6077             // Conditional branch
6078             b=unneeded_reg[(ba[i]-start)>>2];
6079             gte_b=gte_unneeded[(ba[i]-start)>>2];
6080             branch_unneeded_reg[i]=b;
6081             // Branch delay slot
6082             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6083             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6084             b|=1;
6085             gte_b|=gte_rt[i+1];
6086             gte_b&=~gte_rs[i+1];
6087             // If branch is "likely" then we skip the
6088             // delay slot on the fall-thru path
6089             if(likely[i]) {
6090               u=b;
6091               gte_u=gte_b;
6092               if(i<slen-1) {
6093                 u&=unneeded_reg[i+2];
6094                 gte_u&=gte_unneeded[i+2];
6095               }
6096             } else {
6097               u&=b;
6098               gte_u&=gte_b;
6099             }
6100             if(i<slen-1) {
6101               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6102             } else {
6103               branch_unneeded_reg[i]=1;
6104             }
6105           }
6106         }
6107       }
6108     }
6109     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6110     {
6111       // SYSCALL instruction (software interrupt)
6112       u=1;
6113     }
6114     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6115     {
6116       // ERET instruction (return from interrupt)
6117       u=1;
6118     }
6119     //u=1; // DEBUG
6120     // Written registers are unneeded
6121     u|=1LL<<rt1[i];
6122     u|=1LL<<rt2[i];
6123     gte_u|=gte_rt[i];
6124     // Accessed registers are needed
6125     u&=~(1LL<<rs1[i]);
6126     u&=~(1LL<<rs2[i]);
6127     gte_u&=~gte_rs[i];
6128     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
6129       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
6130     // Source-target dependencies
6131     // R0 is always unneeded
6132     u|=1;
6133     // Save it
6134     unneeded_reg[i]=u;
6135     gte_unneeded[i]=gte_u;
6136     /*
6137     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6138     printf("U:");
6139     int r;
6140     for(r=1;r<=CCREG;r++) {
6141       if((unneeded_reg[i]>>r)&1) {
6142         if(r==HIREG) printf(" HI");
6143         else if(r==LOREG) printf(" LO");
6144         else printf(" r%d",r);
6145       }
6146     }
6147     printf("\n");
6148     */
6149   }
6150 }
6151
6152 // Write back dirty registers as soon as we will no longer modify them,
6153 // so that we don't end up with lots of writes at the branches.
6154 void clean_registers(int istart,int iend,int wr)
6155 {
6156   int i;
6157   int r;
6158   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6159   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6160   if(iend==slen-1) {
6161     will_dirty_i=will_dirty_next=0;
6162     wont_dirty_i=wont_dirty_next=0;
6163   }else{
6164     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6165     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6166   }
6167   for (i=iend;i>=istart;i--)
6168   {
6169     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
6170     {
6171       if(ba[i]<start || ba[i]>=(start+slen*4))
6172       {
6173         // Branch out of this block, flush all regs
6174         if (is_ujump(i))
6175         {
6176           // Unconditional branch
6177           will_dirty_i=0;
6178           wont_dirty_i=0;
6179           // Merge in delay slot (will dirty)
6180           for(r=0;r<HOST_REGS;r++) {
6181             if(r!=EXCLUDE_REG) {
6182               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6183               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6184               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6185               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6186               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6187               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6188               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6189               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6190               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6191               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6192               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6193               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6194               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6195               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6196             }
6197           }
6198         }
6199         else
6200         {
6201           // Conditional branch
6202           will_dirty_i=0;
6203           wont_dirty_i=wont_dirty_next;
6204           // Merge in delay slot (will dirty)
6205           for(r=0;r<HOST_REGS;r++) {
6206             if(r!=EXCLUDE_REG) {
6207               if(!likely[i]) {
6208                 // Might not dirty if likely branch is not taken
6209                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6210                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6211                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6212                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6213                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6214                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6215                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6216                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6217                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6218                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6219                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6220                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6221                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6222                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6223               }
6224             }
6225           }
6226         }
6227         // Merge in delay slot (wont dirty)
6228         for(r=0;r<HOST_REGS;r++) {
6229           if(r!=EXCLUDE_REG) {
6230             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6231             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6232             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6233             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6234             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6235             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6236             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6237             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6238             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6239             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6240           }
6241         }
6242         if(wr) {
6243           #ifndef DESTRUCTIVE_WRITEBACK
6244           branch_regs[i].dirty&=wont_dirty_i;
6245           #endif
6246           branch_regs[i].dirty|=will_dirty_i;
6247         }
6248       }
6249       else
6250       {
6251         // Internal branch
6252         if(ba[i]<=start+i*4) {
6253           // Backward branch
6254           if (is_ujump(i))
6255           {
6256             // Unconditional branch
6257             temp_will_dirty=0;
6258             temp_wont_dirty=0;
6259             // Merge in delay slot (will dirty)
6260             for(r=0;r<HOST_REGS;r++) {
6261               if(r!=EXCLUDE_REG) {
6262                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6263                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6264                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6265                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6266                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6267                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6268                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6269                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6270                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6271                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6272                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6273                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6274                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6275                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6276               }
6277             }
6278           } else {
6279             // Conditional branch (not taken case)
6280             temp_will_dirty=will_dirty_next;
6281             temp_wont_dirty=wont_dirty_next;
6282             // Merge in delay slot (will dirty)
6283             for(r=0;r<HOST_REGS;r++) {
6284               if(r!=EXCLUDE_REG) {
6285                 if(!likely[i]) {
6286                   // Will not dirty if likely branch is not taken
6287                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6288                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6289                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6290                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6291                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6292                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
6293                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6294                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6295                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6296                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6297                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6298                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6299                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6300                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6301                 }
6302               }
6303             }
6304           }
6305           // Merge in delay slot (wont dirty)
6306           for(r=0;r<HOST_REGS;r++) {
6307             if(r!=EXCLUDE_REG) {
6308               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6309               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6310               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6311               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6312               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6313               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6314               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6315               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6316               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6317               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6318             }
6319           }
6320           // Deal with changed mappings
6321           if(i<iend) {
6322             for(r=0;r<HOST_REGS;r++) {
6323               if(r!=EXCLUDE_REG) {
6324                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
6325                   temp_will_dirty&=~(1<<r);
6326                   temp_wont_dirty&=~(1<<r);
6327                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6328                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6329                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6330                   } else {
6331                     temp_will_dirty|=1<<r;
6332                     temp_wont_dirty|=1<<r;
6333                   }
6334                 }
6335               }
6336             }
6337           }
6338           if(wr) {
6339             will_dirty[i]=temp_will_dirty;
6340             wont_dirty[i]=temp_wont_dirty;
6341             clean_registers((ba[i]-start)>>2,i-1,0);
6342           }else{
6343             // Limit recursion.  It can take an excessive amount
6344             // of time if there are a lot of nested loops.
6345             will_dirty[(ba[i]-start)>>2]=0;
6346             wont_dirty[(ba[i]-start)>>2]=-1;
6347           }
6348         }
6349         /*else*/ if(1)
6350         {
6351           if (is_ujump(i))
6352           {
6353             // Unconditional branch
6354             will_dirty_i=0;
6355             wont_dirty_i=0;
6356           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6357             for(r=0;r<HOST_REGS;r++) {
6358               if(r!=EXCLUDE_REG) {
6359                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6360                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
6361                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6362                 }
6363                 if(branch_regs[i].regmap[r]>=0) {
6364                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6365                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6366                 }
6367               }
6368             }
6369           //}
6370             // Merge in delay slot
6371             for(r=0;r<HOST_REGS;r++) {
6372               if(r!=EXCLUDE_REG) {
6373                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6374                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6375                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6376                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6377                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6378                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6379                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6380                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6381                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6382                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6383                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6384                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6385                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6386                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6387               }
6388             }
6389           } else {
6390             // Conditional branch
6391             will_dirty_i=will_dirty_next;
6392             wont_dirty_i=wont_dirty_next;
6393           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6394             for(r=0;r<HOST_REGS;r++) {
6395               if(r!=EXCLUDE_REG) {
6396                 signed char target_reg=branch_regs[i].regmap[r];
6397                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6398                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6399                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6400                 }
6401                 else if(target_reg>=0) {
6402                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6403                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6404                 }
6405                 // Treat delay slot as part of branch too
6406                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6407                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6408                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6409                 }
6410                 else
6411                 {
6412                   will_dirty[i+1]&=~(1<<r);
6413                 }*/
6414               }
6415             }
6416           //}
6417             // Merge in delay slot
6418             for(r=0;r<HOST_REGS;r++) {
6419               if(r!=EXCLUDE_REG) {
6420                 if(!likely[i]) {
6421                   // Might not dirty if likely branch is not taken
6422                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6423                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6424                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6425                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6426                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6427                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6428                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6429                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6430                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6431                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6432                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6433                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6434                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6435                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6436                 }
6437               }
6438             }
6439           }
6440           // Merge in delay slot (won't dirty)
6441           for(r=0;r<HOST_REGS;r++) {
6442             if(r!=EXCLUDE_REG) {
6443               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6444               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6445               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6446               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6447               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6448               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6449               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6450               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6451               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6452               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6453             }
6454           }
6455           if(wr) {
6456             #ifndef DESTRUCTIVE_WRITEBACK
6457             branch_regs[i].dirty&=wont_dirty_i;
6458             #endif
6459             branch_regs[i].dirty|=will_dirty_i;
6460           }
6461         }
6462       }
6463     }
6464     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6465     {
6466       // SYSCALL instruction (software interrupt)
6467       will_dirty_i=0;
6468       wont_dirty_i=0;
6469     }
6470     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6471     {
6472       // ERET instruction (return from interrupt)
6473       will_dirty_i=0;
6474       wont_dirty_i=0;
6475     }
6476     will_dirty_next=will_dirty_i;
6477     wont_dirty_next=wont_dirty_i;
6478     for(r=0;r<HOST_REGS;r++) {
6479       if(r!=EXCLUDE_REG) {
6480         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6481         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6482         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6483         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6484         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6485         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6486         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6487         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6488         if(i>istart) {
6489           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP)
6490           {
6491             // Don't store a register immediately after writing it,
6492             // may prevent dual-issue.
6493             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
6494             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
6495           }
6496         }
6497       }
6498     }
6499     // Save it
6500     will_dirty[i]=will_dirty_i;
6501     wont_dirty[i]=wont_dirty_i;
6502     // Mark registers that won't be dirtied as not dirty
6503     if(wr) {
6504       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
6505       for(r=0;r<HOST_REGS;r++) {
6506         if((will_dirty_i>>r)&1) {
6507           printf(" r%d",r);
6508         }
6509       }
6510       printf("\n");*/
6511
6512       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP)) {
6513         regs[i].dirty|=will_dirty_i;
6514         #ifndef DESTRUCTIVE_WRITEBACK
6515         regs[i].dirty&=wont_dirty_i;
6516         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
6517         {
6518           if (i < iend-1 && !is_ujump(i)) {
6519             for(r=0;r<HOST_REGS;r++) {
6520               if(r!=EXCLUDE_REG) {
6521                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
6522                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
6523                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6524               }
6525             }
6526           }
6527         }
6528         else
6529         {
6530           if(i<iend) {
6531             for(r=0;r<HOST_REGS;r++) {
6532               if(r!=EXCLUDE_REG) {
6533                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
6534                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
6535                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6536               }
6537             }
6538           }
6539         }
6540         #endif
6541       //}
6542     }
6543     // Deal with changed mappings
6544     temp_will_dirty=will_dirty_i;
6545     temp_wont_dirty=wont_dirty_i;
6546     for(r=0;r<HOST_REGS;r++) {
6547       if(r!=EXCLUDE_REG) {
6548         int nr;
6549         if(regs[i].regmap[r]==regmap_pre[i][r]) {
6550           if(wr) {
6551             #ifndef DESTRUCTIVE_WRITEBACK
6552             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6553             #endif
6554             regs[i].wasdirty|=will_dirty_i&(1<<r);
6555           }
6556         }
6557         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
6558           // Register moved to a different register
6559           will_dirty_i&=~(1<<r);
6560           wont_dirty_i&=~(1<<r);
6561           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
6562           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
6563           if(wr) {
6564             #ifndef DESTRUCTIVE_WRITEBACK
6565             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6566             #endif
6567             regs[i].wasdirty|=will_dirty_i&(1<<r);
6568           }
6569         }
6570         else {
6571           will_dirty_i&=~(1<<r);
6572           wont_dirty_i&=~(1<<r);
6573           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6574             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6575             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6576           } else {
6577             wont_dirty_i|=1<<r;
6578             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
6579           }
6580         }
6581       }
6582     }
6583   }
6584 }
6585
6586 #ifdef DISASM
6587   /* disassembly */
6588 void disassemble_inst(int i)
6589 {
6590     if (bt[i]) printf("*"); else printf(" ");
6591     switch(itype[i]) {
6592       case UJUMP:
6593         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6594       case CJUMP:
6595         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
6596       case SJUMP:
6597         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
6598       case RJUMP:
6599         if (opcode[i]==0x9&&rt1[i]!=31)
6600           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
6601         else
6602           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6603         break;
6604       case SPAN:
6605         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
6606       case IMM16:
6607         if(opcode[i]==0xf) //LUI
6608           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
6609         else
6610           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6611         break;
6612       case LOAD:
6613       case LOADLR:
6614         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6615         break;
6616       case STORE:
6617       case STORELR:
6618         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
6619         break;
6620       case ALU:
6621       case SHIFT:
6622         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
6623         break;
6624       case MULTDIV:
6625         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
6626         break;
6627       case SHIFTIMM:
6628         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6629         break;
6630       case MOV:
6631         if((opcode2[i]&0x1d)==0x10)
6632           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
6633         else if((opcode2[i]&0x1d)==0x11)
6634           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6635         else
6636           printf (" %x: %s\n",start+i*4,insn[i]);
6637         break;
6638       case COP0:
6639         if(opcode2[i]==0)
6640           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
6641         else if(opcode2[i]==4)
6642           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
6643         else printf (" %x: %s\n",start+i*4,insn[i]);
6644         break;
6645       case COP1:
6646         if(opcode2[i]<3)
6647           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
6648         else if(opcode2[i]>3)
6649           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
6650         else printf (" %x: %s\n",start+i*4,insn[i]);
6651         break;
6652       case COP2:
6653         if(opcode2[i]<3)
6654           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
6655         else if(opcode2[i]>3)
6656           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
6657         else printf (" %x: %s\n",start+i*4,insn[i]);
6658         break;
6659       case C1LS:
6660         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6661         break;
6662       case C2LS:
6663         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6664         break;
6665       case INTCALL:
6666         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
6667         break;
6668       default:
6669         //printf (" %s %8x\n",insn[i],source[i]);
6670         printf (" %x: %s\n",start+i*4,insn[i]);
6671     }
6672 }
6673 #else
6674 static void disassemble_inst(int i) {}
6675 #endif // DISASM
6676
6677 #define DRC_TEST_VAL 0x74657374
6678
6679 static void new_dynarec_test(void)
6680 {
6681   int (*testfunc)(void);
6682   void *beginning;
6683   int ret[2];
6684   size_t i;
6685
6686   // check structure linkage
6687   if ((u_char *)rcnts - (u_char *)&psxRegs != sizeof(psxRegs))
6688   {
6689     SysPrintf("linkage_arm* miscompilation/breakage detected.\n");
6690   }
6691
6692   SysPrintf("testing if we can run recompiled code...\n");
6693   ((volatile u_int *)out)[0]++; // make cache dirty
6694
6695   for (i = 0; i < ARRAY_SIZE(ret); i++) {
6696     out = ndrc->translation_cache;
6697     beginning = start_block();
6698     emit_movimm(DRC_TEST_VAL + i, 0); // test
6699     emit_ret();
6700     literal_pool(0);
6701     end_block(beginning);
6702     testfunc = beginning;
6703     ret[i] = testfunc();
6704   }
6705
6706   if (ret[0] == DRC_TEST_VAL && ret[1] == DRC_TEST_VAL + 1)
6707     SysPrintf("test passed.\n");
6708   else
6709     SysPrintf("test failed, will likely crash soon (r=%08x %08x)\n", ret[0], ret[1]);
6710   out = ndrc->translation_cache;
6711 }
6712
6713 // clear the state completely, instead of just marking
6714 // things invalid like invalidate_all_pages() does
6715 void new_dynarec_clear_full(void)
6716 {
6717   int n;
6718   out = ndrc->translation_cache;
6719   memset(invalid_code,1,sizeof(invalid_code));
6720   memset(hash_table,0xff,sizeof(hash_table));
6721   memset(mini_ht,-1,sizeof(mini_ht));
6722   memset(restore_candidate,0,sizeof(restore_candidate));
6723   memset(shadow,0,sizeof(shadow));
6724   copy=shadow;
6725   expirep=16384; // Expiry pointer, +2 blocks
6726   pending_exception=0;
6727   literalcount=0;
6728   stop_after_jal=0;
6729   inv_code_start=inv_code_end=~0;
6730   // TLB
6731   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6732   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6733   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6734 }
6735
6736 void new_dynarec_init(void)
6737 {
6738   SysPrintf("Init new dynarec\n");
6739
6740 #ifdef BASE_ADDR_DYNAMIC
6741   #ifdef VITA
6742   sceBlock = sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
6743   if (sceBlock < 0)
6744     SysPrintf("sceKernelAllocMemBlockForVM failed\n");
6745   int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&ndrc);
6746   if (ret < 0)
6747     SysPrintf("sceKernelGetMemBlockBase failed\n");
6748   #else
6749   uintptr_t desired_addr = 0;
6750   #ifdef __ELF__
6751   extern char _end;
6752   desired_addr = ((uintptr_t)&_end + 0xffffff) & ~0xffffffl;
6753   #endif
6754   ndrc = mmap((void *)desired_addr, sizeof(*ndrc),
6755             PROT_READ | PROT_WRITE | PROT_EXEC,
6756             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
6757   if (ndrc == MAP_FAILED) {
6758     SysPrintf("mmap() failed: %s\n", strerror(errno));
6759     abort();
6760   }
6761   #endif
6762 #else
6763   #ifndef NO_WRITE_EXEC
6764   // not all systems allow execute in data segment by default
6765   if (mprotect(ndrc, sizeof(ndrc->translation_cache) + sizeof(ndrc->tramp.ops),
6766                PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
6767     SysPrintf("mprotect() failed: %s\n", strerror(errno));
6768   #endif
6769 #endif
6770   out = ndrc->translation_cache;
6771   cycle_multiplier=200;
6772   new_dynarec_clear_full();
6773 #ifdef HOST_IMM8
6774   // Copy this into local area so we don't have to put it in every literal pool
6775   invc_ptr=invalid_code;
6776 #endif
6777   arch_init();
6778   new_dynarec_test();
6779 #ifndef RAM_FIXED
6780   ram_offset=(uintptr_t)rdram-0x80000000;
6781 #endif
6782   if (ram_offset!=0)
6783     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
6784 }
6785
6786 void new_dynarec_cleanup(void)
6787 {
6788   int n;
6789 #ifdef BASE_ADDR_DYNAMIC
6790   #ifdef VITA
6791   sceKernelFreeMemBlock(sceBlock);
6792   sceBlock = -1;
6793   #else
6794   if (munmap(ndrc, sizeof(*ndrc)) < 0)
6795     SysPrintf("munmap() failed\n");
6796   #endif
6797 #endif
6798   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6799   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6800   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6801   #ifdef ROM_COPY
6802   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
6803   #endif
6804 }
6805
6806 static u_int *get_source_start(u_int addr, u_int *limit)
6807 {
6808   if (!HACK_ENABLED(NDHACK_OVERRIDE_CYCLE_M))
6809     cycle_multiplier_override = 0;
6810
6811   if (addr < 0x00200000 ||
6812     (0xa0000000 <= addr && addr < 0xa0200000))
6813   {
6814     // used for BIOS calls mostly?
6815     *limit = (addr&0xa0000000)|0x00200000;
6816     return (u_int *)(rdram + (addr&0x1fffff));
6817   }
6818   else if (!Config.HLE && (
6819     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
6820     (0xbfc00000 <= addr && addr < 0xbfc80000)))
6821   {
6822     // BIOS. The multiplier should be much higher as it's uncached 8bit mem,
6823     // but timings in PCSX are too tied to the interpreter's BIAS
6824     if (!HACK_ENABLED(NDHACK_OVERRIDE_CYCLE_M))
6825       cycle_multiplier_override = 200;
6826
6827     *limit = (addr & 0xfff00000) | 0x80000;
6828     return (u_int *)((u_char *)psxR + (addr&0x7ffff));
6829   }
6830   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
6831     *limit = (addr & 0x80600000) + 0x00200000;
6832     return (u_int *)(rdram + (addr&0x1fffff));
6833   }
6834   return NULL;
6835 }
6836
6837 static u_int scan_for_ret(u_int addr)
6838 {
6839   u_int limit = 0;
6840   u_int *mem;
6841
6842   mem = get_source_start(addr, &limit);
6843   if (mem == NULL)
6844     return addr;
6845
6846   if (limit > addr + 0x1000)
6847     limit = addr + 0x1000;
6848   for (; addr < limit; addr += 4, mem++) {
6849     if (*mem == 0x03e00008) // jr $ra
6850       return addr + 8;
6851   }
6852   return addr;
6853 }
6854
6855 struct savestate_block {
6856   uint32_t addr;
6857   uint32_t regflags;
6858 };
6859
6860 static int addr_cmp(const void *p1_, const void *p2_)
6861 {
6862   const struct savestate_block *p1 = p1_, *p2 = p2_;
6863   return p1->addr - p2->addr;
6864 }
6865
6866 int new_dynarec_save_blocks(void *save, int size)
6867 {
6868   struct savestate_block *blocks = save;
6869   int maxcount = size / sizeof(blocks[0]);
6870   struct savestate_block tmp_blocks[1024];
6871   struct ll_entry *head;
6872   int p, s, d, o, bcnt;
6873   u_int addr;
6874
6875   o = 0;
6876   for (p = 0; p < ARRAY_SIZE(jump_in); p++) {
6877     bcnt = 0;
6878     for (head = jump_in[p]; head != NULL; head = head->next) {
6879       tmp_blocks[bcnt].addr = head->vaddr;
6880       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
6881       bcnt++;
6882     }
6883     if (bcnt < 1)
6884       continue;
6885     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
6886
6887     addr = tmp_blocks[0].addr;
6888     for (s = d = 0; s < bcnt; s++) {
6889       if (tmp_blocks[s].addr < addr)
6890         continue;
6891       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
6892         tmp_blocks[d++] = tmp_blocks[s];
6893       addr = scan_for_ret(tmp_blocks[s].addr);
6894     }
6895
6896     if (o + d > maxcount)
6897       d = maxcount - o;
6898     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
6899     o += d;
6900   }
6901
6902   return o * sizeof(blocks[0]);
6903 }
6904
6905 void new_dynarec_load_blocks(const void *save, int size)
6906 {
6907   const struct savestate_block *blocks = save;
6908   int count = size / sizeof(blocks[0]);
6909   u_int regs_save[32];
6910   uint32_t f;
6911   int i, b;
6912
6913   get_addr(psxRegs.pc);
6914
6915   // change GPRs for speculation to at least partially work..
6916   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
6917   for (i = 1; i < 32; i++)
6918     psxRegs.GPR.r[i] = 0x80000000;
6919
6920   for (b = 0; b < count; b++) {
6921     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
6922       if (f & 1)
6923         psxRegs.GPR.r[i] = 0x1f800000;
6924     }
6925
6926     get_addr(blocks[b].addr);
6927
6928     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
6929       if (f & 1)
6930         psxRegs.GPR.r[i] = 0x80000000;
6931     }
6932   }
6933
6934   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
6935 }
6936
6937 int new_recompile_block(u_int addr)
6938 {
6939   u_int pagelimit = 0;
6940   u_int state_rflags = 0;
6941   int i;
6942
6943   assem_debug("NOTCOMPILED: addr = %x -> %p\n", addr, out);
6944   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
6945   //if(debug)
6946   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
6947
6948   // this is just for speculation
6949   for (i = 1; i < 32; i++) {
6950     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
6951       state_rflags |= 1 << i;
6952   }
6953
6954   start = (u_int)addr&~3;
6955   //assert(((u_int)addr&1)==0); // start-in-delay-slot flag
6956   new_dynarec_did_compile=1;
6957   if (Config.HLE && start == 0x80001000) // hlecall
6958   {
6959     // XXX: is this enough? Maybe check hleSoftCall?
6960     void *beginning=start_block();
6961     u_int page=get_page(start);
6962
6963     invalid_code[start>>12]=0;
6964     emit_movimm(start,0);
6965     emit_writeword(0,&pcaddr);
6966     emit_far_jump(new_dyna_leave);
6967     literal_pool(0);
6968     end_block(beginning);
6969     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
6970     return 0;
6971   }
6972
6973   source = get_source_start(start, &pagelimit);
6974   if (source == NULL) {
6975     SysPrintf("Compile at bogus memory address: %08x\n", addr);
6976     abort();
6977   }
6978
6979   /* Pass 1: disassemble */
6980   /* Pass 2: register dependencies, branch targets */
6981   /* Pass 3: register allocation */
6982   /* Pass 4: branch dependencies */
6983   /* Pass 5: pre-alloc */
6984   /* Pass 6: optimize clean/dirty state */
6985   /* Pass 7: flag 32-bit registers */
6986   /* Pass 8: assembly */
6987   /* Pass 9: linker */
6988   /* Pass 10: garbage collection / free memory */
6989
6990   int j;
6991   int done=0;
6992   unsigned int type,op,op2;
6993
6994   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
6995
6996   /* Pass 1 disassembly */
6997
6998   for(i=0;!done;i++) {
6999     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
7000     minimum_free_regs[i]=0;
7001     opcode[i]=op=source[i]>>26;
7002     switch(op)
7003     {
7004       case 0x00: strcpy(insn[i],"special"); type=NI;
7005         op2=source[i]&0x3f;
7006         switch(op2)
7007         {
7008           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7009           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7010           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7011           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7012           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7013           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7014           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7015           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7016           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7017           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7018           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7019           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7020           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7021           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7022           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7023           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7024           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7025           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7026           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7027           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7028           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7029           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7030           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7031           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7032           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7033           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7034           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7035           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7036           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7037           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7038           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7039           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7040           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7041           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7042           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7043 #if 0
7044           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7045           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7046           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7047           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7048           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7049           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7050           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7051           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7052           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7053           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7054           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7055           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7056           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7057           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7058           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7059           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7060           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7061 #endif
7062         }
7063         break;
7064       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7065         op2=(source[i]>>16)&0x1f;
7066         switch(op2)
7067         {
7068           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7069           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7070           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7071           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7072           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7073           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7074           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7075           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7076           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7077           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7078           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7079           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7080           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7081           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7082         }
7083         break;
7084       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7085       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7086       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7087       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7088       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7089       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7090       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7091       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7092       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7093       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7094       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7095       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7096       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7097       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7098       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7099         op2=(source[i]>>21)&0x1f;
7100         switch(op2)
7101         {
7102           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7103           case 0x02: strcpy(insn[i],"CFC0"); type=COP0; break;
7104           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7105           case 0x06: strcpy(insn[i],"CTC0"); type=COP0; break;
7106           case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
7107         }
7108         break;
7109       case 0x11: strcpy(insn[i],"cop1"); type=COP1;
7110         op2=(source[i]>>21)&0x1f;
7111         break;
7112 #if 0
7113       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7114       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7115       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7116       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7117       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7118       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7119       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7120       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7121 #endif
7122       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7123       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7124       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7125       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7126       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7127       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7128       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7129 #if 0
7130       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7131 #endif
7132       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7133       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7134       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7135       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7136 #if 0
7137       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7138       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7139 #endif
7140       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7141       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7142       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7143       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7144 #if 0
7145       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7146       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7147       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7148 #endif
7149       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7150       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7151 #if 0
7152       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7153       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7154       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7155 #endif
7156       case 0x12: strcpy(insn[i],"COP2"); type=NI;
7157         op2=(source[i]>>21)&0x1f;
7158         //if (op2 & 0x10)
7159         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
7160           if (gte_handlers[source[i]&0x3f]!=NULL) {
7161             if (gte_regnames[source[i]&0x3f]!=NULL)
7162               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
7163             else
7164               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
7165             type=C2OP;
7166           }
7167         }
7168         else switch(op2)
7169         {
7170           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
7171           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
7172           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
7173           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
7174         }
7175         break;
7176       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
7177       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
7178       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
7179       default: strcpy(insn[i],"???"); type=NI;
7180         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
7181         break;
7182     }
7183     itype[i]=type;
7184     opcode2[i]=op2;
7185     /* Get registers/immediates */
7186     lt1[i]=0;
7187     dep1[i]=0;
7188     dep2[i]=0;
7189     gte_rs[i]=gte_rt[i]=0;
7190     switch(type) {
7191       case LOAD:
7192         rs1[i]=(source[i]>>21)&0x1f;
7193         rs2[i]=0;
7194         rt1[i]=(source[i]>>16)&0x1f;
7195         rt2[i]=0;
7196         imm[i]=(short)source[i];
7197         break;
7198       case STORE:
7199       case STORELR:
7200         rs1[i]=(source[i]>>21)&0x1f;
7201         rs2[i]=(source[i]>>16)&0x1f;
7202         rt1[i]=0;
7203         rt2[i]=0;
7204         imm[i]=(short)source[i];
7205         break;
7206       case LOADLR:
7207         // LWL/LWR only load part of the register,
7208         // therefore the target register must be treated as a source too
7209         rs1[i]=(source[i]>>21)&0x1f;
7210         rs2[i]=(source[i]>>16)&0x1f;
7211         rt1[i]=(source[i]>>16)&0x1f;
7212         rt2[i]=0;
7213         imm[i]=(short)source[i];
7214         if(op==0x26) dep1[i]=rt1[i]; // LWR
7215         break;
7216       case IMM16:
7217         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7218         else rs1[i]=(source[i]>>21)&0x1f;
7219         rs2[i]=0;
7220         rt1[i]=(source[i]>>16)&0x1f;
7221         rt2[i]=0;
7222         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7223           imm[i]=(unsigned short)source[i];
7224         }else{
7225           imm[i]=(short)source[i];
7226         }
7227         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7228         break;
7229       case UJUMP:
7230         rs1[i]=0;
7231         rs2[i]=0;
7232         rt1[i]=0;
7233         rt2[i]=0;
7234         // The JAL instruction writes to r31.
7235         if (op&1) {
7236           rt1[i]=31;
7237         }
7238         rs2[i]=CCREG;
7239         break;
7240       case RJUMP:
7241         rs1[i]=(source[i]>>21)&0x1f;
7242         rs2[i]=0;
7243         rt1[i]=0;
7244         rt2[i]=0;
7245         // The JALR instruction writes to rd.
7246         if (op2&1) {
7247           rt1[i]=(source[i]>>11)&0x1f;
7248         }
7249         rs2[i]=CCREG;
7250         break;
7251       case CJUMP:
7252         rs1[i]=(source[i]>>21)&0x1f;
7253         rs2[i]=(source[i]>>16)&0x1f;
7254         rt1[i]=0;
7255         rt2[i]=0;
7256         if(op&2) { // BGTZ/BLEZ
7257           rs2[i]=0;
7258         }
7259         likely[i]=op>>4;
7260         break;
7261       case SJUMP:
7262         rs1[i]=(source[i]>>21)&0x1f;
7263         rs2[i]=CCREG;
7264         rt1[i]=0;
7265         rt2[i]=0;
7266         if(op2&0x10) { // BxxAL
7267           rt1[i]=31;
7268           // NOTE: If the branch is not taken, r31 is still overwritten
7269         }
7270         likely[i]=(op2&2)>>1;
7271         break;
7272       case ALU:
7273         rs1[i]=(source[i]>>21)&0x1f; // source
7274         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7275         rt1[i]=(source[i]>>11)&0x1f; // destination
7276         rt2[i]=0;
7277         if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7278           dep1[i]=rs1[i];dep2[i]=rs2[i];
7279         }
7280         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7281           dep1[i]=rs1[i];dep2[i]=rs2[i];
7282         }
7283         break;
7284       case MULTDIV:
7285         rs1[i]=(source[i]>>21)&0x1f; // source
7286         rs2[i]=(source[i]>>16)&0x1f; // divisor
7287         rt1[i]=HIREG;
7288         rt2[i]=LOREG;
7289         break;
7290       case MOV:
7291         rs1[i]=0;
7292         rs2[i]=0;
7293         rt1[i]=0;
7294         rt2[i]=0;
7295         if(op2==0x10) rs1[i]=HIREG; // MFHI
7296         if(op2==0x11) rt1[i]=HIREG; // MTHI
7297         if(op2==0x12) rs1[i]=LOREG; // MFLO
7298         if(op2==0x13) rt1[i]=LOREG; // MTLO
7299         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7300         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7301         dep1[i]=rs1[i];
7302         break;
7303       case SHIFT:
7304         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7305         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7306         rt1[i]=(source[i]>>11)&0x1f; // destination
7307         rt2[i]=0;
7308         break;
7309       case SHIFTIMM:
7310         rs1[i]=(source[i]>>16)&0x1f;
7311         rs2[i]=0;
7312         rt1[i]=(source[i]>>11)&0x1f;
7313         rt2[i]=0;
7314         imm[i]=(source[i]>>6)&0x1f;
7315         // DSxx32 instructions
7316         if(op2>=0x3c) imm[i]|=0x20;
7317         break;
7318       case COP0:
7319         rs1[i]=0;
7320         rs2[i]=0;
7321         rt1[i]=0;
7322         rt2[i]=0;
7323         if(op2==0||op2==2) rt1[i]=(source[i]>>16)&0x1F; // MFC0/CFC0
7324         if(op2==4||op2==6) rs1[i]=(source[i]>>16)&0x1F; // MTC0/CTC0
7325         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7326         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7327         break;
7328       case COP1:
7329         rs1[i]=0;
7330         rs2[i]=0;
7331         rt1[i]=0;
7332         rt2[i]=0;
7333         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7334         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7335         rs2[i]=CSREG;
7336         break;
7337       case COP2:
7338         rs1[i]=0;
7339         rs2[i]=0;
7340         rt1[i]=0;
7341         rt2[i]=0;
7342         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
7343         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
7344         rs2[i]=CSREG;
7345         int gr=(source[i]>>11)&0x1F;
7346         switch(op2)
7347         {
7348           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
7349           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
7350           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
7351           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
7352         }
7353         break;
7354       case C1LS:
7355         rs1[i]=(source[i]>>21)&0x1F;
7356         rs2[i]=CSREG;
7357         rt1[i]=0;
7358         rt2[i]=0;
7359         imm[i]=(short)source[i];
7360         break;
7361       case C2LS:
7362         rs1[i]=(source[i]>>21)&0x1F;
7363         rs2[i]=0;
7364         rt1[i]=0;
7365         rt2[i]=0;
7366         imm[i]=(short)source[i];
7367         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
7368         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
7369         break;
7370       case C2OP:
7371         rs1[i]=0;
7372         rs2[i]=0;
7373         rt1[i]=0;
7374         rt2[i]=0;
7375         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
7376         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
7377         gte_rt[i]|=1ll<<63; // every op changes flags
7378         if((source[i]&0x3f)==GTE_MVMVA) {
7379           int v = (source[i] >> 15) & 3;
7380           gte_rs[i]&=~0xe3fll;
7381           if(v==3) gte_rs[i]|=0xe00ll;
7382           else gte_rs[i]|=3ll<<(v*2);
7383         }
7384         break;
7385       case SYSCALL:
7386       case HLECALL:
7387       case INTCALL:
7388         rs1[i]=CCREG;
7389         rs2[i]=0;
7390         rt1[i]=0;
7391         rt2[i]=0;
7392         break;
7393       default:
7394         rs1[i]=0;
7395         rs2[i]=0;
7396         rt1[i]=0;
7397         rt2[i]=0;
7398     }
7399     /* Calculate branch target addresses */
7400     if(type==UJUMP)
7401       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
7402     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
7403       ba[i]=start+i*4+8; // Ignore never taken branch
7404     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
7405       ba[i]=start+i*4+8; // Ignore never taken branch
7406     else if(type==CJUMP||type==SJUMP)
7407       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
7408     else ba[i]=-1;
7409     if (i > 0 && is_jump(i-1)) {
7410       int do_in_intrp=0;
7411       // branch in delay slot?
7412       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP) {
7413         // don't handle first branch and call interpreter if it's hit
7414         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
7415         do_in_intrp=1;
7416       }
7417       // basic load delay detection
7418       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
7419         int t=(ba[i-1]-start)/4;
7420         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
7421           // jump target wants DS result - potential load delay effect
7422           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
7423           do_in_intrp=1;
7424           bt[t+1]=1; // expected return from interpreter
7425         }
7426         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
7427               !(i>=3&&is_jump(i-3))) {
7428           // v0 overwrite like this is a sign of trouble, bail out
7429           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
7430           do_in_intrp=1;
7431         }
7432       }
7433       if(do_in_intrp) {
7434         rs1[i-1]=CCREG;
7435         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
7436         ba[i-1]=-1;
7437         itype[i-1]=INTCALL;
7438         done=2;
7439         i--; // don't compile the DS
7440       }
7441     }
7442     /* Is this the end of the block? */
7443     if (i > 0 && is_ujump(i-1)) {
7444       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
7445         done=2;
7446       }
7447       else {
7448         if(stop_after_jal) done=1;
7449         // Stop on BREAK
7450         if((source[i+1]&0xfc00003f)==0x0d) done=1;
7451       }
7452       // Don't recompile stuff that's already compiled
7453       if(check_addr(start+i*4+4)) done=1;
7454       // Don't get too close to the limit
7455       if(i>MAXBLOCK/2) done=1;
7456     }
7457     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
7458     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
7459     if(done==2) {
7460       // Does the block continue due to a branch?
7461       for(j=i-1;j>=0;j--)
7462       {
7463         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
7464         if(ba[j]==start+i*4+4) done=j=0;
7465         if(ba[j]==start+i*4+8) done=j=0;
7466       }
7467     }
7468     //assert(i<MAXBLOCK-1);
7469     if(start+i*4==pagelimit-4) done=1;
7470     assert(start+i*4<pagelimit);
7471     if (i==MAXBLOCK-1) done=1;
7472     // Stop if we're compiling junk
7473     if(itype[i]==NI&&opcode[i]==0x11) {
7474       done=stop_after_jal=1;
7475       SysPrintf("Disabled speculative precompilation\n");
7476     }
7477   }
7478   slen=i;
7479   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP) {
7480     if(start+i*4==pagelimit) {
7481       itype[i-1]=SPAN;
7482     }
7483   }
7484   assert(slen>0);
7485
7486   /* Pass 2 - Register dependencies and branch targets */
7487
7488   unneeded_registers(0,slen-1,0);
7489
7490   /* Pass 3 - Register allocation */
7491
7492   struct regstat current; // Current register allocations/status
7493   current.dirty=0;
7494   current.u=unneeded_reg[0];
7495   clear_all_regs(current.regmap);
7496   alloc_reg(&current,0,CCREG);
7497   dirty_reg(&current,CCREG);
7498   current.isconst=0;
7499   current.wasconst=0;
7500   current.waswritten=0;
7501   int ds=0;
7502   int cc=0;
7503   int hr=-1;
7504
7505   if((u_int)addr&1) {
7506     // First instruction is delay slot
7507     cc=-1;
7508     bt[1]=1;
7509     ds=1;
7510     unneeded_reg[0]=1;
7511     current.regmap[HOST_BTREG]=BTREG;
7512   }
7513
7514   for(i=0;i<slen;i++)
7515   {
7516     if(bt[i])
7517     {
7518       int hr;
7519       for(hr=0;hr<HOST_REGS;hr++)
7520       {
7521         // Is this really necessary?
7522         if(current.regmap[hr]==0) current.regmap[hr]=-1;
7523       }
7524       current.isconst=0;
7525       current.waswritten=0;
7526     }
7527
7528     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
7529     regs[i].wasconst=current.isconst;
7530     regs[i].wasdirty=current.dirty;
7531     regs[i].loadedconst=0;
7532     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP) {
7533       if(i+1<slen) {
7534         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7535         current.u|=1;
7536       } else {
7537         current.u=1;
7538       }
7539     } else {
7540       if(i+1<slen) {
7541         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
7542         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7543         current.u|=1;
7544       } else { SysPrintf("oops, branch at end of block with no delay slot\n");abort(); }
7545     }
7546     is_ds[i]=ds;
7547     if(ds) {
7548       ds=0; // Skip delay slot, already allocated as part of branch
7549       // ...but we need to alloc it in case something jumps here
7550       if(i+1<slen) {
7551         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
7552       }else{
7553         current.u=branch_unneeded_reg[i-1];
7554       }
7555       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7556       current.u|=1;
7557       struct regstat temp;
7558       memcpy(&temp,&current,sizeof(current));
7559       temp.wasdirty=temp.dirty;
7560       // TODO: Take into account unconditional branches, as below
7561       delayslot_alloc(&temp,i);
7562       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
7563       regs[i].wasdirty=temp.wasdirty;
7564       regs[i].dirty=temp.dirty;
7565       regs[i].isconst=0;
7566       regs[i].wasconst=0;
7567       current.isconst=0;
7568       // Create entry (branch target) regmap
7569       for(hr=0;hr<HOST_REGS;hr++)
7570       {
7571         int r=temp.regmap[hr];
7572         if(r>=0) {
7573           if(r!=regmap_pre[i][hr]) {
7574             regs[i].regmap_entry[hr]=-1;
7575           }
7576           else
7577           {
7578               assert(r < 64);
7579               if((current.u>>r)&1) {
7580                 regs[i].regmap_entry[hr]=-1;
7581                 regs[i].regmap[hr]=-1;
7582                 //Don't clear regs in the delay slot as the branch might need them
7583                 //current.regmap[hr]=-1;
7584               }else
7585                 regs[i].regmap_entry[hr]=r;
7586           }
7587         } else {
7588           // First instruction expects CCREG to be allocated
7589           if(i==0&&hr==HOST_CCREG)
7590             regs[i].regmap_entry[hr]=CCREG;
7591           else
7592             regs[i].regmap_entry[hr]=-1;
7593         }
7594       }
7595     }
7596     else { // Not delay slot
7597       switch(itype[i]) {
7598         case UJUMP:
7599           //current.isconst=0; // DEBUG
7600           //current.wasconst=0; // DEBUG
7601           //regs[i].wasconst=0; // DEBUG
7602           clear_const(&current,rt1[i]);
7603           alloc_cc(&current,i);
7604           dirty_reg(&current,CCREG);
7605           if (rt1[i]==31) {
7606             alloc_reg(&current,i,31);
7607             dirty_reg(&current,31);
7608             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
7609             //assert(rt1[i+1]!=rt1[i]);
7610             #ifdef REG_PREFETCH
7611             alloc_reg(&current,i,PTEMP);
7612             #endif
7613           }
7614           ooo[i]=1;
7615           delayslot_alloc(&current,i+1);
7616           //current.isconst=0; // DEBUG
7617           ds=1;
7618           //printf("i=%d, isconst=%x\n",i,current.isconst);
7619           break;
7620         case RJUMP:
7621           //current.isconst=0;
7622           //current.wasconst=0;
7623           //regs[i].wasconst=0;
7624           clear_const(&current,rs1[i]);
7625           clear_const(&current,rt1[i]);
7626           alloc_cc(&current,i);
7627           dirty_reg(&current,CCREG);
7628           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
7629             alloc_reg(&current,i,rs1[i]);
7630             if (rt1[i]!=0) {
7631               alloc_reg(&current,i,rt1[i]);
7632               dirty_reg(&current,rt1[i]);
7633               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
7634               assert(rt1[i+1]!=rt1[i]);
7635               #ifdef REG_PREFETCH
7636               alloc_reg(&current,i,PTEMP);
7637               #endif
7638             }
7639             #ifdef USE_MINI_HT
7640             if(rs1[i]==31) { // JALR
7641               alloc_reg(&current,i,RHASH);
7642               alloc_reg(&current,i,RHTBL);
7643             }
7644             #endif
7645             delayslot_alloc(&current,i+1);
7646           } else {
7647             // The delay slot overwrites our source register,
7648             // allocate a temporary register to hold the old value.
7649             current.isconst=0;
7650             current.wasconst=0;
7651             regs[i].wasconst=0;
7652             delayslot_alloc(&current,i+1);
7653             current.isconst=0;
7654             alloc_reg(&current,i,RTEMP);
7655           }
7656           //current.isconst=0; // DEBUG
7657           ooo[i]=1;
7658           ds=1;
7659           break;
7660         case CJUMP:
7661           //current.isconst=0;
7662           //current.wasconst=0;
7663           //regs[i].wasconst=0;
7664           clear_const(&current,rs1[i]);
7665           clear_const(&current,rs2[i]);
7666           if((opcode[i]&0x3E)==4) // BEQ/BNE
7667           {
7668             alloc_cc(&current,i);
7669             dirty_reg(&current,CCREG);
7670             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7671             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
7672             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
7673                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
7674               // The delay slot overwrites one of our conditions.
7675               // Allocate the branch condition registers instead.
7676               current.isconst=0;
7677               current.wasconst=0;
7678               regs[i].wasconst=0;
7679               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7680               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
7681             }
7682             else
7683             {
7684               ooo[i]=1;
7685               delayslot_alloc(&current,i+1);
7686             }
7687           }
7688           else
7689           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
7690           {
7691             alloc_cc(&current,i);
7692             dirty_reg(&current,CCREG);
7693             alloc_reg(&current,i,rs1[i]);
7694             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
7695               // The delay slot overwrites one of our conditions.
7696               // Allocate the branch condition registers instead.
7697               current.isconst=0;
7698               current.wasconst=0;
7699               regs[i].wasconst=0;
7700               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7701             }
7702             else
7703             {
7704               ooo[i]=1;
7705               delayslot_alloc(&current,i+1);
7706             }
7707           }
7708           else
7709           // Don't alloc the delay slot yet because we might not execute it
7710           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
7711           {
7712             current.isconst=0;
7713             current.wasconst=0;
7714             regs[i].wasconst=0;
7715             alloc_cc(&current,i);
7716             dirty_reg(&current,CCREG);
7717             alloc_reg(&current,i,rs1[i]);
7718             alloc_reg(&current,i,rs2[i]);
7719           }
7720           else
7721           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
7722           {
7723             current.isconst=0;
7724             current.wasconst=0;
7725             regs[i].wasconst=0;
7726             alloc_cc(&current,i);
7727             dirty_reg(&current,CCREG);
7728             alloc_reg(&current,i,rs1[i]);
7729           }
7730           ds=1;
7731           //current.isconst=0;
7732           break;
7733         case SJUMP:
7734           //current.isconst=0;
7735           //current.wasconst=0;
7736           //regs[i].wasconst=0;
7737           clear_const(&current,rs1[i]);
7738           clear_const(&current,rt1[i]);
7739           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
7740           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
7741           {
7742             alloc_cc(&current,i);
7743             dirty_reg(&current,CCREG);
7744             alloc_reg(&current,i,rs1[i]);
7745             if (rt1[i]==31) { // BLTZAL/BGEZAL
7746               alloc_reg(&current,i,31);
7747               dirty_reg(&current,31);
7748               //#ifdef REG_PREFETCH
7749               //alloc_reg(&current,i,PTEMP);
7750               //#endif
7751             }
7752             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
7753                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
7754               // Allocate the branch condition registers instead.
7755               current.isconst=0;
7756               current.wasconst=0;
7757               regs[i].wasconst=0;
7758               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7759             }
7760             else
7761             {
7762               ooo[i]=1;
7763               delayslot_alloc(&current,i+1);
7764             }
7765           }
7766           else
7767           // Don't alloc the delay slot yet because we might not execute it
7768           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
7769           {
7770             current.isconst=0;
7771             current.wasconst=0;
7772             regs[i].wasconst=0;
7773             alloc_cc(&current,i);
7774             dirty_reg(&current,CCREG);
7775             alloc_reg(&current,i,rs1[i]);
7776           }
7777           ds=1;
7778           //current.isconst=0;
7779           break;
7780         case IMM16:
7781           imm16_alloc(&current,i);
7782           break;
7783         case LOAD:
7784         case LOADLR:
7785           load_alloc(&current,i);
7786           break;
7787         case STORE:
7788         case STORELR:
7789           store_alloc(&current,i);
7790           break;
7791         case ALU:
7792           alu_alloc(&current,i);
7793           break;
7794         case SHIFT:
7795           shift_alloc(&current,i);
7796           break;
7797         case MULTDIV:
7798           multdiv_alloc(&current,i);
7799           break;
7800         case SHIFTIMM:
7801           shiftimm_alloc(&current,i);
7802           break;
7803         case MOV:
7804           mov_alloc(&current,i);
7805           break;
7806         case COP0:
7807           cop0_alloc(&current,i);
7808           break;
7809         case COP1:
7810           break;
7811         case COP2:
7812           cop2_alloc(&current,i);
7813           break;
7814         case C1LS:
7815           c1ls_alloc(&current,i);
7816           break;
7817         case C2LS:
7818           c2ls_alloc(&current,i);
7819           break;
7820         case C2OP:
7821           c2op_alloc(&current,i);
7822           break;
7823         case SYSCALL:
7824         case HLECALL:
7825         case INTCALL:
7826           syscall_alloc(&current,i);
7827           break;
7828         case SPAN:
7829           pagespan_alloc(&current,i);
7830           break;
7831       }
7832
7833       // Create entry (branch target) regmap
7834       for(hr=0;hr<HOST_REGS;hr++)
7835       {
7836         int r,or;
7837         r=current.regmap[hr];
7838         if(r>=0) {
7839           if(r!=regmap_pre[i][hr]) {
7840             // TODO: delay slot (?)
7841             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
7842             if(or<0||(r&63)>=TEMPREG){
7843               regs[i].regmap_entry[hr]=-1;
7844             }
7845             else
7846             {
7847               // Just move it to a different register
7848               regs[i].regmap_entry[hr]=r;
7849               // If it was dirty before, it's still dirty
7850               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
7851             }
7852           }
7853           else
7854           {
7855             // Unneeded
7856             if(r==0){
7857               regs[i].regmap_entry[hr]=0;
7858             }
7859             else
7860             {
7861               assert(r<64);
7862               if((current.u>>r)&1) {
7863                 regs[i].regmap_entry[hr]=-1;
7864                 //regs[i].regmap[hr]=-1;
7865                 current.regmap[hr]=-1;
7866               }else
7867                 regs[i].regmap_entry[hr]=r;
7868             }
7869           }
7870         } else {
7871           // Branches expect CCREG to be allocated at the target
7872           if(regmap_pre[i][hr]==CCREG)
7873             regs[i].regmap_entry[hr]=CCREG;
7874           else
7875             regs[i].regmap_entry[hr]=-1;
7876         }
7877       }
7878       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
7879     }
7880
7881     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
7882       current.waswritten|=1<<rs1[i-1];
7883     current.waswritten&=~(1<<rt1[i]);
7884     current.waswritten&=~(1<<rt2[i]);
7885     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
7886       current.waswritten&=~(1<<rs1[i]);
7887
7888     /* Branch post-alloc */
7889     if(i>0)
7890     {
7891       current.wasdirty=current.dirty;
7892       switch(itype[i-1]) {
7893         case UJUMP:
7894           memcpy(&branch_regs[i-1],&current,sizeof(current));
7895           branch_regs[i-1].isconst=0;
7896           branch_regs[i-1].wasconst=0;
7897           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
7898           alloc_cc(&branch_regs[i-1],i-1);
7899           dirty_reg(&branch_regs[i-1],CCREG);
7900           if(rt1[i-1]==31) { // JAL
7901             alloc_reg(&branch_regs[i-1],i-1,31);
7902             dirty_reg(&branch_regs[i-1],31);
7903           }
7904           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7905           memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
7906           break;
7907         case RJUMP:
7908           memcpy(&branch_regs[i-1],&current,sizeof(current));
7909           branch_regs[i-1].isconst=0;
7910           branch_regs[i-1].wasconst=0;
7911           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
7912           alloc_cc(&branch_regs[i-1],i-1);
7913           dirty_reg(&branch_regs[i-1],CCREG);
7914           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
7915           if(rt1[i-1]!=0) { // JALR
7916             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
7917             dirty_reg(&branch_regs[i-1],rt1[i-1]);
7918           }
7919           #ifdef USE_MINI_HT
7920           if(rs1[i-1]==31) { // JALR
7921             alloc_reg(&branch_regs[i-1],i-1,RHASH);
7922             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
7923           }
7924           #endif
7925           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7926           memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
7927           break;
7928         case CJUMP:
7929           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
7930           {
7931             alloc_cc(&current,i-1);
7932             dirty_reg(&current,CCREG);
7933             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
7934                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
7935               // The delay slot overwrote one of our conditions
7936               // Delay slot goes after the test (in order)
7937               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7938               current.u|=1;
7939               delayslot_alloc(&current,i);
7940               current.isconst=0;
7941             }
7942             else
7943             {
7944               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
7945               // Alloc the branch condition registers
7946               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
7947               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
7948             }
7949             memcpy(&branch_regs[i-1],&current,sizeof(current));
7950             branch_regs[i-1].isconst=0;
7951             branch_regs[i-1].wasconst=0;
7952             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
7953             memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
7954           }
7955           else
7956           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
7957           {
7958             alloc_cc(&current,i-1);
7959             dirty_reg(&current,CCREG);
7960             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
7961               // The delay slot overwrote the branch condition
7962               // Delay slot goes after the test (in order)
7963               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7964               current.u|=1;
7965               delayslot_alloc(&current,i);
7966               current.isconst=0;
7967             }
7968             else
7969             {
7970               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
7971               // Alloc the branch condition register
7972               alloc_reg(&current,i-1,rs1[i-1]);
7973             }
7974             memcpy(&branch_regs[i-1],&current,sizeof(current));
7975             branch_regs[i-1].isconst=0;
7976             branch_regs[i-1].wasconst=0;
7977             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
7978             memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
7979           }
7980           else
7981           // Alloc the delay slot in case the branch is taken
7982           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
7983           {
7984             memcpy(&branch_regs[i-1],&current,sizeof(current));
7985             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
7986             alloc_cc(&branch_regs[i-1],i);
7987             dirty_reg(&branch_regs[i-1],CCREG);
7988             delayslot_alloc(&branch_regs[i-1],i);
7989             branch_regs[i-1].isconst=0;
7990             alloc_reg(&current,i,CCREG); // Not taken path
7991             dirty_reg(&current,CCREG);
7992             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7993           }
7994           else
7995           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
7996           {
7997             memcpy(&branch_regs[i-1],&current,sizeof(current));
7998             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
7999             alloc_cc(&branch_regs[i-1],i);
8000             dirty_reg(&branch_regs[i-1],CCREG);
8001             delayslot_alloc(&branch_regs[i-1],i);
8002             branch_regs[i-1].isconst=0;
8003             alloc_reg(&current,i,CCREG); // Not taken path
8004             dirty_reg(&current,CCREG);
8005             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8006           }
8007           break;
8008         case SJUMP:
8009           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8010           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8011           {
8012             alloc_cc(&current,i-1);
8013             dirty_reg(&current,CCREG);
8014             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8015               // The delay slot overwrote the branch condition
8016               // Delay slot goes after the test (in order)
8017               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8018               current.u|=1;
8019               delayslot_alloc(&current,i);
8020               current.isconst=0;
8021             }
8022             else
8023             {
8024               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8025               // Alloc the branch condition register
8026               alloc_reg(&current,i-1,rs1[i-1]);
8027             }
8028             memcpy(&branch_regs[i-1],&current,sizeof(current));
8029             branch_regs[i-1].isconst=0;
8030             branch_regs[i-1].wasconst=0;
8031             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8032             memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
8033           }
8034           else
8035           // Alloc the delay slot in case the branch is taken
8036           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
8037           {
8038             memcpy(&branch_regs[i-1],&current,sizeof(current));
8039             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8040             alloc_cc(&branch_regs[i-1],i);
8041             dirty_reg(&branch_regs[i-1],CCREG);
8042             delayslot_alloc(&branch_regs[i-1],i);
8043             branch_regs[i-1].isconst=0;
8044             alloc_reg(&current,i,CCREG); // Not taken path
8045             dirty_reg(&current,CCREG);
8046             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8047           }
8048           // FIXME: BLTZAL/BGEZAL
8049           if(opcode2[i-1]&0x10) { // BxxZAL
8050             alloc_reg(&branch_regs[i-1],i-1,31);
8051             dirty_reg(&branch_regs[i-1],31);
8052           }
8053           break;
8054       }
8055
8056       if (is_ujump(i-1))
8057       {
8058         if(rt1[i-1]==31) // JAL/JALR
8059         {
8060           // Subroutine call will return here, don't alloc any registers
8061           current.dirty=0;
8062           clear_all_regs(current.regmap);
8063           alloc_reg(&current,i,CCREG);
8064           dirty_reg(&current,CCREG);
8065         }
8066         else if(i+1<slen)
8067         {
8068           // Internal branch will jump here, match registers to caller
8069           current.dirty=0;
8070           clear_all_regs(current.regmap);
8071           alloc_reg(&current,i,CCREG);
8072           dirty_reg(&current,CCREG);
8073           for(j=i-1;j>=0;j--)
8074           {
8075             if(ba[j]==start+i*4+4) {
8076               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
8077               current.dirty=branch_regs[j].dirty;
8078               break;
8079             }
8080           }
8081           while(j>=0) {
8082             if(ba[j]==start+i*4+4) {
8083               for(hr=0;hr<HOST_REGS;hr++) {
8084                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
8085                   current.regmap[hr]=-1;
8086                 }
8087                 current.dirty&=branch_regs[j].dirty;
8088               }
8089             }
8090             j--;
8091           }
8092         }
8093       }
8094     }
8095
8096     // Count cycles in between branches
8097     ccadj[i]=cc;
8098     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
8099     {
8100       cc=0;
8101     }
8102 #if !defined(DRC_DBG)
8103     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
8104     {
8105       // this should really be removed since the real stalls have been implemented,
8106       // but doing so causes sizeable perf regression against the older version
8107       u_int gtec = gte_cycletab[source[i] & 0x3f];
8108       cc += HACK_ENABLED(NDHACK_GTE_NO_STALL) ? gtec/2 : 2;
8109     }
8110     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
8111     {
8112       cc+=4;
8113     }
8114     else if(itype[i]==C2LS)
8115     {
8116       // same as with C2OP
8117       cc += HACK_ENABLED(NDHACK_GTE_NO_STALL) ? 4 : 2;
8118     }
8119 #endif
8120     else
8121     {
8122       cc++;
8123     }
8124
8125     if(!is_ds[i]) {
8126       regs[i].dirty=current.dirty;
8127       regs[i].isconst=current.isconst;
8128       memcpy(constmap[i],current_constmap,sizeof(constmap[i]));
8129     }
8130     for(hr=0;hr<HOST_REGS;hr++) {
8131       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
8132         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
8133           regs[i].wasconst&=~(1<<hr);
8134         }
8135       }
8136     }
8137     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
8138     regs[i].waswritten=current.waswritten;
8139   }
8140
8141   /* Pass 4 - Cull unused host registers */
8142
8143   uint64_t nr=0;
8144
8145   for (i=slen-1;i>=0;i--)
8146   {
8147     int hr;
8148     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
8149     {
8150       if(ba[i]<start || ba[i]>=(start+slen*4))
8151       {
8152         // Branch out of this block, don't need anything
8153         nr=0;
8154       }
8155       else
8156       {
8157         // Internal branch
8158         // Need whatever matches the target
8159         nr=0;
8160         int t=(ba[i]-start)>>2;
8161         for(hr=0;hr<HOST_REGS;hr++)
8162         {
8163           if(regs[i].regmap_entry[hr]>=0) {
8164             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
8165           }
8166         }
8167       }
8168       // Conditional branch may need registers for following instructions
8169       if (!is_ujump(i))
8170       {
8171         if(i<slen-2) {
8172           nr|=needed_reg[i+2];
8173           for(hr=0;hr<HOST_REGS;hr++)
8174           {
8175             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
8176             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
8177           }
8178         }
8179       }
8180       // Don't need stuff which is overwritten
8181       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8182       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8183       // Merge in delay slot
8184       for(hr=0;hr<HOST_REGS;hr++)
8185       {
8186         if(!likely[i]) {
8187           // These are overwritten unless the branch is "likely"
8188           // and the delay slot is nullified if not taken
8189           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8190           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8191         }
8192         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8193         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8194         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8195         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8196         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
8197           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8198           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8199         }
8200       }
8201     }
8202     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
8203     {
8204       // SYSCALL instruction (software interrupt)
8205       nr=0;
8206     }
8207     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
8208     {
8209       // ERET instruction (return from interrupt)
8210       nr=0;
8211     }
8212     else // Non-branch
8213     {
8214       if(i<slen-1) {
8215         for(hr=0;hr<HOST_REGS;hr++) {
8216           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
8217           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
8218           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8219           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8220         }
8221       }
8222     }
8223     for(hr=0;hr<HOST_REGS;hr++)
8224     {
8225       // Overwritten registers are not needed
8226       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8227       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8228       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8229       // Source registers are needed
8230       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
8231       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
8232       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8233       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8234       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
8235         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8236         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8237       }
8238       // Don't store a register immediately after writing it,
8239       // may prevent dual-issue.
8240       // But do so if this is a branch target, otherwise we
8241       // might have to load the register before the branch.
8242       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
8243         if((regmap_pre[i][hr]>0&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1))) {
8244           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8245           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8246         }
8247         if((regs[i].regmap_entry[hr]>0&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1))) {
8248           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8249           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8250         }
8251       }
8252     }
8253     // Cycle count is needed at branches.  Assume it is needed at the target too.
8254     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==SPAN) {
8255       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8256       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8257     }
8258     // Save it
8259     needed_reg[i]=nr;
8260
8261     // Deallocate unneeded registers
8262     for(hr=0;hr<HOST_REGS;hr++)
8263     {
8264       if(!((nr>>hr)&1)) {
8265         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
8266         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8267            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8268            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
8269         {
8270           if (!is_ujump(i))
8271           {
8272             if(likely[i]) {
8273               regs[i].regmap[hr]=-1;
8274               regs[i].isconst&=~(1<<hr);
8275               if(i<slen-2) {
8276                 regmap_pre[i+2][hr]=-1;
8277                 regs[i+2].wasconst&=~(1<<hr);
8278               }
8279             }
8280           }
8281         }
8282         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
8283         {
8284           int map=0,temp=0;
8285           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
8286              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8287             map=INVCP;
8288           }
8289           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
8290              itype[i+1]==C1LS || itype[i+1]==C2LS)
8291             temp=FTEMP;
8292           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8293              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8294              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
8295              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
8296              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
8297              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
8298              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
8299              regs[i].regmap[hr]!=map )
8300           {
8301             regs[i].regmap[hr]=-1;
8302             regs[i].isconst&=~(1<<hr);
8303             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
8304                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
8305                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
8306                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
8307                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
8308                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
8309                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
8310                branch_regs[i].regmap[hr]!=map)
8311             {
8312               branch_regs[i].regmap[hr]=-1;
8313               branch_regs[i].regmap_entry[hr]=-1;
8314               if (!is_ujump(i))
8315               {
8316                 if(!likely[i]&&i<slen-2) {
8317                   regmap_pre[i+2][hr]=-1;
8318                   regs[i+2].wasconst&=~(1<<hr);
8319                 }
8320               }
8321             }
8322           }
8323         }
8324         else
8325         {
8326           // Non-branch
8327           if(i>0)
8328           {
8329             int map=-1,temp=-1;
8330             if(itype[i]==STORE || itype[i]==STORELR ||
8331                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8332               map=INVCP;
8333             }
8334             if(itype[i]==LOADLR || itype[i]==STORELR ||
8335                itype[i]==C1LS || itype[i]==C2LS)
8336               temp=FTEMP;
8337             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8338                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
8339                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
8340                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
8341             {
8342               if(i<slen-1&&!is_ds[i]) {
8343                 assert(regs[i].regmap[hr]<64);
8344                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]>0)
8345                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
8346                 {
8347                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
8348                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
8349                 }
8350                 regmap_pre[i+1][hr]=-1;
8351                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
8352                 regs[i+1].wasconst&=~(1<<hr);
8353               }
8354               regs[i].regmap[hr]=-1;
8355               regs[i].isconst&=~(1<<hr);
8356             }
8357           }
8358         }
8359       } // if needed
8360     } // for hr
8361   }
8362
8363   /* Pass 5 - Pre-allocate registers */
8364
8365   // If a register is allocated during a loop, try to allocate it for the
8366   // entire loop, if possible.  This avoids loading/storing registers
8367   // inside of the loop.
8368
8369   signed char f_regmap[HOST_REGS];
8370   clear_all_regs(f_regmap);
8371   for(i=0;i<slen-1;i++)
8372   {
8373     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
8374     {
8375       if(ba[i]>=start && ba[i]<(start+i*4))
8376       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
8377       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
8378       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
8379       ||itype[i+1]==SHIFT||itype[i+1]==COP1
8380       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
8381       {
8382         int t=(ba[i]-start)>>2;
8383         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP)) // loop_preload can't handle jumps into delay slots
8384         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
8385         for(hr=0;hr<HOST_REGS;hr++)
8386         {
8387           if(regs[i].regmap[hr]>=0) {
8388             if(f_regmap[hr]!=regs[i].regmap[hr]) {
8389               // dealloc old register
8390               int n;
8391               for(n=0;n<HOST_REGS;n++)
8392               {
8393                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
8394               }
8395               // and alloc new one
8396               f_regmap[hr]=regs[i].regmap[hr];
8397             }
8398           }
8399           if(branch_regs[i].regmap[hr]>=0) {
8400             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
8401               // dealloc old register
8402               int n;
8403               for(n=0;n<HOST_REGS;n++)
8404               {
8405                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
8406               }
8407               // and alloc new one
8408               f_regmap[hr]=branch_regs[i].regmap[hr];
8409             }
8410           }
8411           if(ooo[i]) {
8412             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
8413               f_regmap[hr]=branch_regs[i].regmap[hr];
8414           }else{
8415             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
8416               f_regmap[hr]=branch_regs[i].regmap[hr];
8417           }
8418           // Avoid dirty->clean transition
8419           #ifdef DESTRUCTIVE_WRITEBACK
8420           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
8421           #endif
8422           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
8423           // case above, however it's always a good idea.  We can't hoist the
8424           // load if the register was already allocated, so there's no point
8425           // wasting time analyzing most of these cases.  It only "succeeds"
8426           // when the mapping was different and the load can be replaced with
8427           // a mov, which is of negligible benefit.  So such cases are
8428           // skipped below.
8429           if(f_regmap[hr]>0) {
8430             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
8431               int r=f_regmap[hr];
8432               for(j=t;j<=i;j++)
8433               {
8434                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
8435                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
8436                 assert(r < 64);
8437                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
8438                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
8439                   int k;
8440                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
8441                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
8442                     if(r>63) {
8443                       if(get_reg(regs[i].regmap,r&63)<0) break;
8444                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
8445                     }
8446                     k=i;
8447                     while(k>1&&regs[k-1].regmap[hr]==-1) {
8448                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
8449                         //printf("no free regs for store %x\n",start+(k-1)*4);
8450                         break;
8451                       }
8452                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
8453                         //printf("no-match due to different register\n");
8454                         break;
8455                       }
8456                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP) {
8457                         //printf("no-match due to branch\n");
8458                         break;
8459                       }
8460                       // call/ret fast path assumes no registers allocated
8461                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
8462                         break;
8463                       }
8464                       assert(r < 64);
8465                       k--;
8466                     }
8467                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
8468                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
8469                       while(k<i) {
8470                         regs[k].regmap_entry[hr]=f_regmap[hr];
8471                         regs[k].regmap[hr]=f_regmap[hr];
8472                         regmap_pre[k+1][hr]=f_regmap[hr];
8473                         regs[k].wasdirty&=~(1<<hr);
8474                         regs[k].dirty&=~(1<<hr);
8475                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
8476                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
8477                         regs[k].wasconst&=~(1<<hr);
8478                         regs[k].isconst&=~(1<<hr);
8479                         k++;
8480                       }
8481                     }
8482                     else {
8483                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
8484                       break;
8485                     }
8486                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
8487                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
8488                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
8489                       regs[i].regmap_entry[hr]=f_regmap[hr];
8490                       regs[i].regmap[hr]=f_regmap[hr];
8491                       regs[i].wasdirty&=~(1<<hr);
8492                       regs[i].dirty&=~(1<<hr);
8493                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
8494                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
8495                       regs[i].wasconst&=~(1<<hr);
8496                       regs[i].isconst&=~(1<<hr);
8497                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
8498                       branch_regs[i].wasdirty&=~(1<<hr);
8499                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
8500                       branch_regs[i].regmap[hr]=f_regmap[hr];
8501                       branch_regs[i].dirty&=~(1<<hr);
8502                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
8503                       branch_regs[i].wasconst&=~(1<<hr);
8504                       branch_regs[i].isconst&=~(1<<hr);
8505                       if (!is_ujump(i)) {
8506                         regmap_pre[i+2][hr]=f_regmap[hr];
8507                         regs[i+2].wasdirty&=~(1<<hr);
8508                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
8509                       }
8510                     }
8511                   }
8512                   for(k=t;k<j;k++) {
8513                     // Alloc register clean at beginning of loop,
8514                     // but may dirty it in pass 6
8515                     regs[k].regmap_entry[hr]=f_regmap[hr];
8516                     regs[k].regmap[hr]=f_regmap[hr];
8517                     regs[k].dirty&=~(1<<hr);
8518                     regs[k].wasconst&=~(1<<hr);
8519                     regs[k].isconst&=~(1<<hr);
8520                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP) {
8521                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
8522                       branch_regs[k].regmap[hr]=f_regmap[hr];
8523                       branch_regs[k].dirty&=~(1<<hr);
8524                       branch_regs[k].wasconst&=~(1<<hr);
8525                       branch_regs[k].isconst&=~(1<<hr);
8526                       if (!is_ujump(k)) {
8527                         regmap_pre[k+2][hr]=f_regmap[hr];
8528                         regs[k+2].wasdirty&=~(1<<hr);
8529                       }
8530                     }
8531                     else
8532                     {
8533                       regmap_pre[k+1][hr]=f_regmap[hr];
8534                       regs[k+1].wasdirty&=~(1<<hr);
8535                     }
8536                   }
8537                   if(regs[j].regmap[hr]==f_regmap[hr])
8538                     regs[j].regmap_entry[hr]=f_regmap[hr];
8539                   break;
8540                 }
8541                 if(j==i) break;
8542                 if(regs[j].regmap[hr]>=0)
8543                   break;
8544                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
8545                   //printf("no-match due to different register\n");
8546                   break;
8547                 }
8548                 if (is_ujump(j))
8549                 {
8550                   // Stop on unconditional branch
8551                   break;
8552                 }
8553                 if(itype[j]==CJUMP||itype[j]==SJUMP)
8554                 {
8555                   if(ooo[j]) {
8556                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
8557                       break;
8558                   }else{
8559                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
8560                       break;
8561                   }
8562                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
8563                     //printf("no-match due to different register (branch)\n");
8564                     break;
8565                   }
8566                 }
8567                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
8568                   //printf("No free regs for store %x\n",start+j*4);
8569                   break;
8570                 }
8571                 assert(f_regmap[hr]<64);
8572               }
8573             }
8574           }
8575         }
8576       }
8577     }else{
8578       // Non branch or undetermined branch target
8579       for(hr=0;hr<HOST_REGS;hr++)
8580       {
8581         if(hr!=EXCLUDE_REG) {
8582           if(regs[i].regmap[hr]>=0) {
8583             if(f_regmap[hr]!=regs[i].regmap[hr]) {
8584               // dealloc old register
8585               int n;
8586               for(n=0;n<HOST_REGS;n++)
8587               {
8588                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
8589               }
8590               // and alloc new one
8591               f_regmap[hr]=regs[i].regmap[hr];
8592             }
8593           }
8594         }
8595       }
8596       // Try to restore cycle count at branch targets
8597       if(bt[i]) {
8598         for(j=i;j<slen-1;j++) {
8599           if(regs[j].regmap[HOST_CCREG]!=-1) break;
8600           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
8601             //printf("no free regs for store %x\n",start+j*4);
8602             break;
8603           }
8604         }
8605         if(regs[j].regmap[HOST_CCREG]==CCREG) {
8606           int k=i;
8607           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
8608           while(k<j) {
8609             regs[k].regmap_entry[HOST_CCREG]=CCREG;
8610             regs[k].regmap[HOST_CCREG]=CCREG;
8611             regmap_pre[k+1][HOST_CCREG]=CCREG;
8612             regs[k+1].wasdirty|=1<<HOST_CCREG;
8613             regs[k].dirty|=1<<HOST_CCREG;
8614             regs[k].wasconst&=~(1<<HOST_CCREG);
8615             regs[k].isconst&=~(1<<HOST_CCREG);
8616             k++;
8617           }
8618           regs[j].regmap_entry[HOST_CCREG]=CCREG;
8619         }
8620         // Work backwards from the branch target
8621         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
8622         {
8623           //printf("Extend backwards\n");
8624           int k;
8625           k=i;
8626           while(regs[k-1].regmap[HOST_CCREG]==-1) {
8627             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
8628               //printf("no free regs for store %x\n",start+(k-1)*4);
8629               break;
8630             }
8631             k--;
8632           }
8633           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
8634             //printf("Extend CC, %x ->\n",start+k*4);
8635             while(k<=i) {
8636               regs[k].regmap_entry[HOST_CCREG]=CCREG;
8637               regs[k].regmap[HOST_CCREG]=CCREG;
8638               regmap_pre[k+1][HOST_CCREG]=CCREG;
8639               regs[k+1].wasdirty|=1<<HOST_CCREG;
8640               regs[k].dirty|=1<<HOST_CCREG;
8641               regs[k].wasconst&=~(1<<HOST_CCREG);
8642               regs[k].isconst&=~(1<<HOST_CCREG);
8643               k++;
8644             }
8645           }
8646           else {
8647             //printf("Fail Extend CC, %x ->\n",start+k*4);
8648           }
8649         }
8650       }
8651       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
8652          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
8653          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1)
8654       {
8655         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
8656       }
8657     }
8658   }
8659
8660   // This allocates registers (if possible) one instruction prior
8661   // to use, which can avoid a load-use penalty on certain CPUs.
8662   for(i=0;i<slen-1;i++)
8663   {
8664     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP))
8665     {
8666       if(!bt[i+1])
8667       {
8668         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
8669            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
8670         {
8671           if(rs1[i+1]) {
8672             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
8673             {
8674               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8675               {
8676                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
8677                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
8678                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
8679                 regs[i].isconst&=~(1<<hr);
8680                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8681                 constmap[i][hr]=constmap[i+1][hr];
8682                 regs[i+1].wasdirty&=~(1<<hr);
8683                 regs[i].dirty&=~(1<<hr);
8684               }
8685             }
8686           }
8687           if(rs2[i+1]) {
8688             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
8689             {
8690               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8691               {
8692                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
8693                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
8694                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
8695                 regs[i].isconst&=~(1<<hr);
8696                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8697                 constmap[i][hr]=constmap[i+1][hr];
8698                 regs[i+1].wasdirty&=~(1<<hr);
8699                 regs[i].dirty&=~(1<<hr);
8700               }
8701             }
8702           }
8703           // Preload target address for load instruction (non-constant)
8704           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8705             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
8706             {
8707               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8708               {
8709                 regs[i].regmap[hr]=rs1[i+1];
8710                 regmap_pre[i+1][hr]=rs1[i+1];
8711                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8712                 regs[i].isconst&=~(1<<hr);
8713                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8714                 constmap[i][hr]=constmap[i+1][hr];
8715                 regs[i+1].wasdirty&=~(1<<hr);
8716                 regs[i].dirty&=~(1<<hr);
8717               }
8718             }
8719           }
8720           // Load source into target register
8721           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8722             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
8723             {
8724               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8725               {
8726                 regs[i].regmap[hr]=rs1[i+1];
8727                 regmap_pre[i+1][hr]=rs1[i+1];
8728                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8729                 regs[i].isconst&=~(1<<hr);
8730                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8731                 constmap[i][hr]=constmap[i+1][hr];
8732                 regs[i+1].wasdirty&=~(1<<hr);
8733                 regs[i].dirty&=~(1<<hr);
8734               }
8735             }
8736           }
8737           // Address for store instruction (non-constant)
8738           if(itype[i+1]==STORE||itype[i+1]==STORELR
8739              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
8740             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8741               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
8742               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
8743               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
8744               assert(hr>=0);
8745               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8746               {
8747                 regs[i].regmap[hr]=rs1[i+1];
8748                 regmap_pre[i+1][hr]=rs1[i+1];
8749                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8750                 regs[i].isconst&=~(1<<hr);
8751                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8752                 constmap[i][hr]=constmap[i+1][hr];
8753                 regs[i+1].wasdirty&=~(1<<hr);
8754                 regs[i].dirty&=~(1<<hr);
8755               }
8756             }
8757           }
8758           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
8759             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8760               int nr;
8761               hr=get_reg(regs[i+1].regmap,FTEMP);
8762               assert(hr>=0);
8763               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8764               {
8765                 regs[i].regmap[hr]=rs1[i+1];
8766                 regmap_pre[i+1][hr]=rs1[i+1];
8767                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8768                 regs[i].isconst&=~(1<<hr);
8769                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8770                 constmap[i][hr]=constmap[i+1][hr];
8771                 regs[i+1].wasdirty&=~(1<<hr);
8772                 regs[i].dirty&=~(1<<hr);
8773               }
8774               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
8775               {
8776                 // move it to another register
8777                 regs[i+1].regmap[hr]=-1;
8778                 regmap_pre[i+2][hr]=-1;
8779                 regs[i+1].regmap[nr]=FTEMP;
8780                 regmap_pre[i+2][nr]=FTEMP;
8781                 regs[i].regmap[nr]=rs1[i+1];
8782                 regmap_pre[i+1][nr]=rs1[i+1];
8783                 regs[i+1].regmap_entry[nr]=rs1[i+1];
8784                 regs[i].isconst&=~(1<<nr);
8785                 regs[i+1].isconst&=~(1<<nr);
8786                 regs[i].dirty&=~(1<<nr);
8787                 regs[i+1].wasdirty&=~(1<<nr);
8788                 regs[i+1].dirty&=~(1<<nr);
8789                 regs[i+2].wasdirty&=~(1<<nr);
8790               }
8791             }
8792           }
8793           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
8794             if(itype[i+1]==LOAD)
8795               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
8796             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
8797               hr=get_reg(regs[i+1].regmap,FTEMP);
8798             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
8799               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
8800               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
8801             }
8802             if(hr>=0&&regs[i].regmap[hr]<0) {
8803               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
8804               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
8805                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
8806                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
8807                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
8808                 regs[i].isconst&=~(1<<hr);
8809                 regs[i+1].wasdirty&=~(1<<hr);
8810                 regs[i].dirty&=~(1<<hr);
8811               }
8812             }
8813           }
8814         }
8815       }
8816     }
8817   }
8818
8819   /* Pass 6 - Optimize clean/dirty state */
8820   clean_registers(0,slen-1,1);
8821
8822   /* Pass 7 - Identify 32-bit registers */
8823   for (i=slen-1;i>=0;i--)
8824   {
8825     if(itype[i]==CJUMP||itype[i]==SJUMP)
8826     {
8827       // Conditional branch
8828       if((source[i]>>16)!=0x1000&&i<slen-2) {
8829         // Mark this address as a branch target since it may be called
8830         // upon return from interrupt
8831         bt[i+2]=1;
8832       }
8833     }
8834   }
8835
8836   if(itype[slen-1]==SPAN) {
8837     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
8838   }
8839
8840 #ifdef DISASM
8841   /* Debug/disassembly */
8842   for(i=0;i<slen;i++)
8843   {
8844     printf("U:");
8845     int r;
8846     for(r=1;r<=CCREG;r++) {
8847       if((unneeded_reg[i]>>r)&1) {
8848         if(r==HIREG) printf(" HI");
8849         else if(r==LOREG) printf(" LO");
8850         else printf(" r%d",r);
8851       }
8852     }
8853     printf("\n");
8854     #if defined(__i386__) || defined(__x86_64__)
8855     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
8856     #endif
8857     #ifdef __arm__
8858     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
8859     #endif
8860     #if defined(__i386__) || defined(__x86_64__)
8861     printf("needs: ");
8862     if(needed_reg[i]&1) printf("eax ");
8863     if((needed_reg[i]>>1)&1) printf("ecx ");
8864     if((needed_reg[i]>>2)&1) printf("edx ");
8865     if((needed_reg[i]>>3)&1) printf("ebx ");
8866     if((needed_reg[i]>>5)&1) printf("ebp ");
8867     if((needed_reg[i]>>6)&1) printf("esi ");
8868     if((needed_reg[i]>>7)&1) printf("edi ");
8869     printf("\n");
8870     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
8871     printf("dirty: ");
8872     if(regs[i].wasdirty&1) printf("eax ");
8873     if((regs[i].wasdirty>>1)&1) printf("ecx ");
8874     if((regs[i].wasdirty>>2)&1) printf("edx ");
8875     if((regs[i].wasdirty>>3)&1) printf("ebx ");
8876     if((regs[i].wasdirty>>5)&1) printf("ebp ");
8877     if((regs[i].wasdirty>>6)&1) printf("esi ");
8878     if((regs[i].wasdirty>>7)&1) printf("edi ");
8879     #endif
8880     #ifdef __arm__
8881     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
8882     printf("dirty: ");
8883     if(regs[i].wasdirty&1) printf("r0 ");
8884     if((regs[i].wasdirty>>1)&1) printf("r1 ");
8885     if((regs[i].wasdirty>>2)&1) printf("r2 ");
8886     if((regs[i].wasdirty>>3)&1) printf("r3 ");
8887     if((regs[i].wasdirty>>4)&1) printf("r4 ");
8888     if((regs[i].wasdirty>>5)&1) printf("r5 ");
8889     if((regs[i].wasdirty>>6)&1) printf("r6 ");
8890     if((regs[i].wasdirty>>7)&1) printf("r7 ");
8891     if((regs[i].wasdirty>>8)&1) printf("r8 ");
8892     if((regs[i].wasdirty>>9)&1) printf("r9 ");
8893     if((regs[i].wasdirty>>10)&1) printf("r10 ");
8894     if((regs[i].wasdirty>>12)&1) printf("r12 ");
8895     #endif
8896     printf("\n");
8897     disassemble_inst(i);
8898     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
8899     #if defined(__i386__) || defined(__x86_64__)
8900     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
8901     if(regs[i].dirty&1) printf("eax ");
8902     if((regs[i].dirty>>1)&1) printf("ecx ");
8903     if((regs[i].dirty>>2)&1) printf("edx ");
8904     if((regs[i].dirty>>3)&1) printf("ebx ");
8905     if((regs[i].dirty>>5)&1) printf("ebp ");
8906     if((regs[i].dirty>>6)&1) printf("esi ");
8907     if((regs[i].dirty>>7)&1) printf("edi ");
8908     #endif
8909     #ifdef __arm__
8910     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
8911     if(regs[i].dirty&1) printf("r0 ");
8912     if((regs[i].dirty>>1)&1) printf("r1 ");
8913     if((regs[i].dirty>>2)&1) printf("r2 ");
8914     if((regs[i].dirty>>3)&1) printf("r3 ");
8915     if((regs[i].dirty>>4)&1) printf("r4 ");
8916     if((regs[i].dirty>>5)&1) printf("r5 ");
8917     if((regs[i].dirty>>6)&1) printf("r6 ");
8918     if((regs[i].dirty>>7)&1) printf("r7 ");
8919     if((regs[i].dirty>>8)&1) printf("r8 ");
8920     if((regs[i].dirty>>9)&1) printf("r9 ");
8921     if((regs[i].dirty>>10)&1) printf("r10 ");
8922     if((regs[i].dirty>>12)&1) printf("r12 ");
8923     #endif
8924     printf("\n");
8925     if(regs[i].isconst) {
8926       printf("constants: ");
8927       #if defined(__i386__) || defined(__x86_64__)
8928       if(regs[i].isconst&1) printf("eax=%x ",(u_int)constmap[i][0]);
8929       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(u_int)constmap[i][1]);
8930       if((regs[i].isconst>>2)&1) printf("edx=%x ",(u_int)constmap[i][2]);
8931       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(u_int)constmap[i][3]);
8932       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(u_int)constmap[i][5]);
8933       if((regs[i].isconst>>6)&1) printf("esi=%x ",(u_int)constmap[i][6]);
8934       if((regs[i].isconst>>7)&1) printf("edi=%x ",(u_int)constmap[i][7]);
8935       #endif
8936       #if defined(__arm__) || defined(__aarch64__)
8937       int r;
8938       for (r = 0; r < ARRAY_SIZE(constmap[i]); r++)
8939         if ((regs[i].isconst >> r) & 1)
8940           printf(" r%d=%x", r, (u_int)constmap[i][r]);
8941       #endif
8942       printf("\n");
8943     }
8944     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
8945       #if defined(__i386__) || defined(__x86_64__)
8946       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
8947       if(branch_regs[i].dirty&1) printf("eax ");
8948       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
8949       if((branch_regs[i].dirty>>2)&1) printf("edx ");
8950       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
8951       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
8952       if((branch_regs[i].dirty>>6)&1) printf("esi ");
8953       if((branch_regs[i].dirty>>7)&1) printf("edi ");
8954       #endif
8955       #ifdef __arm__
8956       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
8957       if(branch_regs[i].dirty&1) printf("r0 ");
8958       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
8959       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
8960       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
8961       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
8962       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
8963       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
8964       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
8965       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
8966       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
8967       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
8968       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
8969       #endif
8970     }
8971   }
8972 #endif // DISASM
8973
8974   /* Pass 8 - Assembly */
8975   linkcount=0;stubcount=0;
8976   ds=0;is_delayslot=0;
8977   u_int dirty_pre=0;
8978   void *beginning=start_block();
8979   if((u_int)addr&1) {
8980     ds=1;
8981     pagespan_ds();
8982   }
8983   void *instr_addr0_override = NULL;
8984
8985   if (start == 0x80030000) {
8986     // nasty hack for the fastbios thing
8987     // override block entry to this code
8988     instr_addr0_override = out;
8989     emit_movimm(start,0);
8990     // abuse io address var as a flag that we
8991     // have already returned here once
8992     emit_readword(&address,1);
8993     emit_writeword(0,&pcaddr);
8994     emit_writeword(0,&address);
8995     emit_cmp(0,1);
8996     #ifdef __aarch64__
8997     emit_jeq(out + 4*2);
8998     emit_far_jump(new_dyna_leave);
8999     #else
9000     emit_jne(new_dyna_leave);
9001     #endif
9002   }
9003   for(i=0;i<slen;i++)
9004   {
9005     //if(ds) printf("ds: ");
9006     disassemble_inst(i);
9007     if(ds) {
9008       ds=0; // Skip delay slot
9009       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
9010       instr_addr[i] = NULL;
9011     } else {
9012       speculate_register_values(i);
9013       #ifndef DESTRUCTIVE_WRITEBACK
9014       if (i < 2 || !is_ujump(i-2))
9015       {
9016         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,unneeded_reg[i]);
9017       }
9018       if((itype[i]==CJUMP||itype[i]==SJUMP)&&!likely[i]) {
9019         dirty_pre=branch_regs[i].dirty;
9020       }else{
9021         dirty_pre=regs[i].dirty;
9022       }
9023       #endif
9024       // write back
9025       if (i < 2 || !is_ujump(i-2))
9026       {
9027         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,unneeded_reg[i]);
9028         loop_preload(regmap_pre[i],regs[i].regmap_entry);
9029       }
9030       // branch target entry point
9031       instr_addr[i] = out;
9032       assem_debug("<->\n");
9033       drc_dbg_emit_do_cmp(i);
9034
9035       // load regs
9036       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
9037         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty);
9038       load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i],rs2[i]);
9039       address_generation(i,&regs[i],regs[i].regmap_entry);
9040       load_consts(regmap_pre[i],regs[i].regmap,i);
9041       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
9042       {
9043         // Load the delay slot registers if necessary
9044         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
9045           load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i+1],rs1[i+1]);
9046         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
9047           load_regs(regs[i].regmap_entry,regs[i].regmap,rs2[i+1],rs2[i+1]);
9048         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
9049           load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP);
9050       }
9051       else if(i+1<slen)
9052       {
9053         // Preload registers for following instruction
9054         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
9055           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
9056             load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i+1],rs1[i+1]);
9057         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
9058           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
9059             load_regs(regs[i].regmap_entry,regs[i].regmap,rs2[i+1],rs2[i+1]);
9060       }
9061       // TODO: if(is_ooo(i)) address_generation(i+1);
9062       if(itype[i]==CJUMP)
9063         load_regs(regs[i].regmap_entry,regs[i].regmap,CCREG,CCREG);
9064       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
9065         load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP);
9066       // assemble
9067       switch(itype[i]) {
9068         case ALU:
9069           alu_assemble(i,&regs[i]);break;
9070         case IMM16:
9071           imm16_assemble(i,&regs[i]);break;
9072         case SHIFT:
9073           shift_assemble(i,&regs[i]);break;
9074         case SHIFTIMM:
9075           shiftimm_assemble(i,&regs[i]);break;
9076         case LOAD:
9077           load_assemble(i,&regs[i]);break;
9078         case LOADLR:
9079           loadlr_assemble(i,&regs[i]);break;
9080         case STORE:
9081           store_assemble(i,&regs[i]);break;
9082         case STORELR:
9083           storelr_assemble(i,&regs[i]);break;
9084         case COP0:
9085           cop0_assemble(i,&regs[i]);break;
9086         case COP1:
9087           cop1_assemble(i,&regs[i]);break;
9088         case C1LS:
9089           c1ls_assemble(i,&regs[i]);break;
9090         case COP2:
9091           cop2_assemble(i,&regs[i]);break;
9092         case C2LS:
9093           c2ls_assemble(i,&regs[i]);break;
9094         case C2OP:
9095           c2op_assemble(i,&regs[i]);break;
9096         case MULTDIV:
9097           multdiv_assemble(i,&regs[i]);break;
9098         case MOV:
9099           mov_assemble(i,&regs[i]);break;
9100         case SYSCALL:
9101           syscall_assemble(i,&regs[i]);break;
9102         case HLECALL:
9103           hlecall_assemble(i,&regs[i]);break;
9104         case INTCALL:
9105           intcall_assemble(i,&regs[i]);break;
9106         case UJUMP:
9107           ujump_assemble(i,&regs[i]);ds=1;break;
9108         case RJUMP:
9109           rjump_assemble(i,&regs[i]);ds=1;break;
9110         case CJUMP:
9111           cjump_assemble(i,&regs[i]);ds=1;break;
9112         case SJUMP:
9113           sjump_assemble(i,&regs[i]);ds=1;break;
9114         case SPAN:
9115           pagespan_assemble(i,&regs[i]);break;
9116       }
9117       if (is_ujump(i))
9118         literal_pool(1024);
9119       else
9120         literal_pool_jumpover(256);
9121     }
9122   }
9123   //assert(is_ujump(i-2));
9124   // If the block did not end with an unconditional branch,
9125   // add a jump to the next instruction.
9126   if(i>1) {
9127     if(!is_ujump(i-2)&&itype[i-1]!=SPAN) {
9128       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP);
9129       assert(i==slen);
9130       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP) {
9131         store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4);
9132         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
9133           emit_loadreg(CCREG,HOST_CCREG);
9134         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
9135       }
9136       else if(!likely[i-2])
9137       {
9138         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].dirty,start+i*4);
9139         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
9140       }
9141       else
9142       {
9143         store_regs_bt(regs[i-2].regmap,regs[i-2].dirty,start+i*4);
9144         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
9145       }
9146       add_to_linker(out,start+i*4,0);
9147       emit_jmp(0);
9148     }
9149   }
9150   else
9151   {
9152     assert(i>0);
9153     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP);
9154     store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4);
9155     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
9156       emit_loadreg(CCREG,HOST_CCREG);
9157     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
9158     add_to_linker(out,start+i*4,0);
9159     emit_jmp(0);
9160   }
9161
9162   // TODO: delay slot stubs?
9163   // Stubs
9164   for(i=0;i<stubcount;i++)
9165   {
9166     switch(stubs[i].type)
9167     {
9168       case LOADB_STUB:
9169       case LOADH_STUB:
9170       case LOADW_STUB:
9171       case LOADD_STUB:
9172       case LOADBU_STUB:
9173       case LOADHU_STUB:
9174         do_readstub(i);break;
9175       case STOREB_STUB:
9176       case STOREH_STUB:
9177       case STOREW_STUB:
9178       case STORED_STUB:
9179         do_writestub(i);break;
9180       case CC_STUB:
9181         do_ccstub(i);break;
9182       case INVCODE_STUB:
9183         do_invstub(i);break;
9184       case FP_STUB:
9185         do_cop1stub(i);break;
9186       case STORELR_STUB:
9187         do_unalignedwritestub(i);break;
9188     }
9189   }
9190
9191   if (instr_addr0_override)
9192     instr_addr[0] = instr_addr0_override;
9193
9194   /* Pass 9 - Linker */
9195   for(i=0;i<linkcount;i++)
9196   {
9197     assem_debug("%p -> %8x\n",link_addr[i].addr,link_addr[i].target);
9198     literal_pool(64);
9199     if (!link_addr[i].ext)
9200     {
9201       void *stub = out;
9202       void *addr = check_addr(link_addr[i].target);
9203       emit_extjump(link_addr[i].addr, link_addr[i].target);
9204       if (addr) {
9205         set_jump_target(link_addr[i].addr, addr);
9206         add_link(link_addr[i].target,stub);
9207       }
9208       else
9209         set_jump_target(link_addr[i].addr, stub);
9210     }
9211     else
9212     {
9213       // Internal branch
9214       int target=(link_addr[i].target-start)>>2;
9215       assert(target>=0&&target<slen);
9216       assert(instr_addr[target]);
9217       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
9218       //set_jump_target_fillslot(link_addr[i].addr,instr_addr[target],link_addr[i].ext>>1);
9219       //#else
9220       set_jump_target(link_addr[i].addr, instr_addr[target]);
9221       //#endif
9222     }
9223   }
9224   // External Branch Targets (jump_in)
9225   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
9226   for(i=0;i<slen;i++)
9227   {
9228     if(bt[i]||i==0)
9229     {
9230       if(instr_addr[i]) // TODO - delay slots (=null)
9231       {
9232         u_int vaddr=start+i*4;
9233         u_int page=get_page(vaddr);
9234         u_int vpage=get_vpage(vaddr);
9235         literal_pool(256);
9236         {
9237           assem_debug("%p (%d) <- %8x\n",instr_addr[i],i,start+i*4);
9238           assem_debug("jump_in: %x\n",start+i*4);
9239           ll_add(jump_dirty+vpage,vaddr,out);
9240           void *entry_point = do_dirty_stub(i);
9241           ll_add_flags(jump_in+page,vaddr,state_rflags,entry_point);
9242           // If there was an existing entry in the hash table,
9243           // replace it with the new address.
9244           // Don't add new entries.  We'll insert the
9245           // ones that actually get used in check_addr().
9246           struct ht_entry *ht_bin = hash_table_get(vaddr);
9247           if (ht_bin->vaddr[0] == vaddr)
9248             ht_bin->tcaddr[0] = entry_point;
9249           if (ht_bin->vaddr[1] == vaddr)
9250             ht_bin->tcaddr[1] = entry_point;
9251         }
9252       }
9253     }
9254   }
9255   // Write out the literal pool if necessary
9256   literal_pool(0);
9257   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
9258   // Align code
9259   if(((u_int)out)&7) emit_addnop(13);
9260   #endif
9261   assert(out - (u_char *)beginning < MAX_OUTPUT_BLOCK_SIZE);
9262   //printf("shadow buffer: %p-%p\n",copy,(u_char *)copy+slen*4);
9263   memcpy(copy,source,slen*4);
9264   copy+=slen*4;
9265
9266   end_block(beginning);
9267
9268   // If we're within 256K of the end of the buffer,
9269   // start over from the beginning. (Is 256K enough?)
9270   if (out > ndrc->translation_cache + sizeof(ndrc->translation_cache) - MAX_OUTPUT_BLOCK_SIZE)
9271     out = ndrc->translation_cache;
9272
9273   // Trap writes to any of the pages we compiled
9274   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
9275     invalid_code[i]=0;
9276   }
9277   inv_code_start=inv_code_end=~0;
9278
9279   // for PCSX we need to mark all mirrors too
9280   if(get_page(start)<(RAM_SIZE>>12))
9281     for(i=start>>12;i<=(start+slen*4)>>12;i++)
9282       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
9283       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
9284       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
9285
9286   /* Pass 10 - Free memory by expiring oldest blocks */
9287
9288   int end=(((out-ndrc->translation_cache)>>(TARGET_SIZE_2-16))+16384)&65535;
9289   while(expirep!=end)
9290   {
9291     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
9292     uintptr_t base=(uintptr_t)ndrc->translation_cache+((expirep>>13)<<shift); // Base address of this block
9293     inv_debug("EXP: Phase %d\n",expirep);
9294     switch((expirep>>11)&3)
9295     {
9296       case 0:
9297         // Clear jump_in and jump_dirty
9298         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
9299         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
9300         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
9301         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
9302         break;
9303       case 1:
9304         // Clear pointers
9305         ll_kill_pointers(jump_out[expirep&2047],base,shift);
9306         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
9307         break;
9308       case 2:
9309         // Clear hash table
9310         for(i=0;i<32;i++) {
9311           struct ht_entry *ht_bin = &hash_table[((expirep&2047)<<5)+i];
9312           if (((uintptr_t)ht_bin->tcaddr[1]>>shift) == (base>>shift) ||
9313              (((uintptr_t)ht_bin->tcaddr[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
9314             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[1],ht_bin->tcaddr[1]);
9315             ht_bin->vaddr[1] = -1;
9316             ht_bin->tcaddr[1] = NULL;
9317           }
9318           if (((uintptr_t)ht_bin->tcaddr[0]>>shift) == (base>>shift) ||
9319              (((uintptr_t)ht_bin->tcaddr[0]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
9320             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[0],ht_bin->tcaddr[0]);
9321             ht_bin->vaddr[0] = ht_bin->vaddr[1];
9322             ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
9323             ht_bin->vaddr[1] = -1;
9324             ht_bin->tcaddr[1] = NULL;
9325           }
9326         }
9327         break;
9328       case 3:
9329         // Clear jump_out
9330         if((expirep&2047)==0)
9331           do_clear_cache();
9332         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
9333         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
9334         break;
9335     }
9336     expirep=(expirep+1)&65535;
9337   }
9338   return 0;
9339 }
9340
9341 // vim:shiftwidth=2:expandtab