drc: use helpers for jump checks
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29 #ifdef _3DS
30 #include <3ds_utils.h>
31 #endif
32 #ifdef VITA
33 #include <psp2/kernel/sysmem.h>
34 static int sceBlock;
35 #endif
36
37 #include "new_dynarec_config.h"
38 #include "../psxhle.h"
39 #include "../psxinterpreter.h"
40 #include "emu_if.h" //emulator interface
41
42 #define noinline __attribute__((noinline,noclone))
43 #ifndef ARRAY_SIZE
44 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
45 #endif
46
47 //#define DISASM
48 //#define assem_debug printf
49 //#define inv_debug printf
50 #define assem_debug(...)
51 #define inv_debug(...)
52
53 #ifdef __i386__
54 #include "assem_x86.h"
55 #endif
56 #ifdef __x86_64__
57 #include "assem_x64.h"
58 #endif
59 #ifdef __arm__
60 #include "assem_arm.h"
61 #endif
62 #ifdef __aarch64__
63 #include "assem_arm64.h"
64 #endif
65
66 #define MAXBLOCK 4096
67 #define MAX_OUTPUT_BLOCK_SIZE 262144
68
69 struct ndrc_mem
70 {
71   u_char translation_cache[1 << TARGET_SIZE_2];
72   struct
73   {
74     struct tramp_insns ops[2048 / sizeof(struct tramp_insns)];
75     const void *f[2048 / sizeof(void *)];
76   } tramp;
77 };
78
79 #ifdef BASE_ADDR_DYNAMIC
80 static struct ndrc_mem *ndrc;
81 #else
82 static struct ndrc_mem ndrc_ __attribute__((aligned(4096)));
83 static struct ndrc_mem *ndrc = &ndrc_;
84 #endif
85
86 // stubs
87 enum stub_type {
88   CC_STUB = 1,
89   FP_STUB = 2,
90   LOADB_STUB = 3,
91   LOADH_STUB = 4,
92   LOADW_STUB = 5,
93   LOADD_STUB = 6,
94   LOADBU_STUB = 7,
95   LOADHU_STUB = 8,
96   STOREB_STUB = 9,
97   STOREH_STUB = 10,
98   STOREW_STUB = 11,
99   STORED_STUB = 12,
100   STORELR_STUB = 13,
101   INVCODE_STUB = 14,
102 };
103
104 struct regstat
105 {
106   signed char regmap_entry[HOST_REGS];
107   signed char regmap[HOST_REGS];
108   uint64_t wasdirty;
109   uint64_t dirty;
110   uint64_t u;
111   u_int wasconst;
112   u_int isconst;
113   u_int loadedconst;             // host regs that have constants loaded
114   u_int waswritten;              // MIPS regs that were used as store base before
115 };
116
117 // note: asm depends on this layout
118 struct ll_entry
119 {
120   u_int vaddr;
121   u_int reg_sv_flags;
122   void *addr;
123   struct ll_entry *next;
124 };
125
126 struct ht_entry
127 {
128   u_int vaddr[2];
129   void *tcaddr[2];
130 };
131
132 struct code_stub
133 {
134   enum stub_type type;
135   void *addr;
136   void *retaddr;
137   u_int a;
138   uintptr_t b;
139   uintptr_t c;
140   u_int d;
141   u_int e;
142 };
143
144 struct link_entry
145 {
146   void *addr;
147   u_int target;
148   u_int ext;
149 };
150
151   // used by asm:
152   u_char *out;
153   struct ht_entry hash_table[65536]  __attribute__((aligned(16)));
154   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
155   struct ll_entry *jump_dirty[4096];
156
157   static struct ll_entry *jump_out[4096];
158   static u_int start;
159   static u_int *source;
160   static char insn[MAXBLOCK][10];
161   static u_char itype[MAXBLOCK];
162   static u_char opcode[MAXBLOCK];
163   static u_char opcode2[MAXBLOCK];
164   static u_char bt[MAXBLOCK];
165   static u_char rs1[MAXBLOCK];
166   static u_char rs2[MAXBLOCK];
167   static u_char rt1[MAXBLOCK];
168   static u_char rt2[MAXBLOCK];
169   static u_char dep1[MAXBLOCK];
170   static u_char dep2[MAXBLOCK];
171   static u_char lt1[MAXBLOCK];
172   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
173   static uint64_t gte_rt[MAXBLOCK];
174   static uint64_t gte_unneeded[MAXBLOCK];
175   static u_int smrv[32]; // speculated MIPS register values
176   static u_int smrv_strong; // mask or regs that are likely to have correct values
177   static u_int smrv_weak; // same, but somewhat less likely
178   static u_int smrv_strong_next; // same, but after current insn executes
179   static u_int smrv_weak_next;
180   static int imm[MAXBLOCK];
181   static u_int ba[MAXBLOCK];
182   static char likely[MAXBLOCK];
183   static char is_ds[MAXBLOCK];
184   static char ooo[MAXBLOCK];
185   static uint64_t unneeded_reg[MAXBLOCK];
186   static uint64_t branch_unneeded_reg[MAXBLOCK];
187   static signed char regmap_pre[MAXBLOCK][HOST_REGS]; // pre-instruction i?
188   // contains 'real' consts at [i] insn, but may differ from what's actually
189   // loaded in host reg as 'final' value is always loaded, see get_final_value()
190   static uint32_t current_constmap[HOST_REGS];
191   static uint32_t constmap[MAXBLOCK][HOST_REGS];
192   static struct regstat regs[MAXBLOCK];
193   static struct regstat branch_regs[MAXBLOCK];
194   static signed char minimum_free_regs[MAXBLOCK];
195   static u_int needed_reg[MAXBLOCK];
196   static u_int wont_dirty[MAXBLOCK];
197   static u_int will_dirty[MAXBLOCK];
198   static int ccadj[MAXBLOCK];
199   static int slen;
200   static void *instr_addr[MAXBLOCK];
201   static struct link_entry link_addr[MAXBLOCK];
202   static int linkcount;
203   static struct code_stub stubs[MAXBLOCK*3];
204   static int stubcount;
205   static u_int literals[1024][2];
206   static int literalcount;
207   static int is_delayslot;
208   static char shadow[1048576]  __attribute__((aligned(16)));
209   static void *copy;
210   static int expirep;
211   static u_int stop_after_jal;
212 #ifndef RAM_FIXED
213   static uintptr_t ram_offset;
214 #else
215   static const uintptr_t ram_offset=0;
216 #endif
217
218   int new_dynarec_hacks;
219   int new_dynarec_hacks_pergame;
220   int new_dynarec_did_compile;
221
222   #define HACK_ENABLED(x) ((new_dynarec_hacks | new_dynarec_hacks_pergame) & (x))
223
224   extern int cycle_count; // ... until end of the timeslice, counts -N -> 0
225   extern int last_count;  // last absolute target, often = next_interupt
226   extern int pcaddr;
227   extern int pending_exception;
228   extern int branch_target;
229   extern uintptr_t mini_ht[32][2];
230   extern u_char restore_candidate[512];
231
232   /* registers that may be allocated */
233   /* 1-31 gpr */
234 #define LOREG 32 // lo
235 #define HIREG 33 // hi
236 //#define FSREG 34 // FPU status (FCSR)
237 #define CSREG 35 // Coprocessor status
238 #define CCREG 36 // Cycle count
239 #define INVCP 37 // Pointer to invalid_code
240 //#define MMREG 38 // Pointer to memory_map
241 //#define ROREG 39 // ram offset (if rdram!=0x80000000)
242 #define TEMPREG 40
243 #define FTEMP 40 // FPU temporary register
244 #define PTEMP 41 // Prefetch temporary register
245 //#define TLREG 42 // TLB mapping offset
246 #define RHASH 43 // Return address hash
247 #define RHTBL 44 // Return address hash table address
248 #define RTEMP 45 // JR/JALR address register
249 #define MAXREG 45
250 #define AGEN1 46 // Address generation temporary register
251 //#define AGEN2 47 // Address generation temporary register
252 //#define MGEN1 48 // Maptable address generation temporary register
253 //#define MGEN2 49 // Maptable address generation temporary register
254 #define BTREG 50 // Branch target temporary register
255
256   /* instruction types */
257 #define NOP 0     // No operation
258 #define LOAD 1    // Load
259 #define STORE 2   // Store
260 #define LOADLR 3  // Unaligned load
261 #define STORELR 4 // Unaligned store
262 #define MOV 5     // Move
263 #define ALU 6     // Arithmetic/logic
264 #define MULTDIV 7 // Multiply/divide
265 #define SHIFT 8   // Shift by register
266 #define SHIFTIMM 9// Shift by immediate
267 #define IMM16 10  // 16-bit immediate
268 #define RJUMP 11  // Unconditional jump to register
269 #define UJUMP 12  // Unconditional jump
270 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
271 #define SJUMP 14  // Conditional branch (regimm format)
272 #define COP0 15   // Coprocessor 0
273 #define COP1 16   // Coprocessor 1
274 #define C1LS 17   // Coprocessor 1 load/store
275 //#define FJUMP 18  // Conditional branch (floating point)
276 //#define FLOAT 19  // Floating point unit
277 //#define FCONV 20  // Convert integer to float
278 //#define FCOMP 21  // Floating point compare (sets FSREG)
279 #define SYSCALL 22// SYSCALL
280 #define OTHER 23  // Other
281 #define SPAN 24   // Branch/delay slot spans 2 pages
282 #define NI 25     // Not implemented
283 #define HLECALL 26// PCSX fake opcodes for HLE
284 #define COP2 27   // Coprocessor 2 move
285 #define C2LS 28   // Coprocessor 2 load/store
286 #define C2OP 29   // Coprocessor 2 operation
287 #define INTCALL 30// Call interpreter to handle rare corner cases
288
289   /* branch codes */
290 #define TAKEN 1
291 #define NOTTAKEN 2
292 #define NULLDS 3
293
294 #define DJT_1 (void *)1l // no function, just a label in assem_debug log
295 #define DJT_2 (void *)2l
296
297 // asm linkage
298 int new_recompile_block(u_int addr);
299 void *get_addr_ht(u_int vaddr);
300 void invalidate_block(u_int block);
301 void invalidate_addr(u_int addr);
302 void remove_hash(int vaddr);
303 void dyna_linker();
304 void dyna_linker_ds();
305 void verify_code();
306 void verify_code_ds();
307 void cc_interrupt();
308 void fp_exception();
309 void fp_exception_ds();
310 void jump_to_new_pc();
311 void new_dyna_leave();
312
313 // Needed by assembler
314 static void wb_register(signed char r,signed char regmap[],uint64_t dirty);
315 static void wb_dirtys(signed char i_regmap[],uint64_t i_dirty);
316 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_dirty,int addr);
317 static void load_all_regs(signed char i_regmap[]);
318 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
319 static void load_regs_entry(int t);
320 static void load_all_consts(signed char regmap[],u_int dirty,int i);
321
322 static int verify_dirty(const u_int *ptr);
323 static int get_final_value(int hr, int i, int *value);
324 static void add_stub(enum stub_type type, void *addr, void *retaddr,
325   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e);
326 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
327   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist);
328 static void add_to_linker(void *addr, u_int target, int ext);
329 static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override);
330 static void *get_direct_memhandler(void *table, u_int addr,
331   enum stub_type type, uintptr_t *addr_host);
332 static void pass_args(int a0, int a1);
333 static void emit_far_jump(const void *f);
334 static void emit_far_call(const void *f);
335
336 static void mprotect_w_x(void *start, void *end, int is_x)
337 {
338 #ifdef NO_WRITE_EXEC
339   #if defined(VITA)
340   // *Open* enables write on all memory that was
341   // allocated by sceKernelAllocMemBlockForVM()?
342   if (is_x)
343     sceKernelCloseVMDomain();
344   else
345     sceKernelOpenVMDomain();
346   #else
347   u_long mstart = (u_long)start & ~4095ul;
348   u_long mend = (u_long)end;
349   if (mprotect((void *)mstart, mend - mstart,
350                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
351     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
352   #endif
353 #endif
354 }
355
356 static void start_tcache_write(void *start, void *end)
357 {
358   mprotect_w_x(start, end, 0);
359 }
360
361 static void end_tcache_write(void *start, void *end)
362 {
363 #if defined(__arm__) || defined(__aarch64__)
364   size_t len = (char *)end - (char *)start;
365   #if   defined(__BLACKBERRY_QNX__)
366   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
367   #elif defined(__MACH__)
368   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
369   #elif defined(VITA)
370   sceKernelSyncVMDomain(sceBlock, start, len);
371   #elif defined(_3DS)
372   ctr_flush_invalidate_cache();
373   #elif defined(__aarch64__)
374   // as of 2021, __clear_cache() is still broken on arm64
375   // so here is a custom one :(
376   clear_cache_arm64(start, end);
377   #else
378   __clear_cache(start, end);
379   #endif
380   (void)len;
381 #endif
382
383   mprotect_w_x(start, end, 1);
384 }
385
386 static void *start_block(void)
387 {
388   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
389   if (end > ndrc->translation_cache + sizeof(ndrc->translation_cache))
390     end = ndrc->translation_cache + sizeof(ndrc->translation_cache);
391   start_tcache_write(out, end);
392   return out;
393 }
394
395 static void end_block(void *start)
396 {
397   end_tcache_write(start, out);
398 }
399
400 // also takes care of w^x mappings when patching code
401 static u_int needs_clear_cache[1<<(TARGET_SIZE_2-17)];
402
403 static void mark_clear_cache(void *target)
404 {
405   uintptr_t offset = (u_char *)target - ndrc->translation_cache;
406   u_int mask = 1u << ((offset >> 12) & 31);
407   if (!(needs_clear_cache[offset >> 17] & mask)) {
408     char *start = (char *)((uintptr_t)target & ~4095l);
409     start_tcache_write(start, start + 4095);
410     needs_clear_cache[offset >> 17] |= mask;
411   }
412 }
413
414 // Clearing the cache is rather slow on ARM Linux, so mark the areas
415 // that need to be cleared, and then only clear these areas once.
416 static void do_clear_cache(void)
417 {
418   int i, j;
419   for (i = 0; i < (1<<(TARGET_SIZE_2-17)); i++)
420   {
421     u_int bitmap = needs_clear_cache[i];
422     if (!bitmap)
423       continue;
424     for (j = 0; j < 32; j++)
425     {
426       u_char *start, *end;
427       if (!(bitmap & (1<<j)))
428         continue;
429
430       start = ndrc->translation_cache + i*131072 + j*4096;
431       end = start + 4095;
432       for (j++; j < 32; j++) {
433         if (!(bitmap & (1<<j)))
434           break;
435         end += 4096;
436       }
437       end_tcache_write(start, end);
438     }
439     needs_clear_cache[i] = 0;
440   }
441 }
442
443 //#define DEBUG_CYCLE_COUNT 1
444
445 #define NO_CYCLE_PENALTY_THR 12
446
447 int cycle_multiplier; // 100 for 1.0
448 int cycle_multiplier_override;
449
450 static int CLOCK_ADJUST(int x)
451 {
452   int m = cycle_multiplier_override
453         ? cycle_multiplier_override : cycle_multiplier;
454   int s=(x>>31)|1;
455   return (x * m + s * 50) / 100;
456 }
457
458 // is the op an unconditional jump?
459 static int is_ujump(int i)
460 {
461   return itype[i] == UJUMP || itype[i] == RJUMP
462     || (source[i] >> 16) == 0x1000; // beq r0, r0, offset // b offset
463 }
464
465 static int is_jump(int i)
466 {
467   return itype[i] == RJUMP || itype[i] == UJUMP || itype[i] == CJUMP || itype[i] == SJUMP;
468 }
469
470 static u_int get_page(u_int vaddr)
471 {
472   u_int page=vaddr&~0xe0000000;
473   if (page < 0x1000000)
474     page &= ~0x0e00000; // RAM mirrors
475   page>>=12;
476   if(page>2048) page=2048+(page&2047);
477   return page;
478 }
479
480 // no virtual mem in PCSX
481 static u_int get_vpage(u_int vaddr)
482 {
483   return get_page(vaddr);
484 }
485
486 static struct ht_entry *hash_table_get(u_int vaddr)
487 {
488   return &hash_table[((vaddr>>16)^vaddr)&0xFFFF];
489 }
490
491 static void hash_table_add(struct ht_entry *ht_bin, u_int vaddr, void *tcaddr)
492 {
493   ht_bin->vaddr[1] = ht_bin->vaddr[0];
494   ht_bin->tcaddr[1] = ht_bin->tcaddr[0];
495   ht_bin->vaddr[0] = vaddr;
496   ht_bin->tcaddr[0] = tcaddr;
497 }
498
499 // some messy ari64's code, seems to rely on unsigned 32bit overflow
500 static int doesnt_expire_soon(void *tcaddr)
501 {
502   u_int diff = (u_int)((u_char *)tcaddr - out) << (32-TARGET_SIZE_2);
503   return diff > (u_int)(0x60000000 + (MAX_OUTPUT_BLOCK_SIZE << (32-TARGET_SIZE_2)));
504 }
505
506 // Get address from virtual address
507 // This is called from the recompiled JR/JALR instructions
508 void noinline *get_addr(u_int vaddr)
509 {
510   u_int page=get_page(vaddr);
511   u_int vpage=get_vpage(vaddr);
512   struct ll_entry *head;
513   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
514   head=jump_in[page];
515   while(head!=NULL) {
516     if(head->vaddr==vaddr) {
517   //printf("TRACE: count=%d next=%d (get_addr match %x: %p)\n",Count,next_interupt,vaddr,head->addr);
518       hash_table_add(hash_table_get(vaddr), vaddr, head->addr);
519       return head->addr;
520     }
521     head=head->next;
522   }
523   head=jump_dirty[vpage];
524   while(head!=NULL) {
525     if(head->vaddr==vaddr) {
526       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %p)\n",Count,next_interupt,vaddr,head->addr);
527       // Don't restore blocks which are about to expire from the cache
528       if (doesnt_expire_soon(head->addr))
529       if (verify_dirty(head->addr)) {
530         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
531         invalid_code[vaddr>>12]=0;
532         inv_code_start=inv_code_end=~0;
533         if(vpage<2048) {
534           restore_candidate[vpage>>3]|=1<<(vpage&7);
535         }
536         else restore_candidate[page>>3]|=1<<(page&7);
537         struct ht_entry *ht_bin = hash_table_get(vaddr);
538         if (ht_bin->vaddr[0] == vaddr)
539           ht_bin->tcaddr[0] = head->addr; // Replace existing entry
540         else
541           hash_table_add(ht_bin, vaddr, head->addr);
542
543         return head->addr;
544       }
545     }
546     head=head->next;
547   }
548   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
549   int r=new_recompile_block(vaddr);
550   if(r==0) return get_addr(vaddr);
551   // Execute in unmapped page, generate pagefault execption
552   Status|=2;
553   Cause=(vaddr<<31)|0x8;
554   EPC=(vaddr&1)?vaddr-5:vaddr;
555   BadVAddr=(vaddr&~1);
556   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
557   EntryHi=BadVAddr&0xFFFFE000;
558   return get_addr_ht(0x80000000);
559 }
560 // Look up address in hash table first
561 void *get_addr_ht(u_int vaddr)
562 {
563   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
564   const struct ht_entry *ht_bin = hash_table_get(vaddr);
565   if (ht_bin->vaddr[0] == vaddr) return ht_bin->tcaddr[0];
566   if (ht_bin->vaddr[1] == vaddr) return ht_bin->tcaddr[1];
567   return get_addr(vaddr);
568 }
569
570 void clear_all_regs(signed char regmap[])
571 {
572   int hr;
573   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
574 }
575
576 static signed char get_reg(const signed char regmap[],int r)
577 {
578   int hr;
579   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
580   return -1;
581 }
582
583 // Find a register that is available for two consecutive cycles
584 static signed char get_reg2(signed char regmap1[], const signed char regmap2[], int r)
585 {
586   int hr;
587   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
588   return -1;
589 }
590
591 int count_free_regs(signed char regmap[])
592 {
593   int count=0;
594   int hr;
595   for(hr=0;hr<HOST_REGS;hr++)
596   {
597     if(hr!=EXCLUDE_REG) {
598       if(regmap[hr]<0) count++;
599     }
600   }
601   return count;
602 }
603
604 void dirty_reg(struct regstat *cur,signed char reg)
605 {
606   int hr;
607   if(!reg) return;
608   for (hr=0;hr<HOST_REGS;hr++) {
609     if((cur->regmap[hr]&63)==reg) {
610       cur->dirty|=1<<hr;
611     }
612   }
613 }
614
615 static void set_const(struct regstat *cur, signed char reg, uint32_t value)
616 {
617   int hr;
618   if(!reg) return;
619   for (hr=0;hr<HOST_REGS;hr++) {
620     if(cur->regmap[hr]==reg) {
621       cur->isconst|=1<<hr;
622       current_constmap[hr]=value;
623     }
624   }
625 }
626
627 static void clear_const(struct regstat *cur, signed char reg)
628 {
629   int hr;
630   if(!reg) return;
631   for (hr=0;hr<HOST_REGS;hr++) {
632     if((cur->regmap[hr]&63)==reg) {
633       cur->isconst&=~(1<<hr);
634     }
635   }
636 }
637
638 static int is_const(struct regstat *cur, signed char reg)
639 {
640   int hr;
641   if(reg<0) return 0;
642   if(!reg) return 1;
643   for (hr=0;hr<HOST_REGS;hr++) {
644     if((cur->regmap[hr]&63)==reg) {
645       return (cur->isconst>>hr)&1;
646     }
647   }
648   return 0;
649 }
650
651 static uint32_t get_const(struct regstat *cur, signed char reg)
652 {
653   int hr;
654   if(!reg) return 0;
655   for (hr=0;hr<HOST_REGS;hr++) {
656     if(cur->regmap[hr]==reg) {
657       return current_constmap[hr];
658     }
659   }
660   SysPrintf("Unknown constant in r%d\n",reg);
661   abort();
662 }
663
664 // Least soon needed registers
665 // Look at the next ten instructions and see which registers
666 // will be used.  Try not to reallocate these.
667 void lsn(u_char hsn[], int i, int *preferred_reg)
668 {
669   int j;
670   int b=-1;
671   for(j=0;j<9;j++)
672   {
673     if(i+j>=slen) {
674       j=slen-i-1;
675       break;
676     }
677     if (is_ujump(i+j))
678     {
679       // Don't go past an unconditonal jump
680       j++;
681       break;
682     }
683   }
684   for(;j>=0;j--)
685   {
686     if(rs1[i+j]) hsn[rs1[i+j]]=j;
687     if(rs2[i+j]) hsn[rs2[i+j]]=j;
688     if(rt1[i+j]) hsn[rt1[i+j]]=j;
689     if(rt2[i+j]) hsn[rt2[i+j]]=j;
690     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
691       // Stores can allocate zero
692       hsn[rs1[i+j]]=j;
693       hsn[rs2[i+j]]=j;
694     }
695     // On some architectures stores need invc_ptr
696     #if defined(HOST_IMM8)
697     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
698       hsn[INVCP]=j;
699     }
700     #endif
701     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP))
702     {
703       hsn[CCREG]=j;
704       b=j;
705     }
706   }
707   if(b>=0)
708   {
709     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
710     {
711       // Follow first branch
712       int t=(ba[i+b]-start)>>2;
713       j=7-b;if(t+j>=slen) j=slen-t-1;
714       for(;j>=0;j--)
715       {
716         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
717         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
718         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
719         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
720       }
721     }
722     // TODO: preferred register based on backward branch
723   }
724   // Delay slot should preferably not overwrite branch conditions or cycle count
725   if (i > 0 && is_jump(i-1)) {
726     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
727     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
728     hsn[CCREG]=1;
729     // ...or hash tables
730     hsn[RHASH]=1;
731     hsn[RHTBL]=1;
732   }
733   // Coprocessor load/store needs FTEMP, even if not declared
734   if(itype[i]==C1LS||itype[i]==C2LS) {
735     hsn[FTEMP]=0;
736   }
737   // Load L/R also uses FTEMP as a temporary register
738   if(itype[i]==LOADLR) {
739     hsn[FTEMP]=0;
740   }
741   // Also SWL/SWR/SDL/SDR
742   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
743     hsn[FTEMP]=0;
744   }
745   // Don't remove the miniht registers
746   if(itype[i]==UJUMP||itype[i]==RJUMP)
747   {
748     hsn[RHASH]=0;
749     hsn[RHTBL]=0;
750   }
751 }
752
753 // We only want to allocate registers if we're going to use them again soon
754 int needed_again(int r, int i)
755 {
756   int j;
757   int b=-1;
758   int rn=10;
759
760   if (i > 0 && is_ujump(i-1))
761   {
762     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
763       return 0; // Don't need any registers if exiting the block
764   }
765   for(j=0;j<9;j++)
766   {
767     if(i+j>=slen) {
768       j=slen-i-1;
769       break;
770     }
771     if (is_ujump(i+j))
772     {
773       // Don't go past an unconditonal jump
774       j++;
775       break;
776     }
777     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
778     {
779       break;
780     }
781   }
782   for(;j>=1;j--)
783   {
784     if(rs1[i+j]==r) rn=j;
785     if(rs2[i+j]==r) rn=j;
786     if((unneeded_reg[i+j]>>r)&1) rn=10;
787     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP))
788     {
789       b=j;
790     }
791   }
792   /*
793   if(b>=0)
794   {
795     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
796     {
797       // Follow first branch
798       int o=rn;
799       int t=(ba[i+b]-start)>>2;
800       j=7-b;if(t+j>=slen) j=slen-t-1;
801       for(;j>=0;j--)
802       {
803         if(!((unneeded_reg[t+j]>>r)&1)) {
804           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
805           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
806         }
807         else rn=o;
808       }
809     }
810   }*/
811   if(rn<10) return 1;
812   (void)b;
813   return 0;
814 }
815
816 // Try to match register allocations at the end of a loop with those
817 // at the beginning
818 int loop_reg(int i, int r, int hr)
819 {
820   int j,k;
821   for(j=0;j<9;j++)
822   {
823     if(i+j>=slen) {
824       j=slen-i-1;
825       break;
826     }
827     if (is_ujump(i+j))
828     {
829       // Don't go past an unconditonal jump
830       j++;
831       break;
832     }
833   }
834   k=0;
835   if(i>0){
836     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP)
837       k--;
838   }
839   for(;k<j;k++)
840   {
841     assert(r < 64);
842     if((unneeded_reg[i+k]>>r)&1) return hr;
843     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP))
844     {
845       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
846       {
847         int t=(ba[i+k]-start)>>2;
848         int reg=get_reg(regs[t].regmap_entry,r);
849         if(reg>=0) return reg;
850         //reg=get_reg(regs[t+1].regmap_entry,r);
851         //if(reg>=0) return reg;
852       }
853     }
854   }
855   return hr;
856 }
857
858
859 // Allocate every register, preserving source/target regs
860 void alloc_all(struct regstat *cur,int i)
861 {
862   int hr;
863
864   for(hr=0;hr<HOST_REGS;hr++) {
865     if(hr!=EXCLUDE_REG) {
866       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
867          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
868       {
869         cur->regmap[hr]=-1;
870         cur->dirty&=~(1<<hr);
871       }
872       // Don't need zeros
873       if((cur->regmap[hr]&63)==0)
874       {
875         cur->regmap[hr]=-1;
876         cur->dirty&=~(1<<hr);
877       }
878     }
879   }
880 }
881
882 #ifndef NDEBUG
883 static int host_tempreg_in_use;
884
885 static void host_tempreg_acquire(void)
886 {
887   assert(!host_tempreg_in_use);
888   host_tempreg_in_use = 1;
889 }
890
891 static void host_tempreg_release(void)
892 {
893   host_tempreg_in_use = 0;
894 }
895 #else
896 static void host_tempreg_acquire(void) {}
897 static void host_tempreg_release(void) {}
898 #endif
899
900 #ifdef DRC_DBG
901 extern void gen_interupt();
902 extern void do_insn_cmp();
903 #define FUNCNAME(f) { f, " " #f }
904 static const struct {
905   void *addr;
906   const char *name;
907 } function_names[] = {
908   FUNCNAME(cc_interrupt),
909   FUNCNAME(gen_interupt),
910   FUNCNAME(get_addr_ht),
911   FUNCNAME(get_addr),
912   FUNCNAME(jump_handler_read8),
913   FUNCNAME(jump_handler_read16),
914   FUNCNAME(jump_handler_read32),
915   FUNCNAME(jump_handler_write8),
916   FUNCNAME(jump_handler_write16),
917   FUNCNAME(jump_handler_write32),
918   FUNCNAME(invalidate_addr),
919   FUNCNAME(jump_to_new_pc),
920   FUNCNAME(new_dyna_leave),
921   FUNCNAME(pcsx_mtc0),
922   FUNCNAME(pcsx_mtc0_ds),
923   FUNCNAME(do_insn_cmp),
924 #ifdef __arm__
925   FUNCNAME(verify_code),
926 #endif
927 };
928
929 static const char *func_name(const void *a)
930 {
931   int i;
932   for (i = 0; i < sizeof(function_names)/sizeof(function_names[0]); i++)
933     if (function_names[i].addr == a)
934       return function_names[i].name;
935   return "";
936 }
937 #else
938 #define func_name(x) ""
939 #endif
940
941 #ifdef __i386__
942 #include "assem_x86.c"
943 #endif
944 #ifdef __x86_64__
945 #include "assem_x64.c"
946 #endif
947 #ifdef __arm__
948 #include "assem_arm.c"
949 #endif
950 #ifdef __aarch64__
951 #include "assem_arm64.c"
952 #endif
953
954 static void *get_trampoline(const void *f)
955 {
956   size_t i;
957
958   for (i = 0; i < ARRAY_SIZE(ndrc->tramp.f); i++) {
959     if (ndrc->tramp.f[i] == f || ndrc->tramp.f[i] == NULL)
960       break;
961   }
962   if (i == ARRAY_SIZE(ndrc->tramp.f)) {
963     SysPrintf("trampoline table is full, last func %p\n", f);
964     abort();
965   }
966   if (ndrc->tramp.f[i] == NULL) {
967     start_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]);
968     ndrc->tramp.f[i] = f;
969     end_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]);
970   }
971   return &ndrc->tramp.ops[i];
972 }
973
974 static void emit_far_jump(const void *f)
975 {
976   if (can_jump_or_call(f)) {
977     emit_jmp(f);
978     return;
979   }
980
981   f = get_trampoline(f);
982   emit_jmp(f);
983 }
984
985 static void emit_far_call(const void *f)
986 {
987   if (can_jump_or_call(f)) {
988     emit_call(f);
989     return;
990   }
991
992   f = get_trampoline(f);
993   emit_call(f);
994 }
995
996 // Add virtual address mapping to linked list
997 void ll_add(struct ll_entry **head,int vaddr,void *addr)
998 {
999   struct ll_entry *new_entry;
1000   new_entry=malloc(sizeof(struct ll_entry));
1001   assert(new_entry!=NULL);
1002   new_entry->vaddr=vaddr;
1003   new_entry->reg_sv_flags=0;
1004   new_entry->addr=addr;
1005   new_entry->next=*head;
1006   *head=new_entry;
1007 }
1008
1009 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
1010 {
1011   ll_add(head,vaddr,addr);
1012   (*head)->reg_sv_flags=reg_sv_flags;
1013 }
1014
1015 // Check if an address is already compiled
1016 // but don't return addresses which are about to expire from the cache
1017 void *check_addr(u_int vaddr)
1018 {
1019   struct ht_entry *ht_bin = hash_table_get(vaddr);
1020   size_t i;
1021   for (i = 0; i < ARRAY_SIZE(ht_bin->vaddr); i++) {
1022     if (ht_bin->vaddr[i] == vaddr)
1023       if (doesnt_expire_soon((u_char *)ht_bin->tcaddr[i] - MAX_OUTPUT_BLOCK_SIZE))
1024         if (isclean(ht_bin->tcaddr[i]))
1025           return ht_bin->tcaddr[i];
1026   }
1027   u_int page=get_page(vaddr);
1028   struct ll_entry *head;
1029   head=jump_in[page];
1030   while (head != NULL) {
1031     if (head->vaddr == vaddr) {
1032       if (doesnt_expire_soon(head->addr)) {
1033         // Update existing entry with current address
1034         if (ht_bin->vaddr[0] == vaddr) {
1035           ht_bin->tcaddr[0] = head->addr;
1036           return head->addr;
1037         }
1038         if (ht_bin->vaddr[1] == vaddr) {
1039           ht_bin->tcaddr[1] = head->addr;
1040           return head->addr;
1041         }
1042         // Insert into hash table with low priority.
1043         // Don't evict existing entries, as they are probably
1044         // addresses that are being accessed frequently.
1045         if (ht_bin->vaddr[0] == -1) {
1046           ht_bin->vaddr[0] = vaddr;
1047           ht_bin->tcaddr[0] = head->addr;
1048         }
1049         else if (ht_bin->vaddr[1] == -1) {
1050           ht_bin->vaddr[1] = vaddr;
1051           ht_bin->tcaddr[1] = head->addr;
1052         }
1053         return head->addr;
1054       }
1055     }
1056     head=head->next;
1057   }
1058   return 0;
1059 }
1060
1061 void remove_hash(int vaddr)
1062 {
1063   //printf("remove hash: %x\n",vaddr);
1064   struct ht_entry *ht_bin = hash_table_get(vaddr);
1065   if (ht_bin->vaddr[1] == vaddr) {
1066     ht_bin->vaddr[1] = -1;
1067     ht_bin->tcaddr[1] = NULL;
1068   }
1069   if (ht_bin->vaddr[0] == vaddr) {
1070     ht_bin->vaddr[0] = ht_bin->vaddr[1];
1071     ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
1072     ht_bin->vaddr[1] = -1;
1073     ht_bin->tcaddr[1] = NULL;
1074   }
1075 }
1076
1077 void ll_remove_matching_addrs(struct ll_entry **head,uintptr_t addr,int shift)
1078 {
1079   struct ll_entry *next;
1080   while(*head) {
1081     if(((uintptr_t)((*head)->addr)>>shift)==(addr>>shift) ||
1082        ((uintptr_t)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1083     {
1084       inv_debug("EXP: Remove pointer to %p (%x)\n",(*head)->addr,(*head)->vaddr);
1085       remove_hash((*head)->vaddr);
1086       next=(*head)->next;
1087       free(*head);
1088       *head=next;
1089     }
1090     else
1091     {
1092       head=&((*head)->next);
1093     }
1094   }
1095 }
1096
1097 // Remove all entries from linked list
1098 void ll_clear(struct ll_entry **head)
1099 {
1100   struct ll_entry *cur;
1101   struct ll_entry *next;
1102   if((cur=*head)) {
1103     *head=0;
1104     while(cur) {
1105       next=cur->next;
1106       free(cur);
1107       cur=next;
1108     }
1109   }
1110 }
1111
1112 // Dereference the pointers and remove if it matches
1113 static void ll_kill_pointers(struct ll_entry *head,uintptr_t addr,int shift)
1114 {
1115   while(head) {
1116     uintptr_t ptr = (uintptr_t)get_pointer(head->addr);
1117     inv_debug("EXP: Lookup pointer to %lx at %p (%x)\n",(long)ptr,head->addr,head->vaddr);
1118     if(((ptr>>shift)==(addr>>shift)) ||
1119        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1120     {
1121       inv_debug("EXP: Kill pointer at %p (%x)\n",head->addr,head->vaddr);
1122       void *host_addr=find_extjump_insn(head->addr);
1123       mark_clear_cache(host_addr);
1124       set_jump_target(host_addr, head->addr);
1125     }
1126     head=head->next;
1127   }
1128 }
1129
1130 // This is called when we write to a compiled block (see do_invstub)
1131 static void invalidate_page(u_int page)
1132 {
1133   struct ll_entry *head;
1134   struct ll_entry *next;
1135   head=jump_in[page];
1136   jump_in[page]=0;
1137   while(head!=NULL) {
1138     inv_debug("INVALIDATE: %x\n",head->vaddr);
1139     remove_hash(head->vaddr);
1140     next=head->next;
1141     free(head);
1142     head=next;
1143   }
1144   head=jump_out[page];
1145   jump_out[page]=0;
1146   while(head!=NULL) {
1147     inv_debug("INVALIDATE: kill pointer to %x (%p)\n",head->vaddr,head->addr);
1148     void *host_addr=find_extjump_insn(head->addr);
1149     mark_clear_cache(host_addr);
1150     set_jump_target(host_addr, head->addr);
1151     next=head->next;
1152     free(head);
1153     head=next;
1154   }
1155 }
1156
1157 static void invalidate_block_range(u_int block, u_int first, u_int last)
1158 {
1159   u_int page=get_page(block<<12);
1160   //printf("first=%d last=%d\n",first,last);
1161   invalidate_page(page);
1162   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1163   assert(last<page+5);
1164   // Invalidate the adjacent pages if a block crosses a 4K boundary
1165   while(first<page) {
1166     invalidate_page(first);
1167     first++;
1168   }
1169   for(first=page+1;first<last;first++) {
1170     invalidate_page(first);
1171   }
1172   do_clear_cache();
1173
1174   // Don't trap writes
1175   invalid_code[block]=1;
1176
1177   #ifdef USE_MINI_HT
1178   memset(mini_ht,-1,sizeof(mini_ht));
1179   #endif
1180 }
1181
1182 void invalidate_block(u_int block)
1183 {
1184   u_int page=get_page(block<<12);
1185   u_int vpage=get_vpage(block<<12);
1186   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1187   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1188   u_int first,last;
1189   first=last=page;
1190   struct ll_entry *head;
1191   head=jump_dirty[vpage];
1192   //printf("page=%d vpage=%d\n",page,vpage);
1193   while(head!=NULL) {
1194     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1195       u_char *start, *end;
1196       get_bounds(head->addr, &start, &end);
1197       //printf("start: %p end: %p\n", start, end);
1198       if (page < 2048 && start >= rdram && end < rdram+RAM_SIZE) {
1199         if (((start-rdram)>>12) <= page && ((end-1-rdram)>>12) >= page) {
1200           if ((((start-rdram)>>12)&2047) < first) first = ((start-rdram)>>12)&2047;
1201           if ((((end-1-rdram)>>12)&2047) > last)  last = ((end-1-rdram)>>12)&2047;
1202         }
1203       }
1204     }
1205     head=head->next;
1206   }
1207   invalidate_block_range(block,first,last);
1208 }
1209
1210 void invalidate_addr(u_int addr)
1211 {
1212   //static int rhits;
1213   // this check is done by the caller
1214   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1215   u_int page=get_vpage(addr);
1216   if(page<2048) { // RAM
1217     struct ll_entry *head;
1218     u_int addr_min=~0, addr_max=0;
1219     u_int mask=RAM_SIZE-1;
1220     u_int addr_main=0x80000000|(addr&mask);
1221     int pg1;
1222     inv_code_start=addr_main&~0xfff;
1223     inv_code_end=addr_main|0xfff;
1224     pg1=page;
1225     if (pg1>0) {
1226       // must check previous page too because of spans..
1227       pg1--;
1228       inv_code_start-=0x1000;
1229     }
1230     for(;pg1<=page;pg1++) {
1231       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1232         u_char *start_h, *end_h;
1233         u_int start, end;
1234         get_bounds(head->addr, &start_h, &end_h);
1235         start = (uintptr_t)start_h - ram_offset;
1236         end = (uintptr_t)end_h - ram_offset;
1237         if(start<=addr_main&&addr_main<end) {
1238           if(start<addr_min) addr_min=start;
1239           if(end>addr_max) addr_max=end;
1240         }
1241         else if(addr_main<start) {
1242           if(start<inv_code_end)
1243             inv_code_end=start-1;
1244         }
1245         else {
1246           if(end>inv_code_start)
1247             inv_code_start=end;
1248         }
1249       }
1250     }
1251     if (addr_min!=~0) {
1252       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1253       inv_code_start=inv_code_end=~0;
1254       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1255       return;
1256     }
1257     else {
1258       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1259       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1260       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1261       return;
1262     }
1263   }
1264   invalidate_block(addr>>12);
1265 }
1266
1267 // This is called when loading a save state.
1268 // Anything could have changed, so invalidate everything.
1269 void invalidate_all_pages(void)
1270 {
1271   u_int page;
1272   for(page=0;page<4096;page++)
1273     invalidate_page(page);
1274   for(page=0;page<1048576;page++)
1275     if(!invalid_code[page]) {
1276       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1277       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1278     }
1279   #ifdef USE_MINI_HT
1280   memset(mini_ht,-1,sizeof(mini_ht));
1281   #endif
1282   do_clear_cache();
1283 }
1284
1285 static void do_invstub(int n)
1286 {
1287   literal_pool(20);
1288   u_int reglist=stubs[n].a;
1289   set_jump_target(stubs[n].addr, out);
1290   save_regs(reglist);
1291   if(stubs[n].b!=0) emit_mov(stubs[n].b,0);
1292   emit_far_call(invalidate_addr);
1293   restore_regs(reglist);
1294   emit_jmp(stubs[n].retaddr); // return address
1295 }
1296
1297 // Add an entry to jump_out after making a link
1298 // src should point to code by emit_extjump2()
1299 void add_link(u_int vaddr,void *src)
1300 {
1301   u_int page=get_page(vaddr);
1302   inv_debug("add_link: %p -> %x (%d)\n",src,vaddr,page);
1303   check_extjump2(src);
1304   ll_add(jump_out+page,vaddr,src);
1305   //void *ptr=get_pointer(src);
1306   //inv_debug("add_link: Pointer is to %p\n",ptr);
1307 }
1308
1309 // If a code block was found to be unmodified (bit was set in
1310 // restore_candidate) and it remains unmodified (bit is clear
1311 // in invalid_code) then move the entries for that 4K page from
1312 // the dirty list to the clean list.
1313 void clean_blocks(u_int page)
1314 {
1315   struct ll_entry *head;
1316   inv_debug("INV: clean_blocks page=%d\n",page);
1317   head=jump_dirty[page];
1318   while(head!=NULL) {
1319     if(!invalid_code[head->vaddr>>12]) {
1320       // Don't restore blocks which are about to expire from the cache
1321       if (doesnt_expire_soon(head->addr)) {
1322         if(verify_dirty(head->addr)) {
1323           u_char *start, *end;
1324           //printf("Possibly Restore %x (%p)\n",head->vaddr, head->addr);
1325           u_int i;
1326           u_int inv=0;
1327           get_bounds(head->addr, &start, &end);
1328           if (start - rdram < RAM_SIZE) {
1329             for (i = (start-rdram+0x80000000)>>12; i <= (end-1-rdram+0x80000000)>>12; i++) {
1330               inv|=invalid_code[i];
1331             }
1332           }
1333           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1334             inv=1;
1335           }
1336           if(!inv) {
1337             void *clean_addr = get_clean_addr(head->addr);
1338             if (doesnt_expire_soon(clean_addr)) {
1339               u_int ppage=page;
1340               inv_debug("INV: Restored %x (%p/%p)\n",head->vaddr, head->addr, clean_addr);
1341               //printf("page=%x, addr=%x\n",page,head->vaddr);
1342               //assert(head->vaddr>>12==(page|0x80000));
1343               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1344               struct ht_entry *ht_bin = hash_table_get(head->vaddr);
1345               if (ht_bin->vaddr[0] == head->vaddr)
1346                 ht_bin->tcaddr[0] = clean_addr; // Replace existing entry
1347               if (ht_bin->vaddr[1] == head->vaddr)
1348                 ht_bin->tcaddr[1] = clean_addr; // Replace existing entry
1349             }
1350           }
1351         }
1352       }
1353     }
1354     head=head->next;
1355   }
1356 }
1357
1358 /* Register allocation */
1359
1360 // Note: registers are allocated clean (unmodified state)
1361 // if you intend to modify the register, you must call dirty_reg().
1362 static void alloc_reg(struct regstat *cur,int i,signed char reg)
1363 {
1364   int r,hr;
1365   int preferred_reg = (reg&7);
1366   if(reg==CCREG) preferred_reg=HOST_CCREG;
1367   if(reg==PTEMP||reg==FTEMP) preferred_reg=12;
1368
1369   // Don't allocate unused registers
1370   if((cur->u>>reg)&1) return;
1371
1372   // see if it's already allocated
1373   for(hr=0;hr<HOST_REGS;hr++)
1374   {
1375     if(cur->regmap[hr]==reg) return;
1376   }
1377
1378   // Keep the same mapping if the register was already allocated in a loop
1379   preferred_reg = loop_reg(i,reg,preferred_reg);
1380
1381   // Try to allocate the preferred register
1382   if(cur->regmap[preferred_reg]==-1) {
1383     cur->regmap[preferred_reg]=reg;
1384     cur->dirty&=~(1<<preferred_reg);
1385     cur->isconst&=~(1<<preferred_reg);
1386     return;
1387   }
1388   r=cur->regmap[preferred_reg];
1389   assert(r < 64);
1390   if((cur->u>>r)&1) {
1391     cur->regmap[preferred_reg]=reg;
1392     cur->dirty&=~(1<<preferred_reg);
1393     cur->isconst&=~(1<<preferred_reg);
1394     return;
1395   }
1396
1397   // Clear any unneeded registers
1398   // We try to keep the mapping consistent, if possible, because it
1399   // makes branches easier (especially loops).  So we try to allocate
1400   // first (see above) before removing old mappings.  If this is not
1401   // possible then go ahead and clear out the registers that are no
1402   // longer needed.
1403   for(hr=0;hr<HOST_REGS;hr++)
1404   {
1405     r=cur->regmap[hr];
1406     if(r>=0) {
1407       assert(r < 64);
1408       if((cur->u>>r)&1) {cur->regmap[hr]=-1;break;}
1409     }
1410   }
1411   // Try to allocate any available register, but prefer
1412   // registers that have not been used recently.
1413   if(i>0) {
1414     for(hr=0;hr<HOST_REGS;hr++) {
1415       if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
1416         if(regs[i-1].regmap[hr]!=rs1[i-1]&&regs[i-1].regmap[hr]!=rs2[i-1]&&regs[i-1].regmap[hr]!=rt1[i-1]&&regs[i-1].regmap[hr]!=rt2[i-1]) {
1417           cur->regmap[hr]=reg;
1418           cur->dirty&=~(1<<hr);
1419           cur->isconst&=~(1<<hr);
1420           return;
1421         }
1422       }
1423     }
1424   }
1425   // Try to allocate any available register
1426   for(hr=0;hr<HOST_REGS;hr++) {
1427     if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
1428       cur->regmap[hr]=reg;
1429       cur->dirty&=~(1<<hr);
1430       cur->isconst&=~(1<<hr);
1431       return;
1432     }
1433   }
1434
1435   // Ok, now we have to evict someone
1436   // Pick a register we hopefully won't need soon
1437   u_char hsn[MAXREG+1];
1438   memset(hsn,10,sizeof(hsn));
1439   int j;
1440   lsn(hsn,i,&preferred_reg);
1441   //printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",cur->regmap[0],cur->regmap[1],cur->regmap[2],cur->regmap[3],cur->regmap[5],cur->regmap[6],cur->regmap[7]);
1442   //printf("hsn(%x): %d %d %d %d %d %d %d\n",start+i*4,hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]);
1443   if(i>0) {
1444     // Don't evict the cycle count at entry points, otherwise the entry
1445     // stub will have to write it.
1446     if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2;
1447     if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP)) hsn[CCREG]=2;
1448     for(j=10;j>=3;j--)
1449     {
1450       // Alloc preferred register if available
1451       if(hsn[r=cur->regmap[preferred_reg]&63]==j) {
1452         for(hr=0;hr<HOST_REGS;hr++) {
1453           // Evict both parts of a 64-bit register
1454           if((cur->regmap[hr]&63)==r) {
1455             cur->regmap[hr]=-1;
1456             cur->dirty&=~(1<<hr);
1457             cur->isconst&=~(1<<hr);
1458           }
1459         }
1460         cur->regmap[preferred_reg]=reg;
1461         return;
1462       }
1463       for(r=1;r<=MAXREG;r++)
1464       {
1465         if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) {
1466           for(hr=0;hr<HOST_REGS;hr++) {
1467             if(hr!=HOST_CCREG||j<hsn[CCREG]) {
1468               if(cur->regmap[hr]==r) {
1469                 cur->regmap[hr]=reg;
1470                 cur->dirty&=~(1<<hr);
1471                 cur->isconst&=~(1<<hr);
1472                 return;
1473               }
1474             }
1475           }
1476         }
1477       }
1478     }
1479   }
1480   for(j=10;j>=0;j--)
1481   {
1482     for(r=1;r<=MAXREG;r++)
1483     {
1484       if(hsn[r]==j) {
1485         for(hr=0;hr<HOST_REGS;hr++) {
1486           if(cur->regmap[hr]==r) {
1487             cur->regmap[hr]=reg;
1488             cur->dirty&=~(1<<hr);
1489             cur->isconst&=~(1<<hr);
1490             return;
1491           }
1492         }
1493       }
1494     }
1495   }
1496   SysPrintf("This shouldn't happen (alloc_reg)");abort();
1497 }
1498
1499 // Allocate a temporary register.  This is done without regard to
1500 // dirty status or whether the register we request is on the unneeded list
1501 // Note: This will only allocate one register, even if called multiple times
1502 static void alloc_reg_temp(struct regstat *cur,int i,signed char reg)
1503 {
1504   int r,hr;
1505   int preferred_reg = -1;
1506
1507   // see if it's already allocated
1508   for(hr=0;hr<HOST_REGS;hr++)
1509   {
1510     if(hr!=EXCLUDE_REG&&cur->regmap[hr]==reg) return;
1511   }
1512
1513   // Try to allocate any available register
1514   for(hr=HOST_REGS-1;hr>=0;hr--) {
1515     if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
1516       cur->regmap[hr]=reg;
1517       cur->dirty&=~(1<<hr);
1518       cur->isconst&=~(1<<hr);
1519       return;
1520     }
1521   }
1522
1523   // Find an unneeded register
1524   for(hr=HOST_REGS-1;hr>=0;hr--)
1525   {
1526     r=cur->regmap[hr];
1527     if(r>=0) {
1528       assert(r < 64);
1529       if((cur->u>>r)&1) {
1530         if(i==0||((unneeded_reg[i-1]>>r)&1)) {
1531           cur->regmap[hr]=reg;
1532           cur->dirty&=~(1<<hr);
1533           cur->isconst&=~(1<<hr);
1534           return;
1535         }
1536       }
1537     }
1538   }
1539
1540   // Ok, now we have to evict someone
1541   // Pick a register we hopefully won't need soon
1542   // TODO: we might want to follow unconditional jumps here
1543   // TODO: get rid of dupe code and make this into a function
1544   u_char hsn[MAXREG+1];
1545   memset(hsn,10,sizeof(hsn));
1546   int j;
1547   lsn(hsn,i,&preferred_reg);
1548   //printf("hsn: %d %d %d %d %d %d %d\n",hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]);
1549   if(i>0) {
1550     // Don't evict the cycle count at entry points, otherwise the entry
1551     // stub will have to write it.
1552     if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2;
1553     if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP)) hsn[CCREG]=2;
1554     for(j=10;j>=3;j--)
1555     {
1556       for(r=1;r<=MAXREG;r++)
1557       {
1558         if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) {
1559           for(hr=0;hr<HOST_REGS;hr++) {
1560             if(hr!=HOST_CCREG||hsn[CCREG]>2) {
1561               if(cur->regmap[hr]==r) {
1562                 cur->regmap[hr]=reg;
1563                 cur->dirty&=~(1<<hr);
1564                 cur->isconst&=~(1<<hr);
1565                 return;
1566               }
1567             }
1568           }
1569         }
1570       }
1571     }
1572   }
1573   for(j=10;j>=0;j--)
1574   {
1575     for(r=1;r<=MAXREG;r++)
1576     {
1577       if(hsn[r]==j) {
1578         for(hr=0;hr<HOST_REGS;hr++) {
1579           if(cur->regmap[hr]==r) {
1580             cur->regmap[hr]=reg;
1581             cur->dirty&=~(1<<hr);
1582             cur->isconst&=~(1<<hr);
1583             return;
1584           }
1585         }
1586       }
1587     }
1588   }
1589   SysPrintf("This shouldn't happen");abort();
1590 }
1591
1592 static void mov_alloc(struct regstat *current,int i)
1593 {
1594   // Note: Don't need to actually alloc the source registers
1595   //alloc_reg(current,i,rs1[i]);
1596   alloc_reg(current,i,rt1[i]);
1597
1598   clear_const(current,rs1[i]);
1599   clear_const(current,rt1[i]);
1600   dirty_reg(current,rt1[i]);
1601 }
1602
1603 static void shiftimm_alloc(struct regstat *current,int i)
1604 {
1605   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1606   {
1607     if(rt1[i]) {
1608       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1609       else lt1[i]=rs1[i];
1610       alloc_reg(current,i,rt1[i]);
1611       dirty_reg(current,rt1[i]);
1612       if(is_const(current,rs1[i])) {
1613         int v=get_const(current,rs1[i]);
1614         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1615         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1616         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1617       }
1618       else clear_const(current,rt1[i]);
1619     }
1620   }
1621   else
1622   {
1623     clear_const(current,rs1[i]);
1624     clear_const(current,rt1[i]);
1625   }
1626
1627   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1628   {
1629     assert(0);
1630   }
1631   if(opcode2[i]==0x3c) // DSLL32
1632   {
1633     assert(0);
1634   }
1635   if(opcode2[i]==0x3e) // DSRL32
1636   {
1637     assert(0);
1638   }
1639   if(opcode2[i]==0x3f) // DSRA32
1640   {
1641     assert(0);
1642   }
1643 }
1644
1645 static void shift_alloc(struct regstat *current,int i)
1646 {
1647   if(rt1[i]) {
1648     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1649     {
1650       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1651       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1652       alloc_reg(current,i,rt1[i]);
1653       if(rt1[i]==rs2[i]) {
1654         alloc_reg_temp(current,i,-1);
1655         minimum_free_regs[i]=1;
1656       }
1657     } else { // DSLLV/DSRLV/DSRAV
1658       assert(0);
1659     }
1660     clear_const(current,rs1[i]);
1661     clear_const(current,rs2[i]);
1662     clear_const(current,rt1[i]);
1663     dirty_reg(current,rt1[i]);
1664   }
1665 }
1666
1667 static void alu_alloc(struct regstat *current,int i)
1668 {
1669   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1670     if(rt1[i]) {
1671       if(rs1[i]&&rs2[i]) {
1672         alloc_reg(current,i,rs1[i]);
1673         alloc_reg(current,i,rs2[i]);
1674       }
1675       else {
1676         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1677         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1678       }
1679       alloc_reg(current,i,rt1[i]);
1680     }
1681   }
1682   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1683     if(rt1[i]) {
1684       alloc_reg(current,i,rs1[i]);
1685       alloc_reg(current,i,rs2[i]);
1686       alloc_reg(current,i,rt1[i]);
1687     }
1688   }
1689   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1690     if(rt1[i]) {
1691       if(rs1[i]&&rs2[i]) {
1692         alloc_reg(current,i,rs1[i]);
1693         alloc_reg(current,i,rs2[i]);
1694       }
1695       else
1696       {
1697         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1698         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1699       }
1700       alloc_reg(current,i,rt1[i]);
1701     }
1702   }
1703   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1704     assert(0);
1705   }
1706   clear_const(current,rs1[i]);
1707   clear_const(current,rs2[i]);
1708   clear_const(current,rt1[i]);
1709   dirty_reg(current,rt1[i]);
1710 }
1711
1712 static void imm16_alloc(struct regstat *current,int i)
1713 {
1714   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1715   else lt1[i]=rs1[i];
1716   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1717   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1718     assert(0);
1719   }
1720   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1721     clear_const(current,rs1[i]);
1722     clear_const(current,rt1[i]);
1723   }
1724   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1725     if(is_const(current,rs1[i])) {
1726       int v=get_const(current,rs1[i]);
1727       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1728       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1729       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1730     }
1731     else clear_const(current,rt1[i]);
1732   }
1733   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1734     if(is_const(current,rs1[i])) {
1735       int v=get_const(current,rs1[i]);
1736       set_const(current,rt1[i],v+imm[i]);
1737     }
1738     else clear_const(current,rt1[i]);
1739   }
1740   else {
1741     set_const(current,rt1[i],imm[i]<<16); // LUI
1742   }
1743   dirty_reg(current,rt1[i]);
1744 }
1745
1746 static void load_alloc(struct regstat *current,int i)
1747 {
1748   clear_const(current,rt1[i]);
1749   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1750   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1751   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1752   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1753     alloc_reg(current,i,rt1[i]);
1754     assert(get_reg(current->regmap,rt1[i])>=0);
1755     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1756     {
1757       assert(0);
1758     }
1759     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1760     {
1761       assert(0);
1762     }
1763     dirty_reg(current,rt1[i]);
1764     // LWL/LWR need a temporary register for the old value
1765     if(opcode[i]==0x22||opcode[i]==0x26)
1766     {
1767       alloc_reg(current,i,FTEMP);
1768       alloc_reg_temp(current,i,-1);
1769       minimum_free_regs[i]=1;
1770     }
1771   }
1772   else
1773   {
1774     // Load to r0 or unneeded register (dummy load)
1775     // but we still need a register to calculate the address
1776     if(opcode[i]==0x22||opcode[i]==0x26)
1777     {
1778       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1779     }
1780     alloc_reg_temp(current,i,-1);
1781     minimum_free_regs[i]=1;
1782     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1783     {
1784       assert(0);
1785     }
1786   }
1787 }
1788
1789 void store_alloc(struct regstat *current,int i)
1790 {
1791   clear_const(current,rs2[i]);
1792   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1793   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1794   alloc_reg(current,i,rs2[i]);
1795   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1796     assert(0);
1797   }
1798   #if defined(HOST_IMM8)
1799   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1800   else alloc_reg(current,i,INVCP);
1801   #endif
1802   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1803     alloc_reg(current,i,FTEMP);
1804   }
1805   // We need a temporary register for address generation
1806   alloc_reg_temp(current,i,-1);
1807   minimum_free_regs[i]=1;
1808 }
1809
1810 void c1ls_alloc(struct regstat *current,int i)
1811 {
1812   //clear_const(current,rs1[i]); // FIXME
1813   clear_const(current,rt1[i]);
1814   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1815   alloc_reg(current,i,CSREG); // Status
1816   alloc_reg(current,i,FTEMP);
1817   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1818     assert(0);
1819   }
1820   #if defined(HOST_IMM8)
1821   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1822   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1823     alloc_reg(current,i,INVCP);
1824   #endif
1825   // We need a temporary register for address generation
1826   alloc_reg_temp(current,i,-1);
1827 }
1828
1829 void c2ls_alloc(struct regstat *current,int i)
1830 {
1831   clear_const(current,rt1[i]);
1832   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1833   alloc_reg(current,i,FTEMP);
1834   #if defined(HOST_IMM8)
1835   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1836   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1837     alloc_reg(current,i,INVCP);
1838   #endif
1839   // We need a temporary register for address generation
1840   alloc_reg_temp(current,i,-1);
1841   minimum_free_regs[i]=1;
1842 }
1843
1844 #ifndef multdiv_alloc
1845 void multdiv_alloc(struct regstat *current,int i)
1846 {
1847   //  case 0x18: MULT
1848   //  case 0x19: MULTU
1849   //  case 0x1A: DIV
1850   //  case 0x1B: DIVU
1851   //  case 0x1C: DMULT
1852   //  case 0x1D: DMULTU
1853   //  case 0x1E: DDIV
1854   //  case 0x1F: DDIVU
1855   clear_const(current,rs1[i]);
1856   clear_const(current,rs2[i]);
1857   if(rs1[i]&&rs2[i])
1858   {
1859     if((opcode2[i]&4)==0) // 32-bit
1860     {
1861       current->u&=~(1LL<<HIREG);
1862       current->u&=~(1LL<<LOREG);
1863       alloc_reg(current,i,HIREG);
1864       alloc_reg(current,i,LOREG);
1865       alloc_reg(current,i,rs1[i]);
1866       alloc_reg(current,i,rs2[i]);
1867       dirty_reg(current,HIREG);
1868       dirty_reg(current,LOREG);
1869     }
1870     else // 64-bit
1871     {
1872       assert(0);
1873     }
1874   }
1875   else
1876   {
1877     // Multiply by zero is zero.
1878     // MIPS does not have a divide by zero exception.
1879     // The result is undefined, we return zero.
1880     alloc_reg(current,i,HIREG);
1881     alloc_reg(current,i,LOREG);
1882     dirty_reg(current,HIREG);
1883     dirty_reg(current,LOREG);
1884   }
1885 }
1886 #endif
1887
1888 void cop0_alloc(struct regstat *current,int i)
1889 {
1890   if(opcode2[i]==0) // MFC0
1891   {
1892     if(rt1[i]) {
1893       clear_const(current,rt1[i]);
1894       alloc_all(current,i);
1895       alloc_reg(current,i,rt1[i]);
1896       dirty_reg(current,rt1[i]);
1897     }
1898   }
1899   else if(opcode2[i]==4) // MTC0
1900   {
1901     if(rs1[i]){
1902       clear_const(current,rs1[i]);
1903       alloc_reg(current,i,rs1[i]);
1904       alloc_all(current,i);
1905     }
1906     else {
1907       alloc_all(current,i); // FIXME: Keep r0
1908       current->u&=~1LL;
1909       alloc_reg(current,i,0);
1910     }
1911   }
1912   else
1913   {
1914     // TLBR/TLBWI/TLBWR/TLBP/ERET
1915     assert(opcode2[i]==0x10);
1916     alloc_all(current,i);
1917   }
1918   minimum_free_regs[i]=HOST_REGS;
1919 }
1920
1921 static void cop12_alloc(struct regstat *current,int i)
1922 {
1923   alloc_reg(current,i,CSREG); // Load status
1924   if(opcode2[i]<3) // MFC1/CFC1
1925   {
1926     if(rt1[i]){
1927       clear_const(current,rt1[i]);
1928       alloc_reg(current,i,rt1[i]);
1929       dirty_reg(current,rt1[i]);
1930     }
1931     alloc_reg_temp(current,i,-1);
1932   }
1933   else if(opcode2[i]>3) // MTC1/CTC1
1934   {
1935     if(rs1[i]){
1936       clear_const(current,rs1[i]);
1937       alloc_reg(current,i,rs1[i]);
1938     }
1939     else {
1940       current->u&=~1LL;
1941       alloc_reg(current,i,0);
1942     }
1943     alloc_reg_temp(current,i,-1);
1944   }
1945   minimum_free_regs[i]=1;
1946 }
1947
1948 void c2op_alloc(struct regstat *current,int i)
1949 {
1950   alloc_reg_temp(current,i,-1);
1951 }
1952
1953 void syscall_alloc(struct regstat *current,int i)
1954 {
1955   alloc_cc(current,i);
1956   dirty_reg(current,CCREG);
1957   alloc_all(current,i);
1958   minimum_free_regs[i]=HOST_REGS;
1959   current->isconst=0;
1960 }
1961
1962 void delayslot_alloc(struct regstat *current,int i)
1963 {
1964   switch(itype[i]) {
1965     case UJUMP:
1966     case CJUMP:
1967     case SJUMP:
1968     case RJUMP:
1969     case SYSCALL:
1970     case HLECALL:
1971     case SPAN:
1972       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//abort();
1973       SysPrintf("Disabled speculative precompilation\n");
1974       stop_after_jal=1;
1975       break;
1976     case IMM16:
1977       imm16_alloc(current,i);
1978       break;
1979     case LOAD:
1980     case LOADLR:
1981       load_alloc(current,i);
1982       break;
1983     case STORE:
1984     case STORELR:
1985       store_alloc(current,i);
1986       break;
1987     case ALU:
1988       alu_alloc(current,i);
1989       break;
1990     case SHIFT:
1991       shift_alloc(current,i);
1992       break;
1993     case MULTDIV:
1994       multdiv_alloc(current,i);
1995       break;
1996     case SHIFTIMM:
1997       shiftimm_alloc(current,i);
1998       break;
1999     case MOV:
2000       mov_alloc(current,i);
2001       break;
2002     case COP0:
2003       cop0_alloc(current,i);
2004       break;
2005     case COP1:
2006     case COP2:
2007       cop12_alloc(current,i);
2008       break;
2009     case C1LS:
2010       c1ls_alloc(current,i);
2011       break;
2012     case C2LS:
2013       c2ls_alloc(current,i);
2014       break;
2015     case C2OP:
2016       c2op_alloc(current,i);
2017       break;
2018   }
2019 }
2020
2021 // Special case where a branch and delay slot span two pages in virtual memory
2022 static void pagespan_alloc(struct regstat *current,int i)
2023 {
2024   current->isconst=0;
2025   current->wasconst=0;
2026   regs[i].wasconst=0;
2027   minimum_free_regs[i]=HOST_REGS;
2028   alloc_all(current,i);
2029   alloc_cc(current,i);
2030   dirty_reg(current,CCREG);
2031   if(opcode[i]==3) // JAL
2032   {
2033     alloc_reg(current,i,31);
2034     dirty_reg(current,31);
2035   }
2036   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
2037   {
2038     alloc_reg(current,i,rs1[i]);
2039     if (rt1[i]!=0) {
2040       alloc_reg(current,i,rt1[i]);
2041       dirty_reg(current,rt1[i]);
2042     }
2043   }
2044   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
2045   {
2046     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2047     if(rs2[i]) alloc_reg(current,i,rs2[i]);
2048   }
2049   else
2050   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
2051   {
2052     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2053   }
2054   //else ...
2055 }
2056
2057 static void add_stub(enum stub_type type, void *addr, void *retaddr,
2058   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e)
2059 {
2060   assert(stubcount < ARRAY_SIZE(stubs));
2061   stubs[stubcount].type = type;
2062   stubs[stubcount].addr = addr;
2063   stubs[stubcount].retaddr = retaddr;
2064   stubs[stubcount].a = a;
2065   stubs[stubcount].b = b;
2066   stubs[stubcount].c = c;
2067   stubs[stubcount].d = d;
2068   stubs[stubcount].e = e;
2069   stubcount++;
2070 }
2071
2072 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
2073   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist)
2074 {
2075   add_stub(type, addr, retaddr, i, addr_reg, (uintptr_t)i_regs, ccadj, reglist);
2076 }
2077
2078 // Write out a single register
2079 static void wb_register(signed char r,signed char regmap[],uint64_t dirty)
2080 {
2081   int hr;
2082   for(hr=0;hr<HOST_REGS;hr++) {
2083     if(hr!=EXCLUDE_REG) {
2084       if((regmap[hr]&63)==r) {
2085         if((dirty>>hr)&1) {
2086           assert(regmap[hr]<64);
2087           emit_storereg(r,hr);
2088         }
2089       }
2090     }
2091   }
2092 }
2093
2094 static void wb_valid(signed char pre[],signed char entry[],u_int dirty_pre,u_int dirty,uint64_t u)
2095 {
2096   //if(dirty_pre==dirty) return;
2097   int hr,reg;
2098   for(hr=0;hr<HOST_REGS;hr++) {
2099     if(hr!=EXCLUDE_REG) {
2100       reg=pre[hr];
2101       if(((~u)>>(reg&63))&1) {
2102         if(reg>0) {
2103           if(((dirty_pre&~dirty)>>hr)&1) {
2104             if(reg>0&&reg<34) {
2105               emit_storereg(reg,hr);
2106             }
2107             else if(reg>=64) {
2108               assert(0);
2109             }
2110           }
2111         }
2112       }
2113     }
2114   }
2115 }
2116
2117 // trashes r2
2118 static void pass_args(int a0, int a1)
2119 {
2120   if(a0==1&&a1==0) {
2121     // must swap
2122     emit_mov(a0,2); emit_mov(a1,1); emit_mov(2,0);
2123   }
2124   else if(a0!=0&&a1==0) {
2125     emit_mov(a1,1);
2126     if (a0>=0) emit_mov(a0,0);
2127   }
2128   else {
2129     if(a0>=0&&a0!=0) emit_mov(a0,0);
2130     if(a1>=0&&a1!=1) emit_mov(a1,1);
2131   }
2132 }
2133
2134 static void alu_assemble(int i,struct regstat *i_regs)
2135 {
2136   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2137     if(rt1[i]) {
2138       signed char s1,s2,t;
2139       t=get_reg(i_regs->regmap,rt1[i]);
2140       if(t>=0) {
2141         s1=get_reg(i_regs->regmap,rs1[i]);
2142         s2=get_reg(i_regs->regmap,rs2[i]);
2143         if(rs1[i]&&rs2[i]) {
2144           assert(s1>=0);
2145           assert(s2>=0);
2146           if(opcode2[i]&2) emit_sub(s1,s2,t);
2147           else emit_add(s1,s2,t);
2148         }
2149         else if(rs1[i]) {
2150           if(s1>=0) emit_mov(s1,t);
2151           else emit_loadreg(rs1[i],t);
2152         }
2153         else if(rs2[i]) {
2154           if(s2>=0) {
2155             if(opcode2[i]&2) emit_neg(s2,t);
2156             else emit_mov(s2,t);
2157           }
2158           else {
2159             emit_loadreg(rs2[i],t);
2160             if(opcode2[i]&2) emit_neg(t,t);
2161           }
2162         }
2163         else emit_zeroreg(t);
2164       }
2165     }
2166   }
2167   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2168     assert(0);
2169   }
2170   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2171     if(rt1[i]) {
2172       signed char s1l,s2l,t;
2173       {
2174         t=get_reg(i_regs->regmap,rt1[i]);
2175         //assert(t>=0);
2176         if(t>=0) {
2177           s1l=get_reg(i_regs->regmap,rs1[i]);
2178           s2l=get_reg(i_regs->regmap,rs2[i]);
2179           if(rs2[i]==0) // rx<r0
2180           {
2181             if(opcode2[i]==0x2a&&rs1[i]!=0) { // SLT
2182               assert(s1l>=0);
2183               emit_shrimm(s1l,31,t);
2184             }
2185             else // SLTU (unsigned can not be less than zero, 0<0)
2186               emit_zeroreg(t);
2187           }
2188           else if(rs1[i]==0) // r0<rx
2189           {
2190             assert(s2l>=0);
2191             if(opcode2[i]==0x2a) // SLT
2192               emit_set_gz32(s2l,t);
2193             else // SLTU (set if not zero)
2194               emit_set_nz32(s2l,t);
2195           }
2196           else{
2197             assert(s1l>=0);assert(s2l>=0);
2198             if(opcode2[i]==0x2a) // SLT
2199               emit_set_if_less32(s1l,s2l,t);
2200             else // SLTU
2201               emit_set_if_carry32(s1l,s2l,t);
2202           }
2203         }
2204       }
2205     }
2206   }
2207   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2208     if(rt1[i]) {
2209       signed char s1l,s2l,tl;
2210       tl=get_reg(i_regs->regmap,rt1[i]);
2211       {
2212         if(tl>=0) {
2213           s1l=get_reg(i_regs->regmap,rs1[i]);
2214           s2l=get_reg(i_regs->regmap,rs2[i]);
2215           if(rs1[i]&&rs2[i]) {
2216             assert(s1l>=0);
2217             assert(s2l>=0);
2218             if(opcode2[i]==0x24) { // AND
2219               emit_and(s1l,s2l,tl);
2220             } else
2221             if(opcode2[i]==0x25) { // OR
2222               emit_or(s1l,s2l,tl);
2223             } else
2224             if(opcode2[i]==0x26) { // XOR
2225               emit_xor(s1l,s2l,tl);
2226             } else
2227             if(opcode2[i]==0x27) { // NOR
2228               emit_or(s1l,s2l,tl);
2229               emit_not(tl,tl);
2230             }
2231           }
2232           else
2233           {
2234             if(opcode2[i]==0x24) { // AND
2235               emit_zeroreg(tl);
2236             } else
2237             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2238               if(rs1[i]){
2239                 if(s1l>=0) emit_mov(s1l,tl);
2240                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2241               }
2242               else
2243               if(rs2[i]){
2244                 if(s2l>=0) emit_mov(s2l,tl);
2245                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2246               }
2247               else emit_zeroreg(tl);
2248             } else
2249             if(opcode2[i]==0x27) { // NOR
2250               if(rs1[i]){
2251                 if(s1l>=0) emit_not(s1l,tl);
2252                 else {
2253                   emit_loadreg(rs1[i],tl);
2254                   emit_not(tl,tl);
2255                 }
2256               }
2257               else
2258               if(rs2[i]){
2259                 if(s2l>=0) emit_not(s2l,tl);
2260                 else {
2261                   emit_loadreg(rs2[i],tl);
2262                   emit_not(tl,tl);
2263                 }
2264               }
2265               else emit_movimm(-1,tl);
2266             }
2267           }
2268         }
2269       }
2270     }
2271   }
2272 }
2273
2274 void imm16_assemble(int i,struct regstat *i_regs)
2275 {
2276   if (opcode[i]==0x0f) { // LUI
2277     if(rt1[i]) {
2278       signed char t;
2279       t=get_reg(i_regs->regmap,rt1[i]);
2280       //assert(t>=0);
2281       if(t>=0) {
2282         if(!((i_regs->isconst>>t)&1))
2283           emit_movimm(imm[i]<<16,t);
2284       }
2285     }
2286   }
2287   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2288     if(rt1[i]) {
2289       signed char s,t;
2290       t=get_reg(i_regs->regmap,rt1[i]);
2291       s=get_reg(i_regs->regmap,rs1[i]);
2292       if(rs1[i]) {
2293         //assert(t>=0);
2294         //assert(s>=0);
2295         if(t>=0) {
2296           if(!((i_regs->isconst>>t)&1)) {
2297             if(s<0) {
2298               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2299               emit_addimm(t,imm[i],t);
2300             }else{
2301               if(!((i_regs->wasconst>>s)&1))
2302                 emit_addimm(s,imm[i],t);
2303               else
2304                 emit_movimm(constmap[i][s]+imm[i],t);
2305             }
2306           }
2307         }
2308       } else {
2309         if(t>=0) {
2310           if(!((i_regs->isconst>>t)&1))
2311             emit_movimm(imm[i],t);
2312         }
2313       }
2314     }
2315   }
2316   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2317     if(rt1[i]) {
2318       signed char sl,tl;
2319       tl=get_reg(i_regs->regmap,rt1[i]);
2320       sl=get_reg(i_regs->regmap,rs1[i]);
2321       if(tl>=0) {
2322         if(rs1[i]) {
2323           assert(sl>=0);
2324           emit_addimm(sl,imm[i],tl);
2325         } else {
2326           emit_movimm(imm[i],tl);
2327         }
2328       }
2329     }
2330   }
2331   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2332     if(rt1[i]) {
2333       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2334       signed char sl,t;
2335       t=get_reg(i_regs->regmap,rt1[i]);
2336       sl=get_reg(i_regs->regmap,rs1[i]);
2337       //assert(t>=0);
2338       if(t>=0) {
2339         if(rs1[i]>0) {
2340             if(opcode[i]==0x0a) { // SLTI
2341               if(sl<0) {
2342                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2343                 emit_slti32(t,imm[i],t);
2344               }else{
2345                 emit_slti32(sl,imm[i],t);
2346               }
2347             }
2348             else { // SLTIU
2349               if(sl<0) {
2350                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2351                 emit_sltiu32(t,imm[i],t);
2352               }else{
2353                 emit_sltiu32(sl,imm[i],t);
2354               }
2355             }
2356         }else{
2357           // SLTI(U) with r0 is just stupid,
2358           // nonetheless examples can be found
2359           if(opcode[i]==0x0a) // SLTI
2360             if(0<imm[i]) emit_movimm(1,t);
2361             else emit_zeroreg(t);
2362           else // SLTIU
2363           {
2364             if(imm[i]) emit_movimm(1,t);
2365             else emit_zeroreg(t);
2366           }
2367         }
2368       }
2369     }
2370   }
2371   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2372     if(rt1[i]) {
2373       signed char sl,tl;
2374       tl=get_reg(i_regs->regmap,rt1[i]);
2375       sl=get_reg(i_regs->regmap,rs1[i]);
2376       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2377         if(opcode[i]==0x0c) //ANDI
2378         {
2379           if(rs1[i]) {
2380             if(sl<0) {
2381               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2382               emit_andimm(tl,imm[i],tl);
2383             }else{
2384               if(!((i_regs->wasconst>>sl)&1))
2385                 emit_andimm(sl,imm[i],tl);
2386               else
2387                 emit_movimm(constmap[i][sl]&imm[i],tl);
2388             }
2389           }
2390           else
2391             emit_zeroreg(tl);
2392         }
2393         else
2394         {
2395           if(rs1[i]) {
2396             if(sl<0) {
2397               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2398             }
2399             if(opcode[i]==0x0d) { // ORI
2400               if(sl<0) {
2401                 emit_orimm(tl,imm[i],tl);
2402               }else{
2403                 if(!((i_regs->wasconst>>sl)&1))
2404                   emit_orimm(sl,imm[i],tl);
2405                 else
2406                   emit_movimm(constmap[i][sl]|imm[i],tl);
2407               }
2408             }
2409             if(opcode[i]==0x0e) { // XORI
2410               if(sl<0) {
2411                 emit_xorimm(tl,imm[i],tl);
2412               }else{
2413                 if(!((i_regs->wasconst>>sl)&1))
2414                   emit_xorimm(sl,imm[i],tl);
2415                 else
2416                   emit_movimm(constmap[i][sl]^imm[i],tl);
2417               }
2418             }
2419           }
2420           else {
2421             emit_movimm(imm[i],tl);
2422           }
2423         }
2424       }
2425     }
2426   }
2427 }
2428
2429 void shiftimm_assemble(int i,struct regstat *i_regs)
2430 {
2431   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2432   {
2433     if(rt1[i]) {
2434       signed char s,t;
2435       t=get_reg(i_regs->regmap,rt1[i]);
2436       s=get_reg(i_regs->regmap,rs1[i]);
2437       //assert(t>=0);
2438       if(t>=0&&!((i_regs->isconst>>t)&1)){
2439         if(rs1[i]==0)
2440         {
2441           emit_zeroreg(t);
2442         }
2443         else
2444         {
2445           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2446           if(imm[i]) {
2447             if(opcode2[i]==0) // SLL
2448             {
2449               emit_shlimm(s<0?t:s,imm[i],t);
2450             }
2451             if(opcode2[i]==2) // SRL
2452             {
2453               emit_shrimm(s<0?t:s,imm[i],t);
2454             }
2455             if(opcode2[i]==3) // SRA
2456             {
2457               emit_sarimm(s<0?t:s,imm[i],t);
2458             }
2459           }else{
2460             // Shift by zero
2461             if(s>=0 && s!=t) emit_mov(s,t);
2462           }
2463         }
2464       }
2465       //emit_storereg(rt1[i],t); //DEBUG
2466     }
2467   }
2468   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2469   {
2470     assert(0);
2471   }
2472   if(opcode2[i]==0x3c) // DSLL32
2473   {
2474     assert(0);
2475   }
2476   if(opcode2[i]==0x3e) // DSRL32
2477   {
2478     assert(0);
2479   }
2480   if(opcode2[i]==0x3f) // DSRA32
2481   {
2482     assert(0);
2483   }
2484 }
2485
2486 #ifndef shift_assemble
2487 static void shift_assemble(int i,struct regstat *i_regs)
2488 {
2489   signed char s,t,shift;
2490   if (rt1[i] == 0)
2491     return;
2492   assert(opcode2[i]<=0x07); // SLLV/SRLV/SRAV
2493   t = get_reg(i_regs->regmap, rt1[i]);
2494   s = get_reg(i_regs->regmap, rs1[i]);
2495   shift = get_reg(i_regs->regmap, rs2[i]);
2496   if (t < 0)
2497     return;
2498
2499   if(rs1[i]==0)
2500     emit_zeroreg(t);
2501   else if(rs2[i]==0) {
2502     assert(s>=0);
2503     if(s!=t) emit_mov(s,t);
2504   }
2505   else {
2506     host_tempreg_acquire();
2507     emit_andimm(shift,31,HOST_TEMPREG);
2508     switch(opcode2[i]) {
2509     case 4: // SLLV
2510       emit_shl(s,HOST_TEMPREG,t);
2511       break;
2512     case 6: // SRLV
2513       emit_shr(s,HOST_TEMPREG,t);
2514       break;
2515     case 7: // SRAV
2516       emit_sar(s,HOST_TEMPREG,t);
2517       break;
2518     default:
2519       assert(0);
2520     }
2521     host_tempreg_release();
2522   }
2523 }
2524
2525 #endif
2526
2527 enum {
2528   MTYPE_8000 = 0,
2529   MTYPE_8020,
2530   MTYPE_0000,
2531   MTYPE_A000,
2532   MTYPE_1F80,
2533 };
2534
2535 static int get_ptr_mem_type(u_int a)
2536 {
2537   if(a < 0x00200000) {
2538     if(a<0x1000&&((start>>20)==0xbfc||(start>>24)==0xa0))
2539       // return wrong, must use memhandler for BIOS self-test to pass
2540       // 007 does similar stuff from a00 mirror, weird stuff
2541       return MTYPE_8000;
2542     return MTYPE_0000;
2543   }
2544   if(0x1f800000 <= a && a < 0x1f801000)
2545     return MTYPE_1F80;
2546   if(0x80200000 <= a && a < 0x80800000)
2547     return MTYPE_8020;
2548   if(0xa0000000 <= a && a < 0xa0200000)
2549     return MTYPE_A000;
2550   return MTYPE_8000;
2551 }
2552
2553 static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override)
2554 {
2555   void *jaddr = NULL;
2556   int type=0;
2557   int mr=rs1[i];
2558   if(((smrv_strong|smrv_weak)>>mr)&1) {
2559     type=get_ptr_mem_type(smrv[mr]);
2560     //printf("set %08x @%08x r%d %d\n", smrv[mr], start+i*4, mr, type);
2561   }
2562   else {
2563     // use the mirror we are running on
2564     type=get_ptr_mem_type(start);
2565     //printf("set nospec   @%08x r%d %d\n", start+i*4, mr, type);
2566   }
2567
2568   if(type==MTYPE_8020) { // RAM 80200000+ mirror
2569     host_tempreg_acquire();
2570     emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
2571     addr=*addr_reg_override=HOST_TEMPREG;
2572     type=0;
2573   }
2574   else if(type==MTYPE_0000) { // RAM 0 mirror
2575     host_tempreg_acquire();
2576     emit_orimm(addr,0x80000000,HOST_TEMPREG);
2577     addr=*addr_reg_override=HOST_TEMPREG;
2578     type=0;
2579   }
2580   else if(type==MTYPE_A000) { // RAM A mirror
2581     host_tempreg_acquire();
2582     emit_andimm(addr,~0x20000000,HOST_TEMPREG);
2583     addr=*addr_reg_override=HOST_TEMPREG;
2584     type=0;
2585   }
2586   else if(type==MTYPE_1F80) { // scratchpad
2587     if (psxH == (void *)0x1f800000) {
2588       host_tempreg_acquire();
2589       emit_xorimm(addr,0x1f800000,HOST_TEMPREG);
2590       emit_cmpimm(HOST_TEMPREG,0x1000);
2591       host_tempreg_release();
2592       jaddr=out;
2593       emit_jc(0);
2594     }
2595     else {
2596       // do the usual RAM check, jump will go to the right handler
2597       type=0;
2598     }
2599   }
2600
2601   if(type==0)
2602   {
2603     emit_cmpimm(addr,RAM_SIZE);
2604     jaddr=out;
2605     #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2606     // Hint to branch predictor that the branch is unlikely to be taken
2607     if(rs1[i]>=28)
2608       emit_jno_unlikely(0);
2609     else
2610     #endif
2611       emit_jno(0);
2612     if(ram_offset!=0) {
2613       host_tempreg_acquire();
2614       emit_addimm(addr,ram_offset,HOST_TEMPREG);
2615       addr=*addr_reg_override=HOST_TEMPREG;
2616     }
2617   }
2618
2619   return jaddr;
2620 }
2621
2622 // return memhandler, or get directly accessable address and return 0
2623 static void *get_direct_memhandler(void *table, u_int addr,
2624   enum stub_type type, uintptr_t *addr_host)
2625 {
2626   uintptr_t l1, l2 = 0;
2627   l1 = ((uintptr_t *)table)[addr>>12];
2628   if ((l1 & (1ul << (sizeof(l1)*8-1))) == 0) {
2629     uintptr_t v = l1 << 1;
2630     *addr_host = v + addr;
2631     return NULL;
2632   }
2633   else {
2634     l1 <<= 1;
2635     if (type == LOADB_STUB || type == LOADBU_STUB || type == STOREB_STUB)
2636       l2 = ((uintptr_t *)l1)[0x1000/4 + 0x1000/2 + (addr&0xfff)];
2637     else if (type == LOADH_STUB || type == LOADHU_STUB || type == STOREH_STUB)
2638       l2=((uintptr_t *)l1)[0x1000/4 + (addr&0xfff)/2];
2639     else
2640       l2=((uintptr_t *)l1)[(addr&0xfff)/4];
2641     if ((l2 & (1<<31)) == 0) {
2642       uintptr_t v = l2 << 1;
2643       *addr_host = v + (addr&0xfff);
2644       return NULL;
2645     }
2646     return (void *)(l2 << 1);
2647   }
2648 }
2649
2650 static void load_assemble(int i,struct regstat *i_regs)
2651 {
2652   int s,tl,addr;
2653   int offset;
2654   void *jaddr=0;
2655   int memtarget=0,c=0;
2656   int fastio_reg_override=-1;
2657   u_int hr,reglist=0;
2658   tl=get_reg(i_regs->regmap,rt1[i]);
2659   s=get_reg(i_regs->regmap,rs1[i]);
2660   offset=imm[i];
2661   for(hr=0;hr<HOST_REGS;hr++) {
2662     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2663   }
2664   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2665   if(s>=0) {
2666     c=(i_regs->wasconst>>s)&1;
2667     if (c) {
2668       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2669     }
2670   }
2671   //printf("load_assemble: c=%d\n",c);
2672   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2673   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2674   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2675     ||rt1[i]==0) {
2676       // could be FIFO, must perform the read
2677       // ||dummy read
2678       assem_debug("(forced read)\n");
2679       tl=get_reg(i_regs->regmap,-1);
2680       assert(tl>=0);
2681   }
2682   if(offset||s<0||c) addr=tl;
2683   else addr=s;
2684   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2685  if(tl>=0) {
2686   //printf("load_assemble: c=%d\n",c);
2687   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2688   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2689   reglist&=~(1<<tl);
2690   if(!c) {
2691     #ifdef R29_HACK
2692     // Strmnnrmn's speed hack
2693     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2694     #endif
2695     {
2696       jaddr=emit_fastpath_cmp_jump(i,addr,&fastio_reg_override);
2697     }
2698   }
2699   else if(ram_offset&&memtarget) {
2700     host_tempreg_acquire();
2701     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2702     fastio_reg_override=HOST_TEMPREG;
2703   }
2704   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2705   if (opcode[i]==0x20) { // LB
2706     if(!c||memtarget) {
2707       if(!dummy) {
2708         {
2709           int x=0,a=tl;
2710           if(!c) a=addr;
2711           if(fastio_reg_override>=0) a=fastio_reg_override;
2712
2713           emit_movsbl_indexed(x,a,tl);
2714         }
2715       }
2716       if(jaddr)
2717         add_stub_r(LOADB_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2718     }
2719     else
2720       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2721   }
2722   if (opcode[i]==0x21) { // LH
2723     if(!c||memtarget) {
2724       if(!dummy) {
2725         int x=0,a=tl;
2726         if(!c) a=addr;
2727         if(fastio_reg_override>=0) a=fastio_reg_override;
2728         emit_movswl_indexed(x,a,tl);
2729       }
2730       if(jaddr)
2731         add_stub_r(LOADH_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2732     }
2733     else
2734       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2735   }
2736   if (opcode[i]==0x23) { // LW
2737     if(!c||memtarget) {
2738       if(!dummy) {
2739         int a=addr;
2740         if(fastio_reg_override>=0) a=fastio_reg_override;
2741         emit_readword_indexed(0,a,tl);
2742       }
2743       if(jaddr)
2744         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2745     }
2746     else
2747       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2748   }
2749   if (opcode[i]==0x24) { // LBU
2750     if(!c||memtarget) {
2751       if(!dummy) {
2752         int x=0,a=tl;
2753         if(!c) a=addr;
2754         if(fastio_reg_override>=0) a=fastio_reg_override;
2755
2756         emit_movzbl_indexed(x,a,tl);
2757       }
2758       if(jaddr)
2759         add_stub_r(LOADBU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2760     }
2761     else
2762       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2763   }
2764   if (opcode[i]==0x25) { // LHU
2765     if(!c||memtarget) {
2766       if(!dummy) {
2767         int x=0,a=tl;
2768         if(!c) a=addr;
2769         if(fastio_reg_override>=0) a=fastio_reg_override;
2770         emit_movzwl_indexed(x,a,tl);
2771       }
2772       if(jaddr)
2773         add_stub_r(LOADHU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2774     }
2775     else
2776       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2777   }
2778   if (opcode[i]==0x27) { // LWU
2779     assert(0);
2780   }
2781   if (opcode[i]==0x37) { // LD
2782     assert(0);
2783   }
2784  }
2785  if (fastio_reg_override == HOST_TEMPREG)
2786    host_tempreg_release();
2787 }
2788
2789 #ifndef loadlr_assemble
2790 static void loadlr_assemble(int i,struct regstat *i_regs)
2791 {
2792   int s,tl,temp,temp2,addr;
2793   int offset;
2794   void *jaddr=0;
2795   int memtarget=0,c=0;
2796   int fastio_reg_override=-1;
2797   u_int hr,reglist=0;
2798   tl=get_reg(i_regs->regmap,rt1[i]);
2799   s=get_reg(i_regs->regmap,rs1[i]);
2800   temp=get_reg(i_regs->regmap,-1);
2801   temp2=get_reg(i_regs->regmap,FTEMP);
2802   addr=get_reg(i_regs->regmap,AGEN1+(i&1));
2803   assert(addr<0);
2804   offset=imm[i];
2805   for(hr=0;hr<HOST_REGS;hr++) {
2806     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2807   }
2808   reglist|=1<<temp;
2809   if(offset||s<0||c) addr=temp2;
2810   else addr=s;
2811   if(s>=0) {
2812     c=(i_regs->wasconst>>s)&1;
2813     if(c) {
2814       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2815     }
2816   }
2817   if(!c) {
2818     emit_shlimm(addr,3,temp);
2819     if (opcode[i]==0x22||opcode[i]==0x26) {
2820       emit_andimm(addr,0xFFFFFFFC,temp2); // LWL/LWR
2821     }else{
2822       emit_andimm(addr,0xFFFFFFF8,temp2); // LDL/LDR
2823     }
2824     jaddr=emit_fastpath_cmp_jump(i,temp2,&fastio_reg_override);
2825   }
2826   else {
2827     if(ram_offset&&memtarget) {
2828       host_tempreg_acquire();
2829       emit_addimm(temp2,ram_offset,HOST_TEMPREG);
2830       fastio_reg_override=HOST_TEMPREG;
2831     }
2832     if (opcode[i]==0x22||opcode[i]==0x26) {
2833       emit_movimm(((constmap[i][s]+offset)<<3)&24,temp); // LWL/LWR
2834     }else{
2835       emit_movimm(((constmap[i][s]+offset)<<3)&56,temp); // LDL/LDR
2836     }
2837   }
2838   if (opcode[i]==0x22||opcode[i]==0x26) { // LWL/LWR
2839     if(!c||memtarget) {
2840       int a=temp2;
2841       if(fastio_reg_override>=0) a=fastio_reg_override;
2842       emit_readword_indexed(0,a,temp2);
2843       if(fastio_reg_override==HOST_TEMPREG) host_tempreg_release();
2844       if(jaddr) add_stub_r(LOADW_STUB,jaddr,out,i,temp2,i_regs,ccadj[i],reglist);
2845     }
2846     else
2847       inline_readstub(LOADW_STUB,i,(constmap[i][s]+offset)&0xFFFFFFFC,i_regs->regmap,FTEMP,ccadj[i],reglist);
2848     if(rt1[i]) {
2849       assert(tl>=0);
2850       emit_andimm(temp,24,temp);
2851       if (opcode[i]==0x22) // LWL
2852         emit_xorimm(temp,24,temp);
2853       host_tempreg_acquire();
2854       emit_movimm(-1,HOST_TEMPREG);
2855       if (opcode[i]==0x26) {
2856         emit_shr(temp2,temp,temp2);
2857         emit_bic_lsr(tl,HOST_TEMPREG,temp,tl);
2858       }else{
2859         emit_shl(temp2,temp,temp2);
2860         emit_bic_lsl(tl,HOST_TEMPREG,temp,tl);
2861       }
2862       host_tempreg_release();
2863       emit_or(temp2,tl,tl);
2864     }
2865     //emit_storereg(rt1[i],tl); // DEBUG
2866   }
2867   if (opcode[i]==0x1A||opcode[i]==0x1B) { // LDL/LDR
2868     assert(0);
2869   }
2870 }
2871 #endif
2872
2873 void store_assemble(int i,struct regstat *i_regs)
2874 {
2875   int s,tl;
2876   int addr,temp;
2877   int offset;
2878   void *jaddr=0;
2879   enum stub_type type;
2880   int memtarget=0,c=0;
2881   int agr=AGEN1+(i&1);
2882   int fastio_reg_override=-1;
2883   u_int hr,reglist=0;
2884   tl=get_reg(i_regs->regmap,rs2[i]);
2885   s=get_reg(i_regs->regmap,rs1[i]);
2886   temp=get_reg(i_regs->regmap,agr);
2887   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2888   offset=imm[i];
2889   if(s>=0) {
2890     c=(i_regs->wasconst>>s)&1;
2891     if(c) {
2892       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2893     }
2894   }
2895   assert(tl>=0);
2896   assert(temp>=0);
2897   for(hr=0;hr<HOST_REGS;hr++) {
2898     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2899   }
2900   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2901   if(offset||s<0||c) addr=temp;
2902   else addr=s;
2903   if(!c) {
2904     jaddr=emit_fastpath_cmp_jump(i,addr,&fastio_reg_override);
2905   }
2906   else if(ram_offset&&memtarget) {
2907     host_tempreg_acquire();
2908     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2909     fastio_reg_override=HOST_TEMPREG;
2910   }
2911
2912   if (opcode[i]==0x28) { // SB
2913     if(!c||memtarget) {
2914       int x=0,a=temp;
2915       if(!c) a=addr;
2916       if(fastio_reg_override>=0) a=fastio_reg_override;
2917       emit_writebyte_indexed(tl,x,a);
2918     }
2919     type=STOREB_STUB;
2920   }
2921   if (opcode[i]==0x29) { // SH
2922     if(!c||memtarget) {
2923       int x=0,a=temp;
2924       if(!c) a=addr;
2925       if(fastio_reg_override>=0) a=fastio_reg_override;
2926       emit_writehword_indexed(tl,x,a);
2927     }
2928     type=STOREH_STUB;
2929   }
2930   if (opcode[i]==0x2B) { // SW
2931     if(!c||memtarget) {
2932       int a=addr;
2933       if(fastio_reg_override>=0) a=fastio_reg_override;
2934       emit_writeword_indexed(tl,0,a);
2935     }
2936     type=STOREW_STUB;
2937   }
2938   if (opcode[i]==0x3F) { // SD
2939     assert(0);
2940     type=STORED_STUB;
2941   }
2942   if(fastio_reg_override==HOST_TEMPREG)
2943     host_tempreg_release();
2944   if(jaddr) {
2945     // PCSX store handlers don't check invcode again
2946     reglist|=1<<addr;
2947     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2948     jaddr=0;
2949   }
2950   if(!(i_regs->waswritten&(1<<rs1[i])) && !HACK_ENABLED(NDHACK_NO_SMC_CHECK)) {
2951     if(!c||memtarget) {
2952       #ifdef DESTRUCTIVE_SHIFT
2953       // The x86 shift operation is 'destructive'; it overwrites the
2954       // source register, so we need to make a copy first and use that.
2955       addr=temp;
2956       #endif
2957       #if defined(HOST_IMM8)
2958       int ir=get_reg(i_regs->regmap,INVCP);
2959       assert(ir>=0);
2960       emit_cmpmem_indexedsr12_reg(ir,addr,1);
2961       #else
2962       emit_cmpmem_indexedsr12_imm(invalid_code,addr,1);
2963       #endif
2964       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2965       emit_callne(invalidate_addr_reg[addr]);
2966       #else
2967       void *jaddr2 = out;
2968       emit_jne(0);
2969       add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),addr,0,0,0);
2970       #endif
2971     }
2972   }
2973   u_int addr_val=constmap[i][s]+offset;
2974   if(jaddr) {
2975     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2976   } else if(c&&!memtarget) {
2977     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
2978   }
2979   // basic current block modification detection..
2980   // not looking back as that should be in mips cache already
2981   // (see Spyro2 title->attract mode)
2982   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
2983     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
2984     assert(i_regs->regmap==regs[i].regmap); // not delay slot
2985     if(i_regs->regmap==regs[i].regmap) {
2986       load_all_consts(regs[i].regmap_entry,regs[i].wasdirty,i);
2987       wb_dirtys(regs[i].regmap_entry,regs[i].wasdirty);
2988       emit_movimm(start+i*4+4,0);
2989       emit_writeword(0,&pcaddr);
2990       emit_addimm(HOST_CCREG,2,HOST_CCREG);
2991       emit_far_call(get_addr_ht);
2992       emit_jmpreg(0);
2993     }
2994   }
2995 }
2996
2997 static void storelr_assemble(int i,struct regstat *i_regs)
2998 {
2999   int s,tl;
3000   int temp;
3001   int offset;
3002   void *jaddr=0;
3003   void *case1, *case2, *case3;
3004   void *done0, *done1, *done2;
3005   int memtarget=0,c=0;
3006   int agr=AGEN1+(i&1);
3007   u_int hr,reglist=0;
3008   tl=get_reg(i_regs->regmap,rs2[i]);
3009   s=get_reg(i_regs->regmap,rs1[i]);
3010   temp=get_reg(i_regs->regmap,agr);
3011   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3012   offset=imm[i];
3013   if(s>=0) {
3014     c=(i_regs->isconst>>s)&1;
3015     if(c) {
3016       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3017     }
3018   }
3019   assert(tl>=0);
3020   for(hr=0;hr<HOST_REGS;hr++) {
3021     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3022   }
3023   assert(temp>=0);
3024   if(!c) {
3025     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3026     if(!offset&&s!=temp) emit_mov(s,temp);
3027     jaddr=out;
3028     emit_jno(0);
3029   }
3030   else
3031   {
3032     if(!memtarget||!rs1[i]) {
3033       jaddr=out;
3034       emit_jmp(0);
3035     }
3036   }
3037   if(ram_offset)
3038     emit_addimm_no_flags(ram_offset,temp);
3039
3040   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3041     assert(0);
3042   }
3043
3044   emit_xorimm(temp,3,temp);
3045   emit_testimm(temp,2);
3046   case2=out;
3047   emit_jne(0);
3048   emit_testimm(temp,1);
3049   case1=out;
3050   emit_jne(0);
3051   // 0
3052   if (opcode[i]==0x2A) { // SWL
3053     emit_writeword_indexed(tl,0,temp);
3054   }
3055   else if (opcode[i]==0x2E) { // SWR
3056     emit_writebyte_indexed(tl,3,temp);
3057   }
3058   else
3059     assert(0);
3060   done0=out;
3061   emit_jmp(0);
3062   // 1
3063   set_jump_target(case1, out);
3064   if (opcode[i]==0x2A) { // SWL
3065     // Write 3 msb into three least significant bytes
3066     if(rs2[i]) emit_rorimm(tl,8,tl);
3067     emit_writehword_indexed(tl,-1,temp);
3068     if(rs2[i]) emit_rorimm(tl,16,tl);
3069     emit_writebyte_indexed(tl,1,temp);
3070     if(rs2[i]) emit_rorimm(tl,8,tl);
3071   }
3072   else if (opcode[i]==0x2E) { // SWR
3073     // Write two lsb into two most significant bytes
3074     emit_writehword_indexed(tl,1,temp);
3075   }
3076   done1=out;
3077   emit_jmp(0);
3078   // 2
3079   set_jump_target(case2, out);
3080   emit_testimm(temp,1);
3081   case3=out;
3082   emit_jne(0);
3083   if (opcode[i]==0x2A) { // SWL
3084     // Write two msb into two least significant bytes
3085     if(rs2[i]) emit_rorimm(tl,16,tl);
3086     emit_writehword_indexed(tl,-2,temp);
3087     if(rs2[i]) emit_rorimm(tl,16,tl);
3088   }
3089   else if (opcode[i]==0x2E) { // SWR
3090     // Write 3 lsb into three most significant bytes
3091     emit_writebyte_indexed(tl,-1,temp);
3092     if(rs2[i]) emit_rorimm(tl,8,tl);
3093     emit_writehword_indexed(tl,0,temp);
3094     if(rs2[i]) emit_rorimm(tl,24,tl);
3095   }
3096   done2=out;
3097   emit_jmp(0);
3098   // 3
3099   set_jump_target(case3, out);
3100   if (opcode[i]==0x2A) { // SWL
3101     // Write msb into least significant byte
3102     if(rs2[i]) emit_rorimm(tl,24,tl);
3103     emit_writebyte_indexed(tl,-3,temp);
3104     if(rs2[i]) emit_rorimm(tl,8,tl);
3105   }
3106   else if (opcode[i]==0x2E) { // SWR
3107     // Write entire word
3108     emit_writeword_indexed(tl,-3,temp);
3109   }
3110   set_jump_target(done0, out);
3111   set_jump_target(done1, out);
3112   set_jump_target(done2, out);
3113   if(!c||!memtarget)
3114     add_stub_r(STORELR_STUB,jaddr,out,i,temp,i_regs,ccadj[i],reglist);
3115   if(!(i_regs->waswritten&(1<<rs1[i])) && !HACK_ENABLED(NDHACK_NO_SMC_CHECK)) {
3116     emit_addimm_no_flags(-ram_offset,temp);
3117     #if defined(HOST_IMM8)
3118     int ir=get_reg(i_regs->regmap,INVCP);
3119     assert(ir>=0);
3120     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3121     #else
3122     emit_cmpmem_indexedsr12_imm(invalid_code,temp,1);
3123     #endif
3124     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3125     emit_callne(invalidate_addr_reg[temp]);
3126     #else
3127     void *jaddr2 = out;
3128     emit_jne(0);
3129     add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3130     #endif
3131   }
3132 }
3133
3134 static void cop0_assemble(int i,struct regstat *i_regs)
3135 {
3136   if(opcode2[i]==0) // MFC0
3137   {
3138     signed char t=get_reg(i_regs->regmap,rt1[i]);
3139     u_int copr=(source[i]>>11)&0x1f;
3140     //assert(t>=0); // Why does this happen?  OOT is weird
3141     if(t>=0&&rt1[i]!=0) {
3142       emit_readword(&reg_cop0[copr],t);
3143     }
3144   }
3145   else if(opcode2[i]==4) // MTC0
3146   {
3147     signed char s=get_reg(i_regs->regmap,rs1[i]);
3148     char copr=(source[i]>>11)&0x1f;
3149     assert(s>=0);
3150     wb_register(rs1[i],i_regs->regmap,i_regs->dirty);
3151     if(copr==9||copr==11||copr==12||copr==13) {
3152       emit_readword(&last_count,HOST_TEMPREG);
3153       emit_loadreg(CCREG,HOST_CCREG); // TODO: do proper reg alloc
3154       emit_add(HOST_CCREG,HOST_TEMPREG,HOST_CCREG);
3155       emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3156       emit_writeword(HOST_CCREG,&Count);
3157     }
3158     // What a mess.  The status register (12) can enable interrupts,
3159     // so needs a special case to handle a pending interrupt.
3160     // The interrupt must be taken immediately, because a subsequent
3161     // instruction might disable interrupts again.
3162     if(copr==12||copr==13) {
3163       if (is_delayslot) {
3164         // burn cycles to cause cc_interrupt, which will
3165         // reschedule next_interupt. Relies on CCREG from above.
3166         assem_debug("MTC0 DS %d\n", copr);
3167         emit_writeword(HOST_CCREG,&last_count);
3168         emit_movimm(0,HOST_CCREG);
3169         emit_storereg(CCREG,HOST_CCREG);
3170         emit_loadreg(rs1[i],1);
3171         emit_movimm(copr,0);
3172         emit_far_call(pcsx_mtc0_ds);
3173         emit_loadreg(rs1[i],s);
3174         return;
3175       }
3176       emit_movimm(start+i*4+4,HOST_TEMPREG);
3177       emit_writeword(HOST_TEMPREG,&pcaddr);
3178       emit_movimm(0,HOST_TEMPREG);
3179       emit_writeword(HOST_TEMPREG,&pending_exception);
3180     }
3181     if(s==HOST_CCREG)
3182       emit_loadreg(rs1[i],1);
3183     else if(s!=1)
3184       emit_mov(s,1);
3185     emit_movimm(copr,0);
3186     emit_far_call(pcsx_mtc0);
3187     if(copr==9||copr==11||copr==12||copr==13) {
3188       emit_readword(&Count,HOST_CCREG);
3189       emit_readword(&next_interupt,HOST_TEMPREG);
3190       emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3191       emit_sub(HOST_CCREG,HOST_TEMPREG,HOST_CCREG);
3192       emit_writeword(HOST_TEMPREG,&last_count);
3193       emit_storereg(CCREG,HOST_CCREG);
3194     }
3195     if(copr==12||copr==13) {
3196       assert(!is_delayslot);
3197       emit_readword(&pending_exception,14);
3198       emit_test(14,14);
3199       void *jaddr = out;
3200       emit_jeq(0);
3201       emit_readword(&pcaddr, 0);
3202       emit_addimm(HOST_CCREG,2,HOST_CCREG);
3203       emit_far_call(get_addr_ht);
3204       emit_jmpreg(0);
3205       set_jump_target(jaddr, out);
3206     }
3207     emit_loadreg(rs1[i],s);
3208   }
3209   else
3210   {
3211     assert(opcode2[i]==0x10);
3212     //if((source[i]&0x3f)==0x10) // RFE
3213     {
3214       emit_readword(&Status,0);
3215       emit_andimm(0,0x3c,1);
3216       emit_andimm(0,~0xf,0);
3217       emit_orrshr_imm(1,2,0);
3218       emit_writeword(0,&Status);
3219     }
3220   }
3221 }
3222
3223 static void cop1_unusable(int i,struct regstat *i_regs)
3224 {
3225   // XXX: should just just do the exception instead
3226   //if(!cop1_usable)
3227   {
3228     void *jaddr=out;
3229     emit_jmp(0);
3230     add_stub_r(FP_STUB,jaddr,out,i,0,i_regs,is_delayslot,0);
3231   }
3232 }
3233
3234 static void cop1_assemble(int i,struct regstat *i_regs)
3235 {
3236   cop1_unusable(i, i_regs);
3237 }
3238
3239 static void c1ls_assemble(int i,struct regstat *i_regs)
3240 {
3241   cop1_unusable(i, i_regs);
3242 }
3243
3244 // FP_STUB
3245 static void do_cop1stub(int n)
3246 {
3247   literal_pool(256);
3248   assem_debug("do_cop1stub %x\n",start+stubs[n].a*4);
3249   set_jump_target(stubs[n].addr, out);
3250   int i=stubs[n].a;
3251 //  int rs=stubs[n].b;
3252   struct regstat *i_regs=(struct regstat *)stubs[n].c;
3253   int ds=stubs[n].d;
3254   if(!ds) {
3255     load_all_consts(regs[i].regmap_entry,regs[i].wasdirty,i);
3256     //if(i_regs!=&regs[i]) printf("oops: regs[i]=%x i_regs=%x",(int)&regs[i],(int)i_regs);
3257   }
3258   //else {printf("fp exception in delay slot\n");}
3259   wb_dirtys(i_regs->regmap_entry,i_regs->wasdirty);
3260   if(regs[i].regmap_entry[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
3261   emit_movimm(start+(i-ds)*4,EAX); // Get PC
3262   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3263   emit_far_jump(ds?fp_exception_ds:fp_exception);
3264 }
3265
3266 static void cop2_get_dreg(u_int copr,signed char tl,signed char temp)
3267 {
3268   switch (copr) {
3269     case 1:
3270     case 3:
3271     case 5:
3272     case 8:
3273     case 9:
3274     case 10:
3275     case 11:
3276       emit_readword(&reg_cop2d[copr],tl);
3277       emit_signextend16(tl,tl);
3278       emit_writeword(tl,&reg_cop2d[copr]); // hmh
3279       break;
3280     case 7:
3281     case 16:
3282     case 17:
3283     case 18:
3284     case 19:
3285       emit_readword(&reg_cop2d[copr],tl);
3286       emit_andimm(tl,0xffff,tl);
3287       emit_writeword(tl,&reg_cop2d[copr]);
3288       break;
3289     case 15:
3290       emit_readword(&reg_cop2d[14],tl); // SXY2
3291       emit_writeword(tl,&reg_cop2d[copr]);
3292       break;
3293     case 28:
3294     case 29:
3295       c2op_mfc2_29_assemble(tl,temp);
3296       break;
3297     default:
3298       emit_readword(&reg_cop2d[copr],tl);
3299       break;
3300   }
3301 }
3302
3303 static void cop2_put_dreg(u_int copr,signed char sl,signed char temp)
3304 {
3305   switch (copr) {
3306     case 15:
3307       emit_readword(&reg_cop2d[13],temp);  // SXY1
3308       emit_writeword(sl,&reg_cop2d[copr]);
3309       emit_writeword(temp,&reg_cop2d[12]); // SXY0
3310       emit_readword(&reg_cop2d[14],temp);  // SXY2
3311       emit_writeword(sl,&reg_cop2d[14]);
3312       emit_writeword(temp,&reg_cop2d[13]); // SXY1
3313       break;
3314     case 28:
3315       emit_andimm(sl,0x001f,temp);
3316       emit_shlimm(temp,7,temp);
3317       emit_writeword(temp,&reg_cop2d[9]);
3318       emit_andimm(sl,0x03e0,temp);
3319       emit_shlimm(temp,2,temp);
3320       emit_writeword(temp,&reg_cop2d[10]);
3321       emit_andimm(sl,0x7c00,temp);
3322       emit_shrimm(temp,3,temp);
3323       emit_writeword(temp,&reg_cop2d[11]);
3324       emit_writeword(sl,&reg_cop2d[28]);
3325       break;
3326     case 30:
3327       emit_xorsar_imm(sl,sl,31,temp);
3328 #if defined(HAVE_ARMV5) || defined(__aarch64__)
3329       emit_clz(temp,temp);
3330 #else
3331       emit_movs(temp,HOST_TEMPREG);
3332       emit_movimm(0,temp);
3333       emit_jeq((int)out+4*4);
3334       emit_addpl_imm(temp,1,temp);
3335       emit_lslpls_imm(HOST_TEMPREG,1,HOST_TEMPREG);
3336       emit_jns((int)out-2*4);
3337 #endif
3338       emit_writeword(sl,&reg_cop2d[30]);
3339       emit_writeword(temp,&reg_cop2d[31]);
3340       break;
3341     case 31:
3342       break;
3343     default:
3344       emit_writeword(sl,&reg_cop2d[copr]);
3345       break;
3346   }
3347 }
3348
3349 static void c2ls_assemble(int i,struct regstat *i_regs)
3350 {
3351   int s,tl;
3352   int ar;
3353   int offset;
3354   int memtarget=0,c=0;
3355   void *jaddr2=NULL;
3356   enum stub_type type;
3357   int agr=AGEN1+(i&1);
3358   int fastio_reg_override=-1;
3359   u_int hr,reglist=0;
3360   u_int copr=(source[i]>>16)&0x1f;
3361   s=get_reg(i_regs->regmap,rs1[i]);
3362   tl=get_reg(i_regs->regmap,FTEMP);
3363   offset=imm[i];
3364   assert(rs1[i]>0);
3365   assert(tl>=0);
3366
3367   for(hr=0;hr<HOST_REGS;hr++) {
3368     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3369   }
3370   if(i_regs->regmap[HOST_CCREG]==CCREG)
3371     reglist&=~(1<<HOST_CCREG);
3372
3373   // get the address
3374   if (opcode[i]==0x3a) { // SWC2
3375     ar=get_reg(i_regs->regmap,agr);
3376     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3377     reglist|=1<<ar;
3378   } else { // LWC2
3379     ar=tl;
3380   }
3381   if(s>=0) c=(i_regs->wasconst>>s)&1;
3382   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3383   if (!offset&&!c&&s>=0) ar=s;
3384   assert(ar>=0);
3385
3386   if (opcode[i]==0x3a) { // SWC2
3387     cop2_get_dreg(copr,tl,-1);
3388     type=STOREW_STUB;
3389   }
3390   else
3391     type=LOADW_STUB;
3392
3393   if(c&&!memtarget) {
3394     jaddr2=out;
3395     emit_jmp(0); // inline_readstub/inline_writestub?
3396   }
3397   else {
3398     if(!c) {
3399       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3400     }
3401     else if(ram_offset&&memtarget) {
3402       host_tempreg_acquire();
3403       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3404       fastio_reg_override=HOST_TEMPREG;
3405     }
3406     if (opcode[i]==0x32) { // LWC2
3407       int a=ar;
3408       if(fastio_reg_override>=0) a=fastio_reg_override;
3409       emit_readword_indexed(0,a,tl);
3410     }
3411     if (opcode[i]==0x3a) { // SWC2
3412       #ifdef DESTRUCTIVE_SHIFT
3413       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3414       #endif
3415       int a=ar;
3416       if(fastio_reg_override>=0) a=fastio_reg_override;
3417       emit_writeword_indexed(tl,0,a);
3418     }
3419   }
3420   if(fastio_reg_override==HOST_TEMPREG)
3421     host_tempreg_release();
3422   if(jaddr2)
3423     add_stub_r(type,jaddr2,out,i,ar,i_regs,ccadj[i],reglist);
3424   if(opcode[i]==0x3a) // SWC2
3425   if(!(i_regs->waswritten&(1<<rs1[i])) && !HACK_ENABLED(NDHACK_NO_SMC_CHECK)) {
3426 #if defined(HOST_IMM8)
3427     int ir=get_reg(i_regs->regmap,INVCP);
3428     assert(ir>=0);
3429     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3430 #else
3431     emit_cmpmem_indexedsr12_imm(invalid_code,ar,1);
3432 #endif
3433     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3434     emit_callne(invalidate_addr_reg[ar]);
3435     #else
3436     void *jaddr3 = out;
3437     emit_jne(0);
3438     add_stub(INVCODE_STUB,jaddr3,out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3439     #endif
3440   }
3441   if (opcode[i]==0x32) { // LWC2
3442     host_tempreg_acquire();
3443     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3444     host_tempreg_release();
3445   }
3446 }
3447
3448 static void cop2_assemble(int i,struct regstat *i_regs)
3449 {
3450   u_int copr=(source[i]>>11)&0x1f;
3451   signed char temp=get_reg(i_regs->regmap,-1);
3452   if (opcode2[i]==0) { // MFC2
3453     signed char tl=get_reg(i_regs->regmap,rt1[i]);
3454     if(tl>=0&&rt1[i]!=0)
3455       cop2_get_dreg(copr,tl,temp);
3456   }
3457   else if (opcode2[i]==4) { // MTC2
3458     signed char sl=get_reg(i_regs->regmap,rs1[i]);
3459     cop2_put_dreg(copr,sl,temp);
3460   }
3461   else if (opcode2[i]==2) // CFC2
3462   {
3463     signed char tl=get_reg(i_regs->regmap,rt1[i]);
3464     if(tl>=0&&rt1[i]!=0)
3465       emit_readword(&reg_cop2c[copr],tl);
3466   }
3467   else if (opcode2[i]==6) // CTC2
3468   {
3469     signed char sl=get_reg(i_regs->regmap,rs1[i]);
3470     switch(copr) {
3471       case 4:
3472       case 12:
3473       case 20:
3474       case 26:
3475       case 27:
3476       case 29:
3477       case 30:
3478         emit_signextend16(sl,temp);
3479         break;
3480       case 31:
3481         c2op_ctc2_31_assemble(sl,temp);
3482         break;
3483       default:
3484         temp=sl;
3485         break;
3486     }
3487     emit_writeword(temp,&reg_cop2c[copr]);
3488     assert(sl>=0);
3489   }
3490 }
3491
3492 static void do_unalignedwritestub(int n)
3493 {
3494   assem_debug("do_unalignedwritestub %x\n",start+stubs[n].a*4);
3495   literal_pool(256);
3496   set_jump_target(stubs[n].addr, out);
3497
3498   int i=stubs[n].a;
3499   struct regstat *i_regs=(struct regstat *)stubs[n].c;
3500   int addr=stubs[n].b;
3501   u_int reglist=stubs[n].e;
3502   signed char *i_regmap=i_regs->regmap;
3503   int temp2=get_reg(i_regmap,FTEMP);
3504   int rt;
3505   rt=get_reg(i_regmap,rs2[i]);
3506   assert(rt>=0);
3507   assert(addr>=0);
3508   assert(opcode[i]==0x2a||opcode[i]==0x2e); // SWL/SWR only implemented
3509   reglist|=(1<<addr);
3510   reglist&=~(1<<temp2);
3511
3512 #if 1
3513   // don't bother with it and call write handler
3514   save_regs(reglist);
3515   pass_args(addr,rt);
3516   int cc=get_reg(i_regmap,CCREG);
3517   if(cc<0)
3518     emit_loadreg(CCREG,2);
3519   emit_addimm(cc<0?2:cc,CLOCK_ADJUST((int)stubs[n].d+1),2);
3520   emit_far_call((opcode[i]==0x2a?jump_handle_swl:jump_handle_swr));
3521   emit_addimm(0,-CLOCK_ADJUST((int)stubs[n].d+1),cc<0?2:cc);
3522   if(cc<0)
3523     emit_storereg(CCREG,2);
3524   restore_regs(reglist);
3525   emit_jmp(stubs[n].retaddr); // return address
3526 #else
3527   emit_andimm(addr,0xfffffffc,temp2);
3528   emit_writeword(temp2,&address);
3529
3530   save_regs(reglist);
3531   emit_shrimm(addr,16,1);
3532   int cc=get_reg(i_regmap,CCREG);
3533   if(cc<0) {
3534     emit_loadreg(CCREG,2);
3535   }
3536   emit_movimm((u_int)readmem,0);
3537   emit_addimm(cc<0?2:cc,2*stubs[n].d+2,2);
3538   emit_call((int)&indirect_jump_indexed);
3539   restore_regs(reglist);
3540
3541   emit_readword(&readmem_dword,temp2);
3542   int temp=addr; //hmh
3543   emit_shlimm(addr,3,temp);
3544   emit_andimm(temp,24,temp);
3545   if (opcode[i]==0x2a) // SWL
3546     emit_xorimm(temp,24,temp);
3547   emit_movimm(-1,HOST_TEMPREG);
3548   if (opcode[i]==0x2a) { // SWL
3549     emit_bic_lsr(temp2,HOST_TEMPREG,temp,temp2);
3550     emit_orrshr(rt,temp,temp2);
3551   }else{
3552     emit_bic_lsl(temp2,HOST_TEMPREG,temp,temp2);
3553     emit_orrshl(rt,temp,temp2);
3554   }
3555   emit_readword(&address,addr);
3556   emit_writeword(temp2,&word);
3557   //save_regs(reglist); // don't need to, no state changes
3558   emit_shrimm(addr,16,1);
3559   emit_movimm((u_int)writemem,0);
3560   //emit_call((int)&indirect_jump_indexed);
3561   emit_mov(15,14);
3562   emit_readword_dualindexedx4(0,1,15);
3563   emit_readword(&Count,HOST_TEMPREG);
3564   emit_readword(&next_interupt,2);
3565   emit_addimm(HOST_TEMPREG,-2*stubs[n].d-2,HOST_TEMPREG);
3566   emit_writeword(2,&last_count);
3567   emit_sub(HOST_TEMPREG,2,cc<0?HOST_TEMPREG:cc);
3568   if(cc<0) {
3569     emit_storereg(CCREG,HOST_TEMPREG);
3570   }
3571   restore_regs(reglist);
3572   emit_jmp(stubs[n].retaddr); // return address
3573 #endif
3574 }
3575
3576 #ifndef multdiv_assemble
3577 void multdiv_assemble(int i,struct regstat *i_regs)
3578 {
3579   printf("Need multdiv_assemble for this architecture.\n");
3580   abort();
3581 }
3582 #endif
3583
3584 static void mov_assemble(int i,struct regstat *i_regs)
3585 {
3586   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3587   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3588   if(rt1[i]) {
3589     signed char sl,tl;
3590     tl=get_reg(i_regs->regmap,rt1[i]);
3591     //assert(tl>=0);
3592     if(tl>=0) {
3593       sl=get_reg(i_regs->regmap,rs1[i]);
3594       if(sl>=0) emit_mov(sl,tl);
3595       else emit_loadreg(rs1[i],tl);
3596     }
3597   }
3598 }
3599
3600 // call interpreter, exception handler, things that change pc/regs/cycles ...
3601 static void call_c_cpu_handler(int i, const struct regstat *i_regs, u_int pc, void *func)
3602 {
3603   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3604   assert(ccreg==HOST_CCREG);
3605   assert(!is_delayslot);
3606   (void)ccreg;
3607
3608   emit_movimm(pc,3); // Get PC
3609   emit_readword(&last_count,2);
3610   emit_writeword(3,&psxRegs.pc);
3611   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3612   emit_add(2,HOST_CCREG,2);
3613   emit_writeword(2,&psxRegs.cycle);
3614   emit_far_call(func);
3615   emit_far_jump(jump_to_new_pc);
3616 }
3617
3618 static void syscall_assemble(int i,struct regstat *i_regs)
3619 {
3620   emit_movimm(0x20,0); // cause code
3621   emit_movimm(0,1);    // not in delay slot
3622   call_c_cpu_handler(i,i_regs,start+i*4,psxException);
3623 }
3624
3625 static void hlecall_assemble(int i,struct regstat *i_regs)
3626 {
3627   void *hlefunc = psxNULL;
3628   uint32_t hleCode = source[i] & 0x03ffffff;
3629   if (hleCode < ARRAY_SIZE(psxHLEt))
3630     hlefunc = psxHLEt[hleCode];
3631
3632   call_c_cpu_handler(i,i_regs,start+i*4+4,hlefunc);
3633 }
3634
3635 static void intcall_assemble(int i,struct regstat *i_regs)
3636 {
3637   call_c_cpu_handler(i,i_regs,start+i*4,execI);
3638 }
3639
3640 static void speculate_mov(int rs,int rt)
3641 {
3642   if(rt!=0) {
3643     smrv_strong_next|=1<<rt;
3644     smrv[rt]=smrv[rs];
3645   }
3646 }
3647
3648 static void speculate_mov_weak(int rs,int rt)
3649 {
3650   if(rt!=0) {
3651     smrv_weak_next|=1<<rt;
3652     smrv[rt]=smrv[rs];
3653   }
3654 }
3655
3656 static void speculate_register_values(int i)
3657 {
3658   if(i==0) {
3659     memcpy(smrv,psxRegs.GPR.r,sizeof(smrv));
3660     // gp,sp are likely to stay the same throughout the block
3661     smrv_strong_next=(1<<28)|(1<<29)|(1<<30);
3662     smrv_weak_next=~smrv_strong_next;
3663     //printf(" llr %08x\n", smrv[4]);
3664   }
3665   smrv_strong=smrv_strong_next;
3666   smrv_weak=smrv_weak_next;
3667   switch(itype[i]) {
3668     case ALU:
3669       if     ((smrv_strong>>rs1[i])&1) speculate_mov(rs1[i],rt1[i]);
3670       else if((smrv_strong>>rs2[i])&1) speculate_mov(rs2[i],rt1[i]);
3671       else if((smrv_weak>>rs1[i])&1) speculate_mov_weak(rs1[i],rt1[i]);
3672       else if((smrv_weak>>rs2[i])&1) speculate_mov_weak(rs2[i],rt1[i]);
3673       else {
3674         smrv_strong_next&=~(1<<rt1[i]);
3675         smrv_weak_next&=~(1<<rt1[i]);
3676       }
3677       break;
3678     case SHIFTIMM:
3679       smrv_strong_next&=~(1<<rt1[i]);
3680       smrv_weak_next&=~(1<<rt1[i]);
3681       // fallthrough
3682     case IMM16:
3683       if(rt1[i]&&is_const(&regs[i],rt1[i])) {
3684         int value,hr=get_reg(regs[i].regmap,rt1[i]);
3685         if(hr>=0) {
3686           if(get_final_value(hr,i,&value))
3687                smrv[rt1[i]]=value;
3688           else smrv[rt1[i]]=constmap[i][hr];
3689           smrv_strong_next|=1<<rt1[i];
3690         }
3691       }
3692       else {
3693         if     ((smrv_strong>>rs1[i])&1) speculate_mov(rs1[i],rt1[i]);
3694         else if((smrv_weak>>rs1[i])&1) speculate_mov_weak(rs1[i],rt1[i]);
3695       }
3696       break;
3697     case LOAD:
3698       if(start<0x2000&&(rt1[i]==26||(smrv[rt1[i]]>>24)==0xa0)) {
3699         // special case for BIOS
3700         smrv[rt1[i]]=0xa0000000;
3701         smrv_strong_next|=1<<rt1[i];
3702         break;
3703       }
3704       // fallthrough
3705     case SHIFT:
3706     case LOADLR:
3707     case MOV:
3708       smrv_strong_next&=~(1<<rt1[i]);
3709       smrv_weak_next&=~(1<<rt1[i]);
3710       break;
3711     case COP0:
3712     case COP2:
3713       if(opcode2[i]==0||opcode2[i]==2) { // MFC/CFC
3714         smrv_strong_next&=~(1<<rt1[i]);
3715         smrv_weak_next&=~(1<<rt1[i]);
3716       }
3717       break;
3718     case C2LS:
3719       if (opcode[i]==0x32) { // LWC2
3720         smrv_strong_next&=~(1<<rt1[i]);
3721         smrv_weak_next&=~(1<<rt1[i]);
3722       }
3723       break;
3724   }
3725 #if 0
3726   int r=4;
3727   printf("x %08x %08x %d %d c %08x %08x\n",smrv[r],start+i*4,
3728     ((smrv_strong>>r)&1),(smrv_weak>>r)&1,regs[i].isconst,regs[i].wasconst);
3729 #endif
3730 }
3731
3732 static void ds_assemble(int i,struct regstat *i_regs)
3733 {
3734   speculate_register_values(i);
3735   is_delayslot=1;
3736   switch(itype[i]) {
3737     case ALU:
3738       alu_assemble(i,i_regs);break;
3739     case IMM16:
3740       imm16_assemble(i,i_regs);break;
3741     case SHIFT:
3742       shift_assemble(i,i_regs);break;
3743     case SHIFTIMM:
3744       shiftimm_assemble(i,i_regs);break;
3745     case LOAD:
3746       load_assemble(i,i_regs);break;
3747     case LOADLR:
3748       loadlr_assemble(i,i_regs);break;
3749     case STORE:
3750       store_assemble(i,i_regs);break;
3751     case STORELR:
3752       storelr_assemble(i,i_regs);break;
3753     case COP0:
3754       cop0_assemble(i,i_regs);break;
3755     case COP1:
3756       cop1_assemble(i,i_regs);break;
3757     case C1LS:
3758       c1ls_assemble(i,i_regs);break;
3759     case COP2:
3760       cop2_assemble(i,i_regs);break;
3761     case C2LS:
3762       c2ls_assemble(i,i_regs);break;
3763     case C2OP:
3764       c2op_assemble(i,i_regs);break;
3765     case MULTDIV:
3766       multdiv_assemble(i,i_regs);break;
3767     case MOV:
3768       mov_assemble(i,i_regs);break;
3769     case SYSCALL:
3770     case HLECALL:
3771     case INTCALL:
3772     case SPAN:
3773     case UJUMP:
3774     case RJUMP:
3775     case CJUMP:
3776     case SJUMP:
3777       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3778   }
3779   is_delayslot=0;
3780 }
3781
3782 // Is the branch target a valid internal jump?
3783 static int internal_branch(int addr)
3784 {
3785   if(addr&1) return 0; // Indirect (register) jump
3786   if(addr>=start && addr<start+slen*4-4)
3787   {
3788     return 1;
3789   }
3790   return 0;
3791 }
3792
3793 static void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t u)
3794 {
3795   int hr;
3796   for(hr=0;hr<HOST_REGS;hr++) {
3797     if(hr!=EXCLUDE_REG) {
3798       if(pre[hr]!=entry[hr]) {
3799         if(pre[hr]>=0) {
3800           if((dirty>>hr)&1) {
3801             if(get_reg(entry,pre[hr])<0) {
3802               assert(pre[hr]<64);
3803               if(!((u>>pre[hr])&1))
3804                 emit_storereg(pre[hr],hr);
3805             }
3806           }
3807         }
3808       }
3809     }
3810   }
3811   // Move from one register to another (no writeback)
3812   for(hr=0;hr<HOST_REGS;hr++) {
3813     if(hr!=EXCLUDE_REG) {
3814       if(pre[hr]!=entry[hr]) {
3815         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3816           int nr;
3817           if((nr=get_reg(entry,pre[hr]))>=0) {
3818             emit_mov(hr,nr);
3819           }
3820         }
3821       }
3822     }
3823   }
3824 }
3825
3826 // Load the specified registers
3827 // This only loads the registers given as arguments because
3828 // we don't want to load things that will be overwritten
3829 static void load_regs(signed char entry[],signed char regmap[],int rs1,int rs2)
3830 {
3831   int hr;
3832   // Load 32-bit regs
3833   for(hr=0;hr<HOST_REGS;hr++) {
3834     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3835       if(entry[hr]!=regmap[hr]) {
3836         if(regmap[hr]==rs1||regmap[hr]==rs2)
3837         {
3838           if(regmap[hr]==0) {
3839             emit_zeroreg(hr);
3840           }
3841           else
3842           {
3843             emit_loadreg(regmap[hr],hr);
3844           }
3845         }
3846       }
3847     }
3848   }
3849 }
3850
3851 // Load registers prior to the start of a loop
3852 // so that they are not loaded within the loop
3853 static void loop_preload(signed char pre[],signed char entry[])
3854 {
3855   int hr;
3856   for(hr=0;hr<HOST_REGS;hr++) {
3857     if(hr!=EXCLUDE_REG) {
3858       if(pre[hr]!=entry[hr]) {
3859         if(entry[hr]>=0) {
3860           if(get_reg(pre,entry[hr])<0) {
3861             assem_debug("loop preload:\n");
3862             //printf("loop preload: %d\n",hr);
3863             if(entry[hr]==0) {
3864               emit_zeroreg(hr);
3865             }
3866             else if(entry[hr]<TEMPREG)
3867             {
3868               emit_loadreg(entry[hr],hr);
3869             }
3870             else if(entry[hr]-64<TEMPREG)
3871             {
3872               emit_loadreg(entry[hr],hr);
3873             }
3874           }
3875         }
3876       }
3877     }
3878   }
3879 }
3880
3881 // Generate address for load/store instruction
3882 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
3883 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3884 {
3885   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
3886     int ra=-1;
3887     int agr=AGEN1+(i&1);
3888     if(itype[i]==LOAD) {
3889       ra=get_reg(i_regs->regmap,rt1[i]);
3890       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3891       assert(ra>=0);
3892     }
3893     if(itype[i]==LOADLR) {
3894       ra=get_reg(i_regs->regmap,FTEMP);
3895     }
3896     if(itype[i]==STORE||itype[i]==STORELR) {
3897       ra=get_reg(i_regs->regmap,agr);
3898       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3899     }
3900     if(itype[i]==C1LS||itype[i]==C2LS) {
3901       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
3902         ra=get_reg(i_regs->regmap,FTEMP);
3903       else { // SWC1/SDC1/SWC2/SDC2
3904         ra=get_reg(i_regs->regmap,agr);
3905         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3906       }
3907     }
3908     int rs=get_reg(i_regs->regmap,rs1[i]);
3909     if(ra>=0) {
3910       int offset=imm[i];
3911       int c=(i_regs->wasconst>>rs)&1;
3912       if(rs1[i]==0) {
3913         // Using r0 as a base address
3914         if(!entry||entry[ra]!=agr) {
3915           if (opcode[i]==0x22||opcode[i]==0x26) {
3916             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3917           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3918             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3919           }else{
3920             emit_movimm(offset,ra);
3921           }
3922         } // else did it in the previous cycle
3923       }
3924       else if(rs<0) {
3925         if(!entry||entry[ra]!=rs1[i])
3926           emit_loadreg(rs1[i],ra);
3927         //if(!entry||entry[ra]!=rs1[i])
3928         //  printf("poor load scheduling!\n");
3929       }
3930       else if(c) {
3931         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3932           if(!entry||entry[ra]!=agr) {
3933             if (opcode[i]==0x22||opcode[i]==0x26) {
3934               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3935             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3936               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3937             }else{
3938               emit_movimm(constmap[i][rs]+offset,ra);
3939               regs[i].loadedconst|=1<<ra;
3940             }
3941           } // else did it in the previous cycle
3942         } // else load_consts already did it
3943       }
3944       if(offset&&!c&&rs1[i]) {
3945         if(rs>=0) {
3946           emit_addimm(rs,offset,ra);
3947         }else{
3948           emit_addimm(ra,offset,ra);
3949         }
3950       }
3951     }
3952   }
3953   // Preload constants for next instruction
3954   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
3955     int agr,ra;
3956     // Actual address
3957     agr=AGEN1+((i+1)&1);
3958     ra=get_reg(i_regs->regmap,agr);
3959     if(ra>=0) {
3960       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3961       int offset=imm[i+1];
3962       int c=(regs[i+1].wasconst>>rs)&1;
3963       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3964         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3965           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3966         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3967           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3968         }else{
3969           emit_movimm(constmap[i+1][rs]+offset,ra);
3970           regs[i+1].loadedconst|=1<<ra;
3971         }
3972       }
3973       else if(rs1[i+1]==0) {
3974         // Using r0 as a base address
3975         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3976           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3977         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3978           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3979         }else{
3980           emit_movimm(offset,ra);
3981         }
3982       }
3983     }
3984   }
3985 }
3986
3987 static int get_final_value(int hr, int i, int *value)
3988 {
3989   int reg=regs[i].regmap[hr];
3990   while(i<slen-1) {
3991     if(regs[i+1].regmap[hr]!=reg) break;
3992     if(!((regs[i+1].isconst>>hr)&1)) break;
3993     if(bt[i+1]) break;
3994     i++;
3995   }
3996   if(i<slen-1) {
3997     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3998       *value=constmap[i][hr];
3999       return 1;
4000     }
4001     if(!bt[i+1]) {
4002       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
4003         // Load in delay slot, out-of-order execution
4004         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
4005         {
4006           // Precompute load address
4007           *value=constmap[i][hr]+imm[i+2];
4008           return 1;
4009         }
4010       }
4011       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
4012       {
4013         // Precompute load address
4014         *value=constmap[i][hr]+imm[i+1];
4015         //printf("c=%x imm=%lx\n",(long)constmap[i][hr],imm[i+1]);
4016         return 1;
4017       }
4018     }
4019   }
4020   *value=constmap[i][hr];
4021   //printf("c=%lx\n",(long)constmap[i][hr]);
4022   if(i==slen-1) return 1;
4023   assert(reg < 64);
4024   return !((unneeded_reg[i+1]>>reg)&1);
4025 }
4026
4027 // Load registers with known constants
4028 static void load_consts(signed char pre[],signed char regmap[],int i)
4029 {
4030   int hr,hr2;
4031   // propagate loaded constant flags
4032   if(i==0||bt[i])
4033     regs[i].loadedconst=0;
4034   else {
4035     for(hr=0;hr<HOST_REGS;hr++) {
4036       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
4037          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
4038       {
4039         regs[i].loadedconst|=1<<hr;
4040       }
4041     }
4042   }
4043   // Load 32-bit regs
4044   for(hr=0;hr<HOST_REGS;hr++) {
4045     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4046       //if(entry[hr]!=regmap[hr]) {
4047       if(!((regs[i].loadedconst>>hr)&1)) {
4048         assert(regmap[hr]<64);
4049         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>0) {
4050           int value,similar=0;
4051           if(get_final_value(hr,i,&value)) {
4052             // see if some other register has similar value
4053             for(hr2=0;hr2<HOST_REGS;hr2++) {
4054               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
4055                 if(is_similar_value(value,constmap[i][hr2])) {
4056                   similar=1;
4057                   break;
4058                 }
4059               }
4060             }
4061             if(similar) {
4062               int value2;
4063               if(get_final_value(hr2,i,&value2)) // is this needed?
4064                 emit_movimm_from(value2,hr2,value,hr);
4065               else
4066                 emit_movimm(value,hr);
4067             }
4068             else if(value==0) {
4069               emit_zeroreg(hr);
4070             }
4071             else {
4072               emit_movimm(value,hr);
4073             }
4074           }
4075           regs[i].loadedconst|=1<<hr;
4076         }
4077       }
4078     }
4079   }
4080 }
4081
4082 void load_all_consts(signed char regmap[], u_int dirty, int i)
4083 {
4084   int hr;
4085   // Load 32-bit regs
4086   for(hr=0;hr<HOST_REGS;hr++) {
4087     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4088       assert(regmap[hr] < 64);
4089       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>0) {
4090         int value=constmap[i][hr];
4091         if(value==0) {
4092           emit_zeroreg(hr);
4093         }
4094         else {
4095           emit_movimm(value,hr);
4096         }
4097       }
4098     }
4099   }
4100 }
4101
4102 // Write out all dirty registers (except cycle count)
4103 static void wb_dirtys(signed char i_regmap[],uint64_t i_dirty)
4104 {
4105   int hr;
4106   for(hr=0;hr<HOST_REGS;hr++) {
4107     if(hr!=EXCLUDE_REG) {
4108       if(i_regmap[hr]>0) {
4109         if(i_regmap[hr]!=CCREG) {
4110           if((i_dirty>>hr)&1) {
4111             assert(i_regmap[hr]<64);
4112             emit_storereg(i_regmap[hr],hr);
4113           }
4114         }
4115       }
4116     }
4117   }
4118 }
4119
4120 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4121 // This writes the registers not written by store_regs_bt
4122 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_dirty,int addr)
4123 {
4124   int hr;
4125   int t=(addr-start)>>2;
4126   for(hr=0;hr<HOST_REGS;hr++) {
4127     if(hr!=EXCLUDE_REG) {
4128       if(i_regmap[hr]>0) {
4129         if(i_regmap[hr]!=CCREG) {
4130           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1)) {
4131             if((i_dirty>>hr)&1) {
4132               assert(i_regmap[hr]<64);
4133               emit_storereg(i_regmap[hr],hr);
4134             }
4135           }
4136         }
4137       }
4138     }
4139   }
4140 }
4141
4142 // Load all registers (except cycle count)
4143 void load_all_regs(signed char i_regmap[])
4144 {
4145   int hr;
4146   for(hr=0;hr<HOST_REGS;hr++) {
4147     if(hr!=EXCLUDE_REG) {
4148       if(i_regmap[hr]==0) {
4149         emit_zeroreg(hr);
4150       }
4151       else
4152       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4153       {
4154         emit_loadreg(i_regmap[hr],hr);
4155       }
4156     }
4157   }
4158 }
4159
4160 // Load all current registers also needed by next instruction
4161 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4162 {
4163   int hr;
4164   for(hr=0;hr<HOST_REGS;hr++) {
4165     if(hr!=EXCLUDE_REG) {
4166       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4167         if(i_regmap[hr]==0) {
4168           emit_zeroreg(hr);
4169         }
4170         else
4171         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4172         {
4173           emit_loadreg(i_regmap[hr],hr);
4174         }
4175       }
4176     }
4177   }
4178 }
4179
4180 // Load all regs, storing cycle count if necessary
4181 void load_regs_entry(int t)
4182 {
4183   int hr;
4184   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
4185   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
4186   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4187     emit_storereg(CCREG,HOST_CCREG);
4188   }
4189   // Load 32-bit regs
4190   for(hr=0;hr<HOST_REGS;hr++) {
4191     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4192       if(regs[t].regmap_entry[hr]==0) {
4193         emit_zeroreg(hr);
4194       }
4195       else if(regs[t].regmap_entry[hr]!=CCREG)
4196       {
4197         emit_loadreg(regs[t].regmap_entry[hr],hr);
4198       }
4199     }
4200   }
4201 }
4202
4203 // Store dirty registers prior to branch
4204 void store_regs_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
4205 {
4206   if(internal_branch(addr))
4207   {
4208     int t=(addr-start)>>2;
4209     int hr;
4210     for(hr=0;hr<HOST_REGS;hr++) {
4211       if(hr!=EXCLUDE_REG) {
4212         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4213           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1)) {
4214             if((i_dirty>>hr)&1) {
4215               assert(i_regmap[hr]<64);
4216               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4217                 emit_storereg(i_regmap[hr],hr);
4218             }
4219           }
4220         }
4221       }
4222     }
4223   }
4224   else
4225   {
4226     // Branch out of this block, write out all dirty regs
4227     wb_dirtys(i_regmap,i_dirty);
4228   }
4229 }
4230
4231 // Load all needed registers for branch target
4232 static void load_regs_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
4233 {
4234   //if(addr>=start && addr<(start+slen*4))
4235   if(internal_branch(addr))
4236   {
4237     int t=(addr-start)>>2;
4238     int hr;
4239     // Store the cycle count before loading something else
4240     if(i_regmap[HOST_CCREG]!=CCREG) {
4241       assert(i_regmap[HOST_CCREG]==-1);
4242     }
4243     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4244       emit_storereg(CCREG,HOST_CCREG);
4245     }
4246     // Load 32-bit regs
4247     for(hr=0;hr<HOST_REGS;hr++) {
4248       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4249         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4250           if(regs[t].regmap_entry[hr]==0) {
4251             emit_zeroreg(hr);
4252           }
4253           else if(regs[t].regmap_entry[hr]!=CCREG)
4254           {
4255             emit_loadreg(regs[t].regmap_entry[hr],hr);
4256           }
4257         }
4258       }
4259     }
4260   }
4261 }
4262
4263 static int match_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
4264 {
4265   if(addr>=start && addr<start+slen*4-4)
4266   {
4267     int t=(addr-start)>>2;
4268     int hr;
4269     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4270     for(hr=0;hr<HOST_REGS;hr++)
4271     {
4272       if(hr!=EXCLUDE_REG)
4273       {
4274         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4275         {
4276           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4277           {
4278             return 0;
4279           }
4280           else
4281           if((i_dirty>>hr)&1)
4282           {
4283             if(i_regmap[hr]<TEMPREG)
4284             {
4285               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4286                 return 0;
4287             }
4288             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4289             {
4290               assert(0);
4291             }
4292           }
4293         }
4294         else // Same register but is it 32-bit or dirty?
4295         if(i_regmap[hr]>=0)
4296         {
4297           if(!((regs[t].dirty>>hr)&1))
4298           {
4299             if((i_dirty>>hr)&1)
4300             {
4301               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4302               {
4303                 //printf("%x: dirty no match\n",addr);
4304                 return 0;
4305               }
4306             }
4307           }
4308         }
4309       }
4310     }
4311     // Delay slots are not valid branch targets
4312     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP)) return 0;
4313     // Delay slots require additional processing, so do not match
4314     if(is_ds[t]) return 0;
4315   }
4316   else
4317   {
4318     int hr;
4319     for(hr=0;hr<HOST_REGS;hr++)
4320     {
4321       if(hr!=EXCLUDE_REG)
4322       {
4323         if(i_regmap[hr]>=0)
4324         {
4325           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4326           {
4327             if((i_dirty>>hr)&1)
4328             {
4329               return 0;
4330             }
4331           }
4332         }
4333       }
4334     }
4335   }
4336   return 1;
4337 }
4338
4339 #ifdef DRC_DBG
4340 static void drc_dbg_emit_do_cmp(int i)
4341 {
4342   extern void do_insn_cmp();
4343   //extern int cycle;
4344   u_int hr,reglist=0;
4345
4346   assem_debug("//do_insn_cmp %08x\n", start+i*4);
4347   for (hr = 0; hr < HOST_REGS; hr++)
4348     if(regs[i].regmap[hr]>=0) reglist|=1<<hr;
4349   save_regs(reglist);
4350   // write out changed consts to match the interpreter
4351   if (i > 0 && !bt[i]) {
4352     for (hr = 0; hr < HOST_REGS; hr++) {
4353       int reg = regs[i-1].regmap[hr];
4354       if (hr == EXCLUDE_REG || reg < 0)
4355         continue;
4356       if (!((regs[i-1].isconst >> hr) & 1))
4357         continue;
4358       if (i > 1 && reg == regs[i-2].regmap[hr] && constmap[i-1][hr] == constmap[i-2][hr])
4359         continue;
4360       emit_movimm(constmap[i-1][hr],0);
4361       emit_storereg(reg, 0);
4362     }
4363   }
4364   emit_movimm(start+i*4,0);
4365   emit_writeword(0,&pcaddr);
4366   emit_far_call(do_insn_cmp);
4367   //emit_readword(&cycle,0);
4368   //emit_addimm(0,2,0);
4369   //emit_writeword(0,&cycle);
4370   (void)get_reg2;
4371   restore_regs(reglist);
4372   assem_debug("\\\\do_insn_cmp\n");
4373 }
4374 #else
4375 #define drc_dbg_emit_do_cmp(x)
4376 #endif
4377
4378 // Used when a branch jumps into the delay slot of another branch
4379 static void ds_assemble_entry(int i)
4380 {
4381   int t=(ba[i]-start)>>2;
4382   if (!instr_addr[t])
4383     instr_addr[t] = out;
4384   assem_debug("Assemble delay slot at %x\n",ba[i]);
4385   assem_debug("<->\n");
4386   drc_dbg_emit_do_cmp(t);
4387   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4388     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty);
4389   load_regs(regs[t].regmap_entry,regs[t].regmap,rs1[t],rs2[t]);
4390   address_generation(t,&regs[t],regs[t].regmap_entry);
4391   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4392     load_regs(regs[t].regmap_entry,regs[t].regmap,INVCP,INVCP);
4393   is_delayslot=0;
4394   switch(itype[t]) {
4395     case ALU:
4396       alu_assemble(t,&regs[t]);break;
4397     case IMM16:
4398       imm16_assemble(t,&regs[t]);break;
4399     case SHIFT:
4400       shift_assemble(t,&regs[t]);break;
4401     case SHIFTIMM:
4402       shiftimm_assemble(t,&regs[t]);break;
4403     case LOAD:
4404       load_assemble(t,&regs[t]);break;
4405     case LOADLR:
4406       loadlr_assemble(t,&regs[t]);break;
4407     case STORE:
4408       store_assemble(t,&regs[t]);break;
4409     case STORELR:
4410       storelr_assemble(t,&regs[t]);break;
4411     case COP0:
4412       cop0_assemble(t,&regs[t]);break;
4413     case COP1:
4414       cop1_assemble(t,&regs[t]);break;
4415     case C1LS:
4416       c1ls_assemble(t,&regs[t]);break;
4417     case COP2:
4418       cop2_assemble(t,&regs[t]);break;
4419     case C2LS:
4420       c2ls_assemble(t,&regs[t]);break;
4421     case C2OP:
4422       c2op_assemble(t,&regs[t]);break;
4423     case MULTDIV:
4424       multdiv_assemble(t,&regs[t]);break;
4425     case MOV:
4426       mov_assemble(t,&regs[t]);break;
4427     case SYSCALL:
4428     case HLECALL:
4429     case INTCALL:
4430     case SPAN:
4431     case UJUMP:
4432     case RJUMP:
4433     case CJUMP:
4434     case SJUMP:
4435       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4436   }
4437   store_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+4);
4438   load_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+4);
4439   if(internal_branch(ba[i]+4))
4440     assem_debug("branch: internal\n");
4441   else
4442     assem_debug("branch: external\n");
4443   assert(internal_branch(ba[i]+4));
4444   add_to_linker(out,ba[i]+4,internal_branch(ba[i]+4));
4445   emit_jmp(0);
4446 }
4447
4448 static void emit_extjump(void *addr, u_int target)
4449 {
4450   emit_extjump2(addr, target, dyna_linker);
4451 }
4452
4453 static void emit_extjump_ds(void *addr, u_int target)
4454 {
4455   emit_extjump2(addr, target, dyna_linker_ds);
4456 }
4457
4458 // Load 2 immediates optimizing for small code size
4459 static void emit_mov2imm_compact(int imm1,u_int rt1,int imm2,u_int rt2)
4460 {
4461   emit_movimm(imm1,rt1);
4462   emit_movimm_from(imm1,rt1,imm2,rt2);
4463 }
4464
4465 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4466 {
4467   int count;
4468   void *jaddr;
4469   void *idle=NULL;
4470   int t=0;
4471   if(itype[i]==RJUMP)
4472   {
4473     *adj=0;
4474   }
4475   //if(ba[i]>=start && ba[i]<(start+slen*4))
4476   if(internal_branch(ba[i]))
4477   {
4478     t=(ba[i]-start)>>2;
4479     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4480     else *adj=ccadj[t];
4481   }
4482   else
4483   {
4484     *adj=0;
4485   }
4486   count=ccadj[i];
4487   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4488     // Idle loop
4489     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4490     idle=out;
4491     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4492     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4493     jaddr=out;
4494     emit_jmp(0);
4495   }
4496   else if(*adj==0||invert) {
4497     int cycles=CLOCK_ADJUST(count+2);
4498     // faster loop HACK
4499 #if 0
4500     if (t&&*adj) {
4501       int rel=t-i;
4502       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4503         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4504     }
4505 #endif
4506     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4507     jaddr=out;
4508     emit_jns(0);
4509   }
4510   else
4511   {
4512     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4513     jaddr=out;
4514     emit_jns(0);
4515   }
4516   add_stub(CC_STUB,jaddr,idle?idle:out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4517 }
4518
4519 static void do_ccstub(int n)
4520 {
4521   literal_pool(256);
4522   assem_debug("do_ccstub %x\n",start+(u_int)stubs[n].b*4);
4523   set_jump_target(stubs[n].addr, out);
4524   int i=stubs[n].b;
4525   if(stubs[n].d==NULLDS) {
4526     // Delay slot instruction is nullified ("likely" branch)
4527     wb_dirtys(regs[i].regmap,regs[i].dirty);
4528   }
4529   else if(stubs[n].d!=TAKEN) {
4530     wb_dirtys(branch_regs[i].regmap,branch_regs[i].dirty);
4531   }
4532   else {
4533     if(internal_branch(ba[i]))
4534       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4535   }
4536   if(stubs[n].c!=-1)
4537   {
4538     // Save PC as return address
4539     emit_movimm(stubs[n].c,EAX);
4540     emit_writeword(EAX,&pcaddr);
4541   }
4542   else
4543   {
4544     // Return address depends on which way the branch goes
4545     if(itype[i]==CJUMP||itype[i]==SJUMP)
4546     {
4547       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4548       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4549       if(rs1[i]==0)
4550       {
4551         s1l=s2l;
4552         s2l=-1;
4553       }
4554       else if(rs2[i]==0)
4555       {
4556         s2l=-1;
4557       }
4558       assert(s1l>=0);
4559       #ifdef DESTRUCTIVE_WRITEBACK
4560       if(rs1[i]) {
4561         if((branch_regs[i].dirty>>s1l)&&1)
4562           emit_loadreg(rs1[i],s1l);
4563       }
4564       else {
4565         if((branch_regs[i].dirty>>s1l)&1)
4566           emit_loadreg(rs2[i],s1l);
4567       }
4568       if(s2l>=0)
4569         if((branch_regs[i].dirty>>s2l)&1)
4570           emit_loadreg(rs2[i],s2l);
4571       #endif
4572       int hr=0;
4573       int addr=-1,alt=-1,ntaddr=-1;
4574       while(hr<HOST_REGS)
4575       {
4576         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4577            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4578            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4579         {
4580           addr=hr++;break;
4581         }
4582         hr++;
4583       }
4584       while(hr<HOST_REGS)
4585       {
4586         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4587            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4588            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4589         {
4590           alt=hr++;break;
4591         }
4592         hr++;
4593       }
4594       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4595       {
4596         while(hr<HOST_REGS)
4597         {
4598           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4599              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4600              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4601           {
4602             ntaddr=hr;break;
4603           }
4604           hr++;
4605         }
4606         assert(hr<HOST_REGS);
4607       }
4608       if((opcode[i]&0x2f)==4) // BEQ
4609       {
4610         #ifdef HAVE_CMOV_IMM
4611         if(s2l>=0) emit_cmp(s1l,s2l);
4612         else emit_test(s1l,s1l);
4613         emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4614         #else
4615         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4616         if(s2l>=0) emit_cmp(s1l,s2l);
4617         else emit_test(s1l,s1l);
4618         emit_cmovne_reg(alt,addr);
4619         #endif
4620       }
4621       if((opcode[i]&0x2f)==5) // BNE
4622       {
4623         #ifdef HAVE_CMOV_IMM
4624         if(s2l>=0) emit_cmp(s1l,s2l);
4625         else emit_test(s1l,s1l);
4626         emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4627         #else
4628         emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4629         if(s2l>=0) emit_cmp(s1l,s2l);
4630         else emit_test(s1l,s1l);
4631         emit_cmovne_reg(alt,addr);
4632         #endif
4633       }
4634       if((opcode[i]&0x2f)==6) // BLEZ
4635       {
4636         //emit_movimm(ba[i],alt);
4637         //emit_movimm(start+i*4+8,addr);
4638         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4639         emit_cmpimm(s1l,1);
4640         emit_cmovl_reg(alt,addr);
4641       }
4642       if((opcode[i]&0x2f)==7) // BGTZ
4643       {
4644         //emit_movimm(ba[i],addr);
4645         //emit_movimm(start+i*4+8,ntaddr);
4646         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4647         emit_cmpimm(s1l,1);
4648         emit_cmovl_reg(ntaddr,addr);
4649       }
4650       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4651       {
4652         //emit_movimm(ba[i],alt);
4653         //emit_movimm(start+i*4+8,addr);
4654         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4655         emit_test(s1l,s1l);
4656         emit_cmovs_reg(alt,addr);
4657       }
4658       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4659       {
4660         //emit_movimm(ba[i],addr);
4661         //emit_movimm(start+i*4+8,alt);
4662         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4663         emit_test(s1l,s1l);
4664         emit_cmovs_reg(alt,addr);
4665       }
4666       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4667         if(source[i]&0x10000) // BC1T
4668         {
4669           //emit_movimm(ba[i],alt);
4670           //emit_movimm(start+i*4+8,addr);
4671           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4672           emit_testimm(s1l,0x800000);
4673           emit_cmovne_reg(alt,addr);
4674         }
4675         else // BC1F
4676         {
4677           //emit_movimm(ba[i],addr);
4678           //emit_movimm(start+i*4+8,alt);
4679           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4680           emit_testimm(s1l,0x800000);
4681           emit_cmovne_reg(alt,addr);
4682         }
4683       }
4684       emit_writeword(addr,&pcaddr);
4685     }
4686     else
4687     if(itype[i]==RJUMP)
4688     {
4689       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4690       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4691         r=get_reg(branch_regs[i].regmap,RTEMP);
4692       }
4693       emit_writeword(r,&pcaddr);
4694     }
4695     else {SysPrintf("Unknown branch type in do_ccstub\n");abort();}
4696   }
4697   // Update cycle count
4698   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4699   if(stubs[n].a) emit_addimm(HOST_CCREG,CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
4700   emit_far_call(cc_interrupt);
4701   if(stubs[n].a) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
4702   if(stubs[n].d==TAKEN) {
4703     if(internal_branch(ba[i]))
4704       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4705     else if(itype[i]==RJUMP) {
4706       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4707         emit_readword(&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4708       else
4709         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4710     }
4711   }else if(stubs[n].d==NOTTAKEN) {
4712     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4713     else load_all_regs(branch_regs[i].regmap);
4714   }else if(stubs[n].d==NULLDS) {
4715     // Delay slot instruction is nullified ("likely" branch)
4716     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4717     else load_all_regs(regs[i].regmap);
4718   }else{
4719     load_all_regs(branch_regs[i].regmap);
4720   }
4721   if (stubs[n].retaddr)
4722     emit_jmp(stubs[n].retaddr);
4723   else
4724     do_jump_vaddr(stubs[n].e);
4725 }
4726
4727 static void add_to_linker(void *addr, u_int target, int ext)
4728 {
4729   assert(linkcount < ARRAY_SIZE(link_addr));
4730   link_addr[linkcount].addr = addr;
4731   link_addr[linkcount].target = target;
4732   link_addr[linkcount].ext = ext;
4733   linkcount++;
4734 }
4735
4736 static void ujump_assemble_write_ra(int i)
4737 {
4738   int rt;
4739   unsigned int return_address;
4740   rt=get_reg(branch_regs[i].regmap,31);
4741   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4742   //assert(rt>=0);
4743   return_address=start+i*4+8;
4744   if(rt>=0) {
4745     #ifdef USE_MINI_HT
4746     if(internal_branch(return_address)&&rt1[i+1]!=31) {
4747       int temp=-1; // note: must be ds-safe
4748       #ifdef HOST_TEMPREG
4749       temp=HOST_TEMPREG;
4750       #endif
4751       if(temp>=0) do_miniht_insert(return_address,rt,temp);
4752       else emit_movimm(return_address,rt);
4753     }
4754     else
4755     #endif
4756     {
4757       #ifdef REG_PREFETCH
4758       if(temp>=0)
4759       {
4760         if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4761       }
4762       #endif
4763       emit_movimm(return_address,rt); // PC into link register
4764       #ifdef IMM_PREFETCH
4765       emit_prefetch(hash_table_get(return_address));
4766       #endif
4767     }
4768   }
4769 }
4770
4771 static void ujump_assemble(int i,struct regstat *i_regs)
4772 {
4773   int ra_done=0;
4774   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4775   address_generation(i+1,i_regs,regs[i].regmap_entry);
4776   #ifdef REG_PREFETCH
4777   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4778   if(rt1[i]==31&&temp>=0)
4779   {
4780     signed char *i_regmap=i_regs->regmap;
4781     int return_address=start+i*4+8;
4782     if(get_reg(branch_regs[i].regmap,31)>0)
4783     if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4784   }
4785   #endif
4786   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4787     ujump_assemble_write_ra(i); // writeback ra for DS
4788     ra_done=1;
4789   }
4790   ds_assemble(i+1,i_regs);
4791   uint64_t bc_unneeded=branch_regs[i].u;
4792   bc_unneeded|=1|(1LL<<rt1[i]);
4793   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
4794   load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
4795   if(!ra_done&&rt1[i]==31)
4796     ujump_assemble_write_ra(i);
4797   int cc,adj;
4798   cc=get_reg(branch_regs[i].regmap,CCREG);
4799   assert(cc==HOST_CCREG);
4800   store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4801   #ifdef REG_PREFETCH
4802   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4803   #endif
4804   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4805   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4806   load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4807   if(internal_branch(ba[i]))
4808     assem_debug("branch: internal\n");
4809   else
4810     assem_debug("branch: external\n");
4811   if(internal_branch(ba[i])&&is_ds[(ba[i]-start)>>2]) {
4812     ds_assemble_entry(i);
4813   }
4814   else {
4815     add_to_linker(out,ba[i],internal_branch(ba[i]));
4816     emit_jmp(0);
4817   }
4818 }
4819
4820 static void rjump_assemble_write_ra(int i)
4821 {
4822   int rt,return_address;
4823   assert(rt1[i+1]!=rt1[i]);
4824   assert(rt2[i+1]!=rt1[i]);
4825   rt=get_reg(branch_regs[i].regmap,rt1[i]);
4826   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4827   assert(rt>=0);
4828   return_address=start+i*4+8;
4829   #ifdef REG_PREFETCH
4830   if(temp>=0)
4831   {
4832     if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4833   }
4834   #endif
4835   emit_movimm(return_address,rt); // PC into link register
4836   #ifdef IMM_PREFETCH
4837   emit_prefetch(hash_table_get(return_address));
4838   #endif
4839 }
4840
4841 static void rjump_assemble(int i,struct regstat *i_regs)
4842 {
4843   int temp;
4844   int rs,cc;
4845   int ra_done=0;
4846   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4847   assert(rs>=0);
4848   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4849     // Delay slot abuse, make a copy of the branch address register
4850     temp=get_reg(branch_regs[i].regmap,RTEMP);
4851     assert(temp>=0);
4852     assert(regs[i].regmap[temp]==RTEMP);
4853     emit_mov(rs,temp);
4854     rs=temp;
4855   }
4856   address_generation(i+1,i_regs,regs[i].regmap_entry);
4857   #ifdef REG_PREFETCH
4858   if(rt1[i]==31)
4859   {
4860     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4861       signed char *i_regmap=i_regs->regmap;
4862       int return_address=start+i*4+8;
4863       if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4864     }
4865   }
4866   #endif
4867   #ifdef USE_MINI_HT
4868   if(rs1[i]==31) {
4869     int rh=get_reg(regs[i].regmap,RHASH);
4870     if(rh>=0) do_preload_rhash(rh);
4871   }
4872   #endif
4873   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4874     rjump_assemble_write_ra(i);
4875     ra_done=1;
4876   }
4877   ds_assemble(i+1,i_regs);
4878   uint64_t bc_unneeded=branch_regs[i].u;
4879   bc_unneeded|=1|(1LL<<rt1[i]);
4880   bc_unneeded&=~(1LL<<rs1[i]);
4881   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
4882   load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],CCREG);
4883   if(!ra_done&&rt1[i]!=0)
4884     rjump_assemble_write_ra(i);
4885   cc=get_reg(branch_regs[i].regmap,CCREG);
4886   assert(cc==HOST_CCREG);
4887   (void)cc;
4888   #ifdef USE_MINI_HT
4889   int rh=get_reg(branch_regs[i].regmap,RHASH);
4890   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4891   if(rs1[i]==31) {
4892     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4893     do_preload_rhtbl(ht);
4894     do_rhash(rs,rh);
4895   }
4896   #endif
4897   store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1);
4898   #ifdef DESTRUCTIVE_WRITEBACK
4899   if((branch_regs[i].dirty>>rs)&1) {
4900     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4901       emit_loadreg(rs1[i],rs);
4902     }
4903   }
4904   #endif
4905   #ifdef REG_PREFETCH
4906   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4907   #endif
4908   #ifdef USE_MINI_HT
4909   if(rs1[i]==31) {
4910     do_miniht_load(ht,rh);
4911   }
4912   #endif
4913   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4914   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4915   //assert(adj==0);
4916   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4917   add_stub(CC_STUB,out,NULL,0,i,-1,TAKEN,rs);
4918   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
4919     // special case for RFE
4920     emit_jmp(0);
4921   else
4922     emit_jns(0);
4923   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1);
4924   #ifdef USE_MINI_HT
4925   if(rs1[i]==31) {
4926     do_miniht_jump(rs,rh,ht);
4927   }
4928   else
4929   #endif
4930   {
4931     do_jump_vaddr(rs);
4932   }
4933   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4934   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4935   #endif
4936 }
4937
4938 static void cjump_assemble(int i,struct regstat *i_regs)
4939 {
4940   signed char *i_regmap=i_regs->regmap;
4941   int cc;
4942   int match;
4943   match=match_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4944   assem_debug("match=%d\n",match);
4945   int s1l,s2l;
4946   int unconditional=0,nop=0;
4947   int invert=0;
4948   int internal=internal_branch(ba[i]);
4949   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4950   if(!match) invert=1;
4951   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4952   if(i>(ba[i]-start)>>2) invert=1;
4953   #endif
4954   #ifdef __aarch64__
4955   invert=1; // because of near cond. branches
4956   #endif
4957
4958   if(ooo[i]) {
4959     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4960     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4961   }
4962   else {
4963     s1l=get_reg(i_regmap,rs1[i]);
4964     s2l=get_reg(i_regmap,rs2[i]);
4965   }
4966   if(rs1[i]==0&&rs2[i]==0)
4967   {
4968     if(opcode[i]&1) nop=1;
4969     else unconditional=1;
4970     //assert(opcode[i]!=5);
4971     //assert(opcode[i]!=7);
4972     //assert(opcode[i]!=0x15);
4973     //assert(opcode[i]!=0x17);
4974   }
4975   else if(rs1[i]==0)
4976   {
4977     s1l=s2l;
4978     s2l=-1;
4979   }
4980   else if(rs2[i]==0)
4981   {
4982     s2l=-1;
4983   }
4984
4985   if(ooo[i]) {
4986     // Out of order execution (delay slot first)
4987     //printf("OOOE\n");
4988     address_generation(i+1,i_regs,regs[i].regmap_entry);
4989     ds_assemble(i+1,i_regs);
4990     int adj;
4991     uint64_t bc_unneeded=branch_regs[i].u;
4992     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
4993     bc_unneeded|=1;
4994     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
4995     load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],rs2[i]);
4996     load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
4997     cc=get_reg(branch_regs[i].regmap,CCREG);
4998     assert(cc==HOST_CCREG);
4999     if(unconditional)
5000       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5001     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5002     //assem_debug("cycle count (adj)\n");
5003     if(unconditional) {
5004       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5005       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5006         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5007         load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5008         if(internal)
5009           assem_debug("branch: internal\n");
5010         else
5011           assem_debug("branch: external\n");
5012         if(internal&&is_ds[(ba[i]-start)>>2]) {
5013           ds_assemble_entry(i);
5014         }
5015         else {
5016           add_to_linker(out,ba[i],internal);
5017           emit_jmp(0);
5018         }
5019         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5020         if(((u_int)out)&7) emit_addnop(0);
5021         #endif
5022       }
5023     }
5024     else if(nop) {
5025       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5026       void *jaddr=out;
5027       emit_jns(0);
5028       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5029     }
5030     else {
5031       void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
5032       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5033       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5034
5035       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5036       assert(s1l>=0);
5037       if(opcode[i]==4) // BEQ
5038       {
5039         if(s2l>=0) emit_cmp(s1l,s2l);
5040         else emit_test(s1l,s1l);
5041         if(invert){
5042           nottaken=out;
5043           emit_jne(DJT_1);
5044         }else{
5045           add_to_linker(out,ba[i],internal);
5046           emit_jeq(0);
5047         }
5048       }
5049       if(opcode[i]==5) // BNE
5050       {
5051         if(s2l>=0) emit_cmp(s1l,s2l);
5052         else emit_test(s1l,s1l);
5053         if(invert){
5054           nottaken=out;
5055           emit_jeq(DJT_1);
5056         }else{
5057           add_to_linker(out,ba[i],internal);
5058           emit_jne(0);
5059         }
5060       }
5061       if(opcode[i]==6) // BLEZ
5062       {
5063         emit_cmpimm(s1l,1);
5064         if(invert){
5065           nottaken=out;
5066           emit_jge(DJT_1);
5067         }else{
5068           add_to_linker(out,ba[i],internal);
5069           emit_jl(0);
5070         }
5071       }
5072       if(opcode[i]==7) // BGTZ
5073       {
5074         emit_cmpimm(s1l,1);
5075         if(invert){
5076           nottaken=out;
5077           emit_jl(DJT_1);
5078         }else{
5079           add_to_linker(out,ba[i],internal);
5080           emit_jge(0);
5081         }
5082       }
5083       if(invert) {
5084         if(taken) set_jump_target(taken, out);
5085         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5086         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5087           if(adj) {
5088             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5089             add_to_linker(out,ba[i],internal);
5090           }else{
5091             emit_addnop(13);
5092             add_to_linker(out,ba[i],internal*2);
5093           }
5094           emit_jmp(0);
5095         }else
5096         #endif
5097         {
5098           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5099           store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5100           load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5101           if(internal)
5102             assem_debug("branch: internal\n");
5103           else
5104             assem_debug("branch: external\n");
5105           if(internal&&is_ds[(ba[i]-start)>>2]) {
5106             ds_assemble_entry(i);
5107           }
5108           else {
5109             add_to_linker(out,ba[i],internal);
5110             emit_jmp(0);
5111           }
5112         }
5113         set_jump_target(nottaken, out);
5114       }
5115
5116       if(nottaken1) set_jump_target(nottaken1, out);
5117       if(adj) {
5118         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5119       }
5120     } // (!unconditional)
5121   } // if(ooo)
5122   else
5123   {
5124     // In-order execution (branch first)
5125     //if(likely[i]) printf("IOL\n");
5126     //else
5127     //printf("IOE\n");
5128     void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
5129     if(!unconditional&&!nop) {
5130       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5131       assert(s1l>=0);
5132       if((opcode[i]&0x2f)==4) // BEQ
5133       {
5134         if(s2l>=0) emit_cmp(s1l,s2l);
5135         else emit_test(s1l,s1l);
5136         nottaken=out;
5137         emit_jne(DJT_2);
5138       }
5139       if((opcode[i]&0x2f)==5) // BNE
5140       {
5141         if(s2l>=0) emit_cmp(s1l,s2l);
5142         else emit_test(s1l,s1l);
5143         nottaken=out;
5144         emit_jeq(DJT_2);
5145       }
5146       if((opcode[i]&0x2f)==6) // BLEZ
5147       {
5148         emit_cmpimm(s1l,1);
5149         nottaken=out;
5150         emit_jge(DJT_2);
5151       }
5152       if((opcode[i]&0x2f)==7) // BGTZ
5153       {
5154         emit_cmpimm(s1l,1);
5155         nottaken=out;
5156         emit_jl(DJT_2);
5157       }
5158     } // if(!unconditional)
5159     int adj;
5160     uint64_t ds_unneeded=branch_regs[i].u;
5161     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5162     ds_unneeded|=1;
5163     // branch taken
5164     if(!nop) {
5165       if(taken) set_jump_target(taken, out);
5166       assem_debug("1:\n");
5167       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5168       // load regs
5169       load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5170       address_generation(i+1,&branch_regs[i],0);
5171       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
5172       ds_assemble(i+1,&branch_regs[i]);
5173       cc=get_reg(branch_regs[i].regmap,CCREG);
5174       if(cc==-1) {
5175         emit_loadreg(CCREG,cc=HOST_CCREG);
5176         // CHECK: Is the following instruction (fall thru) allocated ok?
5177       }
5178       assert(cc==HOST_CCREG);
5179       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5180       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5181       assem_debug("cycle count (adj)\n");
5182       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5183       load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5184       if(internal)
5185         assem_debug("branch: internal\n");
5186       else
5187         assem_debug("branch: external\n");
5188       if(internal&&is_ds[(ba[i]-start)>>2]) {
5189         ds_assemble_entry(i);
5190       }
5191       else {
5192         add_to_linker(out,ba[i],internal);
5193         emit_jmp(0);
5194       }
5195     }
5196     // branch not taken
5197     if(!unconditional) {
5198       if(nottaken1) set_jump_target(nottaken1, out);
5199       set_jump_target(nottaken, out);
5200       assem_debug("2:\n");
5201       if(!likely[i]) {
5202         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5203         load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5204         address_generation(i+1,&branch_regs[i],0);
5205         load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5206         ds_assemble(i+1,&branch_regs[i]);
5207       }
5208       cc=get_reg(branch_regs[i].regmap,CCREG);
5209       if(cc==-1&&!likely[i]) {
5210         // Cycle count isn't in a register, temporarily load it then write it out
5211         emit_loadreg(CCREG,HOST_CCREG);
5212         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5213         void *jaddr=out;
5214         emit_jns(0);
5215         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5216         emit_storereg(CCREG,HOST_CCREG);
5217       }
5218       else{
5219         cc=get_reg(i_regmap,CCREG);
5220         assert(cc==HOST_CCREG);
5221         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5222         void *jaddr=out;
5223         emit_jns(0);
5224         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5225       }
5226     }
5227   }
5228 }
5229
5230 static void sjump_assemble(int i,struct regstat *i_regs)
5231 {
5232   signed char *i_regmap=i_regs->regmap;
5233   int cc;
5234   int match;
5235   match=match_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5236   assem_debug("smatch=%d\n",match);
5237   int s1l;
5238   int unconditional=0,nevertaken=0;
5239   int invert=0;
5240   int internal=internal_branch(ba[i]);
5241   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5242   if(!match) invert=1;
5243   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5244   if(i>(ba[i]-start)>>2) invert=1;
5245   #endif
5246   #ifdef __aarch64__
5247   invert=1; // because of near cond. branches
5248   #endif
5249
5250   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5251   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5252
5253   if(ooo[i]) {
5254     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5255   }
5256   else {
5257     s1l=get_reg(i_regmap,rs1[i]);
5258   }
5259   if(rs1[i]==0)
5260   {
5261     if(opcode2[i]&1) unconditional=1;
5262     else nevertaken=1;
5263     // These are never taken (r0 is never less than zero)
5264     //assert(opcode2[i]!=0);
5265     //assert(opcode2[i]!=2);
5266     //assert(opcode2[i]!=0x10);
5267     //assert(opcode2[i]!=0x12);
5268   }
5269
5270   if(ooo[i]) {
5271     // Out of order execution (delay slot first)
5272     //printf("OOOE\n");
5273     address_generation(i+1,i_regs,regs[i].regmap_entry);
5274     ds_assemble(i+1,i_regs);
5275     int adj;
5276     uint64_t bc_unneeded=branch_regs[i].u;
5277     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5278     bc_unneeded|=1;
5279     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
5280     load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],rs1[i]);
5281     load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5282     if(rt1[i]==31) {
5283       int rt,return_address;
5284       rt=get_reg(branch_regs[i].regmap,31);
5285       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5286       if(rt>=0) {
5287         // Save the PC even if the branch is not taken
5288         return_address=start+i*4+8;
5289         emit_movimm(return_address,rt); // PC into link register
5290         #ifdef IMM_PREFETCH
5291         if(!nevertaken) emit_prefetch(hash_table_get(return_address));
5292         #endif
5293       }
5294     }
5295     cc=get_reg(branch_regs[i].regmap,CCREG);
5296     assert(cc==HOST_CCREG);
5297     if(unconditional)
5298       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5299     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5300     assem_debug("cycle count (adj)\n");
5301     if(unconditional) {
5302       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5303       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5304         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5305         load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5306         if(internal)
5307           assem_debug("branch: internal\n");
5308         else
5309           assem_debug("branch: external\n");
5310         if(internal&&is_ds[(ba[i]-start)>>2]) {
5311           ds_assemble_entry(i);
5312         }
5313         else {
5314           add_to_linker(out,ba[i],internal);
5315           emit_jmp(0);
5316         }
5317         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5318         if(((u_int)out)&7) emit_addnop(0);
5319         #endif
5320       }
5321     }
5322     else if(nevertaken) {
5323       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5324       void *jaddr=out;
5325       emit_jns(0);
5326       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5327     }
5328     else {
5329       void *nottaken = NULL;
5330       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5331       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5332       {
5333         assert(s1l>=0);
5334         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5335         {
5336           emit_test(s1l,s1l);
5337           if(invert){
5338             nottaken=out;
5339             emit_jns(DJT_1);
5340           }else{
5341             add_to_linker(out,ba[i],internal);
5342             emit_js(0);
5343           }
5344         }
5345         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5346         {
5347           emit_test(s1l,s1l);
5348           if(invert){
5349             nottaken=out;
5350             emit_js(DJT_1);
5351           }else{
5352             add_to_linker(out,ba[i],internal);
5353             emit_jns(0);
5354           }
5355         }
5356       }
5357
5358       if(invert) {
5359         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5360         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5361           if(adj) {
5362             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5363             add_to_linker(out,ba[i],internal);
5364           }else{
5365             emit_addnop(13);
5366             add_to_linker(out,ba[i],internal*2);
5367           }
5368           emit_jmp(0);
5369         }else
5370         #endif
5371         {
5372           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5373           store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5374           load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5375           if(internal)
5376             assem_debug("branch: internal\n");
5377           else
5378             assem_debug("branch: external\n");
5379           if(internal&&is_ds[(ba[i]-start)>>2]) {
5380             ds_assemble_entry(i);
5381           }
5382           else {
5383             add_to_linker(out,ba[i],internal);
5384             emit_jmp(0);
5385           }
5386         }
5387         set_jump_target(nottaken, out);
5388       }
5389
5390       if(adj) {
5391         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5392       }
5393     } // (!unconditional)
5394   } // if(ooo)
5395   else
5396   {
5397     // In-order execution (branch first)
5398     //printf("IOE\n");
5399     void *nottaken = NULL;
5400     if(rt1[i]==31) {
5401       int rt,return_address;
5402       rt=get_reg(branch_regs[i].regmap,31);
5403       if(rt>=0) {
5404         // Save the PC even if the branch is not taken
5405         return_address=start+i*4+8;
5406         emit_movimm(return_address,rt); // PC into link register
5407         #ifdef IMM_PREFETCH
5408         emit_prefetch(hash_table_get(return_address));
5409         #endif
5410       }
5411     }
5412     if(!unconditional) {
5413       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5414         assert(s1l>=0);
5415         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5416         {
5417           emit_test(s1l,s1l);
5418           nottaken=out;
5419           emit_jns(DJT_1);
5420         }
5421         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5422         {
5423           emit_test(s1l,s1l);
5424           nottaken=out;
5425           emit_js(DJT_1);
5426         }
5427     } // if(!unconditional)
5428     int adj;
5429     uint64_t ds_unneeded=branch_regs[i].u;
5430     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5431     ds_unneeded|=1;
5432     // branch taken
5433     if(!nevertaken) {
5434       //assem_debug("1:\n");
5435       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5436       // load regs
5437       load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5438       address_generation(i+1,&branch_regs[i],0);
5439       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
5440       ds_assemble(i+1,&branch_regs[i]);
5441       cc=get_reg(branch_regs[i].regmap,CCREG);
5442       if(cc==-1) {
5443         emit_loadreg(CCREG,cc=HOST_CCREG);
5444         // CHECK: Is the following instruction (fall thru) allocated ok?
5445       }
5446       assert(cc==HOST_CCREG);
5447       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5448       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5449       assem_debug("cycle count (adj)\n");
5450       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5451       load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5452       if(internal)
5453         assem_debug("branch: internal\n");
5454       else
5455         assem_debug("branch: external\n");
5456       if(internal&&is_ds[(ba[i]-start)>>2]) {
5457         ds_assemble_entry(i);
5458       }
5459       else {
5460         add_to_linker(out,ba[i],internal);
5461         emit_jmp(0);
5462       }
5463     }
5464     // branch not taken
5465     if(!unconditional) {
5466       set_jump_target(nottaken, out);
5467       assem_debug("1:\n");
5468       if(!likely[i]) {
5469         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5470         load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5471         address_generation(i+1,&branch_regs[i],0);
5472         load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5473         ds_assemble(i+1,&branch_regs[i]);
5474       }
5475       cc=get_reg(branch_regs[i].regmap,CCREG);
5476       if(cc==-1&&!likely[i]) {
5477         // Cycle count isn't in a register, temporarily load it then write it out
5478         emit_loadreg(CCREG,HOST_CCREG);
5479         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5480         void *jaddr=out;
5481         emit_jns(0);
5482         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5483         emit_storereg(CCREG,HOST_CCREG);
5484       }
5485       else{
5486         cc=get_reg(i_regmap,CCREG);
5487         assert(cc==HOST_CCREG);
5488         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5489         void *jaddr=out;
5490         emit_jns(0);
5491         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5492       }
5493     }
5494   }
5495 }
5496
5497 static void pagespan_assemble(int i,struct regstat *i_regs)
5498 {
5499   int s1l=get_reg(i_regs->regmap,rs1[i]);
5500   int s2l=get_reg(i_regs->regmap,rs2[i]);
5501   void *taken = NULL;
5502   void *nottaken = NULL;
5503   int unconditional=0;
5504   if(rs1[i]==0)
5505   {
5506     s1l=s2l;
5507     s2l=-1;
5508   }
5509   else if(rs2[i]==0)
5510   {
5511     s2l=-1;
5512   }
5513   int hr=0;
5514   int addr=-1,alt=-1,ntaddr=-1;
5515   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5516   else {
5517     while(hr<HOST_REGS)
5518     {
5519       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5520          (i_regs->regmap[hr]&63)!=rs1[i] &&
5521          (i_regs->regmap[hr]&63)!=rs2[i] )
5522       {
5523         addr=hr++;break;
5524       }
5525       hr++;
5526     }
5527   }
5528   while(hr<HOST_REGS)
5529   {
5530     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5531        (i_regs->regmap[hr]&63)!=rs1[i] &&
5532        (i_regs->regmap[hr]&63)!=rs2[i] )
5533     {
5534       alt=hr++;break;
5535     }
5536     hr++;
5537   }
5538   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5539   {
5540     while(hr<HOST_REGS)
5541     {
5542       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5543          (i_regs->regmap[hr]&63)!=rs1[i] &&
5544          (i_regs->regmap[hr]&63)!=rs2[i] )
5545       {
5546         ntaddr=hr;break;
5547       }
5548       hr++;
5549     }
5550   }
5551   assert(hr<HOST_REGS);
5552   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5553     load_regs(regs[i].regmap_entry,regs[i].regmap,CCREG,CCREG);
5554   }
5555   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5556   if(opcode[i]==2) // J
5557   {
5558     unconditional=1;
5559   }
5560   if(opcode[i]==3) // JAL
5561   {
5562     // TODO: mini_ht
5563     int rt=get_reg(i_regs->regmap,31);
5564     emit_movimm(start+i*4+8,rt);
5565     unconditional=1;
5566   }
5567   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
5568   {
5569     emit_mov(s1l,addr);
5570     if(opcode2[i]==9) // JALR
5571     {
5572       int rt=get_reg(i_regs->regmap,rt1[i]);
5573       emit_movimm(start+i*4+8,rt);
5574     }
5575   }
5576   if((opcode[i]&0x3f)==4) // BEQ
5577   {
5578     if(rs1[i]==rs2[i])
5579     {
5580       unconditional=1;
5581     }
5582     else
5583     #ifdef HAVE_CMOV_IMM
5584     if(1) {
5585       if(s2l>=0) emit_cmp(s1l,s2l);
5586       else emit_test(s1l,s1l);
5587       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5588     }
5589     else
5590     #endif
5591     {
5592       assert(s1l>=0);
5593       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5594       if(s2l>=0) emit_cmp(s1l,s2l);
5595       else emit_test(s1l,s1l);
5596       emit_cmovne_reg(alt,addr);
5597     }
5598   }
5599   if((opcode[i]&0x3f)==5) // BNE
5600   {
5601     #ifdef HAVE_CMOV_IMM
5602     if(s2l>=0) emit_cmp(s1l,s2l);
5603     else emit_test(s1l,s1l);
5604     emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5605     #else
5606     assert(s1l>=0);
5607     emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5608     if(s2l>=0) emit_cmp(s1l,s2l);
5609     else emit_test(s1l,s1l);
5610     emit_cmovne_reg(alt,addr);
5611     #endif
5612   }
5613   if((opcode[i]&0x3f)==0x14) // BEQL
5614   {
5615     if(s2l>=0) emit_cmp(s1l,s2l);
5616     else emit_test(s1l,s1l);
5617     if(nottaken) set_jump_target(nottaken, out);
5618     nottaken=out;
5619     emit_jne(0);
5620   }
5621   if((opcode[i]&0x3f)==0x15) // BNEL
5622   {
5623     if(s2l>=0) emit_cmp(s1l,s2l);
5624     else emit_test(s1l,s1l);
5625     nottaken=out;
5626     emit_jeq(0);
5627     if(taken) set_jump_target(taken, out);
5628   }
5629   if((opcode[i]&0x3f)==6) // BLEZ
5630   {
5631     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5632     emit_cmpimm(s1l,1);
5633     emit_cmovl_reg(alt,addr);
5634   }
5635   if((opcode[i]&0x3f)==7) // BGTZ
5636   {
5637     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5638     emit_cmpimm(s1l,1);
5639     emit_cmovl_reg(ntaddr,addr);
5640   }
5641   if((opcode[i]&0x3f)==0x16) // BLEZL
5642   {
5643     assert((opcode[i]&0x3f)!=0x16);
5644   }
5645   if((opcode[i]&0x3f)==0x17) // BGTZL
5646   {
5647     assert((opcode[i]&0x3f)!=0x17);
5648   }
5649   assert(opcode[i]!=1); // BLTZ/BGEZ
5650
5651   //FIXME: Check CSREG
5652   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5653     if((source[i]&0x30000)==0) // BC1F
5654     {
5655       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5656       emit_testimm(s1l,0x800000);
5657       emit_cmovne_reg(alt,addr);
5658     }
5659     if((source[i]&0x30000)==0x10000) // BC1T
5660     {
5661       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5662       emit_testimm(s1l,0x800000);
5663       emit_cmovne_reg(alt,addr);
5664     }
5665     if((source[i]&0x30000)==0x20000) // BC1FL
5666     {
5667       emit_testimm(s1l,0x800000);
5668       nottaken=out;
5669       emit_jne(0);
5670     }
5671     if((source[i]&0x30000)==0x30000) // BC1TL
5672     {
5673       emit_testimm(s1l,0x800000);
5674       nottaken=out;
5675       emit_jeq(0);
5676     }
5677   }
5678
5679   assert(i_regs->regmap[HOST_CCREG]==CCREG);
5680   wb_dirtys(regs[i].regmap,regs[i].dirty);
5681   if(likely[i]||unconditional)
5682   {
5683     emit_movimm(ba[i],HOST_BTREG);
5684   }
5685   else if(addr!=HOST_BTREG)
5686   {
5687     emit_mov(addr,HOST_BTREG);
5688   }
5689   void *branch_addr=out;
5690   emit_jmp(0);
5691   int target_addr=start+i*4+5;
5692   void *stub=out;
5693   void *compiled_target_addr=check_addr(target_addr);
5694   emit_extjump_ds(branch_addr, target_addr);
5695   if(compiled_target_addr) {
5696     set_jump_target(branch_addr, compiled_target_addr);
5697     add_link(target_addr,stub);
5698   }
5699   else set_jump_target(branch_addr, stub);
5700   if(likely[i]) {
5701     // Not-taken path
5702     set_jump_target(nottaken, out);
5703     wb_dirtys(regs[i].regmap,regs[i].dirty);
5704     void *branch_addr=out;
5705     emit_jmp(0);
5706     int target_addr=start+i*4+8;
5707     void *stub=out;
5708     void *compiled_target_addr=check_addr(target_addr);
5709     emit_extjump_ds(branch_addr, target_addr);
5710     if(compiled_target_addr) {
5711       set_jump_target(branch_addr, compiled_target_addr);
5712       add_link(target_addr,stub);
5713     }
5714     else set_jump_target(branch_addr, stub);
5715   }
5716 }
5717
5718 // Assemble the delay slot for the above
5719 static void pagespan_ds()
5720 {
5721   assem_debug("initial delay slot:\n");
5722   u_int vaddr=start+1;
5723   u_int page=get_page(vaddr);
5724   u_int vpage=get_vpage(vaddr);
5725   ll_add(jump_dirty+vpage,vaddr,(void *)out);
5726   do_dirty_stub_ds();
5727   ll_add(jump_in+page,vaddr,(void *)out);
5728   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
5729   if(regs[0].regmap[HOST_CCREG]!=CCREG)
5730     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty);
5731   if(regs[0].regmap[HOST_BTREG]!=BTREG)
5732     emit_writeword(HOST_BTREG,&branch_target);
5733   load_regs(regs[0].regmap_entry,regs[0].regmap,rs1[0],rs2[0]);
5734   address_generation(0,&regs[0],regs[0].regmap_entry);
5735   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
5736     load_regs(regs[0].regmap_entry,regs[0].regmap,INVCP,INVCP);
5737   is_delayslot=0;
5738   switch(itype[0]) {
5739     case ALU:
5740       alu_assemble(0,&regs[0]);break;
5741     case IMM16:
5742       imm16_assemble(0,&regs[0]);break;
5743     case SHIFT:
5744       shift_assemble(0,&regs[0]);break;
5745     case SHIFTIMM:
5746       shiftimm_assemble(0,&regs[0]);break;
5747     case LOAD:
5748       load_assemble(0,&regs[0]);break;
5749     case LOADLR:
5750       loadlr_assemble(0,&regs[0]);break;
5751     case STORE:
5752       store_assemble(0,&regs[0]);break;
5753     case STORELR:
5754       storelr_assemble(0,&regs[0]);break;
5755     case COP0:
5756       cop0_assemble(0,&regs[0]);break;
5757     case COP1:
5758       cop1_assemble(0,&regs[0]);break;
5759     case C1LS:
5760       c1ls_assemble(0,&regs[0]);break;
5761     case COP2:
5762       cop2_assemble(0,&regs[0]);break;
5763     case C2LS:
5764       c2ls_assemble(0,&regs[0]);break;
5765     case C2OP:
5766       c2op_assemble(0,&regs[0]);break;
5767     case MULTDIV:
5768       multdiv_assemble(0,&regs[0]);break;
5769     case MOV:
5770       mov_assemble(0,&regs[0]);break;
5771     case SYSCALL:
5772     case HLECALL:
5773     case INTCALL:
5774     case SPAN:
5775     case UJUMP:
5776     case RJUMP:
5777     case CJUMP:
5778     case SJUMP:
5779       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
5780   }
5781   int btaddr=get_reg(regs[0].regmap,BTREG);
5782   if(btaddr<0) {
5783     btaddr=get_reg(regs[0].regmap,-1);
5784     emit_readword(&branch_target,btaddr);
5785   }
5786   assert(btaddr!=HOST_CCREG);
5787   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
5788 #ifdef HOST_IMM8
5789   host_tempreg_acquire();
5790   emit_movimm(start+4,HOST_TEMPREG);
5791   emit_cmp(btaddr,HOST_TEMPREG);
5792   host_tempreg_release();
5793 #else
5794   emit_cmpimm(btaddr,start+4);
5795 #endif
5796   void *branch = out;
5797   emit_jeq(0);
5798   store_regs_bt(regs[0].regmap,regs[0].dirty,-1);
5799   do_jump_vaddr(btaddr);
5800   set_jump_target(branch, out);
5801   store_regs_bt(regs[0].regmap,regs[0].dirty,start+4);
5802   load_regs_bt(regs[0].regmap,regs[0].dirty,start+4);
5803 }
5804
5805 // Basic liveness analysis for MIPS registers
5806 void unneeded_registers(int istart,int iend,int r)
5807 {
5808   int i;
5809   uint64_t u,gte_u,b,gte_b;
5810   uint64_t temp_u,temp_gte_u=0;
5811   uint64_t gte_u_unknown=0;
5812   if (HACK_ENABLED(NDHACK_GTE_UNNEEDED))
5813     gte_u_unknown=~0ll;
5814   if(iend==slen-1) {
5815     u=1;
5816     gte_u=gte_u_unknown;
5817   }else{
5818     //u=unneeded_reg[iend+1];
5819     u=1;
5820     gte_u=gte_unneeded[iend+1];
5821   }
5822
5823   for (i=iend;i>=istart;i--)
5824   {
5825     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
5826     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
5827     {
5828       // If subroutine call, flag return address as a possible branch target
5829       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
5830
5831       if(ba[i]<start || ba[i]>=(start+slen*4))
5832       {
5833         // Branch out of this block, flush all regs
5834         u=1;
5835         gte_u=gte_u_unknown;
5836         branch_unneeded_reg[i]=u;
5837         // Merge in delay slot
5838         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5839         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5840         u|=1;
5841         gte_u|=gte_rt[i+1];
5842         gte_u&=~gte_rs[i+1];
5843         // If branch is "likely" (and conditional)
5844         // then we skip the delay slot on the fall-thru path
5845         if(likely[i]) {
5846           if(i<slen-1) {
5847             u&=unneeded_reg[i+2];
5848             gte_u&=gte_unneeded[i+2];
5849           }
5850           else
5851           {
5852             u=1;
5853             gte_u=gte_u_unknown;
5854           }
5855         }
5856       }
5857       else
5858       {
5859         // Internal branch, flag target
5860         bt[(ba[i]-start)>>2]=1;
5861         if(ba[i]<=start+i*4) {
5862           // Backward branch
5863           if(is_ujump(i))
5864           {
5865             // Unconditional branch
5866             temp_u=1;
5867             temp_gte_u=0;
5868           } else {
5869             // Conditional branch (not taken case)
5870             temp_u=unneeded_reg[i+2];
5871             temp_gte_u&=gte_unneeded[i+2];
5872           }
5873           // Merge in delay slot
5874           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5875           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5876           temp_u|=1;
5877           temp_gte_u|=gte_rt[i+1];
5878           temp_gte_u&=~gte_rs[i+1];
5879           // If branch is "likely" (and conditional)
5880           // then we skip the delay slot on the fall-thru path
5881           if(likely[i]) {
5882             if(i<slen-1) {
5883               temp_u&=unneeded_reg[i+2];
5884               temp_gte_u&=gte_unneeded[i+2];
5885             }
5886             else
5887             {
5888               temp_u=1;
5889               temp_gte_u=gte_u_unknown;
5890             }
5891           }
5892           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
5893           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5894           temp_u|=1;
5895           temp_gte_u|=gte_rt[i];
5896           temp_gte_u&=~gte_rs[i];
5897           unneeded_reg[i]=temp_u;
5898           gte_unneeded[i]=temp_gte_u;
5899           // Only go three levels deep.  This recursion can take an
5900           // excessive amount of time if there are a lot of nested loops.
5901           if(r<2) {
5902             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
5903           }else{
5904             unneeded_reg[(ba[i]-start)>>2]=1;
5905             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
5906           }
5907         } /*else*/ if(1) {
5908           if (is_ujump(i))
5909           {
5910             // Unconditional branch
5911             u=unneeded_reg[(ba[i]-start)>>2];
5912             gte_u=gte_unneeded[(ba[i]-start)>>2];
5913             branch_unneeded_reg[i]=u;
5914             // Merge in delay slot
5915             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5916             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5917             u|=1;
5918             gte_u|=gte_rt[i+1];
5919             gte_u&=~gte_rs[i+1];
5920           } else {
5921             // Conditional branch
5922             b=unneeded_reg[(ba[i]-start)>>2];
5923             gte_b=gte_unneeded[(ba[i]-start)>>2];
5924             branch_unneeded_reg[i]=b;
5925             // Branch delay slot
5926             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5927             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5928             b|=1;
5929             gte_b|=gte_rt[i+1];
5930             gte_b&=~gte_rs[i+1];
5931             // If branch is "likely" then we skip the
5932             // delay slot on the fall-thru path
5933             if(likely[i]) {
5934               u=b;
5935               gte_u=gte_b;
5936               if(i<slen-1) {
5937                 u&=unneeded_reg[i+2];
5938                 gte_u&=gte_unneeded[i+2];
5939               }
5940             } else {
5941               u&=b;
5942               gte_u&=gte_b;
5943             }
5944             if(i<slen-1) {
5945               branch_unneeded_reg[i]&=unneeded_reg[i+2];
5946             } else {
5947               branch_unneeded_reg[i]=1;
5948             }
5949           }
5950         }
5951       }
5952     }
5953     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
5954     {
5955       // SYSCALL instruction (software interrupt)
5956       u=1;
5957     }
5958     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
5959     {
5960       // ERET instruction (return from interrupt)
5961       u=1;
5962     }
5963     //u=1; // DEBUG
5964     // Written registers are unneeded
5965     u|=1LL<<rt1[i];
5966     u|=1LL<<rt2[i];
5967     gte_u|=gte_rt[i];
5968     // Accessed registers are needed
5969     u&=~(1LL<<rs1[i]);
5970     u&=~(1LL<<rs2[i]);
5971     gte_u&=~gte_rs[i];
5972     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
5973       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
5974     // Source-target dependencies
5975     // R0 is always unneeded
5976     u|=1;
5977     // Save it
5978     unneeded_reg[i]=u;
5979     gte_unneeded[i]=gte_u;
5980     /*
5981     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
5982     printf("U:");
5983     int r;
5984     for(r=1;r<=CCREG;r++) {
5985       if((unneeded_reg[i]>>r)&1) {
5986         if(r==HIREG) printf(" HI");
5987         else if(r==LOREG) printf(" LO");
5988         else printf(" r%d",r);
5989       }
5990     }
5991     printf("\n");
5992     */
5993   }
5994 }
5995
5996 // Write back dirty registers as soon as we will no longer modify them,
5997 // so that we don't end up with lots of writes at the branches.
5998 void clean_registers(int istart,int iend,int wr)
5999 {
6000   int i;
6001   int r;
6002   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6003   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6004   if(iend==slen-1) {
6005     will_dirty_i=will_dirty_next=0;
6006     wont_dirty_i=wont_dirty_next=0;
6007   }else{
6008     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6009     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6010   }
6011   for (i=iend;i>=istart;i--)
6012   {
6013     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
6014     {
6015       if(ba[i]<start || ba[i]>=(start+slen*4))
6016       {
6017         // Branch out of this block, flush all regs
6018         if (is_ujump(i))
6019         {
6020           // Unconditional branch
6021           will_dirty_i=0;
6022           wont_dirty_i=0;
6023           // Merge in delay slot (will dirty)
6024           for(r=0;r<HOST_REGS;r++) {
6025             if(r!=EXCLUDE_REG) {
6026               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6027               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6028               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6029               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6030               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6031               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6032               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6033               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6034               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6035               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6036               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6037               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6038               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6039               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6040             }
6041           }
6042         }
6043         else
6044         {
6045           // Conditional branch
6046           will_dirty_i=0;
6047           wont_dirty_i=wont_dirty_next;
6048           // Merge in delay slot (will dirty)
6049           for(r=0;r<HOST_REGS;r++) {
6050             if(r!=EXCLUDE_REG) {
6051               if(!likely[i]) {
6052                 // Might not dirty if likely branch is not taken
6053                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6054                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6055                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6056                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6057                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6058                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6059                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6060                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6061                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6062                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6063                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6064                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6065                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6066                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6067               }
6068             }
6069           }
6070         }
6071         // Merge in delay slot (wont dirty)
6072         for(r=0;r<HOST_REGS;r++) {
6073           if(r!=EXCLUDE_REG) {
6074             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6075             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6076             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6077             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6078             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6079             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6080             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6081             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6082             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6083             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6084           }
6085         }
6086         if(wr) {
6087           #ifndef DESTRUCTIVE_WRITEBACK
6088           branch_regs[i].dirty&=wont_dirty_i;
6089           #endif
6090           branch_regs[i].dirty|=will_dirty_i;
6091         }
6092       }
6093       else
6094       {
6095         // Internal branch
6096         if(ba[i]<=start+i*4) {
6097           // Backward branch
6098           if (is_ujump(i))
6099           {
6100             // Unconditional branch
6101             temp_will_dirty=0;
6102             temp_wont_dirty=0;
6103             // Merge in delay slot (will dirty)
6104             for(r=0;r<HOST_REGS;r++) {
6105               if(r!=EXCLUDE_REG) {
6106                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6107                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6108                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6109                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6110                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6111                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6112                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6113                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6114                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6115                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6116                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6117                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6118                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6119                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6120               }
6121             }
6122           } else {
6123             // Conditional branch (not taken case)
6124             temp_will_dirty=will_dirty_next;
6125             temp_wont_dirty=wont_dirty_next;
6126             // Merge in delay slot (will dirty)
6127             for(r=0;r<HOST_REGS;r++) {
6128               if(r!=EXCLUDE_REG) {
6129                 if(!likely[i]) {
6130                   // Will not dirty if likely branch is not taken
6131                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6132                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6133                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6134                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6135                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6136                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
6137                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6138                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6139                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6140                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6141                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6142                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6143                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6144                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6145                 }
6146               }
6147             }
6148           }
6149           // Merge in delay slot (wont dirty)
6150           for(r=0;r<HOST_REGS;r++) {
6151             if(r!=EXCLUDE_REG) {
6152               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6153               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6154               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6155               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6156               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6157               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6158               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6159               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6160               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6161               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6162             }
6163           }
6164           // Deal with changed mappings
6165           if(i<iend) {
6166             for(r=0;r<HOST_REGS;r++) {
6167               if(r!=EXCLUDE_REG) {
6168                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
6169                   temp_will_dirty&=~(1<<r);
6170                   temp_wont_dirty&=~(1<<r);
6171                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6172                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6173                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6174                   } else {
6175                     temp_will_dirty|=1<<r;
6176                     temp_wont_dirty|=1<<r;
6177                   }
6178                 }
6179               }
6180             }
6181           }
6182           if(wr) {
6183             will_dirty[i]=temp_will_dirty;
6184             wont_dirty[i]=temp_wont_dirty;
6185             clean_registers((ba[i]-start)>>2,i-1,0);
6186           }else{
6187             // Limit recursion.  It can take an excessive amount
6188             // of time if there are a lot of nested loops.
6189             will_dirty[(ba[i]-start)>>2]=0;
6190             wont_dirty[(ba[i]-start)>>2]=-1;
6191           }
6192         }
6193         /*else*/ if(1)
6194         {
6195           if (is_ujump(i))
6196           {
6197             // Unconditional branch
6198             will_dirty_i=0;
6199             wont_dirty_i=0;
6200           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6201             for(r=0;r<HOST_REGS;r++) {
6202               if(r!=EXCLUDE_REG) {
6203                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6204                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
6205                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6206                 }
6207                 if(branch_regs[i].regmap[r]>=0) {
6208                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6209                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6210                 }
6211               }
6212             }
6213           //}
6214             // Merge in delay slot
6215             for(r=0;r<HOST_REGS;r++) {
6216               if(r!=EXCLUDE_REG) {
6217                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6218                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6219                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6220                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6221                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6222                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6223                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6224                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6225                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6226                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6227                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6228                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6229                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6230                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6231               }
6232             }
6233           } else {
6234             // Conditional branch
6235             will_dirty_i=will_dirty_next;
6236             wont_dirty_i=wont_dirty_next;
6237           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6238             for(r=0;r<HOST_REGS;r++) {
6239               if(r!=EXCLUDE_REG) {
6240                 signed char target_reg=branch_regs[i].regmap[r];
6241                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6242                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6243                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6244                 }
6245                 else if(target_reg>=0) {
6246                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6247                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6248                 }
6249                 // Treat delay slot as part of branch too
6250                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6251                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6252                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6253                 }
6254                 else
6255                 {
6256                   will_dirty[i+1]&=~(1<<r);
6257                 }*/
6258               }
6259             }
6260           //}
6261             // Merge in delay slot
6262             for(r=0;r<HOST_REGS;r++) {
6263               if(r!=EXCLUDE_REG) {
6264                 if(!likely[i]) {
6265                   // Might not dirty if likely branch is not taken
6266                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6267                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6268                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6269                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6270                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6271                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6272                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6273                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6274                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6275                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6276                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6277                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6278                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6279                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6280                 }
6281               }
6282             }
6283           }
6284           // Merge in delay slot (won't dirty)
6285           for(r=0;r<HOST_REGS;r++) {
6286             if(r!=EXCLUDE_REG) {
6287               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6288               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6289               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6290               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6291               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6292               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6293               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6294               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6295               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6296               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6297             }
6298           }
6299           if(wr) {
6300             #ifndef DESTRUCTIVE_WRITEBACK
6301             branch_regs[i].dirty&=wont_dirty_i;
6302             #endif
6303             branch_regs[i].dirty|=will_dirty_i;
6304           }
6305         }
6306       }
6307     }
6308     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6309     {
6310       // SYSCALL instruction (software interrupt)
6311       will_dirty_i=0;
6312       wont_dirty_i=0;
6313     }
6314     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6315     {
6316       // ERET instruction (return from interrupt)
6317       will_dirty_i=0;
6318       wont_dirty_i=0;
6319     }
6320     will_dirty_next=will_dirty_i;
6321     wont_dirty_next=wont_dirty_i;
6322     for(r=0;r<HOST_REGS;r++) {
6323       if(r!=EXCLUDE_REG) {
6324         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6325         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6326         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6327         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6328         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6329         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6330         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6331         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6332         if(i>istart) {
6333           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP)
6334           {
6335             // Don't store a register immediately after writing it,
6336             // may prevent dual-issue.
6337             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
6338             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
6339           }
6340         }
6341       }
6342     }
6343     // Save it
6344     will_dirty[i]=will_dirty_i;
6345     wont_dirty[i]=wont_dirty_i;
6346     // Mark registers that won't be dirtied as not dirty
6347     if(wr) {
6348       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
6349       for(r=0;r<HOST_REGS;r++) {
6350         if((will_dirty_i>>r)&1) {
6351           printf(" r%d",r);
6352         }
6353       }
6354       printf("\n");*/
6355
6356       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP)) {
6357         regs[i].dirty|=will_dirty_i;
6358         #ifndef DESTRUCTIVE_WRITEBACK
6359         regs[i].dirty&=wont_dirty_i;
6360         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
6361         {
6362           if (i < iend-1 && !is_ujump(i)) {
6363             for(r=0;r<HOST_REGS;r++) {
6364               if(r!=EXCLUDE_REG) {
6365                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
6366                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
6367                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6368               }
6369             }
6370           }
6371         }
6372         else
6373         {
6374           if(i<iend) {
6375             for(r=0;r<HOST_REGS;r++) {
6376               if(r!=EXCLUDE_REG) {
6377                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
6378                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
6379                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6380               }
6381             }
6382           }
6383         }
6384         #endif
6385       //}
6386     }
6387     // Deal with changed mappings
6388     temp_will_dirty=will_dirty_i;
6389     temp_wont_dirty=wont_dirty_i;
6390     for(r=0;r<HOST_REGS;r++) {
6391       if(r!=EXCLUDE_REG) {
6392         int nr;
6393         if(regs[i].regmap[r]==regmap_pre[i][r]) {
6394           if(wr) {
6395             #ifndef DESTRUCTIVE_WRITEBACK
6396             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6397             #endif
6398             regs[i].wasdirty|=will_dirty_i&(1<<r);
6399           }
6400         }
6401         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
6402           // Register moved to a different register
6403           will_dirty_i&=~(1<<r);
6404           wont_dirty_i&=~(1<<r);
6405           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
6406           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
6407           if(wr) {
6408             #ifndef DESTRUCTIVE_WRITEBACK
6409             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6410             #endif
6411             regs[i].wasdirty|=will_dirty_i&(1<<r);
6412           }
6413         }
6414         else {
6415           will_dirty_i&=~(1<<r);
6416           wont_dirty_i&=~(1<<r);
6417           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6418             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6419             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6420           } else {
6421             wont_dirty_i|=1<<r;
6422             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
6423           }
6424         }
6425       }
6426     }
6427   }
6428 }
6429
6430 #ifdef DISASM
6431   /* disassembly */
6432 void disassemble_inst(int i)
6433 {
6434     if (bt[i]) printf("*"); else printf(" ");
6435     switch(itype[i]) {
6436       case UJUMP:
6437         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6438       case CJUMP:
6439         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
6440       case SJUMP:
6441         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
6442       case RJUMP:
6443         if (opcode[i]==0x9&&rt1[i]!=31)
6444           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
6445         else
6446           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6447         break;
6448       case SPAN:
6449         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
6450       case IMM16:
6451         if(opcode[i]==0xf) //LUI
6452           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
6453         else
6454           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6455         break;
6456       case LOAD:
6457       case LOADLR:
6458         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6459         break;
6460       case STORE:
6461       case STORELR:
6462         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
6463         break;
6464       case ALU:
6465       case SHIFT:
6466         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
6467         break;
6468       case MULTDIV:
6469         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
6470         break;
6471       case SHIFTIMM:
6472         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6473         break;
6474       case MOV:
6475         if((opcode2[i]&0x1d)==0x10)
6476           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
6477         else if((opcode2[i]&0x1d)==0x11)
6478           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6479         else
6480           printf (" %x: %s\n",start+i*4,insn[i]);
6481         break;
6482       case COP0:
6483         if(opcode2[i]==0)
6484           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
6485         else if(opcode2[i]==4)
6486           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
6487         else printf (" %x: %s\n",start+i*4,insn[i]);
6488         break;
6489       case COP1:
6490         if(opcode2[i]<3)
6491           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
6492         else if(opcode2[i]>3)
6493           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
6494         else printf (" %x: %s\n",start+i*4,insn[i]);
6495         break;
6496       case COP2:
6497         if(opcode2[i]<3)
6498           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
6499         else if(opcode2[i]>3)
6500           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
6501         else printf (" %x: %s\n",start+i*4,insn[i]);
6502         break;
6503       case C1LS:
6504         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6505         break;
6506       case C2LS:
6507         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6508         break;
6509       case INTCALL:
6510         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
6511         break;
6512       default:
6513         //printf (" %s %8x\n",insn[i],source[i]);
6514         printf (" %x: %s\n",start+i*4,insn[i]);
6515     }
6516 }
6517 #else
6518 static void disassemble_inst(int i) {}
6519 #endif // DISASM
6520
6521 #define DRC_TEST_VAL 0x74657374
6522
6523 static void new_dynarec_test(void)
6524 {
6525   int (*testfunc)(void);
6526   void *beginning;
6527   int ret[2];
6528   size_t i;
6529
6530   // check structure linkage
6531   if ((u_char *)rcnts - (u_char *)&psxRegs != sizeof(psxRegs))
6532   {
6533     SysPrintf("linkage_arm* miscompilation/breakage detected.\n");
6534   }
6535
6536   SysPrintf("testing if we can run recompiled code...\n");
6537   ((volatile u_int *)out)[0]++; // make cache dirty
6538
6539   for (i = 0; i < ARRAY_SIZE(ret); i++) {
6540     out = ndrc->translation_cache;
6541     beginning = start_block();
6542     emit_movimm(DRC_TEST_VAL + i, 0); // test
6543     emit_ret();
6544     literal_pool(0);
6545     end_block(beginning);
6546     testfunc = beginning;
6547     ret[i] = testfunc();
6548   }
6549
6550   if (ret[0] == DRC_TEST_VAL && ret[1] == DRC_TEST_VAL + 1)
6551     SysPrintf("test passed.\n");
6552   else
6553     SysPrintf("test failed, will likely crash soon (r=%08x %08x)\n", ret[0], ret[1]);
6554   out = ndrc->translation_cache;
6555 }
6556
6557 // clear the state completely, instead of just marking
6558 // things invalid like invalidate_all_pages() does
6559 void new_dynarec_clear_full(void)
6560 {
6561   int n;
6562   out = ndrc->translation_cache;
6563   memset(invalid_code,1,sizeof(invalid_code));
6564   memset(hash_table,0xff,sizeof(hash_table));
6565   memset(mini_ht,-1,sizeof(mini_ht));
6566   memset(restore_candidate,0,sizeof(restore_candidate));
6567   memset(shadow,0,sizeof(shadow));
6568   copy=shadow;
6569   expirep=16384; // Expiry pointer, +2 blocks
6570   pending_exception=0;
6571   literalcount=0;
6572   stop_after_jal=0;
6573   inv_code_start=inv_code_end=~0;
6574   // TLB
6575   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6576   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6577   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6578 }
6579
6580 void new_dynarec_init(void)
6581 {
6582   SysPrintf("Init new dynarec\n");
6583
6584 #ifdef BASE_ADDR_DYNAMIC
6585   #ifdef VITA
6586   sceBlock = sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
6587   if (sceBlock < 0)
6588     SysPrintf("sceKernelAllocMemBlockForVM failed\n");
6589   int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&ndrc);
6590   if (ret < 0)
6591     SysPrintf("sceKernelGetMemBlockBase failed\n");
6592   #else
6593   uintptr_t desired_addr = 0;
6594   #ifdef __ELF__
6595   extern char _end;
6596   desired_addr = ((uintptr_t)&_end + 0xffffff) & ~0xffffffl;
6597   #endif
6598   ndrc = mmap((void *)desired_addr, sizeof(*ndrc),
6599             PROT_READ | PROT_WRITE | PROT_EXEC,
6600             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
6601   if (ndrc == MAP_FAILED) {
6602     SysPrintf("mmap() failed: %s\n", strerror(errno));
6603     abort();
6604   }
6605   #endif
6606 #else
6607   #ifndef NO_WRITE_EXEC
6608   // not all systems allow execute in data segment by default
6609   if (mprotect(ndrc, sizeof(ndrc->translation_cache) + sizeof(ndrc->tramp.ops),
6610                PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
6611     SysPrintf("mprotect() failed: %s\n", strerror(errno));
6612   #endif
6613 #endif
6614   out = ndrc->translation_cache;
6615   cycle_multiplier=200;
6616   new_dynarec_clear_full();
6617 #ifdef HOST_IMM8
6618   // Copy this into local area so we don't have to put it in every literal pool
6619   invc_ptr=invalid_code;
6620 #endif
6621   arch_init();
6622   new_dynarec_test();
6623 #ifndef RAM_FIXED
6624   ram_offset=(uintptr_t)rdram-0x80000000;
6625 #endif
6626   if (ram_offset!=0)
6627     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
6628 }
6629
6630 void new_dynarec_cleanup(void)
6631 {
6632   int n;
6633 #ifdef BASE_ADDR_DYNAMIC
6634   #ifdef VITA
6635   sceKernelFreeMemBlock(sceBlock);
6636   sceBlock = -1;
6637   #else
6638   if (munmap(ndrc, sizeof(*ndrc)) < 0)
6639     SysPrintf("munmap() failed\n");
6640   #endif
6641 #endif
6642   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6643   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6644   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6645   #ifdef ROM_COPY
6646   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
6647   #endif
6648 }
6649
6650 static u_int *get_source_start(u_int addr, u_int *limit)
6651 {
6652   if (!HACK_ENABLED(NDHACK_OVERRIDE_CYCLE_M))
6653     cycle_multiplier_override = 0;
6654
6655   if (addr < 0x00200000 ||
6656     (0xa0000000 <= addr && addr < 0xa0200000))
6657   {
6658     // used for BIOS calls mostly?
6659     *limit = (addr&0xa0000000)|0x00200000;
6660     return (u_int *)(rdram + (addr&0x1fffff));
6661   }
6662   else if (!Config.HLE && (
6663     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
6664     (0xbfc00000 <= addr && addr < 0xbfc80000)))
6665   {
6666     // BIOS. The multiplier should be much higher as it's uncached 8bit mem,
6667     // but timings in PCSX are too tied to the interpreter's BIAS
6668     if (!HACK_ENABLED(NDHACK_OVERRIDE_CYCLE_M))
6669       cycle_multiplier_override = 200;
6670
6671     *limit = (addr & 0xfff00000) | 0x80000;
6672     return (u_int *)((u_char *)psxR + (addr&0x7ffff));
6673   }
6674   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
6675     *limit = (addr & 0x80600000) + 0x00200000;
6676     return (u_int *)(rdram + (addr&0x1fffff));
6677   }
6678   return NULL;
6679 }
6680
6681 static u_int scan_for_ret(u_int addr)
6682 {
6683   u_int limit = 0;
6684   u_int *mem;
6685
6686   mem = get_source_start(addr, &limit);
6687   if (mem == NULL)
6688     return addr;
6689
6690   if (limit > addr + 0x1000)
6691     limit = addr + 0x1000;
6692   for (; addr < limit; addr += 4, mem++) {
6693     if (*mem == 0x03e00008) // jr $ra
6694       return addr + 8;
6695   }
6696   return addr;
6697 }
6698
6699 struct savestate_block {
6700   uint32_t addr;
6701   uint32_t regflags;
6702 };
6703
6704 static int addr_cmp(const void *p1_, const void *p2_)
6705 {
6706   const struct savestate_block *p1 = p1_, *p2 = p2_;
6707   return p1->addr - p2->addr;
6708 }
6709
6710 int new_dynarec_save_blocks(void *save, int size)
6711 {
6712   struct savestate_block *blocks = save;
6713   int maxcount = size / sizeof(blocks[0]);
6714   struct savestate_block tmp_blocks[1024];
6715   struct ll_entry *head;
6716   int p, s, d, o, bcnt;
6717   u_int addr;
6718
6719   o = 0;
6720   for (p = 0; p < ARRAY_SIZE(jump_in); p++) {
6721     bcnt = 0;
6722     for (head = jump_in[p]; head != NULL; head = head->next) {
6723       tmp_blocks[bcnt].addr = head->vaddr;
6724       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
6725       bcnt++;
6726     }
6727     if (bcnt < 1)
6728       continue;
6729     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
6730
6731     addr = tmp_blocks[0].addr;
6732     for (s = d = 0; s < bcnt; s++) {
6733       if (tmp_blocks[s].addr < addr)
6734         continue;
6735       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
6736         tmp_blocks[d++] = tmp_blocks[s];
6737       addr = scan_for_ret(tmp_blocks[s].addr);
6738     }
6739
6740     if (o + d > maxcount)
6741       d = maxcount - o;
6742     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
6743     o += d;
6744   }
6745
6746   return o * sizeof(blocks[0]);
6747 }
6748
6749 void new_dynarec_load_blocks(const void *save, int size)
6750 {
6751   const struct savestate_block *blocks = save;
6752   int count = size / sizeof(blocks[0]);
6753   u_int regs_save[32];
6754   uint32_t f;
6755   int i, b;
6756
6757   get_addr(psxRegs.pc);
6758
6759   // change GPRs for speculation to at least partially work..
6760   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
6761   for (i = 1; i < 32; i++)
6762     psxRegs.GPR.r[i] = 0x80000000;
6763
6764   for (b = 0; b < count; b++) {
6765     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
6766       if (f & 1)
6767         psxRegs.GPR.r[i] = 0x1f800000;
6768     }
6769
6770     get_addr(blocks[b].addr);
6771
6772     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
6773       if (f & 1)
6774         psxRegs.GPR.r[i] = 0x80000000;
6775     }
6776   }
6777
6778   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
6779 }
6780
6781 int new_recompile_block(u_int addr)
6782 {
6783   u_int pagelimit = 0;
6784   u_int state_rflags = 0;
6785   int i;
6786
6787   assem_debug("NOTCOMPILED: addr = %x -> %p\n", addr, out);
6788   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
6789   //if(debug)
6790   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
6791
6792   // this is just for speculation
6793   for (i = 1; i < 32; i++) {
6794     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
6795       state_rflags |= 1 << i;
6796   }
6797
6798   start = (u_int)addr&~3;
6799   //assert(((u_int)addr&1)==0); // start-in-delay-slot flag
6800   new_dynarec_did_compile=1;
6801   if (Config.HLE && start == 0x80001000) // hlecall
6802   {
6803     // XXX: is this enough? Maybe check hleSoftCall?
6804     void *beginning=start_block();
6805     u_int page=get_page(start);
6806
6807     invalid_code[start>>12]=0;
6808     emit_movimm(start,0);
6809     emit_writeword(0,&pcaddr);
6810     emit_far_jump(new_dyna_leave);
6811     literal_pool(0);
6812     end_block(beginning);
6813     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
6814     return 0;
6815   }
6816
6817   source = get_source_start(start, &pagelimit);
6818   if (source == NULL) {
6819     SysPrintf("Compile at bogus memory address: %08x\n", addr);
6820     abort();
6821   }
6822
6823   /* Pass 1: disassemble */
6824   /* Pass 2: register dependencies, branch targets */
6825   /* Pass 3: register allocation */
6826   /* Pass 4: branch dependencies */
6827   /* Pass 5: pre-alloc */
6828   /* Pass 6: optimize clean/dirty state */
6829   /* Pass 7: flag 32-bit registers */
6830   /* Pass 8: assembly */
6831   /* Pass 9: linker */
6832   /* Pass 10: garbage collection / free memory */
6833
6834   int j;
6835   int done=0;
6836   unsigned int type,op,op2;
6837
6838   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
6839
6840   /* Pass 1 disassembly */
6841
6842   for(i=0;!done;i++) {
6843     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
6844     minimum_free_regs[i]=0;
6845     opcode[i]=op=source[i]>>26;
6846     switch(op)
6847     {
6848       case 0x00: strcpy(insn[i],"special"); type=NI;
6849         op2=source[i]&0x3f;
6850         switch(op2)
6851         {
6852           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
6853           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
6854           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
6855           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
6856           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
6857           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
6858           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
6859           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
6860           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
6861           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
6862           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
6863           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
6864           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
6865           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
6866           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
6867           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
6868           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
6869           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
6870           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
6871           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
6872           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
6873           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
6874           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
6875           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
6876           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
6877           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
6878           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
6879           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
6880           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
6881           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
6882           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
6883           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
6884           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
6885           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
6886           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
6887 #if 0
6888           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
6889           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
6890           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
6891           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
6892           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
6893           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
6894           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
6895           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
6896           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
6897           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
6898           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
6899           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
6900           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
6901           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
6902           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
6903           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
6904           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
6905 #endif
6906         }
6907         break;
6908       case 0x01: strcpy(insn[i],"regimm"); type=NI;
6909         op2=(source[i]>>16)&0x1f;
6910         switch(op2)
6911         {
6912           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
6913           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
6914           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
6915           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
6916           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
6917           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
6918           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
6919           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
6920           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
6921           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
6922           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
6923           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
6924           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
6925           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
6926         }
6927         break;
6928       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
6929       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
6930       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
6931       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
6932       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
6933       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
6934       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
6935       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
6936       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
6937       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
6938       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
6939       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
6940       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
6941       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
6942       case 0x10: strcpy(insn[i],"cop0"); type=NI;
6943         op2=(source[i]>>21)&0x1f;
6944         switch(op2)
6945         {
6946           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
6947           case 0x02: strcpy(insn[i],"CFC0"); type=COP0; break;
6948           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
6949           case 0x06: strcpy(insn[i],"CTC0"); type=COP0; break;
6950           case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
6951         }
6952         break;
6953       case 0x11: strcpy(insn[i],"cop1"); type=COP1;
6954         op2=(source[i]>>21)&0x1f;
6955         break;
6956 #if 0
6957       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
6958       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
6959       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
6960       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
6961       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
6962       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
6963       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
6964       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
6965 #endif
6966       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
6967       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
6968       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
6969       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
6970       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
6971       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
6972       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
6973 #if 0
6974       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
6975 #endif
6976       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
6977       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
6978       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
6979       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
6980 #if 0
6981       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
6982       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
6983 #endif
6984       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
6985       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
6986       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
6987       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
6988 #if 0
6989       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
6990       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
6991       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
6992 #endif
6993       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
6994       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
6995 #if 0
6996       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
6997       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
6998       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
6999 #endif
7000       case 0x12: strcpy(insn[i],"COP2"); type=NI;
7001         op2=(source[i]>>21)&0x1f;
7002         //if (op2 & 0x10)
7003         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
7004           if (gte_handlers[source[i]&0x3f]!=NULL) {
7005             if (gte_regnames[source[i]&0x3f]!=NULL)
7006               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
7007             else
7008               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
7009             type=C2OP;
7010           }
7011         }
7012         else switch(op2)
7013         {
7014           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
7015           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
7016           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
7017           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
7018         }
7019         break;
7020       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
7021       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
7022       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
7023       default: strcpy(insn[i],"???"); type=NI;
7024         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
7025         break;
7026     }
7027     itype[i]=type;
7028     opcode2[i]=op2;
7029     /* Get registers/immediates */
7030     lt1[i]=0;
7031     dep1[i]=0;
7032     dep2[i]=0;
7033     gte_rs[i]=gte_rt[i]=0;
7034     switch(type) {
7035       case LOAD:
7036         rs1[i]=(source[i]>>21)&0x1f;
7037         rs2[i]=0;
7038         rt1[i]=(source[i]>>16)&0x1f;
7039         rt2[i]=0;
7040         imm[i]=(short)source[i];
7041         break;
7042       case STORE:
7043       case STORELR:
7044         rs1[i]=(source[i]>>21)&0x1f;
7045         rs2[i]=(source[i]>>16)&0x1f;
7046         rt1[i]=0;
7047         rt2[i]=0;
7048         imm[i]=(short)source[i];
7049         break;
7050       case LOADLR:
7051         // LWL/LWR only load part of the register,
7052         // therefore the target register must be treated as a source too
7053         rs1[i]=(source[i]>>21)&0x1f;
7054         rs2[i]=(source[i]>>16)&0x1f;
7055         rt1[i]=(source[i]>>16)&0x1f;
7056         rt2[i]=0;
7057         imm[i]=(short)source[i];
7058         if(op==0x26) dep1[i]=rt1[i]; // LWR
7059         break;
7060       case IMM16:
7061         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7062         else rs1[i]=(source[i]>>21)&0x1f;
7063         rs2[i]=0;
7064         rt1[i]=(source[i]>>16)&0x1f;
7065         rt2[i]=0;
7066         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7067           imm[i]=(unsigned short)source[i];
7068         }else{
7069           imm[i]=(short)source[i];
7070         }
7071         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7072         break;
7073       case UJUMP:
7074         rs1[i]=0;
7075         rs2[i]=0;
7076         rt1[i]=0;
7077         rt2[i]=0;
7078         // The JAL instruction writes to r31.
7079         if (op&1) {
7080           rt1[i]=31;
7081         }
7082         rs2[i]=CCREG;
7083         break;
7084       case RJUMP:
7085         rs1[i]=(source[i]>>21)&0x1f;
7086         rs2[i]=0;
7087         rt1[i]=0;
7088         rt2[i]=0;
7089         // The JALR instruction writes to rd.
7090         if (op2&1) {
7091           rt1[i]=(source[i]>>11)&0x1f;
7092         }
7093         rs2[i]=CCREG;
7094         break;
7095       case CJUMP:
7096         rs1[i]=(source[i]>>21)&0x1f;
7097         rs2[i]=(source[i]>>16)&0x1f;
7098         rt1[i]=0;
7099         rt2[i]=0;
7100         if(op&2) { // BGTZ/BLEZ
7101           rs2[i]=0;
7102         }
7103         likely[i]=op>>4;
7104         break;
7105       case SJUMP:
7106         rs1[i]=(source[i]>>21)&0x1f;
7107         rs2[i]=CCREG;
7108         rt1[i]=0;
7109         rt2[i]=0;
7110         if(op2&0x10) { // BxxAL
7111           rt1[i]=31;
7112           // NOTE: If the branch is not taken, r31 is still overwritten
7113         }
7114         likely[i]=(op2&2)>>1;
7115         break;
7116       case ALU:
7117         rs1[i]=(source[i]>>21)&0x1f; // source
7118         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7119         rt1[i]=(source[i]>>11)&0x1f; // destination
7120         rt2[i]=0;
7121         if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7122           dep1[i]=rs1[i];dep2[i]=rs2[i];
7123         }
7124         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7125           dep1[i]=rs1[i];dep2[i]=rs2[i];
7126         }
7127         break;
7128       case MULTDIV:
7129         rs1[i]=(source[i]>>21)&0x1f; // source
7130         rs2[i]=(source[i]>>16)&0x1f; // divisor
7131         rt1[i]=HIREG;
7132         rt2[i]=LOREG;
7133         break;
7134       case MOV:
7135         rs1[i]=0;
7136         rs2[i]=0;
7137         rt1[i]=0;
7138         rt2[i]=0;
7139         if(op2==0x10) rs1[i]=HIREG; // MFHI
7140         if(op2==0x11) rt1[i]=HIREG; // MTHI
7141         if(op2==0x12) rs1[i]=LOREG; // MFLO
7142         if(op2==0x13) rt1[i]=LOREG; // MTLO
7143         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7144         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7145         dep1[i]=rs1[i];
7146         break;
7147       case SHIFT:
7148         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7149         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7150         rt1[i]=(source[i]>>11)&0x1f; // destination
7151         rt2[i]=0;
7152         break;
7153       case SHIFTIMM:
7154         rs1[i]=(source[i]>>16)&0x1f;
7155         rs2[i]=0;
7156         rt1[i]=(source[i]>>11)&0x1f;
7157         rt2[i]=0;
7158         imm[i]=(source[i]>>6)&0x1f;
7159         // DSxx32 instructions
7160         if(op2>=0x3c) imm[i]|=0x20;
7161         break;
7162       case COP0:
7163         rs1[i]=0;
7164         rs2[i]=0;
7165         rt1[i]=0;
7166         rt2[i]=0;
7167         if(op2==0||op2==2) rt1[i]=(source[i]>>16)&0x1F; // MFC0/CFC0
7168         if(op2==4||op2==6) rs1[i]=(source[i]>>16)&0x1F; // MTC0/CTC0
7169         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7170         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7171         break;
7172       case COP1:
7173         rs1[i]=0;
7174         rs2[i]=0;
7175         rt1[i]=0;
7176         rt2[i]=0;
7177         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7178         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7179         rs2[i]=CSREG;
7180         break;
7181       case COP2:
7182         rs1[i]=0;
7183         rs2[i]=0;
7184         rt1[i]=0;
7185         rt2[i]=0;
7186         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
7187         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
7188         rs2[i]=CSREG;
7189         int gr=(source[i]>>11)&0x1F;
7190         switch(op2)
7191         {
7192           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
7193           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
7194           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
7195           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
7196         }
7197         break;
7198       case C1LS:
7199         rs1[i]=(source[i]>>21)&0x1F;
7200         rs2[i]=CSREG;
7201         rt1[i]=0;
7202         rt2[i]=0;
7203         imm[i]=(short)source[i];
7204         break;
7205       case C2LS:
7206         rs1[i]=(source[i]>>21)&0x1F;
7207         rs2[i]=0;
7208         rt1[i]=0;
7209         rt2[i]=0;
7210         imm[i]=(short)source[i];
7211         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
7212         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
7213         break;
7214       case C2OP:
7215         rs1[i]=0;
7216         rs2[i]=0;
7217         rt1[i]=0;
7218         rt2[i]=0;
7219         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
7220         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
7221         gte_rt[i]|=1ll<<63; // every op changes flags
7222         if((source[i]&0x3f)==GTE_MVMVA) {
7223           int v = (source[i] >> 15) & 3;
7224           gte_rs[i]&=~0xe3fll;
7225           if(v==3) gte_rs[i]|=0xe00ll;
7226           else gte_rs[i]|=3ll<<(v*2);
7227         }
7228         break;
7229       case SYSCALL:
7230       case HLECALL:
7231       case INTCALL:
7232         rs1[i]=CCREG;
7233         rs2[i]=0;
7234         rt1[i]=0;
7235         rt2[i]=0;
7236         break;
7237       default:
7238         rs1[i]=0;
7239         rs2[i]=0;
7240         rt1[i]=0;
7241         rt2[i]=0;
7242     }
7243     /* Calculate branch target addresses */
7244     if(type==UJUMP)
7245       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
7246     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
7247       ba[i]=start+i*4+8; // Ignore never taken branch
7248     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
7249       ba[i]=start+i*4+8; // Ignore never taken branch
7250     else if(type==CJUMP||type==SJUMP)
7251       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
7252     else ba[i]=-1;
7253     if (i > 0 && is_jump(i-1)) {
7254       int do_in_intrp=0;
7255       // branch in delay slot?
7256       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP) {
7257         // don't handle first branch and call interpreter if it's hit
7258         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
7259         do_in_intrp=1;
7260       }
7261       // basic load delay detection
7262       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
7263         int t=(ba[i-1]-start)/4;
7264         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
7265           // jump target wants DS result - potential load delay effect
7266           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
7267           do_in_intrp=1;
7268           bt[t+1]=1; // expected return from interpreter
7269         }
7270         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
7271               !(i>=3&&is_jump(i-3))) {
7272           // v0 overwrite like this is a sign of trouble, bail out
7273           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
7274           do_in_intrp=1;
7275         }
7276       }
7277       if(do_in_intrp) {
7278         rs1[i-1]=CCREG;
7279         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
7280         ba[i-1]=-1;
7281         itype[i-1]=INTCALL;
7282         done=2;
7283         i--; // don't compile the DS
7284       }
7285     }
7286     /* Is this the end of the block? */
7287     if (i > 0 && is_ujump(i-1)) {
7288       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
7289         done=2;
7290       }
7291       else {
7292         if(stop_after_jal) done=1;
7293         // Stop on BREAK
7294         if((source[i+1]&0xfc00003f)==0x0d) done=1;
7295       }
7296       // Don't recompile stuff that's already compiled
7297       if(check_addr(start+i*4+4)) done=1;
7298       // Don't get too close to the limit
7299       if(i>MAXBLOCK/2) done=1;
7300     }
7301     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
7302     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
7303     if(done==2) {
7304       // Does the block continue due to a branch?
7305       for(j=i-1;j>=0;j--)
7306       {
7307         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
7308         if(ba[j]==start+i*4+4) done=j=0;
7309         if(ba[j]==start+i*4+8) done=j=0;
7310       }
7311     }
7312     //assert(i<MAXBLOCK-1);
7313     if(start+i*4==pagelimit-4) done=1;
7314     assert(start+i*4<pagelimit);
7315     if (i==MAXBLOCK-1) done=1;
7316     // Stop if we're compiling junk
7317     if(itype[i]==NI&&opcode[i]==0x11) {
7318       done=stop_after_jal=1;
7319       SysPrintf("Disabled speculative precompilation\n");
7320     }
7321   }
7322   slen=i;
7323   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP) {
7324     if(start+i*4==pagelimit) {
7325       itype[i-1]=SPAN;
7326     }
7327   }
7328   assert(slen>0);
7329
7330   /* Pass 2 - Register dependencies and branch targets */
7331
7332   unneeded_registers(0,slen-1,0);
7333
7334   /* Pass 3 - Register allocation */
7335
7336   struct regstat current; // Current register allocations/status
7337   current.dirty=0;
7338   current.u=unneeded_reg[0];
7339   clear_all_regs(current.regmap);
7340   alloc_reg(&current,0,CCREG);
7341   dirty_reg(&current,CCREG);
7342   current.isconst=0;
7343   current.wasconst=0;
7344   current.waswritten=0;
7345   int ds=0;
7346   int cc=0;
7347   int hr=-1;
7348
7349   if((u_int)addr&1) {
7350     // First instruction is delay slot
7351     cc=-1;
7352     bt[1]=1;
7353     ds=1;
7354     unneeded_reg[0]=1;
7355     current.regmap[HOST_BTREG]=BTREG;
7356   }
7357
7358   for(i=0;i<slen;i++)
7359   {
7360     if(bt[i])
7361     {
7362       int hr;
7363       for(hr=0;hr<HOST_REGS;hr++)
7364       {
7365         // Is this really necessary?
7366         if(current.regmap[hr]==0) current.regmap[hr]=-1;
7367       }
7368       current.isconst=0;
7369       current.waswritten=0;
7370     }
7371
7372     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
7373     regs[i].wasconst=current.isconst;
7374     regs[i].wasdirty=current.dirty;
7375     regs[i].loadedconst=0;
7376     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP) {
7377       if(i+1<slen) {
7378         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7379         current.u|=1;
7380       } else {
7381         current.u=1;
7382       }
7383     } else {
7384       if(i+1<slen) {
7385         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
7386         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7387         current.u|=1;
7388       } else { SysPrintf("oops, branch at end of block with no delay slot\n");abort(); }
7389     }
7390     is_ds[i]=ds;
7391     if(ds) {
7392       ds=0; // Skip delay slot, already allocated as part of branch
7393       // ...but we need to alloc it in case something jumps here
7394       if(i+1<slen) {
7395         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
7396       }else{
7397         current.u=branch_unneeded_reg[i-1];
7398       }
7399       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7400       current.u|=1;
7401       struct regstat temp;
7402       memcpy(&temp,&current,sizeof(current));
7403       temp.wasdirty=temp.dirty;
7404       // TODO: Take into account unconditional branches, as below
7405       delayslot_alloc(&temp,i);
7406       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
7407       regs[i].wasdirty=temp.wasdirty;
7408       regs[i].dirty=temp.dirty;
7409       regs[i].isconst=0;
7410       regs[i].wasconst=0;
7411       current.isconst=0;
7412       // Create entry (branch target) regmap
7413       for(hr=0;hr<HOST_REGS;hr++)
7414       {
7415         int r=temp.regmap[hr];
7416         if(r>=0) {
7417           if(r!=regmap_pre[i][hr]) {
7418             regs[i].regmap_entry[hr]=-1;
7419           }
7420           else
7421           {
7422               assert(r < 64);
7423               if((current.u>>r)&1) {
7424                 regs[i].regmap_entry[hr]=-1;
7425                 regs[i].regmap[hr]=-1;
7426                 //Don't clear regs in the delay slot as the branch might need them
7427                 //current.regmap[hr]=-1;
7428               }else
7429                 regs[i].regmap_entry[hr]=r;
7430           }
7431         } else {
7432           // First instruction expects CCREG to be allocated
7433           if(i==0&&hr==HOST_CCREG)
7434             regs[i].regmap_entry[hr]=CCREG;
7435           else
7436             regs[i].regmap_entry[hr]=-1;
7437         }
7438       }
7439     }
7440     else { // Not delay slot
7441       switch(itype[i]) {
7442         case UJUMP:
7443           //current.isconst=0; // DEBUG
7444           //current.wasconst=0; // DEBUG
7445           //regs[i].wasconst=0; // DEBUG
7446           clear_const(&current,rt1[i]);
7447           alloc_cc(&current,i);
7448           dirty_reg(&current,CCREG);
7449           if (rt1[i]==31) {
7450             alloc_reg(&current,i,31);
7451             dirty_reg(&current,31);
7452             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
7453             //assert(rt1[i+1]!=rt1[i]);
7454             #ifdef REG_PREFETCH
7455             alloc_reg(&current,i,PTEMP);
7456             #endif
7457           }
7458           ooo[i]=1;
7459           delayslot_alloc(&current,i+1);
7460           //current.isconst=0; // DEBUG
7461           ds=1;
7462           //printf("i=%d, isconst=%x\n",i,current.isconst);
7463           break;
7464         case RJUMP:
7465           //current.isconst=0;
7466           //current.wasconst=0;
7467           //regs[i].wasconst=0;
7468           clear_const(&current,rs1[i]);
7469           clear_const(&current,rt1[i]);
7470           alloc_cc(&current,i);
7471           dirty_reg(&current,CCREG);
7472           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
7473             alloc_reg(&current,i,rs1[i]);
7474             if (rt1[i]!=0) {
7475               alloc_reg(&current,i,rt1[i]);
7476               dirty_reg(&current,rt1[i]);
7477               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
7478               assert(rt1[i+1]!=rt1[i]);
7479               #ifdef REG_PREFETCH
7480               alloc_reg(&current,i,PTEMP);
7481               #endif
7482             }
7483             #ifdef USE_MINI_HT
7484             if(rs1[i]==31) { // JALR
7485               alloc_reg(&current,i,RHASH);
7486               alloc_reg(&current,i,RHTBL);
7487             }
7488             #endif
7489             delayslot_alloc(&current,i+1);
7490           } else {
7491             // The delay slot overwrites our source register,
7492             // allocate a temporary register to hold the old value.
7493             current.isconst=0;
7494             current.wasconst=0;
7495             regs[i].wasconst=0;
7496             delayslot_alloc(&current,i+1);
7497             current.isconst=0;
7498             alloc_reg(&current,i,RTEMP);
7499           }
7500           //current.isconst=0; // DEBUG
7501           ooo[i]=1;
7502           ds=1;
7503           break;
7504         case CJUMP:
7505           //current.isconst=0;
7506           //current.wasconst=0;
7507           //regs[i].wasconst=0;
7508           clear_const(&current,rs1[i]);
7509           clear_const(&current,rs2[i]);
7510           if((opcode[i]&0x3E)==4) // BEQ/BNE
7511           {
7512             alloc_cc(&current,i);
7513             dirty_reg(&current,CCREG);
7514             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7515             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
7516             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
7517                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
7518               // The delay slot overwrites one of our conditions.
7519               // Allocate the branch condition registers instead.
7520               current.isconst=0;
7521               current.wasconst=0;
7522               regs[i].wasconst=0;
7523               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7524               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
7525             }
7526             else
7527             {
7528               ooo[i]=1;
7529               delayslot_alloc(&current,i+1);
7530             }
7531           }
7532           else
7533           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
7534           {
7535             alloc_cc(&current,i);
7536             dirty_reg(&current,CCREG);
7537             alloc_reg(&current,i,rs1[i]);
7538             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
7539               // The delay slot overwrites one of our conditions.
7540               // Allocate the branch condition registers instead.
7541               current.isconst=0;
7542               current.wasconst=0;
7543               regs[i].wasconst=0;
7544               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7545             }
7546             else
7547             {
7548               ooo[i]=1;
7549               delayslot_alloc(&current,i+1);
7550             }
7551           }
7552           else
7553           // Don't alloc the delay slot yet because we might not execute it
7554           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
7555           {
7556             current.isconst=0;
7557             current.wasconst=0;
7558             regs[i].wasconst=0;
7559             alloc_cc(&current,i);
7560             dirty_reg(&current,CCREG);
7561             alloc_reg(&current,i,rs1[i]);
7562             alloc_reg(&current,i,rs2[i]);
7563           }
7564           else
7565           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
7566           {
7567             current.isconst=0;
7568             current.wasconst=0;
7569             regs[i].wasconst=0;
7570             alloc_cc(&current,i);
7571             dirty_reg(&current,CCREG);
7572             alloc_reg(&current,i,rs1[i]);
7573           }
7574           ds=1;
7575           //current.isconst=0;
7576           break;
7577         case SJUMP:
7578           //current.isconst=0;
7579           //current.wasconst=0;
7580           //regs[i].wasconst=0;
7581           clear_const(&current,rs1[i]);
7582           clear_const(&current,rt1[i]);
7583           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
7584           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
7585           {
7586             alloc_cc(&current,i);
7587             dirty_reg(&current,CCREG);
7588             alloc_reg(&current,i,rs1[i]);
7589             if (rt1[i]==31) { // BLTZAL/BGEZAL
7590               alloc_reg(&current,i,31);
7591               dirty_reg(&current,31);
7592               //#ifdef REG_PREFETCH
7593               //alloc_reg(&current,i,PTEMP);
7594               //#endif
7595             }
7596             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
7597                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
7598               // Allocate the branch condition registers instead.
7599               current.isconst=0;
7600               current.wasconst=0;
7601               regs[i].wasconst=0;
7602               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7603             }
7604             else
7605             {
7606               ooo[i]=1;
7607               delayslot_alloc(&current,i+1);
7608             }
7609           }
7610           else
7611           // Don't alloc the delay slot yet because we might not execute it
7612           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
7613           {
7614             current.isconst=0;
7615             current.wasconst=0;
7616             regs[i].wasconst=0;
7617             alloc_cc(&current,i);
7618             dirty_reg(&current,CCREG);
7619             alloc_reg(&current,i,rs1[i]);
7620           }
7621           ds=1;
7622           //current.isconst=0;
7623           break;
7624         case IMM16:
7625           imm16_alloc(&current,i);
7626           break;
7627         case LOAD:
7628         case LOADLR:
7629           load_alloc(&current,i);
7630           break;
7631         case STORE:
7632         case STORELR:
7633           store_alloc(&current,i);
7634           break;
7635         case ALU:
7636           alu_alloc(&current,i);
7637           break;
7638         case SHIFT:
7639           shift_alloc(&current,i);
7640           break;
7641         case MULTDIV:
7642           multdiv_alloc(&current,i);
7643           break;
7644         case SHIFTIMM:
7645           shiftimm_alloc(&current,i);
7646           break;
7647         case MOV:
7648           mov_alloc(&current,i);
7649           break;
7650         case COP0:
7651           cop0_alloc(&current,i);
7652           break;
7653         case COP1:
7654         case COP2:
7655           cop12_alloc(&current,i);
7656           break;
7657         case C1LS:
7658           c1ls_alloc(&current,i);
7659           break;
7660         case C2LS:
7661           c2ls_alloc(&current,i);
7662           break;
7663         case C2OP:
7664           c2op_alloc(&current,i);
7665           break;
7666         case SYSCALL:
7667         case HLECALL:
7668         case INTCALL:
7669           syscall_alloc(&current,i);
7670           break;
7671         case SPAN:
7672           pagespan_alloc(&current,i);
7673           break;
7674       }
7675
7676       // Create entry (branch target) regmap
7677       for(hr=0;hr<HOST_REGS;hr++)
7678       {
7679         int r,or;
7680         r=current.regmap[hr];
7681         if(r>=0) {
7682           if(r!=regmap_pre[i][hr]) {
7683             // TODO: delay slot (?)
7684             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
7685             if(or<0||(r&63)>=TEMPREG){
7686               regs[i].regmap_entry[hr]=-1;
7687             }
7688             else
7689             {
7690               // Just move it to a different register
7691               regs[i].regmap_entry[hr]=r;
7692               // If it was dirty before, it's still dirty
7693               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
7694             }
7695           }
7696           else
7697           {
7698             // Unneeded
7699             if(r==0){
7700               regs[i].regmap_entry[hr]=0;
7701             }
7702             else
7703             {
7704               assert(r<64);
7705               if((current.u>>r)&1) {
7706                 regs[i].regmap_entry[hr]=-1;
7707                 //regs[i].regmap[hr]=-1;
7708                 current.regmap[hr]=-1;
7709               }else
7710                 regs[i].regmap_entry[hr]=r;
7711             }
7712           }
7713         } else {
7714           // Branches expect CCREG to be allocated at the target
7715           if(regmap_pre[i][hr]==CCREG)
7716             regs[i].regmap_entry[hr]=CCREG;
7717           else
7718             regs[i].regmap_entry[hr]=-1;
7719         }
7720       }
7721       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
7722     }
7723
7724     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
7725       current.waswritten|=1<<rs1[i-1];
7726     current.waswritten&=~(1<<rt1[i]);
7727     current.waswritten&=~(1<<rt2[i]);
7728     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
7729       current.waswritten&=~(1<<rs1[i]);
7730
7731     /* Branch post-alloc */
7732     if(i>0)
7733     {
7734       current.wasdirty=current.dirty;
7735       switch(itype[i-1]) {
7736         case UJUMP:
7737           memcpy(&branch_regs[i-1],&current,sizeof(current));
7738           branch_regs[i-1].isconst=0;
7739           branch_regs[i-1].wasconst=0;
7740           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
7741           alloc_cc(&branch_regs[i-1],i-1);
7742           dirty_reg(&branch_regs[i-1],CCREG);
7743           if(rt1[i-1]==31) { // JAL
7744             alloc_reg(&branch_regs[i-1],i-1,31);
7745             dirty_reg(&branch_regs[i-1],31);
7746           }
7747           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7748           memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
7749           break;
7750         case RJUMP:
7751           memcpy(&branch_regs[i-1],&current,sizeof(current));
7752           branch_regs[i-1].isconst=0;
7753           branch_regs[i-1].wasconst=0;
7754           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
7755           alloc_cc(&branch_regs[i-1],i-1);
7756           dirty_reg(&branch_regs[i-1],CCREG);
7757           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
7758           if(rt1[i-1]!=0) { // JALR
7759             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
7760             dirty_reg(&branch_regs[i-1],rt1[i-1]);
7761           }
7762           #ifdef USE_MINI_HT
7763           if(rs1[i-1]==31) { // JALR
7764             alloc_reg(&branch_regs[i-1],i-1,RHASH);
7765             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
7766           }
7767           #endif
7768           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7769           memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
7770           break;
7771         case CJUMP:
7772           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
7773           {
7774             alloc_cc(&current,i-1);
7775             dirty_reg(&current,CCREG);
7776             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
7777                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
7778               // The delay slot overwrote one of our conditions
7779               // Delay slot goes after the test (in order)
7780               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7781               current.u|=1;
7782               delayslot_alloc(&current,i);
7783               current.isconst=0;
7784             }
7785             else
7786             {
7787               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
7788               // Alloc the branch condition registers
7789               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
7790               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
7791             }
7792             memcpy(&branch_regs[i-1],&current,sizeof(current));
7793             branch_regs[i-1].isconst=0;
7794             branch_regs[i-1].wasconst=0;
7795             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
7796             memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
7797           }
7798           else
7799           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
7800           {
7801             alloc_cc(&current,i-1);
7802             dirty_reg(&current,CCREG);
7803             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
7804               // The delay slot overwrote the branch condition
7805               // Delay slot goes after the test (in order)
7806               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7807               current.u|=1;
7808               delayslot_alloc(&current,i);
7809               current.isconst=0;
7810             }
7811             else
7812             {
7813               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
7814               // Alloc the branch condition register
7815               alloc_reg(&current,i-1,rs1[i-1]);
7816             }
7817             memcpy(&branch_regs[i-1],&current,sizeof(current));
7818             branch_regs[i-1].isconst=0;
7819             branch_regs[i-1].wasconst=0;
7820             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
7821             memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
7822           }
7823           else
7824           // Alloc the delay slot in case the branch is taken
7825           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
7826           {
7827             memcpy(&branch_regs[i-1],&current,sizeof(current));
7828             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
7829             alloc_cc(&branch_regs[i-1],i);
7830             dirty_reg(&branch_regs[i-1],CCREG);
7831             delayslot_alloc(&branch_regs[i-1],i);
7832             branch_regs[i-1].isconst=0;
7833             alloc_reg(&current,i,CCREG); // Not taken path
7834             dirty_reg(&current,CCREG);
7835             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7836           }
7837           else
7838           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
7839           {
7840             memcpy(&branch_regs[i-1],&current,sizeof(current));
7841             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
7842             alloc_cc(&branch_regs[i-1],i);
7843             dirty_reg(&branch_regs[i-1],CCREG);
7844             delayslot_alloc(&branch_regs[i-1],i);
7845             branch_regs[i-1].isconst=0;
7846             alloc_reg(&current,i,CCREG); // Not taken path
7847             dirty_reg(&current,CCREG);
7848             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7849           }
7850           break;
7851         case SJUMP:
7852           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
7853           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
7854           {
7855             alloc_cc(&current,i-1);
7856             dirty_reg(&current,CCREG);
7857             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
7858               // The delay slot overwrote the branch condition
7859               // Delay slot goes after the test (in order)
7860               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7861               current.u|=1;
7862               delayslot_alloc(&current,i);
7863               current.isconst=0;
7864             }
7865             else
7866             {
7867               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
7868               // Alloc the branch condition register
7869               alloc_reg(&current,i-1,rs1[i-1]);
7870             }
7871             memcpy(&branch_regs[i-1],&current,sizeof(current));
7872             branch_regs[i-1].isconst=0;
7873             branch_regs[i-1].wasconst=0;
7874             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
7875             memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
7876           }
7877           else
7878           // Alloc the delay slot in case the branch is taken
7879           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
7880           {
7881             memcpy(&branch_regs[i-1],&current,sizeof(current));
7882             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
7883             alloc_cc(&branch_regs[i-1],i);
7884             dirty_reg(&branch_regs[i-1],CCREG);
7885             delayslot_alloc(&branch_regs[i-1],i);
7886             branch_regs[i-1].isconst=0;
7887             alloc_reg(&current,i,CCREG); // Not taken path
7888             dirty_reg(&current,CCREG);
7889             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7890           }
7891           // FIXME: BLTZAL/BGEZAL
7892           if(opcode2[i-1]&0x10) { // BxxZAL
7893             alloc_reg(&branch_regs[i-1],i-1,31);
7894             dirty_reg(&branch_regs[i-1],31);
7895           }
7896           break;
7897       }
7898
7899       if (is_ujump(i-1))
7900       {
7901         if(rt1[i-1]==31) // JAL/JALR
7902         {
7903           // Subroutine call will return here, don't alloc any registers
7904           current.dirty=0;
7905           clear_all_regs(current.regmap);
7906           alloc_reg(&current,i,CCREG);
7907           dirty_reg(&current,CCREG);
7908         }
7909         else if(i+1<slen)
7910         {
7911           // Internal branch will jump here, match registers to caller
7912           current.dirty=0;
7913           clear_all_regs(current.regmap);
7914           alloc_reg(&current,i,CCREG);
7915           dirty_reg(&current,CCREG);
7916           for(j=i-1;j>=0;j--)
7917           {
7918             if(ba[j]==start+i*4+4) {
7919               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
7920               current.dirty=branch_regs[j].dirty;
7921               break;
7922             }
7923           }
7924           while(j>=0) {
7925             if(ba[j]==start+i*4+4) {
7926               for(hr=0;hr<HOST_REGS;hr++) {
7927                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
7928                   current.regmap[hr]=-1;
7929                 }
7930                 current.dirty&=branch_regs[j].dirty;
7931               }
7932             }
7933             j--;
7934           }
7935         }
7936       }
7937     }
7938
7939     // Count cycles in between branches
7940     ccadj[i]=cc;
7941     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
7942     {
7943       cc=0;
7944     }
7945 #if !defined(DRC_DBG)
7946     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
7947     {
7948       // GTE runs in parallel until accessed, divide by 2 for a rough guess
7949       cc+=gte_cycletab[source[i]&0x3f]/2;
7950     }
7951     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
7952     {
7953       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
7954     }
7955     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
7956     {
7957       cc+=4;
7958     }
7959     else if(itype[i]==C2LS)
7960     {
7961       cc+=4;
7962     }
7963 #endif
7964     else
7965     {
7966       cc++;
7967     }
7968
7969     if(!is_ds[i]) {
7970       regs[i].dirty=current.dirty;
7971       regs[i].isconst=current.isconst;
7972       memcpy(constmap[i],current_constmap,sizeof(constmap[i]));
7973     }
7974     for(hr=0;hr<HOST_REGS;hr++) {
7975       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
7976         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
7977           regs[i].wasconst&=~(1<<hr);
7978         }
7979       }
7980     }
7981     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
7982     regs[i].waswritten=current.waswritten;
7983   }
7984
7985   /* Pass 4 - Cull unused host registers */
7986
7987   uint64_t nr=0;
7988
7989   for (i=slen-1;i>=0;i--)
7990   {
7991     int hr;
7992     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
7993     {
7994       if(ba[i]<start || ba[i]>=(start+slen*4))
7995       {
7996         // Branch out of this block, don't need anything
7997         nr=0;
7998       }
7999       else
8000       {
8001         // Internal branch
8002         // Need whatever matches the target
8003         nr=0;
8004         int t=(ba[i]-start)>>2;
8005         for(hr=0;hr<HOST_REGS;hr++)
8006         {
8007           if(regs[i].regmap_entry[hr]>=0) {
8008             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
8009           }
8010         }
8011       }
8012       // Conditional branch may need registers for following instructions
8013       if (!is_ujump(i))
8014       {
8015         if(i<slen-2) {
8016           nr|=needed_reg[i+2];
8017           for(hr=0;hr<HOST_REGS;hr++)
8018           {
8019             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
8020             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
8021           }
8022         }
8023       }
8024       // Don't need stuff which is overwritten
8025       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8026       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8027       // Merge in delay slot
8028       for(hr=0;hr<HOST_REGS;hr++)
8029       {
8030         if(!likely[i]) {
8031           // These are overwritten unless the branch is "likely"
8032           // and the delay slot is nullified if not taken
8033           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8034           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8035         }
8036         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8037         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8038         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8039         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8040         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
8041           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8042           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8043         }
8044       }
8045     }
8046     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
8047     {
8048       // SYSCALL instruction (software interrupt)
8049       nr=0;
8050     }
8051     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
8052     {
8053       // ERET instruction (return from interrupt)
8054       nr=0;
8055     }
8056     else // Non-branch
8057     {
8058       if(i<slen-1) {
8059         for(hr=0;hr<HOST_REGS;hr++) {
8060           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
8061           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
8062           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8063           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8064         }
8065       }
8066     }
8067     for(hr=0;hr<HOST_REGS;hr++)
8068     {
8069       // Overwritten registers are not needed
8070       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8071       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8072       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8073       // Source registers are needed
8074       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
8075       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
8076       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8077       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8078       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
8079         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8080         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8081       }
8082       // Don't store a register immediately after writing it,
8083       // may prevent dual-issue.
8084       // But do so if this is a branch target, otherwise we
8085       // might have to load the register before the branch.
8086       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
8087         if((regmap_pre[i][hr]>0&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1))) {
8088           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8089           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8090         }
8091         if((regs[i].regmap_entry[hr]>0&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1))) {
8092           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8093           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8094         }
8095       }
8096     }
8097     // Cycle count is needed at branches.  Assume it is needed at the target too.
8098     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==SPAN) {
8099       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8100       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8101     }
8102     // Save it
8103     needed_reg[i]=nr;
8104
8105     // Deallocate unneeded registers
8106     for(hr=0;hr<HOST_REGS;hr++)
8107     {
8108       if(!((nr>>hr)&1)) {
8109         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
8110         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8111            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8112            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
8113         {
8114           if (!is_ujump(i))
8115           {
8116             if(likely[i]) {
8117               regs[i].regmap[hr]=-1;
8118               regs[i].isconst&=~(1<<hr);
8119               if(i<slen-2) {
8120                 regmap_pre[i+2][hr]=-1;
8121                 regs[i+2].wasconst&=~(1<<hr);
8122               }
8123             }
8124           }
8125         }
8126         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
8127         {
8128           int map=0,temp=0;
8129           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
8130              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8131             map=INVCP;
8132           }
8133           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
8134              itype[i+1]==C1LS || itype[i+1]==C2LS)
8135             temp=FTEMP;
8136           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8137              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8138              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
8139              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
8140              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
8141              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
8142              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
8143              regs[i].regmap[hr]!=map )
8144           {
8145             regs[i].regmap[hr]=-1;
8146             regs[i].isconst&=~(1<<hr);
8147             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
8148                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
8149                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
8150                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
8151                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
8152                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
8153                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
8154                branch_regs[i].regmap[hr]!=map)
8155             {
8156               branch_regs[i].regmap[hr]=-1;
8157               branch_regs[i].regmap_entry[hr]=-1;
8158               if (!is_ujump(i))
8159               {
8160                 if(!likely[i]&&i<slen-2) {
8161                   regmap_pre[i+2][hr]=-1;
8162                   regs[i+2].wasconst&=~(1<<hr);
8163                 }
8164               }
8165             }
8166           }
8167         }
8168         else
8169         {
8170           // Non-branch
8171           if(i>0)
8172           {
8173             int map=-1,temp=-1;
8174             if(itype[i]==STORE || itype[i]==STORELR ||
8175                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8176               map=INVCP;
8177             }
8178             if(itype[i]==LOADLR || itype[i]==STORELR ||
8179                itype[i]==C1LS || itype[i]==C2LS)
8180               temp=FTEMP;
8181             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8182                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
8183                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
8184                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
8185             {
8186               if(i<slen-1&&!is_ds[i]) {
8187                 assert(regs[i].regmap[hr]<64);
8188                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]>0)
8189                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
8190                 {
8191                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
8192                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
8193                 }
8194                 regmap_pre[i+1][hr]=-1;
8195                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
8196                 regs[i+1].wasconst&=~(1<<hr);
8197               }
8198               regs[i].regmap[hr]=-1;
8199               regs[i].isconst&=~(1<<hr);
8200             }
8201           }
8202         }
8203       } // if needed
8204     } // for hr
8205   }
8206
8207   /* Pass 5 - Pre-allocate registers */
8208
8209   // If a register is allocated during a loop, try to allocate it for the
8210   // entire loop, if possible.  This avoids loading/storing registers
8211   // inside of the loop.
8212
8213   signed char f_regmap[HOST_REGS];
8214   clear_all_regs(f_regmap);
8215   for(i=0;i<slen-1;i++)
8216   {
8217     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
8218     {
8219       if(ba[i]>=start && ba[i]<(start+i*4))
8220       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
8221       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
8222       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
8223       ||itype[i+1]==SHIFT||itype[i+1]==COP1
8224       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
8225       {
8226         int t=(ba[i]-start)>>2;
8227         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP)) // loop_preload can't handle jumps into delay slots
8228         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
8229         for(hr=0;hr<HOST_REGS;hr++)
8230         {
8231           if(regs[i].regmap[hr]>=0) {
8232             if(f_regmap[hr]!=regs[i].regmap[hr]) {
8233               // dealloc old register
8234               int n;
8235               for(n=0;n<HOST_REGS;n++)
8236               {
8237                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
8238               }
8239               // and alloc new one
8240               f_regmap[hr]=regs[i].regmap[hr];
8241             }
8242           }
8243           if(branch_regs[i].regmap[hr]>=0) {
8244             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
8245               // dealloc old register
8246               int n;
8247               for(n=0;n<HOST_REGS;n++)
8248               {
8249                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
8250               }
8251               // and alloc new one
8252               f_regmap[hr]=branch_regs[i].regmap[hr];
8253             }
8254           }
8255           if(ooo[i]) {
8256             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
8257               f_regmap[hr]=branch_regs[i].regmap[hr];
8258           }else{
8259             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
8260               f_regmap[hr]=branch_regs[i].regmap[hr];
8261           }
8262           // Avoid dirty->clean transition
8263           #ifdef DESTRUCTIVE_WRITEBACK
8264           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
8265           #endif
8266           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
8267           // case above, however it's always a good idea.  We can't hoist the
8268           // load if the register was already allocated, so there's no point
8269           // wasting time analyzing most of these cases.  It only "succeeds"
8270           // when the mapping was different and the load can be replaced with
8271           // a mov, which is of negligible benefit.  So such cases are
8272           // skipped below.
8273           if(f_regmap[hr]>0) {
8274             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
8275               int r=f_regmap[hr];
8276               for(j=t;j<=i;j++)
8277               {
8278                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
8279                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
8280                 assert(r < 64);
8281                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
8282                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
8283                   int k;
8284                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
8285                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
8286                     if(r>63) {
8287                       if(get_reg(regs[i].regmap,r&63)<0) break;
8288                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
8289                     }
8290                     k=i;
8291                     while(k>1&&regs[k-1].regmap[hr]==-1) {
8292                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
8293                         //printf("no free regs for store %x\n",start+(k-1)*4);
8294                         break;
8295                       }
8296                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
8297                         //printf("no-match due to different register\n");
8298                         break;
8299                       }
8300                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP) {
8301                         //printf("no-match due to branch\n");
8302                         break;
8303                       }
8304                       // call/ret fast path assumes no registers allocated
8305                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
8306                         break;
8307                       }
8308                       assert(r < 64);
8309                       k--;
8310                     }
8311                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
8312                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
8313                       while(k<i) {
8314                         regs[k].regmap_entry[hr]=f_regmap[hr];
8315                         regs[k].regmap[hr]=f_regmap[hr];
8316                         regmap_pre[k+1][hr]=f_regmap[hr];
8317                         regs[k].wasdirty&=~(1<<hr);
8318                         regs[k].dirty&=~(1<<hr);
8319                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
8320                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
8321                         regs[k].wasconst&=~(1<<hr);
8322                         regs[k].isconst&=~(1<<hr);
8323                         k++;
8324                       }
8325                     }
8326                     else {
8327                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
8328                       break;
8329                     }
8330                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
8331                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
8332                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
8333                       regs[i].regmap_entry[hr]=f_regmap[hr];
8334                       regs[i].regmap[hr]=f_regmap[hr];
8335                       regs[i].wasdirty&=~(1<<hr);
8336                       regs[i].dirty&=~(1<<hr);
8337                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
8338                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
8339                       regs[i].wasconst&=~(1<<hr);
8340                       regs[i].isconst&=~(1<<hr);
8341                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
8342                       branch_regs[i].wasdirty&=~(1<<hr);
8343                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
8344                       branch_regs[i].regmap[hr]=f_regmap[hr];
8345                       branch_regs[i].dirty&=~(1<<hr);
8346                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
8347                       branch_regs[i].wasconst&=~(1<<hr);
8348                       branch_regs[i].isconst&=~(1<<hr);
8349                       if (!is_ujump(i)) {
8350                         regmap_pre[i+2][hr]=f_regmap[hr];
8351                         regs[i+2].wasdirty&=~(1<<hr);
8352                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
8353                       }
8354                     }
8355                   }
8356                   for(k=t;k<j;k++) {
8357                     // Alloc register clean at beginning of loop,
8358                     // but may dirty it in pass 6
8359                     regs[k].regmap_entry[hr]=f_regmap[hr];
8360                     regs[k].regmap[hr]=f_regmap[hr];
8361                     regs[k].dirty&=~(1<<hr);
8362                     regs[k].wasconst&=~(1<<hr);
8363                     regs[k].isconst&=~(1<<hr);
8364                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP) {
8365                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
8366                       branch_regs[k].regmap[hr]=f_regmap[hr];
8367                       branch_regs[k].dirty&=~(1<<hr);
8368                       branch_regs[k].wasconst&=~(1<<hr);
8369                       branch_regs[k].isconst&=~(1<<hr);
8370                       if (!is_ujump(k)) {
8371                         regmap_pre[k+2][hr]=f_regmap[hr];
8372                         regs[k+2].wasdirty&=~(1<<hr);
8373                       }
8374                     }
8375                     else
8376                     {
8377                       regmap_pre[k+1][hr]=f_regmap[hr];
8378                       regs[k+1].wasdirty&=~(1<<hr);
8379                     }
8380                   }
8381                   if(regs[j].regmap[hr]==f_regmap[hr])
8382                     regs[j].regmap_entry[hr]=f_regmap[hr];
8383                   break;
8384                 }
8385                 if(j==i) break;
8386                 if(regs[j].regmap[hr]>=0)
8387                   break;
8388                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
8389                   //printf("no-match due to different register\n");
8390                   break;
8391                 }
8392                 if (is_ujump(j))
8393                 {
8394                   // Stop on unconditional branch
8395                   break;
8396                 }
8397                 if(itype[j]==CJUMP||itype[j]==SJUMP)
8398                 {
8399                   if(ooo[j]) {
8400                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
8401                       break;
8402                   }else{
8403                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
8404                       break;
8405                   }
8406                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
8407                     //printf("no-match due to different register (branch)\n");
8408                     break;
8409                   }
8410                 }
8411                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
8412                   //printf("No free regs for store %x\n",start+j*4);
8413                   break;
8414                 }
8415                 assert(f_regmap[hr]<64);
8416               }
8417             }
8418           }
8419         }
8420       }
8421     }else{
8422       // Non branch or undetermined branch target
8423       for(hr=0;hr<HOST_REGS;hr++)
8424       {
8425         if(hr!=EXCLUDE_REG) {
8426           if(regs[i].regmap[hr]>=0) {
8427             if(f_regmap[hr]!=regs[i].regmap[hr]) {
8428               // dealloc old register
8429               int n;
8430               for(n=0;n<HOST_REGS;n++)
8431               {
8432                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
8433               }
8434               // and alloc new one
8435               f_regmap[hr]=regs[i].regmap[hr];
8436             }
8437           }
8438         }
8439       }
8440       // Try to restore cycle count at branch targets
8441       if(bt[i]) {
8442         for(j=i;j<slen-1;j++) {
8443           if(regs[j].regmap[HOST_CCREG]!=-1) break;
8444           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
8445             //printf("no free regs for store %x\n",start+j*4);
8446             break;
8447           }
8448         }
8449         if(regs[j].regmap[HOST_CCREG]==CCREG) {
8450           int k=i;
8451           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
8452           while(k<j) {
8453             regs[k].regmap_entry[HOST_CCREG]=CCREG;
8454             regs[k].regmap[HOST_CCREG]=CCREG;
8455             regmap_pre[k+1][HOST_CCREG]=CCREG;
8456             regs[k+1].wasdirty|=1<<HOST_CCREG;
8457             regs[k].dirty|=1<<HOST_CCREG;
8458             regs[k].wasconst&=~(1<<HOST_CCREG);
8459             regs[k].isconst&=~(1<<HOST_CCREG);
8460             k++;
8461           }
8462           regs[j].regmap_entry[HOST_CCREG]=CCREG;
8463         }
8464         // Work backwards from the branch target
8465         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
8466         {
8467           //printf("Extend backwards\n");
8468           int k;
8469           k=i;
8470           while(regs[k-1].regmap[HOST_CCREG]==-1) {
8471             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
8472               //printf("no free regs for store %x\n",start+(k-1)*4);
8473               break;
8474             }
8475             k--;
8476           }
8477           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
8478             //printf("Extend CC, %x ->\n",start+k*4);
8479             while(k<=i) {
8480               regs[k].regmap_entry[HOST_CCREG]=CCREG;
8481               regs[k].regmap[HOST_CCREG]=CCREG;
8482               regmap_pre[k+1][HOST_CCREG]=CCREG;
8483               regs[k+1].wasdirty|=1<<HOST_CCREG;
8484               regs[k].dirty|=1<<HOST_CCREG;
8485               regs[k].wasconst&=~(1<<HOST_CCREG);
8486               regs[k].isconst&=~(1<<HOST_CCREG);
8487               k++;
8488             }
8489           }
8490           else {
8491             //printf("Fail Extend CC, %x ->\n",start+k*4);
8492           }
8493         }
8494       }
8495       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
8496          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
8497          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1)
8498       {
8499         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
8500       }
8501     }
8502   }
8503
8504   // This allocates registers (if possible) one instruction prior
8505   // to use, which can avoid a load-use penalty on certain CPUs.
8506   for(i=0;i<slen-1;i++)
8507   {
8508     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP))
8509     {
8510       if(!bt[i+1])
8511       {
8512         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
8513            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
8514         {
8515           if(rs1[i+1]) {
8516             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
8517             {
8518               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8519               {
8520                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
8521                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
8522                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
8523                 regs[i].isconst&=~(1<<hr);
8524                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8525                 constmap[i][hr]=constmap[i+1][hr];
8526                 regs[i+1].wasdirty&=~(1<<hr);
8527                 regs[i].dirty&=~(1<<hr);
8528               }
8529             }
8530           }
8531           if(rs2[i+1]) {
8532             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
8533             {
8534               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8535               {
8536                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
8537                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
8538                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
8539                 regs[i].isconst&=~(1<<hr);
8540                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8541                 constmap[i][hr]=constmap[i+1][hr];
8542                 regs[i+1].wasdirty&=~(1<<hr);
8543                 regs[i].dirty&=~(1<<hr);
8544               }
8545             }
8546           }
8547           // Preload target address for load instruction (non-constant)
8548           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8549             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
8550             {
8551               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8552               {
8553                 regs[i].regmap[hr]=rs1[i+1];
8554                 regmap_pre[i+1][hr]=rs1[i+1];
8555                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8556                 regs[i].isconst&=~(1<<hr);
8557                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8558                 constmap[i][hr]=constmap[i+1][hr];
8559                 regs[i+1].wasdirty&=~(1<<hr);
8560                 regs[i].dirty&=~(1<<hr);
8561               }
8562             }
8563           }
8564           // Load source into target register
8565           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8566             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
8567             {
8568               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8569               {
8570                 regs[i].regmap[hr]=rs1[i+1];
8571                 regmap_pre[i+1][hr]=rs1[i+1];
8572                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8573                 regs[i].isconst&=~(1<<hr);
8574                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8575                 constmap[i][hr]=constmap[i+1][hr];
8576                 regs[i+1].wasdirty&=~(1<<hr);
8577                 regs[i].dirty&=~(1<<hr);
8578               }
8579             }
8580           }
8581           // Address for store instruction (non-constant)
8582           if(itype[i+1]==STORE||itype[i+1]==STORELR
8583              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
8584             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8585               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
8586               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
8587               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
8588               assert(hr>=0);
8589               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8590               {
8591                 regs[i].regmap[hr]=rs1[i+1];
8592                 regmap_pre[i+1][hr]=rs1[i+1];
8593                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8594                 regs[i].isconst&=~(1<<hr);
8595                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8596                 constmap[i][hr]=constmap[i+1][hr];
8597                 regs[i+1].wasdirty&=~(1<<hr);
8598                 regs[i].dirty&=~(1<<hr);
8599               }
8600             }
8601           }
8602           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
8603             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8604               int nr;
8605               hr=get_reg(regs[i+1].regmap,FTEMP);
8606               assert(hr>=0);
8607               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8608               {
8609                 regs[i].regmap[hr]=rs1[i+1];
8610                 regmap_pre[i+1][hr]=rs1[i+1];
8611                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8612                 regs[i].isconst&=~(1<<hr);
8613                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8614                 constmap[i][hr]=constmap[i+1][hr];
8615                 regs[i+1].wasdirty&=~(1<<hr);
8616                 regs[i].dirty&=~(1<<hr);
8617               }
8618               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
8619               {
8620                 // move it to another register
8621                 regs[i+1].regmap[hr]=-1;
8622                 regmap_pre[i+2][hr]=-1;
8623                 regs[i+1].regmap[nr]=FTEMP;
8624                 regmap_pre[i+2][nr]=FTEMP;
8625                 regs[i].regmap[nr]=rs1[i+1];
8626                 regmap_pre[i+1][nr]=rs1[i+1];
8627                 regs[i+1].regmap_entry[nr]=rs1[i+1];
8628                 regs[i].isconst&=~(1<<nr);
8629                 regs[i+1].isconst&=~(1<<nr);
8630                 regs[i].dirty&=~(1<<nr);
8631                 regs[i+1].wasdirty&=~(1<<nr);
8632                 regs[i+1].dirty&=~(1<<nr);
8633                 regs[i+2].wasdirty&=~(1<<nr);
8634               }
8635             }
8636           }
8637           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
8638             if(itype[i+1]==LOAD)
8639               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
8640             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
8641               hr=get_reg(regs[i+1].regmap,FTEMP);
8642             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
8643               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
8644               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
8645             }
8646             if(hr>=0&&regs[i].regmap[hr]<0) {
8647               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
8648               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
8649                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
8650                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
8651                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
8652                 regs[i].isconst&=~(1<<hr);
8653                 regs[i+1].wasdirty&=~(1<<hr);
8654                 regs[i].dirty&=~(1<<hr);
8655               }
8656             }
8657           }
8658         }
8659       }
8660     }
8661   }
8662
8663   /* Pass 6 - Optimize clean/dirty state */
8664   clean_registers(0,slen-1,1);
8665
8666   /* Pass 7 - Identify 32-bit registers */
8667   for (i=slen-1;i>=0;i--)
8668   {
8669     if(itype[i]==CJUMP||itype[i]==SJUMP)
8670     {
8671       // Conditional branch
8672       if((source[i]>>16)!=0x1000&&i<slen-2) {
8673         // Mark this address as a branch target since it may be called
8674         // upon return from interrupt
8675         bt[i+2]=1;
8676       }
8677     }
8678   }
8679
8680   if(itype[slen-1]==SPAN) {
8681     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
8682   }
8683
8684 #ifdef DISASM
8685   /* Debug/disassembly */
8686   for(i=0;i<slen;i++)
8687   {
8688     printf("U:");
8689     int r;
8690     for(r=1;r<=CCREG;r++) {
8691       if((unneeded_reg[i]>>r)&1) {
8692         if(r==HIREG) printf(" HI");
8693         else if(r==LOREG) printf(" LO");
8694         else printf(" r%d",r);
8695       }
8696     }
8697     printf("\n");
8698     #if defined(__i386__) || defined(__x86_64__)
8699     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
8700     #endif
8701     #ifdef __arm__
8702     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
8703     #endif
8704     #if defined(__i386__) || defined(__x86_64__)
8705     printf("needs: ");
8706     if(needed_reg[i]&1) printf("eax ");
8707     if((needed_reg[i]>>1)&1) printf("ecx ");
8708     if((needed_reg[i]>>2)&1) printf("edx ");
8709     if((needed_reg[i]>>3)&1) printf("ebx ");
8710     if((needed_reg[i]>>5)&1) printf("ebp ");
8711     if((needed_reg[i]>>6)&1) printf("esi ");
8712     if((needed_reg[i]>>7)&1) printf("edi ");
8713     printf("\n");
8714     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
8715     printf("dirty: ");
8716     if(regs[i].wasdirty&1) printf("eax ");
8717     if((regs[i].wasdirty>>1)&1) printf("ecx ");
8718     if((regs[i].wasdirty>>2)&1) printf("edx ");
8719     if((regs[i].wasdirty>>3)&1) printf("ebx ");
8720     if((regs[i].wasdirty>>5)&1) printf("ebp ");
8721     if((regs[i].wasdirty>>6)&1) printf("esi ");
8722     if((regs[i].wasdirty>>7)&1) printf("edi ");
8723     #endif
8724     #ifdef __arm__
8725     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
8726     printf("dirty: ");
8727     if(regs[i].wasdirty&1) printf("r0 ");
8728     if((regs[i].wasdirty>>1)&1) printf("r1 ");
8729     if((regs[i].wasdirty>>2)&1) printf("r2 ");
8730     if((regs[i].wasdirty>>3)&1) printf("r3 ");
8731     if((regs[i].wasdirty>>4)&1) printf("r4 ");
8732     if((regs[i].wasdirty>>5)&1) printf("r5 ");
8733     if((regs[i].wasdirty>>6)&1) printf("r6 ");
8734     if((regs[i].wasdirty>>7)&1) printf("r7 ");
8735     if((regs[i].wasdirty>>8)&1) printf("r8 ");
8736     if((regs[i].wasdirty>>9)&1) printf("r9 ");
8737     if((regs[i].wasdirty>>10)&1) printf("r10 ");
8738     if((regs[i].wasdirty>>12)&1) printf("r12 ");
8739     #endif
8740     printf("\n");
8741     disassemble_inst(i);
8742     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
8743     #if defined(__i386__) || defined(__x86_64__)
8744     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
8745     if(regs[i].dirty&1) printf("eax ");
8746     if((regs[i].dirty>>1)&1) printf("ecx ");
8747     if((regs[i].dirty>>2)&1) printf("edx ");
8748     if((regs[i].dirty>>3)&1) printf("ebx ");
8749     if((regs[i].dirty>>5)&1) printf("ebp ");
8750     if((regs[i].dirty>>6)&1) printf("esi ");
8751     if((regs[i].dirty>>7)&1) printf("edi ");
8752     #endif
8753     #ifdef __arm__
8754     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
8755     if(regs[i].dirty&1) printf("r0 ");
8756     if((regs[i].dirty>>1)&1) printf("r1 ");
8757     if((regs[i].dirty>>2)&1) printf("r2 ");
8758     if((regs[i].dirty>>3)&1) printf("r3 ");
8759     if((regs[i].dirty>>4)&1) printf("r4 ");
8760     if((regs[i].dirty>>5)&1) printf("r5 ");
8761     if((regs[i].dirty>>6)&1) printf("r6 ");
8762     if((regs[i].dirty>>7)&1) printf("r7 ");
8763     if((regs[i].dirty>>8)&1) printf("r8 ");
8764     if((regs[i].dirty>>9)&1) printf("r9 ");
8765     if((regs[i].dirty>>10)&1) printf("r10 ");
8766     if((regs[i].dirty>>12)&1) printf("r12 ");
8767     #endif
8768     printf("\n");
8769     if(regs[i].isconst) {
8770       printf("constants: ");
8771       #if defined(__i386__) || defined(__x86_64__)
8772       if(regs[i].isconst&1) printf("eax=%x ",(u_int)constmap[i][0]);
8773       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(u_int)constmap[i][1]);
8774       if((regs[i].isconst>>2)&1) printf("edx=%x ",(u_int)constmap[i][2]);
8775       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(u_int)constmap[i][3]);
8776       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(u_int)constmap[i][5]);
8777       if((regs[i].isconst>>6)&1) printf("esi=%x ",(u_int)constmap[i][6]);
8778       if((regs[i].isconst>>7)&1) printf("edi=%x ",(u_int)constmap[i][7]);
8779       #endif
8780       #if defined(__arm__) || defined(__aarch64__)
8781       int r;
8782       for (r = 0; r < ARRAY_SIZE(constmap[i]); r++)
8783         if ((regs[i].isconst >> r) & 1)
8784           printf(" r%d=%x", r, (u_int)constmap[i][r]);
8785       #endif
8786       printf("\n");
8787     }
8788     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
8789       #if defined(__i386__) || defined(__x86_64__)
8790       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
8791       if(branch_regs[i].dirty&1) printf("eax ");
8792       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
8793       if((branch_regs[i].dirty>>2)&1) printf("edx ");
8794       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
8795       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
8796       if((branch_regs[i].dirty>>6)&1) printf("esi ");
8797       if((branch_regs[i].dirty>>7)&1) printf("edi ");
8798       #endif
8799       #ifdef __arm__
8800       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
8801       if(branch_regs[i].dirty&1) printf("r0 ");
8802       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
8803       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
8804       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
8805       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
8806       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
8807       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
8808       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
8809       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
8810       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
8811       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
8812       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
8813       #endif
8814     }
8815   }
8816 #endif // DISASM
8817
8818   /* Pass 8 - Assembly */
8819   linkcount=0;stubcount=0;
8820   ds=0;is_delayslot=0;
8821   u_int dirty_pre=0;
8822   void *beginning=start_block();
8823   if((u_int)addr&1) {
8824     ds=1;
8825     pagespan_ds();
8826   }
8827   void *instr_addr0_override = NULL;
8828
8829   if (start == 0x80030000) {
8830     // nasty hack for the fastbios thing
8831     // override block entry to this code
8832     instr_addr0_override = out;
8833     emit_movimm(start,0);
8834     // abuse io address var as a flag that we
8835     // have already returned here once
8836     emit_readword(&address,1);
8837     emit_writeword(0,&pcaddr);
8838     emit_writeword(0,&address);
8839     emit_cmp(0,1);
8840     #ifdef __aarch64__
8841     emit_jeq(out + 4*2);
8842     emit_far_jump(new_dyna_leave);
8843     #else
8844     emit_jne(new_dyna_leave);
8845     #endif
8846   }
8847   for(i=0;i<slen;i++)
8848   {
8849     //if(ds) printf("ds: ");
8850     disassemble_inst(i);
8851     if(ds) {
8852       ds=0; // Skip delay slot
8853       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
8854       instr_addr[i] = NULL;
8855     } else {
8856       speculate_register_values(i);
8857       #ifndef DESTRUCTIVE_WRITEBACK
8858       if (i < 2 || !is_ujump(i-2))
8859       {
8860         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,unneeded_reg[i]);
8861       }
8862       if((itype[i]==CJUMP||itype[i]==SJUMP)&&!likely[i]) {
8863         dirty_pre=branch_regs[i].dirty;
8864       }else{
8865         dirty_pre=regs[i].dirty;
8866       }
8867       #endif
8868       // write back
8869       if (i < 2 || !is_ujump(i-2))
8870       {
8871         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,unneeded_reg[i]);
8872         loop_preload(regmap_pre[i],regs[i].regmap_entry);
8873       }
8874       // branch target entry point
8875       instr_addr[i] = out;
8876       assem_debug("<->\n");
8877       drc_dbg_emit_do_cmp(i);
8878
8879       // load regs
8880       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
8881         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty);
8882       load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i],rs2[i]);
8883       address_generation(i,&regs[i],regs[i].regmap_entry);
8884       load_consts(regmap_pre[i],regs[i].regmap,i);
8885       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
8886       {
8887         // Load the delay slot registers if necessary
8888         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
8889           load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i+1],rs1[i+1]);
8890         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
8891           load_regs(regs[i].regmap_entry,regs[i].regmap,rs2[i+1],rs2[i+1]);
8892         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
8893           load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP);
8894       }
8895       else if(i+1<slen)
8896       {
8897         // Preload registers for following instruction
8898         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
8899           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
8900             load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i+1],rs1[i+1]);
8901         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
8902           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
8903             load_regs(regs[i].regmap_entry,regs[i].regmap,rs2[i+1],rs2[i+1]);
8904       }
8905       // TODO: if(is_ooo(i)) address_generation(i+1);
8906       if(itype[i]==CJUMP)
8907         load_regs(regs[i].regmap_entry,regs[i].regmap,CCREG,CCREG);
8908       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
8909         load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP);
8910       // assemble
8911       switch(itype[i]) {
8912         case ALU:
8913           alu_assemble(i,&regs[i]);break;
8914         case IMM16:
8915           imm16_assemble(i,&regs[i]);break;
8916         case SHIFT:
8917           shift_assemble(i,&regs[i]);break;
8918         case SHIFTIMM:
8919           shiftimm_assemble(i,&regs[i]);break;
8920         case LOAD:
8921           load_assemble(i,&regs[i]);break;
8922         case LOADLR:
8923           loadlr_assemble(i,&regs[i]);break;
8924         case STORE:
8925           store_assemble(i,&regs[i]);break;
8926         case STORELR:
8927           storelr_assemble(i,&regs[i]);break;
8928         case COP0:
8929           cop0_assemble(i,&regs[i]);break;
8930         case COP1:
8931           cop1_assemble(i,&regs[i]);break;
8932         case C1LS:
8933           c1ls_assemble(i,&regs[i]);break;
8934         case COP2:
8935           cop2_assemble(i,&regs[i]);break;
8936         case C2LS:
8937           c2ls_assemble(i,&regs[i]);break;
8938         case C2OP:
8939           c2op_assemble(i,&regs[i]);break;
8940         case MULTDIV:
8941           multdiv_assemble(i,&regs[i]);break;
8942         case MOV:
8943           mov_assemble(i,&regs[i]);break;
8944         case SYSCALL:
8945           syscall_assemble(i,&regs[i]);break;
8946         case HLECALL:
8947           hlecall_assemble(i,&regs[i]);break;
8948         case INTCALL:
8949           intcall_assemble(i,&regs[i]);break;
8950         case UJUMP:
8951           ujump_assemble(i,&regs[i]);ds=1;break;
8952         case RJUMP:
8953           rjump_assemble(i,&regs[i]);ds=1;break;
8954         case CJUMP:
8955           cjump_assemble(i,&regs[i]);ds=1;break;
8956         case SJUMP:
8957           sjump_assemble(i,&regs[i]);ds=1;break;
8958         case SPAN:
8959           pagespan_assemble(i,&regs[i]);break;
8960       }
8961       if (is_ujump(i))
8962         literal_pool(1024);
8963       else
8964         literal_pool_jumpover(256);
8965     }
8966   }
8967   //assert(is_ujump(i-2));
8968   // If the block did not end with an unconditional branch,
8969   // add a jump to the next instruction.
8970   if(i>1) {
8971     if(!is_ujump(i-2)&&itype[i-1]!=SPAN) {
8972       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP);
8973       assert(i==slen);
8974       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP) {
8975         store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4);
8976         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
8977           emit_loadreg(CCREG,HOST_CCREG);
8978         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
8979       }
8980       else if(!likely[i-2])
8981       {
8982         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].dirty,start+i*4);
8983         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
8984       }
8985       else
8986       {
8987         store_regs_bt(regs[i-2].regmap,regs[i-2].dirty,start+i*4);
8988         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
8989       }
8990       add_to_linker(out,start+i*4,0);
8991       emit_jmp(0);
8992     }
8993   }
8994   else
8995   {
8996     assert(i>0);
8997     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP);
8998     store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4);
8999     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
9000       emit_loadreg(CCREG,HOST_CCREG);
9001     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
9002     add_to_linker(out,start+i*4,0);
9003     emit_jmp(0);
9004   }
9005
9006   // TODO: delay slot stubs?
9007   // Stubs
9008   for(i=0;i<stubcount;i++)
9009   {
9010     switch(stubs[i].type)
9011     {
9012       case LOADB_STUB:
9013       case LOADH_STUB:
9014       case LOADW_STUB:
9015       case LOADD_STUB:
9016       case LOADBU_STUB:
9017       case LOADHU_STUB:
9018         do_readstub(i);break;
9019       case STOREB_STUB:
9020       case STOREH_STUB:
9021       case STOREW_STUB:
9022       case STORED_STUB:
9023         do_writestub(i);break;
9024       case CC_STUB:
9025         do_ccstub(i);break;
9026       case INVCODE_STUB:
9027         do_invstub(i);break;
9028       case FP_STUB:
9029         do_cop1stub(i);break;
9030       case STORELR_STUB:
9031         do_unalignedwritestub(i);break;
9032     }
9033   }
9034
9035   if (instr_addr0_override)
9036     instr_addr[0] = instr_addr0_override;
9037
9038   /* Pass 9 - Linker */
9039   for(i=0;i<linkcount;i++)
9040   {
9041     assem_debug("%p -> %8x\n",link_addr[i].addr,link_addr[i].target);
9042     literal_pool(64);
9043     if (!link_addr[i].ext)
9044     {
9045       void *stub = out;
9046       void *addr = check_addr(link_addr[i].target);
9047       emit_extjump(link_addr[i].addr, link_addr[i].target);
9048       if (addr) {
9049         set_jump_target(link_addr[i].addr, addr);
9050         add_link(link_addr[i].target,stub);
9051       }
9052       else
9053         set_jump_target(link_addr[i].addr, stub);
9054     }
9055     else
9056     {
9057       // Internal branch
9058       int target=(link_addr[i].target-start)>>2;
9059       assert(target>=0&&target<slen);
9060       assert(instr_addr[target]);
9061       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
9062       //set_jump_target_fillslot(link_addr[i].addr,instr_addr[target],link_addr[i].ext>>1);
9063       //#else
9064       set_jump_target(link_addr[i].addr, instr_addr[target]);
9065       //#endif
9066     }
9067   }
9068   // External Branch Targets (jump_in)
9069   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
9070   for(i=0;i<slen;i++)
9071   {
9072     if(bt[i]||i==0)
9073     {
9074       if(instr_addr[i]) // TODO - delay slots (=null)
9075       {
9076         u_int vaddr=start+i*4;
9077         u_int page=get_page(vaddr);
9078         u_int vpage=get_vpage(vaddr);
9079         literal_pool(256);
9080         {
9081           assem_debug("%p (%d) <- %8x\n",instr_addr[i],i,start+i*4);
9082           assem_debug("jump_in: %x\n",start+i*4);
9083           ll_add(jump_dirty+vpage,vaddr,out);
9084           void *entry_point = do_dirty_stub(i);
9085           ll_add_flags(jump_in+page,vaddr,state_rflags,entry_point);
9086           // If there was an existing entry in the hash table,
9087           // replace it with the new address.
9088           // Don't add new entries.  We'll insert the
9089           // ones that actually get used in check_addr().
9090           struct ht_entry *ht_bin = hash_table_get(vaddr);
9091           if (ht_bin->vaddr[0] == vaddr)
9092             ht_bin->tcaddr[0] = entry_point;
9093           if (ht_bin->vaddr[1] == vaddr)
9094             ht_bin->tcaddr[1] = entry_point;
9095         }
9096       }
9097     }
9098   }
9099   // Write out the literal pool if necessary
9100   literal_pool(0);
9101   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
9102   // Align code
9103   if(((u_int)out)&7) emit_addnop(13);
9104   #endif
9105   assert(out - (u_char *)beginning < MAX_OUTPUT_BLOCK_SIZE);
9106   //printf("shadow buffer: %p-%p\n",copy,(u_char *)copy+slen*4);
9107   memcpy(copy,source,slen*4);
9108   copy+=slen*4;
9109
9110   end_block(beginning);
9111
9112   // If we're within 256K of the end of the buffer,
9113   // start over from the beginning. (Is 256K enough?)
9114   if (out > ndrc->translation_cache + sizeof(ndrc->translation_cache) - MAX_OUTPUT_BLOCK_SIZE)
9115     out = ndrc->translation_cache;
9116
9117   // Trap writes to any of the pages we compiled
9118   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
9119     invalid_code[i]=0;
9120   }
9121   inv_code_start=inv_code_end=~0;
9122
9123   // for PCSX we need to mark all mirrors too
9124   if(get_page(start)<(RAM_SIZE>>12))
9125     for(i=start>>12;i<=(start+slen*4)>>12;i++)
9126       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
9127       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
9128       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
9129
9130   /* Pass 10 - Free memory by expiring oldest blocks */
9131
9132   int end=(((out-ndrc->translation_cache)>>(TARGET_SIZE_2-16))+16384)&65535;
9133   while(expirep!=end)
9134   {
9135     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
9136     uintptr_t base=(uintptr_t)ndrc->translation_cache+((expirep>>13)<<shift); // Base address of this block
9137     inv_debug("EXP: Phase %d\n",expirep);
9138     switch((expirep>>11)&3)
9139     {
9140       case 0:
9141         // Clear jump_in and jump_dirty
9142         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
9143         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
9144         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
9145         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
9146         break;
9147       case 1:
9148         // Clear pointers
9149         ll_kill_pointers(jump_out[expirep&2047],base,shift);
9150         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
9151         break;
9152       case 2:
9153         // Clear hash table
9154         for(i=0;i<32;i++) {
9155           struct ht_entry *ht_bin = &hash_table[((expirep&2047)<<5)+i];
9156           if (((uintptr_t)ht_bin->tcaddr[1]>>shift) == (base>>shift) ||
9157              (((uintptr_t)ht_bin->tcaddr[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
9158             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[1],ht_bin->tcaddr[1]);
9159             ht_bin->vaddr[1] = -1;
9160             ht_bin->tcaddr[1] = NULL;
9161           }
9162           if (((uintptr_t)ht_bin->tcaddr[0]>>shift) == (base>>shift) ||
9163              (((uintptr_t)ht_bin->tcaddr[0]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
9164             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[0],ht_bin->tcaddr[0]);
9165             ht_bin->vaddr[0] = ht_bin->vaddr[1];
9166             ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
9167             ht_bin->vaddr[1] = -1;
9168             ht_bin->tcaddr[1] = NULL;
9169           }
9170         }
9171         break;
9172       case 3:
9173         // Clear jump_out
9174         if((expirep&2047)==0)
9175           do_clear_cache();
9176         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
9177         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
9178         break;
9179     }
9180     expirep=(expirep+1)&65535;
9181   }
9182   return 0;
9183 }
9184
9185 // vim:shiftwidth=2:expandtab