drc: add a timing hack for Internal Section
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29 #ifdef _3DS
30 #include <3ds_utils.h>
31 #endif
32 #ifdef VITA
33 #include <psp2/kernel/sysmem.h>
34 static int sceBlock;
35 #endif
36
37 #include "new_dynarec_config.h"
38 #include "../psxhle.h"
39 #include "../psxinterpreter.h"
40 #include "emu_if.h" //emulator interface
41
42 #define noinline __attribute__((noinline,noclone))
43 #ifndef ARRAY_SIZE
44 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
45 #endif
46
47 //#define DISASM
48 //#define assem_debug printf
49 //#define inv_debug printf
50 #define assem_debug(...)
51 #define inv_debug(...)
52
53 #ifdef __i386__
54 #include "assem_x86.h"
55 #endif
56 #ifdef __x86_64__
57 #include "assem_x64.h"
58 #endif
59 #ifdef __arm__
60 #include "assem_arm.h"
61 #endif
62 #ifdef __aarch64__
63 #include "assem_arm64.h"
64 #endif
65
66 #define MAXBLOCK 4096
67 #define MAX_OUTPUT_BLOCK_SIZE 262144
68
69 struct ndrc_mem
70 {
71   u_char translation_cache[1 << TARGET_SIZE_2];
72   struct
73   {
74     struct tramp_insns ops[2048 / sizeof(struct tramp_insns)];
75     const void *f[2048 / sizeof(void *)];
76   } tramp;
77 };
78
79 #ifdef BASE_ADDR_DYNAMIC
80 static struct ndrc_mem *ndrc;
81 #else
82 static struct ndrc_mem ndrc_ __attribute__((aligned(4096)));
83 static struct ndrc_mem *ndrc = &ndrc_;
84 #endif
85
86 // stubs
87 enum stub_type {
88   CC_STUB = 1,
89   FP_STUB = 2,
90   LOADB_STUB = 3,
91   LOADH_STUB = 4,
92   LOADW_STUB = 5,
93   LOADD_STUB = 6,
94   LOADBU_STUB = 7,
95   LOADHU_STUB = 8,
96   STOREB_STUB = 9,
97   STOREH_STUB = 10,
98   STOREW_STUB = 11,
99   STORED_STUB = 12,
100   STORELR_STUB = 13,
101   INVCODE_STUB = 14,
102 };
103
104 struct regstat
105 {
106   signed char regmap_entry[HOST_REGS];
107   signed char regmap[HOST_REGS];
108   uint64_t wasdirty;
109   uint64_t dirty;
110   uint64_t u;
111   u_int wasconst;
112   u_int isconst;
113   u_int loadedconst;             // host regs that have constants loaded
114   u_int waswritten;              // MIPS regs that were used as store base before
115 };
116
117 // note: asm depends on this layout
118 struct ll_entry
119 {
120   u_int vaddr;
121   u_int reg_sv_flags;
122   void *addr;
123   struct ll_entry *next;
124 };
125
126 struct ht_entry
127 {
128   u_int vaddr[2];
129   void *tcaddr[2];
130 };
131
132 struct code_stub
133 {
134   enum stub_type type;
135   void *addr;
136   void *retaddr;
137   u_int a;
138   uintptr_t b;
139   uintptr_t c;
140   u_int d;
141   u_int e;
142 };
143
144 struct link_entry
145 {
146   void *addr;
147   u_int target;
148   u_int ext;
149 };
150
151   // used by asm:
152   u_char *out;
153   struct ht_entry hash_table[65536]  __attribute__((aligned(16)));
154   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
155   struct ll_entry *jump_dirty[4096];
156
157   static struct ll_entry *jump_out[4096];
158   static u_int start;
159   static u_int *source;
160   static char insn[MAXBLOCK][10];
161   static u_char itype[MAXBLOCK];
162   static u_char opcode[MAXBLOCK];
163   static u_char opcode2[MAXBLOCK];
164   static u_char bt[MAXBLOCK];
165   static u_char rs1[MAXBLOCK];
166   static u_char rs2[MAXBLOCK];
167   static u_char rt1[MAXBLOCK];
168   static u_char rt2[MAXBLOCK];
169   static u_char dep1[MAXBLOCK];
170   static u_char dep2[MAXBLOCK];
171   static u_char lt1[MAXBLOCK];
172   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
173   static uint64_t gte_rt[MAXBLOCK];
174   static uint64_t gte_unneeded[MAXBLOCK];
175   static u_int smrv[32]; // speculated MIPS register values
176   static u_int smrv_strong; // mask or regs that are likely to have correct values
177   static u_int smrv_weak; // same, but somewhat less likely
178   static u_int smrv_strong_next; // same, but after current insn executes
179   static u_int smrv_weak_next;
180   static int imm[MAXBLOCK];
181   static u_int ba[MAXBLOCK];
182   static char likely[MAXBLOCK];
183   static char is_ds[MAXBLOCK];
184   static char ooo[MAXBLOCK];
185   static uint64_t unneeded_reg[MAXBLOCK];
186   static uint64_t branch_unneeded_reg[MAXBLOCK];
187   static signed char regmap_pre[MAXBLOCK][HOST_REGS]; // pre-instruction i?
188   // contains 'real' consts at [i] insn, but may differ from what's actually
189   // loaded in host reg as 'final' value is always loaded, see get_final_value()
190   static uint32_t current_constmap[HOST_REGS];
191   static uint32_t constmap[MAXBLOCK][HOST_REGS];
192   static struct regstat regs[MAXBLOCK];
193   static struct regstat branch_regs[MAXBLOCK];
194   static signed char minimum_free_regs[MAXBLOCK];
195   static u_int needed_reg[MAXBLOCK];
196   static u_int wont_dirty[MAXBLOCK];
197   static u_int will_dirty[MAXBLOCK];
198   static int ccadj[MAXBLOCK];
199   static int slen;
200   static void *instr_addr[MAXBLOCK];
201   static struct link_entry link_addr[MAXBLOCK];
202   static int linkcount;
203   static struct code_stub stubs[MAXBLOCK*3];
204   static int stubcount;
205   static u_int literals[1024][2];
206   static int literalcount;
207   static int is_delayslot;
208   static char shadow[1048576]  __attribute__((aligned(16)));
209   static void *copy;
210   static int expirep;
211   static u_int stop_after_jal;
212 #ifndef RAM_FIXED
213   static uintptr_t ram_offset;
214 #else
215   static const uintptr_t ram_offset=0;
216 #endif
217
218   int new_dynarec_hacks;
219   int new_dynarec_did_compile;
220
221   extern int cycle_count; // ... until end of the timeslice, counts -N -> 0
222   extern int last_count;  // last absolute target, often = next_interupt
223   extern int pcaddr;
224   extern int pending_exception;
225   extern int branch_target;
226   extern uintptr_t mini_ht[32][2];
227   extern u_char restore_candidate[512];
228
229   /* registers that may be allocated */
230   /* 1-31 gpr */
231 #define LOREG 32 // lo
232 #define HIREG 33 // hi
233 //#define FSREG 34 // FPU status (FCSR)
234 #define CSREG 35 // Coprocessor status
235 #define CCREG 36 // Cycle count
236 #define INVCP 37 // Pointer to invalid_code
237 //#define MMREG 38 // Pointer to memory_map
238 //#define ROREG 39 // ram offset (if rdram!=0x80000000)
239 #define TEMPREG 40
240 #define FTEMP 40 // FPU temporary register
241 #define PTEMP 41 // Prefetch temporary register
242 //#define TLREG 42 // TLB mapping offset
243 #define RHASH 43 // Return address hash
244 #define RHTBL 44 // Return address hash table address
245 #define RTEMP 45 // JR/JALR address register
246 #define MAXREG 45
247 #define AGEN1 46 // Address generation temporary register
248 //#define AGEN2 47 // Address generation temporary register
249 //#define MGEN1 48 // Maptable address generation temporary register
250 //#define MGEN2 49 // Maptable address generation temporary register
251 #define BTREG 50 // Branch target temporary register
252
253   /* instruction types */
254 #define NOP 0     // No operation
255 #define LOAD 1    // Load
256 #define STORE 2   // Store
257 #define LOADLR 3  // Unaligned load
258 #define STORELR 4 // Unaligned store
259 #define MOV 5     // Move
260 #define ALU 6     // Arithmetic/logic
261 #define MULTDIV 7 // Multiply/divide
262 #define SHIFT 8   // Shift by register
263 #define SHIFTIMM 9// Shift by immediate
264 #define IMM16 10  // 16-bit immediate
265 #define RJUMP 11  // Unconditional jump to register
266 #define UJUMP 12  // Unconditional jump
267 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
268 #define SJUMP 14  // Conditional branch (regimm format)
269 #define COP0 15   // Coprocessor 0
270 #define COP1 16   // Coprocessor 1
271 #define C1LS 17   // Coprocessor 1 load/store
272 //#define FJUMP 18  // Conditional branch (floating point)
273 //#define FLOAT 19  // Floating point unit
274 //#define FCONV 20  // Convert integer to float
275 //#define FCOMP 21  // Floating point compare (sets FSREG)
276 #define SYSCALL 22// SYSCALL
277 #define OTHER 23  // Other
278 #define SPAN 24   // Branch/delay slot spans 2 pages
279 #define NI 25     // Not implemented
280 #define HLECALL 26// PCSX fake opcodes for HLE
281 #define COP2 27   // Coprocessor 2 move
282 #define C2LS 28   // Coprocessor 2 load/store
283 #define C2OP 29   // Coprocessor 2 operation
284 #define INTCALL 30// Call interpreter to handle rare corner cases
285
286   /* branch codes */
287 #define TAKEN 1
288 #define NOTTAKEN 2
289 #define NULLDS 3
290
291 #define DJT_1 (void *)1l // no function, just a label in assem_debug log
292 #define DJT_2 (void *)2l
293
294 // asm linkage
295 int new_recompile_block(u_int addr);
296 void *get_addr_ht(u_int vaddr);
297 void invalidate_block(u_int block);
298 void invalidate_addr(u_int addr);
299 void remove_hash(int vaddr);
300 void dyna_linker();
301 void dyna_linker_ds();
302 void verify_code();
303 void verify_code_ds();
304 void cc_interrupt();
305 void fp_exception();
306 void fp_exception_ds();
307 void jump_to_new_pc();
308 void new_dyna_leave();
309
310 // Needed by assembler
311 static void wb_register(signed char r,signed char regmap[],uint64_t dirty);
312 static void wb_dirtys(signed char i_regmap[],uint64_t i_dirty);
313 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_dirty,int addr);
314 static void load_all_regs(signed char i_regmap[]);
315 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
316 static void load_regs_entry(int t);
317 static void load_all_consts(signed char regmap[],u_int dirty,int i);
318
319 static int verify_dirty(const u_int *ptr);
320 static int get_final_value(int hr, int i, int *value);
321 static void add_stub(enum stub_type type, void *addr, void *retaddr,
322   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e);
323 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
324   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist);
325 static void add_to_linker(void *addr, u_int target, int ext);
326 static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override);
327 static void *get_direct_memhandler(void *table, u_int addr,
328   enum stub_type type, uintptr_t *addr_host);
329 static void pass_args(int a0, int a1);
330 static void emit_far_jump(const void *f);
331 static void emit_far_call(const void *f);
332
333 static void mprotect_w_x(void *start, void *end, int is_x)
334 {
335 #ifdef NO_WRITE_EXEC
336   #if defined(VITA)
337   // *Open* enables write on all memory that was
338   // allocated by sceKernelAllocMemBlockForVM()?
339   if (is_x)
340     sceKernelCloseVMDomain();
341   else
342     sceKernelOpenVMDomain();
343   #else
344   u_long mstart = (u_long)start & ~4095ul;
345   u_long mend = (u_long)end;
346   if (mprotect((void *)mstart, mend - mstart,
347                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
348     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
349   #endif
350 #endif
351 }
352
353 static void start_tcache_write(void *start, void *end)
354 {
355   mprotect_w_x(start, end, 0);
356 }
357
358 static void end_tcache_write(void *start, void *end)
359 {
360 #if defined(__arm__) || defined(__aarch64__)
361   size_t len = (char *)end - (char *)start;
362   #if   defined(__BLACKBERRY_QNX__)
363   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
364   #elif defined(__MACH__)
365   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
366   #elif defined(VITA)
367   sceKernelSyncVMDomain(sceBlock, start, len);
368   #elif defined(_3DS)
369   ctr_flush_invalidate_cache();
370   #elif defined(__aarch64__)
371   // as of 2021, __clear_cache() is still broken on arm64
372   // so here is a custom one :(
373   clear_cache_arm64(start, end);
374   #else
375   __clear_cache(start, end);
376   #endif
377   (void)len;
378 #endif
379
380   mprotect_w_x(start, end, 1);
381 }
382
383 static void *start_block(void)
384 {
385   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
386   if (end > ndrc->translation_cache + sizeof(ndrc->translation_cache))
387     end = ndrc->translation_cache + sizeof(ndrc->translation_cache);
388   start_tcache_write(out, end);
389   return out;
390 }
391
392 static void end_block(void *start)
393 {
394   end_tcache_write(start, out);
395 }
396
397 // also takes care of w^x mappings when patching code
398 static u_int needs_clear_cache[1<<(TARGET_SIZE_2-17)];
399
400 static void mark_clear_cache(void *target)
401 {
402   uintptr_t offset = (u_char *)target - ndrc->translation_cache;
403   u_int mask = 1u << ((offset >> 12) & 31);
404   if (!(needs_clear_cache[offset >> 17] & mask)) {
405     char *start = (char *)((uintptr_t)target & ~4095l);
406     start_tcache_write(start, start + 4095);
407     needs_clear_cache[offset >> 17] |= mask;
408   }
409 }
410
411 // Clearing the cache is rather slow on ARM Linux, so mark the areas
412 // that need to be cleared, and then only clear these areas once.
413 static void do_clear_cache(void)
414 {
415   int i, j;
416   for (i = 0; i < (1<<(TARGET_SIZE_2-17)); i++)
417   {
418     u_int bitmap = needs_clear_cache[i];
419     if (!bitmap)
420       continue;
421     for (j = 0; j < 32; j++)
422     {
423       u_char *start, *end;
424       if (!(bitmap & (1<<j)))
425         continue;
426
427       start = ndrc->translation_cache + i*131072 + j*4096;
428       end = start + 4095;
429       for (j++; j < 32; j++) {
430         if (!(bitmap & (1<<j)))
431           break;
432         end += 4096;
433       }
434       end_tcache_write(start, end);
435     }
436     needs_clear_cache[i] = 0;
437   }
438 }
439
440 //#define DEBUG_CYCLE_COUNT 1
441
442 #define NO_CYCLE_PENALTY_THR 12
443
444 int cycle_multiplier; // 100 for 1.0
445 int cycle_multiplier_override;
446
447 static int CLOCK_ADJUST(int x)
448 {
449   int m = cycle_multiplier_override
450         ? cycle_multiplier_override : cycle_multiplier;
451   int s=(x>>31)|1;
452   return (x * m + s * 50) / 100;
453 }
454
455 static u_int get_page(u_int vaddr)
456 {
457   u_int page=vaddr&~0xe0000000;
458   if (page < 0x1000000)
459     page &= ~0x0e00000; // RAM mirrors
460   page>>=12;
461   if(page>2048) page=2048+(page&2047);
462   return page;
463 }
464
465 // no virtual mem in PCSX
466 static u_int get_vpage(u_int vaddr)
467 {
468   return get_page(vaddr);
469 }
470
471 static struct ht_entry *hash_table_get(u_int vaddr)
472 {
473   return &hash_table[((vaddr>>16)^vaddr)&0xFFFF];
474 }
475
476 static void hash_table_add(struct ht_entry *ht_bin, u_int vaddr, void *tcaddr)
477 {
478   ht_bin->vaddr[1] = ht_bin->vaddr[0];
479   ht_bin->tcaddr[1] = ht_bin->tcaddr[0];
480   ht_bin->vaddr[0] = vaddr;
481   ht_bin->tcaddr[0] = tcaddr;
482 }
483
484 // some messy ari64's code, seems to rely on unsigned 32bit overflow
485 static int doesnt_expire_soon(void *tcaddr)
486 {
487   u_int diff = (u_int)((u_char *)tcaddr - out) << (32-TARGET_SIZE_2);
488   return diff > (u_int)(0x60000000 + (MAX_OUTPUT_BLOCK_SIZE << (32-TARGET_SIZE_2)));
489 }
490
491 // Get address from virtual address
492 // This is called from the recompiled JR/JALR instructions
493 void noinline *get_addr(u_int vaddr)
494 {
495   u_int page=get_page(vaddr);
496   u_int vpage=get_vpage(vaddr);
497   struct ll_entry *head;
498   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
499   head=jump_in[page];
500   while(head!=NULL) {
501     if(head->vaddr==vaddr) {
502   //printf("TRACE: count=%d next=%d (get_addr match %x: %p)\n",Count,next_interupt,vaddr,head->addr);
503       hash_table_add(hash_table_get(vaddr), vaddr, head->addr);
504       return head->addr;
505     }
506     head=head->next;
507   }
508   head=jump_dirty[vpage];
509   while(head!=NULL) {
510     if(head->vaddr==vaddr) {
511       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %p)\n",Count,next_interupt,vaddr,head->addr);
512       // Don't restore blocks which are about to expire from the cache
513       if (doesnt_expire_soon(head->addr))
514       if (verify_dirty(head->addr)) {
515         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
516         invalid_code[vaddr>>12]=0;
517         inv_code_start=inv_code_end=~0;
518         if(vpage<2048) {
519           restore_candidate[vpage>>3]|=1<<(vpage&7);
520         }
521         else restore_candidate[page>>3]|=1<<(page&7);
522         struct ht_entry *ht_bin = hash_table_get(vaddr);
523         if (ht_bin->vaddr[0] == vaddr)
524           ht_bin->tcaddr[0] = head->addr; // Replace existing entry
525         else
526           hash_table_add(ht_bin, vaddr, head->addr);
527
528         return head->addr;
529       }
530     }
531     head=head->next;
532   }
533   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
534   int r=new_recompile_block(vaddr);
535   if(r==0) return get_addr(vaddr);
536   // Execute in unmapped page, generate pagefault execption
537   Status|=2;
538   Cause=(vaddr<<31)|0x8;
539   EPC=(vaddr&1)?vaddr-5:vaddr;
540   BadVAddr=(vaddr&~1);
541   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
542   EntryHi=BadVAddr&0xFFFFE000;
543   return get_addr_ht(0x80000000);
544 }
545 // Look up address in hash table first
546 void *get_addr_ht(u_int vaddr)
547 {
548   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
549   const struct ht_entry *ht_bin = hash_table_get(vaddr);
550   if (ht_bin->vaddr[0] == vaddr) return ht_bin->tcaddr[0];
551   if (ht_bin->vaddr[1] == vaddr) return ht_bin->tcaddr[1];
552   return get_addr(vaddr);
553 }
554
555 void clear_all_regs(signed char regmap[])
556 {
557   int hr;
558   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
559 }
560
561 static signed char get_reg(const signed char regmap[],int r)
562 {
563   int hr;
564   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
565   return -1;
566 }
567
568 // Find a register that is available for two consecutive cycles
569 static signed char get_reg2(signed char regmap1[], const signed char regmap2[], int r)
570 {
571   int hr;
572   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
573   return -1;
574 }
575
576 int count_free_regs(signed char regmap[])
577 {
578   int count=0;
579   int hr;
580   for(hr=0;hr<HOST_REGS;hr++)
581   {
582     if(hr!=EXCLUDE_REG) {
583       if(regmap[hr]<0) count++;
584     }
585   }
586   return count;
587 }
588
589 void dirty_reg(struct regstat *cur,signed char reg)
590 {
591   int hr;
592   if(!reg) return;
593   for (hr=0;hr<HOST_REGS;hr++) {
594     if((cur->regmap[hr]&63)==reg) {
595       cur->dirty|=1<<hr;
596     }
597   }
598 }
599
600 static void set_const(struct regstat *cur, signed char reg, uint32_t value)
601 {
602   int hr;
603   if(!reg) return;
604   for (hr=0;hr<HOST_REGS;hr++) {
605     if(cur->regmap[hr]==reg) {
606       cur->isconst|=1<<hr;
607       current_constmap[hr]=value;
608     }
609   }
610 }
611
612 static void clear_const(struct regstat *cur, signed char reg)
613 {
614   int hr;
615   if(!reg) return;
616   for (hr=0;hr<HOST_REGS;hr++) {
617     if((cur->regmap[hr]&63)==reg) {
618       cur->isconst&=~(1<<hr);
619     }
620   }
621 }
622
623 static int is_const(struct regstat *cur, signed char reg)
624 {
625   int hr;
626   if(reg<0) return 0;
627   if(!reg) return 1;
628   for (hr=0;hr<HOST_REGS;hr++) {
629     if((cur->regmap[hr]&63)==reg) {
630       return (cur->isconst>>hr)&1;
631     }
632   }
633   return 0;
634 }
635
636 static uint32_t get_const(struct regstat *cur, signed char reg)
637 {
638   int hr;
639   if(!reg) return 0;
640   for (hr=0;hr<HOST_REGS;hr++) {
641     if(cur->regmap[hr]==reg) {
642       return current_constmap[hr];
643     }
644   }
645   SysPrintf("Unknown constant in r%d\n",reg);
646   abort();
647 }
648
649 // Least soon needed registers
650 // Look at the next ten instructions and see which registers
651 // will be used.  Try not to reallocate these.
652 void lsn(u_char hsn[], int i, int *preferred_reg)
653 {
654   int j;
655   int b=-1;
656   for(j=0;j<9;j++)
657   {
658     if(i+j>=slen) {
659       j=slen-i-1;
660       break;
661     }
662     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
663     {
664       // Don't go past an unconditonal jump
665       j++;
666       break;
667     }
668   }
669   for(;j>=0;j--)
670   {
671     if(rs1[i+j]) hsn[rs1[i+j]]=j;
672     if(rs2[i+j]) hsn[rs2[i+j]]=j;
673     if(rt1[i+j]) hsn[rt1[i+j]]=j;
674     if(rt2[i+j]) hsn[rt2[i+j]]=j;
675     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
676       // Stores can allocate zero
677       hsn[rs1[i+j]]=j;
678       hsn[rs2[i+j]]=j;
679     }
680     // On some architectures stores need invc_ptr
681     #if defined(HOST_IMM8)
682     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
683       hsn[INVCP]=j;
684     }
685     #endif
686     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP))
687     {
688       hsn[CCREG]=j;
689       b=j;
690     }
691   }
692   if(b>=0)
693   {
694     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
695     {
696       // Follow first branch
697       int t=(ba[i+b]-start)>>2;
698       j=7-b;if(t+j>=slen) j=slen-t-1;
699       for(;j>=0;j--)
700       {
701         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
702         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
703         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
704         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
705       }
706     }
707     // TODO: preferred register based on backward branch
708   }
709   // Delay slot should preferably not overwrite branch conditions or cycle count
710   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP)) {
711     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
712     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
713     hsn[CCREG]=1;
714     // ...or hash tables
715     hsn[RHASH]=1;
716     hsn[RHTBL]=1;
717   }
718   // Coprocessor load/store needs FTEMP, even if not declared
719   if(itype[i]==C1LS||itype[i]==C2LS) {
720     hsn[FTEMP]=0;
721   }
722   // Load L/R also uses FTEMP as a temporary register
723   if(itype[i]==LOADLR) {
724     hsn[FTEMP]=0;
725   }
726   // Also SWL/SWR/SDL/SDR
727   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
728     hsn[FTEMP]=0;
729   }
730   // Don't remove the miniht registers
731   if(itype[i]==UJUMP||itype[i]==RJUMP)
732   {
733     hsn[RHASH]=0;
734     hsn[RHTBL]=0;
735   }
736 }
737
738 // We only want to allocate registers if we're going to use them again soon
739 int needed_again(int r, int i)
740 {
741   int j;
742   int b=-1;
743   int rn=10;
744
745   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
746   {
747     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
748       return 0; // Don't need any registers if exiting the block
749   }
750   for(j=0;j<9;j++)
751   {
752     if(i+j>=slen) {
753       j=slen-i-1;
754       break;
755     }
756     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
757     {
758       // Don't go past an unconditonal jump
759       j++;
760       break;
761     }
762     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
763     {
764       break;
765     }
766   }
767   for(;j>=1;j--)
768   {
769     if(rs1[i+j]==r) rn=j;
770     if(rs2[i+j]==r) rn=j;
771     if((unneeded_reg[i+j]>>r)&1) rn=10;
772     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP))
773     {
774       b=j;
775     }
776   }
777   /*
778   if(b>=0)
779   {
780     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
781     {
782       // Follow first branch
783       int o=rn;
784       int t=(ba[i+b]-start)>>2;
785       j=7-b;if(t+j>=slen) j=slen-t-1;
786       for(;j>=0;j--)
787       {
788         if(!((unneeded_reg[t+j]>>r)&1)) {
789           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
790           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
791         }
792         else rn=o;
793       }
794     }
795   }*/
796   if(rn<10) return 1;
797   (void)b;
798   return 0;
799 }
800
801 // Try to match register allocations at the end of a loop with those
802 // at the beginning
803 int loop_reg(int i, int r, int hr)
804 {
805   int j,k;
806   for(j=0;j<9;j++)
807   {
808     if(i+j>=slen) {
809       j=slen-i-1;
810       break;
811     }
812     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
813     {
814       // Don't go past an unconditonal jump
815       j++;
816       break;
817     }
818   }
819   k=0;
820   if(i>0){
821     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP)
822       k--;
823   }
824   for(;k<j;k++)
825   {
826     assert(r < 64);
827     if((unneeded_reg[i+k]>>r)&1) return hr;
828     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP))
829     {
830       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
831       {
832         int t=(ba[i+k]-start)>>2;
833         int reg=get_reg(regs[t].regmap_entry,r);
834         if(reg>=0) return reg;
835         //reg=get_reg(regs[t+1].regmap_entry,r);
836         //if(reg>=0) return reg;
837       }
838     }
839   }
840   return hr;
841 }
842
843
844 // Allocate every register, preserving source/target regs
845 void alloc_all(struct regstat *cur,int i)
846 {
847   int hr;
848
849   for(hr=0;hr<HOST_REGS;hr++) {
850     if(hr!=EXCLUDE_REG) {
851       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
852          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
853       {
854         cur->regmap[hr]=-1;
855         cur->dirty&=~(1<<hr);
856       }
857       // Don't need zeros
858       if((cur->regmap[hr]&63)==0)
859       {
860         cur->regmap[hr]=-1;
861         cur->dirty&=~(1<<hr);
862       }
863     }
864   }
865 }
866
867 #ifndef NDEBUG
868 static int host_tempreg_in_use;
869
870 static void host_tempreg_acquire(void)
871 {
872   assert(!host_tempreg_in_use);
873   host_tempreg_in_use = 1;
874 }
875
876 static void host_tempreg_release(void)
877 {
878   host_tempreg_in_use = 0;
879 }
880 #else
881 static void host_tempreg_acquire(void) {}
882 static void host_tempreg_release(void) {}
883 #endif
884
885 #ifdef DRC_DBG
886 extern void gen_interupt();
887 extern void do_insn_cmp();
888 #define FUNCNAME(f) { f, " " #f }
889 static const struct {
890   void *addr;
891   const char *name;
892 } function_names[] = {
893   FUNCNAME(cc_interrupt),
894   FUNCNAME(gen_interupt),
895   FUNCNAME(get_addr_ht),
896   FUNCNAME(get_addr),
897   FUNCNAME(jump_handler_read8),
898   FUNCNAME(jump_handler_read16),
899   FUNCNAME(jump_handler_read32),
900   FUNCNAME(jump_handler_write8),
901   FUNCNAME(jump_handler_write16),
902   FUNCNAME(jump_handler_write32),
903   FUNCNAME(invalidate_addr),
904   FUNCNAME(jump_to_new_pc),
905   FUNCNAME(new_dyna_leave),
906   FUNCNAME(pcsx_mtc0),
907   FUNCNAME(pcsx_mtc0_ds),
908   FUNCNAME(do_insn_cmp),
909 #ifdef __arm__
910   FUNCNAME(verify_code),
911 #endif
912 };
913
914 static const char *func_name(const void *a)
915 {
916   int i;
917   for (i = 0; i < sizeof(function_names)/sizeof(function_names[0]); i++)
918     if (function_names[i].addr == a)
919       return function_names[i].name;
920   return "";
921 }
922 #else
923 #define func_name(x) ""
924 #endif
925
926 #ifdef __i386__
927 #include "assem_x86.c"
928 #endif
929 #ifdef __x86_64__
930 #include "assem_x64.c"
931 #endif
932 #ifdef __arm__
933 #include "assem_arm.c"
934 #endif
935 #ifdef __aarch64__
936 #include "assem_arm64.c"
937 #endif
938
939 static void *get_trampoline(const void *f)
940 {
941   size_t i;
942
943   for (i = 0; i < ARRAY_SIZE(ndrc->tramp.f); i++) {
944     if (ndrc->tramp.f[i] == f || ndrc->tramp.f[i] == NULL)
945       break;
946   }
947   if (i == ARRAY_SIZE(ndrc->tramp.f)) {
948     SysPrintf("trampoline table is full, last func %p\n", f);
949     abort();
950   }
951   if (ndrc->tramp.f[i] == NULL) {
952     start_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]);
953     ndrc->tramp.f[i] = f;
954     end_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]);
955   }
956   return &ndrc->tramp.ops[i];
957 }
958
959 static void emit_far_jump(const void *f)
960 {
961   if (can_jump_or_call(f)) {
962     emit_jmp(f);
963     return;
964   }
965
966   f = get_trampoline(f);
967   emit_jmp(f);
968 }
969
970 static void emit_far_call(const void *f)
971 {
972   if (can_jump_or_call(f)) {
973     emit_call(f);
974     return;
975   }
976
977   f = get_trampoline(f);
978   emit_call(f);
979 }
980
981 // Add virtual address mapping to linked list
982 void ll_add(struct ll_entry **head,int vaddr,void *addr)
983 {
984   struct ll_entry *new_entry;
985   new_entry=malloc(sizeof(struct ll_entry));
986   assert(new_entry!=NULL);
987   new_entry->vaddr=vaddr;
988   new_entry->reg_sv_flags=0;
989   new_entry->addr=addr;
990   new_entry->next=*head;
991   *head=new_entry;
992 }
993
994 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
995 {
996   ll_add(head,vaddr,addr);
997   (*head)->reg_sv_flags=reg_sv_flags;
998 }
999
1000 // Check if an address is already compiled
1001 // but don't return addresses which are about to expire from the cache
1002 void *check_addr(u_int vaddr)
1003 {
1004   struct ht_entry *ht_bin = hash_table_get(vaddr);
1005   size_t i;
1006   for (i = 0; i < ARRAY_SIZE(ht_bin->vaddr); i++) {
1007     if (ht_bin->vaddr[i] == vaddr)
1008       if (doesnt_expire_soon((u_char *)ht_bin->tcaddr[i] - MAX_OUTPUT_BLOCK_SIZE))
1009         if (isclean(ht_bin->tcaddr[i]))
1010           return ht_bin->tcaddr[i];
1011   }
1012   u_int page=get_page(vaddr);
1013   struct ll_entry *head;
1014   head=jump_in[page];
1015   while (head != NULL) {
1016     if (head->vaddr == vaddr) {
1017       if (doesnt_expire_soon(head->addr)) {
1018         // Update existing entry with current address
1019         if (ht_bin->vaddr[0] == vaddr) {
1020           ht_bin->tcaddr[0] = head->addr;
1021           return head->addr;
1022         }
1023         if (ht_bin->vaddr[1] == vaddr) {
1024           ht_bin->tcaddr[1] = head->addr;
1025           return head->addr;
1026         }
1027         // Insert into hash table with low priority.
1028         // Don't evict existing entries, as they are probably
1029         // addresses that are being accessed frequently.
1030         if (ht_bin->vaddr[0] == -1) {
1031           ht_bin->vaddr[0] = vaddr;
1032           ht_bin->tcaddr[0] = head->addr;
1033         }
1034         else if (ht_bin->vaddr[1] == -1) {
1035           ht_bin->vaddr[1] = vaddr;
1036           ht_bin->tcaddr[1] = head->addr;
1037         }
1038         return head->addr;
1039       }
1040     }
1041     head=head->next;
1042   }
1043   return 0;
1044 }
1045
1046 void remove_hash(int vaddr)
1047 {
1048   //printf("remove hash: %x\n",vaddr);
1049   struct ht_entry *ht_bin = hash_table_get(vaddr);
1050   if (ht_bin->vaddr[1] == vaddr) {
1051     ht_bin->vaddr[1] = -1;
1052     ht_bin->tcaddr[1] = NULL;
1053   }
1054   if (ht_bin->vaddr[0] == vaddr) {
1055     ht_bin->vaddr[0] = ht_bin->vaddr[1];
1056     ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
1057     ht_bin->vaddr[1] = -1;
1058     ht_bin->tcaddr[1] = NULL;
1059   }
1060 }
1061
1062 void ll_remove_matching_addrs(struct ll_entry **head,uintptr_t addr,int shift)
1063 {
1064   struct ll_entry *next;
1065   while(*head) {
1066     if(((uintptr_t)((*head)->addr)>>shift)==(addr>>shift) ||
1067        ((uintptr_t)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1068     {
1069       inv_debug("EXP: Remove pointer to %p (%x)\n",(*head)->addr,(*head)->vaddr);
1070       remove_hash((*head)->vaddr);
1071       next=(*head)->next;
1072       free(*head);
1073       *head=next;
1074     }
1075     else
1076     {
1077       head=&((*head)->next);
1078     }
1079   }
1080 }
1081
1082 // Remove all entries from linked list
1083 void ll_clear(struct ll_entry **head)
1084 {
1085   struct ll_entry *cur;
1086   struct ll_entry *next;
1087   if((cur=*head)) {
1088     *head=0;
1089     while(cur) {
1090       next=cur->next;
1091       free(cur);
1092       cur=next;
1093     }
1094   }
1095 }
1096
1097 // Dereference the pointers and remove if it matches
1098 static void ll_kill_pointers(struct ll_entry *head,uintptr_t addr,int shift)
1099 {
1100   while(head) {
1101     uintptr_t ptr = (uintptr_t)get_pointer(head->addr);
1102     inv_debug("EXP: Lookup pointer to %lx at %p (%x)\n",(long)ptr,head->addr,head->vaddr);
1103     if(((ptr>>shift)==(addr>>shift)) ||
1104        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1105     {
1106       inv_debug("EXP: Kill pointer at %p (%x)\n",head->addr,head->vaddr);
1107       void *host_addr=find_extjump_insn(head->addr);
1108       mark_clear_cache(host_addr);
1109       set_jump_target(host_addr, head->addr);
1110     }
1111     head=head->next;
1112   }
1113 }
1114
1115 // This is called when we write to a compiled block (see do_invstub)
1116 static void invalidate_page(u_int page)
1117 {
1118   struct ll_entry *head;
1119   struct ll_entry *next;
1120   head=jump_in[page];
1121   jump_in[page]=0;
1122   while(head!=NULL) {
1123     inv_debug("INVALIDATE: %x\n",head->vaddr);
1124     remove_hash(head->vaddr);
1125     next=head->next;
1126     free(head);
1127     head=next;
1128   }
1129   head=jump_out[page];
1130   jump_out[page]=0;
1131   while(head!=NULL) {
1132     inv_debug("INVALIDATE: kill pointer to %x (%p)\n",head->vaddr,head->addr);
1133     void *host_addr=find_extjump_insn(head->addr);
1134     mark_clear_cache(host_addr);
1135     set_jump_target(host_addr, head->addr);
1136     next=head->next;
1137     free(head);
1138     head=next;
1139   }
1140 }
1141
1142 static void invalidate_block_range(u_int block, u_int first, u_int last)
1143 {
1144   u_int page=get_page(block<<12);
1145   //printf("first=%d last=%d\n",first,last);
1146   invalidate_page(page);
1147   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1148   assert(last<page+5);
1149   // Invalidate the adjacent pages if a block crosses a 4K boundary
1150   while(first<page) {
1151     invalidate_page(first);
1152     first++;
1153   }
1154   for(first=page+1;first<last;first++) {
1155     invalidate_page(first);
1156   }
1157   do_clear_cache();
1158
1159   // Don't trap writes
1160   invalid_code[block]=1;
1161
1162   #ifdef USE_MINI_HT
1163   memset(mini_ht,-1,sizeof(mini_ht));
1164   #endif
1165 }
1166
1167 void invalidate_block(u_int block)
1168 {
1169   u_int page=get_page(block<<12);
1170   u_int vpage=get_vpage(block<<12);
1171   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1172   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1173   u_int first,last;
1174   first=last=page;
1175   struct ll_entry *head;
1176   head=jump_dirty[vpage];
1177   //printf("page=%d vpage=%d\n",page,vpage);
1178   while(head!=NULL) {
1179     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1180       u_char *start, *end;
1181       get_bounds(head->addr, &start, &end);
1182       //printf("start: %p end: %p\n", start, end);
1183       if (page < 2048 && start >= rdram && end < rdram+RAM_SIZE) {
1184         if (((start-rdram)>>12) <= page && ((end-1-rdram)>>12) >= page) {
1185           if ((((start-rdram)>>12)&2047) < first) first = ((start-rdram)>>12)&2047;
1186           if ((((end-1-rdram)>>12)&2047) > last)  last = ((end-1-rdram)>>12)&2047;
1187         }
1188       }
1189     }
1190     head=head->next;
1191   }
1192   invalidate_block_range(block,first,last);
1193 }
1194
1195 void invalidate_addr(u_int addr)
1196 {
1197   //static int rhits;
1198   // this check is done by the caller
1199   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1200   u_int page=get_vpage(addr);
1201   if(page<2048) { // RAM
1202     struct ll_entry *head;
1203     u_int addr_min=~0, addr_max=0;
1204     u_int mask=RAM_SIZE-1;
1205     u_int addr_main=0x80000000|(addr&mask);
1206     int pg1;
1207     inv_code_start=addr_main&~0xfff;
1208     inv_code_end=addr_main|0xfff;
1209     pg1=page;
1210     if (pg1>0) {
1211       // must check previous page too because of spans..
1212       pg1--;
1213       inv_code_start-=0x1000;
1214     }
1215     for(;pg1<=page;pg1++) {
1216       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1217         u_char *start_h, *end_h;
1218         u_int start, end;
1219         get_bounds(head->addr, &start_h, &end_h);
1220         start = (uintptr_t)start_h - ram_offset;
1221         end = (uintptr_t)end_h - ram_offset;
1222         if(start<=addr_main&&addr_main<end) {
1223           if(start<addr_min) addr_min=start;
1224           if(end>addr_max) addr_max=end;
1225         }
1226         else if(addr_main<start) {
1227           if(start<inv_code_end)
1228             inv_code_end=start-1;
1229         }
1230         else {
1231           if(end>inv_code_start)
1232             inv_code_start=end;
1233         }
1234       }
1235     }
1236     if (addr_min!=~0) {
1237       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1238       inv_code_start=inv_code_end=~0;
1239       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1240       return;
1241     }
1242     else {
1243       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1244       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1245       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1246       return;
1247     }
1248   }
1249   invalidate_block(addr>>12);
1250 }
1251
1252 // This is called when loading a save state.
1253 // Anything could have changed, so invalidate everything.
1254 void invalidate_all_pages(void)
1255 {
1256   u_int page;
1257   for(page=0;page<4096;page++)
1258     invalidate_page(page);
1259   for(page=0;page<1048576;page++)
1260     if(!invalid_code[page]) {
1261       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1262       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1263     }
1264   #ifdef USE_MINI_HT
1265   memset(mini_ht,-1,sizeof(mini_ht));
1266   #endif
1267   do_clear_cache();
1268 }
1269
1270 static void do_invstub(int n)
1271 {
1272   literal_pool(20);
1273   u_int reglist=stubs[n].a;
1274   set_jump_target(stubs[n].addr, out);
1275   save_regs(reglist);
1276   if(stubs[n].b!=0) emit_mov(stubs[n].b,0);
1277   emit_far_call(invalidate_addr);
1278   restore_regs(reglist);
1279   emit_jmp(stubs[n].retaddr); // return address
1280 }
1281
1282 // Add an entry to jump_out after making a link
1283 // src should point to code by emit_extjump2()
1284 void add_link(u_int vaddr,void *src)
1285 {
1286   u_int page=get_page(vaddr);
1287   inv_debug("add_link: %p -> %x (%d)\n",src,vaddr,page);
1288   check_extjump2(src);
1289   ll_add(jump_out+page,vaddr,src);
1290   //void *ptr=get_pointer(src);
1291   //inv_debug("add_link: Pointer is to %p\n",ptr);
1292 }
1293
1294 // If a code block was found to be unmodified (bit was set in
1295 // restore_candidate) and it remains unmodified (bit is clear
1296 // in invalid_code) then move the entries for that 4K page from
1297 // the dirty list to the clean list.
1298 void clean_blocks(u_int page)
1299 {
1300   struct ll_entry *head;
1301   inv_debug("INV: clean_blocks page=%d\n",page);
1302   head=jump_dirty[page];
1303   while(head!=NULL) {
1304     if(!invalid_code[head->vaddr>>12]) {
1305       // Don't restore blocks which are about to expire from the cache
1306       if (doesnt_expire_soon(head->addr)) {
1307         if(verify_dirty(head->addr)) {
1308           u_char *start, *end;
1309           //printf("Possibly Restore %x (%p)\n",head->vaddr, head->addr);
1310           u_int i;
1311           u_int inv=0;
1312           get_bounds(head->addr, &start, &end);
1313           if (start - rdram < RAM_SIZE) {
1314             for (i = (start-rdram+0x80000000)>>12; i <= (end-1-rdram+0x80000000)>>12; i++) {
1315               inv|=invalid_code[i];
1316             }
1317           }
1318           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1319             inv=1;
1320           }
1321           if(!inv) {
1322             void *clean_addr = get_clean_addr(head->addr);
1323             if (doesnt_expire_soon(clean_addr)) {
1324               u_int ppage=page;
1325               inv_debug("INV: Restored %x (%p/%p)\n",head->vaddr, head->addr, clean_addr);
1326               //printf("page=%x, addr=%x\n",page,head->vaddr);
1327               //assert(head->vaddr>>12==(page|0x80000));
1328               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1329               struct ht_entry *ht_bin = hash_table_get(head->vaddr);
1330               if (ht_bin->vaddr[0] == head->vaddr)
1331                 ht_bin->tcaddr[0] = clean_addr; // Replace existing entry
1332               if (ht_bin->vaddr[1] == head->vaddr)
1333                 ht_bin->tcaddr[1] = clean_addr; // Replace existing entry
1334             }
1335           }
1336         }
1337       }
1338     }
1339     head=head->next;
1340   }
1341 }
1342
1343 /* Register allocation */
1344
1345 // Note: registers are allocated clean (unmodified state)
1346 // if you intend to modify the register, you must call dirty_reg().
1347 static void alloc_reg(struct regstat *cur,int i,signed char reg)
1348 {
1349   int r,hr;
1350   int preferred_reg = (reg&7);
1351   if(reg==CCREG) preferred_reg=HOST_CCREG;
1352   if(reg==PTEMP||reg==FTEMP) preferred_reg=12;
1353
1354   // Don't allocate unused registers
1355   if((cur->u>>reg)&1) return;
1356
1357   // see if it's already allocated
1358   for(hr=0;hr<HOST_REGS;hr++)
1359   {
1360     if(cur->regmap[hr]==reg) return;
1361   }
1362
1363   // Keep the same mapping if the register was already allocated in a loop
1364   preferred_reg = loop_reg(i,reg,preferred_reg);
1365
1366   // Try to allocate the preferred register
1367   if(cur->regmap[preferred_reg]==-1) {
1368     cur->regmap[preferred_reg]=reg;
1369     cur->dirty&=~(1<<preferred_reg);
1370     cur->isconst&=~(1<<preferred_reg);
1371     return;
1372   }
1373   r=cur->regmap[preferred_reg];
1374   assert(r < 64);
1375   if((cur->u>>r)&1) {
1376     cur->regmap[preferred_reg]=reg;
1377     cur->dirty&=~(1<<preferred_reg);
1378     cur->isconst&=~(1<<preferred_reg);
1379     return;
1380   }
1381
1382   // Clear any unneeded registers
1383   // We try to keep the mapping consistent, if possible, because it
1384   // makes branches easier (especially loops).  So we try to allocate
1385   // first (see above) before removing old mappings.  If this is not
1386   // possible then go ahead and clear out the registers that are no
1387   // longer needed.
1388   for(hr=0;hr<HOST_REGS;hr++)
1389   {
1390     r=cur->regmap[hr];
1391     if(r>=0) {
1392       assert(r < 64);
1393       if((cur->u>>r)&1) {cur->regmap[hr]=-1;break;}
1394     }
1395   }
1396   // Try to allocate any available register, but prefer
1397   // registers that have not been used recently.
1398   if(i>0) {
1399     for(hr=0;hr<HOST_REGS;hr++) {
1400       if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
1401         if(regs[i-1].regmap[hr]!=rs1[i-1]&&regs[i-1].regmap[hr]!=rs2[i-1]&&regs[i-1].regmap[hr]!=rt1[i-1]&&regs[i-1].regmap[hr]!=rt2[i-1]) {
1402           cur->regmap[hr]=reg;
1403           cur->dirty&=~(1<<hr);
1404           cur->isconst&=~(1<<hr);
1405           return;
1406         }
1407       }
1408     }
1409   }
1410   // Try to allocate any available register
1411   for(hr=0;hr<HOST_REGS;hr++) {
1412     if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
1413       cur->regmap[hr]=reg;
1414       cur->dirty&=~(1<<hr);
1415       cur->isconst&=~(1<<hr);
1416       return;
1417     }
1418   }
1419
1420   // Ok, now we have to evict someone
1421   // Pick a register we hopefully won't need soon
1422   u_char hsn[MAXREG+1];
1423   memset(hsn,10,sizeof(hsn));
1424   int j;
1425   lsn(hsn,i,&preferred_reg);
1426   //printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",cur->regmap[0],cur->regmap[1],cur->regmap[2],cur->regmap[3],cur->regmap[5],cur->regmap[6],cur->regmap[7]);
1427   //printf("hsn(%x): %d %d %d %d %d %d %d\n",start+i*4,hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]);
1428   if(i>0) {
1429     // Don't evict the cycle count at entry points, otherwise the entry
1430     // stub will have to write it.
1431     if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2;
1432     if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP)) hsn[CCREG]=2;
1433     for(j=10;j>=3;j--)
1434     {
1435       // Alloc preferred register if available
1436       if(hsn[r=cur->regmap[preferred_reg]&63]==j) {
1437         for(hr=0;hr<HOST_REGS;hr++) {
1438           // Evict both parts of a 64-bit register
1439           if((cur->regmap[hr]&63)==r) {
1440             cur->regmap[hr]=-1;
1441             cur->dirty&=~(1<<hr);
1442             cur->isconst&=~(1<<hr);
1443           }
1444         }
1445         cur->regmap[preferred_reg]=reg;
1446         return;
1447       }
1448       for(r=1;r<=MAXREG;r++)
1449       {
1450         if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) {
1451           for(hr=0;hr<HOST_REGS;hr++) {
1452             if(hr!=HOST_CCREG||j<hsn[CCREG]) {
1453               if(cur->regmap[hr]==r) {
1454                 cur->regmap[hr]=reg;
1455                 cur->dirty&=~(1<<hr);
1456                 cur->isconst&=~(1<<hr);
1457                 return;
1458               }
1459             }
1460           }
1461         }
1462       }
1463     }
1464   }
1465   for(j=10;j>=0;j--)
1466   {
1467     for(r=1;r<=MAXREG;r++)
1468     {
1469       if(hsn[r]==j) {
1470         for(hr=0;hr<HOST_REGS;hr++) {
1471           if(cur->regmap[hr]==r) {
1472             cur->regmap[hr]=reg;
1473             cur->dirty&=~(1<<hr);
1474             cur->isconst&=~(1<<hr);
1475             return;
1476           }
1477         }
1478       }
1479     }
1480   }
1481   SysPrintf("This shouldn't happen (alloc_reg)");abort();
1482 }
1483
1484 // Allocate a temporary register.  This is done without regard to
1485 // dirty status or whether the register we request is on the unneeded list
1486 // Note: This will only allocate one register, even if called multiple times
1487 static void alloc_reg_temp(struct regstat *cur,int i,signed char reg)
1488 {
1489   int r,hr;
1490   int preferred_reg = -1;
1491
1492   // see if it's already allocated
1493   for(hr=0;hr<HOST_REGS;hr++)
1494   {
1495     if(hr!=EXCLUDE_REG&&cur->regmap[hr]==reg) return;
1496   }
1497
1498   // Try to allocate any available register
1499   for(hr=HOST_REGS-1;hr>=0;hr--) {
1500     if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
1501       cur->regmap[hr]=reg;
1502       cur->dirty&=~(1<<hr);
1503       cur->isconst&=~(1<<hr);
1504       return;
1505     }
1506   }
1507
1508   // Find an unneeded register
1509   for(hr=HOST_REGS-1;hr>=0;hr--)
1510   {
1511     r=cur->regmap[hr];
1512     if(r>=0) {
1513       assert(r < 64);
1514       if((cur->u>>r)&1) {
1515         if(i==0||((unneeded_reg[i-1]>>r)&1)) {
1516           cur->regmap[hr]=reg;
1517           cur->dirty&=~(1<<hr);
1518           cur->isconst&=~(1<<hr);
1519           return;
1520         }
1521       }
1522     }
1523   }
1524
1525   // Ok, now we have to evict someone
1526   // Pick a register we hopefully won't need soon
1527   // TODO: we might want to follow unconditional jumps here
1528   // TODO: get rid of dupe code and make this into a function
1529   u_char hsn[MAXREG+1];
1530   memset(hsn,10,sizeof(hsn));
1531   int j;
1532   lsn(hsn,i,&preferred_reg);
1533   //printf("hsn: %d %d %d %d %d %d %d\n",hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]);
1534   if(i>0) {
1535     // Don't evict the cycle count at entry points, otherwise the entry
1536     // stub will have to write it.
1537     if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2;
1538     if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP)) hsn[CCREG]=2;
1539     for(j=10;j>=3;j--)
1540     {
1541       for(r=1;r<=MAXREG;r++)
1542       {
1543         if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) {
1544           for(hr=0;hr<HOST_REGS;hr++) {
1545             if(hr!=HOST_CCREG||hsn[CCREG]>2) {
1546               if(cur->regmap[hr]==r) {
1547                 cur->regmap[hr]=reg;
1548                 cur->dirty&=~(1<<hr);
1549                 cur->isconst&=~(1<<hr);
1550                 return;
1551               }
1552             }
1553           }
1554         }
1555       }
1556     }
1557   }
1558   for(j=10;j>=0;j--)
1559   {
1560     for(r=1;r<=MAXREG;r++)
1561     {
1562       if(hsn[r]==j) {
1563         for(hr=0;hr<HOST_REGS;hr++) {
1564           if(cur->regmap[hr]==r) {
1565             cur->regmap[hr]=reg;
1566             cur->dirty&=~(1<<hr);
1567             cur->isconst&=~(1<<hr);
1568             return;
1569           }
1570         }
1571       }
1572     }
1573   }
1574   SysPrintf("This shouldn't happen");abort();
1575 }
1576
1577 static void mov_alloc(struct regstat *current,int i)
1578 {
1579   // Note: Don't need to actually alloc the source registers
1580   //alloc_reg(current,i,rs1[i]);
1581   alloc_reg(current,i,rt1[i]);
1582
1583   clear_const(current,rs1[i]);
1584   clear_const(current,rt1[i]);
1585   dirty_reg(current,rt1[i]);
1586 }
1587
1588 static void shiftimm_alloc(struct regstat *current,int i)
1589 {
1590   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1591   {
1592     if(rt1[i]) {
1593       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1594       else lt1[i]=rs1[i];
1595       alloc_reg(current,i,rt1[i]);
1596       dirty_reg(current,rt1[i]);
1597       if(is_const(current,rs1[i])) {
1598         int v=get_const(current,rs1[i]);
1599         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1600         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1601         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1602       }
1603       else clear_const(current,rt1[i]);
1604     }
1605   }
1606   else
1607   {
1608     clear_const(current,rs1[i]);
1609     clear_const(current,rt1[i]);
1610   }
1611
1612   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1613   {
1614     assert(0);
1615   }
1616   if(opcode2[i]==0x3c) // DSLL32
1617   {
1618     assert(0);
1619   }
1620   if(opcode2[i]==0x3e) // DSRL32
1621   {
1622     assert(0);
1623   }
1624   if(opcode2[i]==0x3f) // DSRA32
1625   {
1626     assert(0);
1627   }
1628 }
1629
1630 static void shift_alloc(struct regstat *current,int i)
1631 {
1632   if(rt1[i]) {
1633     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1634     {
1635       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1636       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1637       alloc_reg(current,i,rt1[i]);
1638       if(rt1[i]==rs2[i]) {
1639         alloc_reg_temp(current,i,-1);
1640         minimum_free_regs[i]=1;
1641       }
1642     } else { // DSLLV/DSRLV/DSRAV
1643       assert(0);
1644     }
1645     clear_const(current,rs1[i]);
1646     clear_const(current,rs2[i]);
1647     clear_const(current,rt1[i]);
1648     dirty_reg(current,rt1[i]);
1649   }
1650 }
1651
1652 static void alu_alloc(struct regstat *current,int i)
1653 {
1654   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1655     if(rt1[i]) {
1656       if(rs1[i]&&rs2[i]) {
1657         alloc_reg(current,i,rs1[i]);
1658         alloc_reg(current,i,rs2[i]);
1659       }
1660       else {
1661         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1662         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1663       }
1664       alloc_reg(current,i,rt1[i]);
1665     }
1666   }
1667   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1668     if(rt1[i]) {
1669       alloc_reg(current,i,rs1[i]);
1670       alloc_reg(current,i,rs2[i]);
1671       alloc_reg(current,i,rt1[i]);
1672     }
1673   }
1674   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1675     if(rt1[i]) {
1676       if(rs1[i]&&rs2[i]) {
1677         alloc_reg(current,i,rs1[i]);
1678         alloc_reg(current,i,rs2[i]);
1679       }
1680       else
1681       {
1682         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1683         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1684       }
1685       alloc_reg(current,i,rt1[i]);
1686     }
1687   }
1688   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1689     assert(0);
1690   }
1691   clear_const(current,rs1[i]);
1692   clear_const(current,rs2[i]);
1693   clear_const(current,rt1[i]);
1694   dirty_reg(current,rt1[i]);
1695 }
1696
1697 static void imm16_alloc(struct regstat *current,int i)
1698 {
1699   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1700   else lt1[i]=rs1[i];
1701   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1702   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1703     assert(0);
1704   }
1705   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1706     clear_const(current,rs1[i]);
1707     clear_const(current,rt1[i]);
1708   }
1709   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1710     if(is_const(current,rs1[i])) {
1711       int v=get_const(current,rs1[i]);
1712       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1713       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1714       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1715     }
1716     else clear_const(current,rt1[i]);
1717   }
1718   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1719     if(is_const(current,rs1[i])) {
1720       int v=get_const(current,rs1[i]);
1721       set_const(current,rt1[i],v+imm[i]);
1722     }
1723     else clear_const(current,rt1[i]);
1724   }
1725   else {
1726     set_const(current,rt1[i],imm[i]<<16); // LUI
1727   }
1728   dirty_reg(current,rt1[i]);
1729 }
1730
1731 static void load_alloc(struct regstat *current,int i)
1732 {
1733   clear_const(current,rt1[i]);
1734   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1735   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1736   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1737   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1738     alloc_reg(current,i,rt1[i]);
1739     assert(get_reg(current->regmap,rt1[i])>=0);
1740     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1741     {
1742       assert(0);
1743     }
1744     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1745     {
1746       assert(0);
1747     }
1748     dirty_reg(current,rt1[i]);
1749     // LWL/LWR need a temporary register for the old value
1750     if(opcode[i]==0x22||opcode[i]==0x26)
1751     {
1752       alloc_reg(current,i,FTEMP);
1753       alloc_reg_temp(current,i,-1);
1754       minimum_free_regs[i]=1;
1755     }
1756   }
1757   else
1758   {
1759     // Load to r0 or unneeded register (dummy load)
1760     // but we still need a register to calculate the address
1761     if(opcode[i]==0x22||opcode[i]==0x26)
1762     {
1763       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1764     }
1765     alloc_reg_temp(current,i,-1);
1766     minimum_free_regs[i]=1;
1767     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1768     {
1769       assert(0);
1770     }
1771   }
1772 }
1773
1774 void store_alloc(struct regstat *current,int i)
1775 {
1776   clear_const(current,rs2[i]);
1777   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1778   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1779   alloc_reg(current,i,rs2[i]);
1780   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1781     assert(0);
1782   }
1783   #if defined(HOST_IMM8)
1784   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1785   else alloc_reg(current,i,INVCP);
1786   #endif
1787   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1788     alloc_reg(current,i,FTEMP);
1789   }
1790   // We need a temporary register for address generation
1791   alloc_reg_temp(current,i,-1);
1792   minimum_free_regs[i]=1;
1793 }
1794
1795 void c1ls_alloc(struct regstat *current,int i)
1796 {
1797   //clear_const(current,rs1[i]); // FIXME
1798   clear_const(current,rt1[i]);
1799   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1800   alloc_reg(current,i,CSREG); // Status
1801   alloc_reg(current,i,FTEMP);
1802   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1803     assert(0);
1804   }
1805   #if defined(HOST_IMM8)
1806   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1807   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1808     alloc_reg(current,i,INVCP);
1809   #endif
1810   // We need a temporary register for address generation
1811   alloc_reg_temp(current,i,-1);
1812 }
1813
1814 void c2ls_alloc(struct regstat *current,int i)
1815 {
1816   clear_const(current,rt1[i]);
1817   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1818   alloc_reg(current,i,FTEMP);
1819   #if defined(HOST_IMM8)
1820   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1821   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1822     alloc_reg(current,i,INVCP);
1823   #endif
1824   // We need a temporary register for address generation
1825   alloc_reg_temp(current,i,-1);
1826   minimum_free_regs[i]=1;
1827 }
1828
1829 #ifndef multdiv_alloc
1830 void multdiv_alloc(struct regstat *current,int i)
1831 {
1832   //  case 0x18: MULT
1833   //  case 0x19: MULTU
1834   //  case 0x1A: DIV
1835   //  case 0x1B: DIVU
1836   //  case 0x1C: DMULT
1837   //  case 0x1D: DMULTU
1838   //  case 0x1E: DDIV
1839   //  case 0x1F: DDIVU
1840   clear_const(current,rs1[i]);
1841   clear_const(current,rs2[i]);
1842   if(rs1[i]&&rs2[i])
1843   {
1844     if((opcode2[i]&4)==0) // 32-bit
1845     {
1846       current->u&=~(1LL<<HIREG);
1847       current->u&=~(1LL<<LOREG);
1848       alloc_reg(current,i,HIREG);
1849       alloc_reg(current,i,LOREG);
1850       alloc_reg(current,i,rs1[i]);
1851       alloc_reg(current,i,rs2[i]);
1852       dirty_reg(current,HIREG);
1853       dirty_reg(current,LOREG);
1854     }
1855     else // 64-bit
1856     {
1857       assert(0);
1858     }
1859   }
1860   else
1861   {
1862     // Multiply by zero is zero.
1863     // MIPS does not have a divide by zero exception.
1864     // The result is undefined, we return zero.
1865     alloc_reg(current,i,HIREG);
1866     alloc_reg(current,i,LOREG);
1867     dirty_reg(current,HIREG);
1868     dirty_reg(current,LOREG);
1869   }
1870 }
1871 #endif
1872
1873 void cop0_alloc(struct regstat *current,int i)
1874 {
1875   if(opcode2[i]==0) // MFC0
1876   {
1877     if(rt1[i]) {
1878       clear_const(current,rt1[i]);
1879       alloc_all(current,i);
1880       alloc_reg(current,i,rt1[i]);
1881       dirty_reg(current,rt1[i]);
1882     }
1883   }
1884   else if(opcode2[i]==4) // MTC0
1885   {
1886     if(rs1[i]){
1887       clear_const(current,rs1[i]);
1888       alloc_reg(current,i,rs1[i]);
1889       alloc_all(current,i);
1890     }
1891     else {
1892       alloc_all(current,i); // FIXME: Keep r0
1893       current->u&=~1LL;
1894       alloc_reg(current,i,0);
1895     }
1896   }
1897   else
1898   {
1899     // TLBR/TLBWI/TLBWR/TLBP/ERET
1900     assert(opcode2[i]==0x10);
1901     alloc_all(current,i);
1902   }
1903   minimum_free_regs[i]=HOST_REGS;
1904 }
1905
1906 static void cop12_alloc(struct regstat *current,int i)
1907 {
1908   alloc_reg(current,i,CSREG); // Load status
1909   if(opcode2[i]<3) // MFC1/CFC1
1910   {
1911     if(rt1[i]){
1912       clear_const(current,rt1[i]);
1913       alloc_reg(current,i,rt1[i]);
1914       dirty_reg(current,rt1[i]);
1915     }
1916     alloc_reg_temp(current,i,-1);
1917   }
1918   else if(opcode2[i]>3) // MTC1/CTC1
1919   {
1920     if(rs1[i]){
1921       clear_const(current,rs1[i]);
1922       alloc_reg(current,i,rs1[i]);
1923     }
1924     else {
1925       current->u&=~1LL;
1926       alloc_reg(current,i,0);
1927     }
1928     alloc_reg_temp(current,i,-1);
1929   }
1930   minimum_free_regs[i]=1;
1931 }
1932
1933 void c2op_alloc(struct regstat *current,int i)
1934 {
1935   alloc_reg_temp(current,i,-1);
1936 }
1937
1938 void syscall_alloc(struct regstat *current,int i)
1939 {
1940   alloc_cc(current,i);
1941   dirty_reg(current,CCREG);
1942   alloc_all(current,i);
1943   minimum_free_regs[i]=HOST_REGS;
1944   current->isconst=0;
1945 }
1946
1947 void delayslot_alloc(struct regstat *current,int i)
1948 {
1949   switch(itype[i]) {
1950     case UJUMP:
1951     case CJUMP:
1952     case SJUMP:
1953     case RJUMP:
1954     case SYSCALL:
1955     case HLECALL:
1956     case SPAN:
1957       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//abort();
1958       SysPrintf("Disabled speculative precompilation\n");
1959       stop_after_jal=1;
1960       break;
1961     case IMM16:
1962       imm16_alloc(current,i);
1963       break;
1964     case LOAD:
1965     case LOADLR:
1966       load_alloc(current,i);
1967       break;
1968     case STORE:
1969     case STORELR:
1970       store_alloc(current,i);
1971       break;
1972     case ALU:
1973       alu_alloc(current,i);
1974       break;
1975     case SHIFT:
1976       shift_alloc(current,i);
1977       break;
1978     case MULTDIV:
1979       multdiv_alloc(current,i);
1980       break;
1981     case SHIFTIMM:
1982       shiftimm_alloc(current,i);
1983       break;
1984     case MOV:
1985       mov_alloc(current,i);
1986       break;
1987     case COP0:
1988       cop0_alloc(current,i);
1989       break;
1990     case COP1:
1991     case COP2:
1992       cop12_alloc(current,i);
1993       break;
1994     case C1LS:
1995       c1ls_alloc(current,i);
1996       break;
1997     case C2LS:
1998       c2ls_alloc(current,i);
1999       break;
2000     case C2OP:
2001       c2op_alloc(current,i);
2002       break;
2003   }
2004 }
2005
2006 // Special case where a branch and delay slot span two pages in virtual memory
2007 static void pagespan_alloc(struct regstat *current,int i)
2008 {
2009   current->isconst=0;
2010   current->wasconst=0;
2011   regs[i].wasconst=0;
2012   minimum_free_regs[i]=HOST_REGS;
2013   alloc_all(current,i);
2014   alloc_cc(current,i);
2015   dirty_reg(current,CCREG);
2016   if(opcode[i]==3) // JAL
2017   {
2018     alloc_reg(current,i,31);
2019     dirty_reg(current,31);
2020   }
2021   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
2022   {
2023     alloc_reg(current,i,rs1[i]);
2024     if (rt1[i]!=0) {
2025       alloc_reg(current,i,rt1[i]);
2026       dirty_reg(current,rt1[i]);
2027     }
2028   }
2029   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
2030   {
2031     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2032     if(rs2[i]) alloc_reg(current,i,rs2[i]);
2033   }
2034   else
2035   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
2036   {
2037     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2038   }
2039   //else ...
2040 }
2041
2042 static void add_stub(enum stub_type type, void *addr, void *retaddr,
2043   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e)
2044 {
2045   assert(stubcount < ARRAY_SIZE(stubs));
2046   stubs[stubcount].type = type;
2047   stubs[stubcount].addr = addr;
2048   stubs[stubcount].retaddr = retaddr;
2049   stubs[stubcount].a = a;
2050   stubs[stubcount].b = b;
2051   stubs[stubcount].c = c;
2052   stubs[stubcount].d = d;
2053   stubs[stubcount].e = e;
2054   stubcount++;
2055 }
2056
2057 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
2058   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist)
2059 {
2060   add_stub(type, addr, retaddr, i, addr_reg, (uintptr_t)i_regs, ccadj, reglist);
2061 }
2062
2063 // Write out a single register
2064 static void wb_register(signed char r,signed char regmap[],uint64_t dirty)
2065 {
2066   int hr;
2067   for(hr=0;hr<HOST_REGS;hr++) {
2068     if(hr!=EXCLUDE_REG) {
2069       if((regmap[hr]&63)==r) {
2070         if((dirty>>hr)&1) {
2071           assert(regmap[hr]<64);
2072           emit_storereg(r,hr);
2073         }
2074       }
2075     }
2076   }
2077 }
2078
2079 static void wb_valid(signed char pre[],signed char entry[],u_int dirty_pre,u_int dirty,uint64_t u)
2080 {
2081   //if(dirty_pre==dirty) return;
2082   int hr,reg;
2083   for(hr=0;hr<HOST_REGS;hr++) {
2084     if(hr!=EXCLUDE_REG) {
2085       reg=pre[hr];
2086       if(((~u)>>(reg&63))&1) {
2087         if(reg>0) {
2088           if(((dirty_pre&~dirty)>>hr)&1) {
2089             if(reg>0&&reg<34) {
2090               emit_storereg(reg,hr);
2091             }
2092             else if(reg>=64) {
2093               assert(0);
2094             }
2095           }
2096         }
2097       }
2098     }
2099   }
2100 }
2101
2102 // trashes r2
2103 static void pass_args(int a0, int a1)
2104 {
2105   if(a0==1&&a1==0) {
2106     // must swap
2107     emit_mov(a0,2); emit_mov(a1,1); emit_mov(2,0);
2108   }
2109   else if(a0!=0&&a1==0) {
2110     emit_mov(a1,1);
2111     if (a0>=0) emit_mov(a0,0);
2112   }
2113   else {
2114     if(a0>=0&&a0!=0) emit_mov(a0,0);
2115     if(a1>=0&&a1!=1) emit_mov(a1,1);
2116   }
2117 }
2118
2119 static void alu_assemble(int i,struct regstat *i_regs)
2120 {
2121   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2122     if(rt1[i]) {
2123       signed char s1,s2,t;
2124       t=get_reg(i_regs->regmap,rt1[i]);
2125       if(t>=0) {
2126         s1=get_reg(i_regs->regmap,rs1[i]);
2127         s2=get_reg(i_regs->regmap,rs2[i]);
2128         if(rs1[i]&&rs2[i]) {
2129           assert(s1>=0);
2130           assert(s2>=0);
2131           if(opcode2[i]&2) emit_sub(s1,s2,t);
2132           else emit_add(s1,s2,t);
2133         }
2134         else if(rs1[i]) {
2135           if(s1>=0) emit_mov(s1,t);
2136           else emit_loadreg(rs1[i],t);
2137         }
2138         else if(rs2[i]) {
2139           if(s2>=0) {
2140             if(opcode2[i]&2) emit_neg(s2,t);
2141             else emit_mov(s2,t);
2142           }
2143           else {
2144             emit_loadreg(rs2[i],t);
2145             if(opcode2[i]&2) emit_neg(t,t);
2146           }
2147         }
2148         else emit_zeroreg(t);
2149       }
2150     }
2151   }
2152   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2153     assert(0);
2154   }
2155   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2156     if(rt1[i]) {
2157       signed char s1l,s2l,t;
2158       {
2159         t=get_reg(i_regs->regmap,rt1[i]);
2160         //assert(t>=0);
2161         if(t>=0) {
2162           s1l=get_reg(i_regs->regmap,rs1[i]);
2163           s2l=get_reg(i_regs->regmap,rs2[i]);
2164           if(rs2[i]==0) // rx<r0
2165           {
2166             if(opcode2[i]==0x2a&&rs1[i]!=0) { // SLT
2167               assert(s1l>=0);
2168               emit_shrimm(s1l,31,t);
2169             }
2170             else // SLTU (unsigned can not be less than zero, 0<0)
2171               emit_zeroreg(t);
2172           }
2173           else if(rs1[i]==0) // r0<rx
2174           {
2175             assert(s2l>=0);
2176             if(opcode2[i]==0x2a) // SLT
2177               emit_set_gz32(s2l,t);
2178             else // SLTU (set if not zero)
2179               emit_set_nz32(s2l,t);
2180           }
2181           else{
2182             assert(s1l>=0);assert(s2l>=0);
2183             if(opcode2[i]==0x2a) // SLT
2184               emit_set_if_less32(s1l,s2l,t);
2185             else // SLTU
2186               emit_set_if_carry32(s1l,s2l,t);
2187           }
2188         }
2189       }
2190     }
2191   }
2192   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2193     if(rt1[i]) {
2194       signed char s1l,s2l,tl;
2195       tl=get_reg(i_regs->regmap,rt1[i]);
2196       {
2197         if(tl>=0) {
2198           s1l=get_reg(i_regs->regmap,rs1[i]);
2199           s2l=get_reg(i_regs->regmap,rs2[i]);
2200           if(rs1[i]&&rs2[i]) {
2201             assert(s1l>=0);
2202             assert(s2l>=0);
2203             if(opcode2[i]==0x24) { // AND
2204               emit_and(s1l,s2l,tl);
2205             } else
2206             if(opcode2[i]==0x25) { // OR
2207               emit_or(s1l,s2l,tl);
2208             } else
2209             if(opcode2[i]==0x26) { // XOR
2210               emit_xor(s1l,s2l,tl);
2211             } else
2212             if(opcode2[i]==0x27) { // NOR
2213               emit_or(s1l,s2l,tl);
2214               emit_not(tl,tl);
2215             }
2216           }
2217           else
2218           {
2219             if(opcode2[i]==0x24) { // AND
2220               emit_zeroreg(tl);
2221             } else
2222             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2223               if(rs1[i]){
2224                 if(s1l>=0) emit_mov(s1l,tl);
2225                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2226               }
2227               else
2228               if(rs2[i]){
2229                 if(s2l>=0) emit_mov(s2l,tl);
2230                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2231               }
2232               else emit_zeroreg(tl);
2233             } else
2234             if(opcode2[i]==0x27) { // NOR
2235               if(rs1[i]){
2236                 if(s1l>=0) emit_not(s1l,tl);
2237                 else {
2238                   emit_loadreg(rs1[i],tl);
2239                   emit_not(tl,tl);
2240                 }
2241               }
2242               else
2243               if(rs2[i]){
2244                 if(s2l>=0) emit_not(s2l,tl);
2245                 else {
2246                   emit_loadreg(rs2[i],tl);
2247                   emit_not(tl,tl);
2248                 }
2249               }
2250               else emit_movimm(-1,tl);
2251             }
2252           }
2253         }
2254       }
2255     }
2256   }
2257 }
2258
2259 void imm16_assemble(int i,struct regstat *i_regs)
2260 {
2261   if (opcode[i]==0x0f) { // LUI
2262     if(rt1[i]) {
2263       signed char t;
2264       t=get_reg(i_regs->regmap,rt1[i]);
2265       //assert(t>=0);
2266       if(t>=0) {
2267         if(!((i_regs->isconst>>t)&1))
2268           emit_movimm(imm[i]<<16,t);
2269       }
2270     }
2271   }
2272   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2273     if(rt1[i]) {
2274       signed char s,t;
2275       t=get_reg(i_regs->regmap,rt1[i]);
2276       s=get_reg(i_regs->regmap,rs1[i]);
2277       if(rs1[i]) {
2278         //assert(t>=0);
2279         //assert(s>=0);
2280         if(t>=0) {
2281           if(!((i_regs->isconst>>t)&1)) {
2282             if(s<0) {
2283               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2284               emit_addimm(t,imm[i],t);
2285             }else{
2286               if(!((i_regs->wasconst>>s)&1))
2287                 emit_addimm(s,imm[i],t);
2288               else
2289                 emit_movimm(constmap[i][s]+imm[i],t);
2290             }
2291           }
2292         }
2293       } else {
2294         if(t>=0) {
2295           if(!((i_regs->isconst>>t)&1))
2296             emit_movimm(imm[i],t);
2297         }
2298       }
2299     }
2300   }
2301   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2302     if(rt1[i]) {
2303       signed char sl,tl;
2304       tl=get_reg(i_regs->regmap,rt1[i]);
2305       sl=get_reg(i_regs->regmap,rs1[i]);
2306       if(tl>=0) {
2307         if(rs1[i]) {
2308           assert(sl>=0);
2309           emit_addimm(sl,imm[i],tl);
2310         } else {
2311           emit_movimm(imm[i],tl);
2312         }
2313       }
2314     }
2315   }
2316   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2317     if(rt1[i]) {
2318       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2319       signed char sl,t;
2320       t=get_reg(i_regs->regmap,rt1[i]);
2321       sl=get_reg(i_regs->regmap,rs1[i]);
2322       //assert(t>=0);
2323       if(t>=0) {
2324         if(rs1[i]>0) {
2325             if(opcode[i]==0x0a) { // SLTI
2326               if(sl<0) {
2327                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2328                 emit_slti32(t,imm[i],t);
2329               }else{
2330                 emit_slti32(sl,imm[i],t);
2331               }
2332             }
2333             else { // SLTIU
2334               if(sl<0) {
2335                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2336                 emit_sltiu32(t,imm[i],t);
2337               }else{
2338                 emit_sltiu32(sl,imm[i],t);
2339               }
2340             }
2341         }else{
2342           // SLTI(U) with r0 is just stupid,
2343           // nonetheless examples can be found
2344           if(opcode[i]==0x0a) // SLTI
2345             if(0<imm[i]) emit_movimm(1,t);
2346             else emit_zeroreg(t);
2347           else // SLTIU
2348           {
2349             if(imm[i]) emit_movimm(1,t);
2350             else emit_zeroreg(t);
2351           }
2352         }
2353       }
2354     }
2355   }
2356   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2357     if(rt1[i]) {
2358       signed char sl,tl;
2359       tl=get_reg(i_regs->regmap,rt1[i]);
2360       sl=get_reg(i_regs->regmap,rs1[i]);
2361       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2362         if(opcode[i]==0x0c) //ANDI
2363         {
2364           if(rs1[i]) {
2365             if(sl<0) {
2366               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2367               emit_andimm(tl,imm[i],tl);
2368             }else{
2369               if(!((i_regs->wasconst>>sl)&1))
2370                 emit_andimm(sl,imm[i],tl);
2371               else
2372                 emit_movimm(constmap[i][sl]&imm[i],tl);
2373             }
2374           }
2375           else
2376             emit_zeroreg(tl);
2377         }
2378         else
2379         {
2380           if(rs1[i]) {
2381             if(sl<0) {
2382               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2383             }
2384             if(opcode[i]==0x0d) { // ORI
2385               if(sl<0) {
2386                 emit_orimm(tl,imm[i],tl);
2387               }else{
2388                 if(!((i_regs->wasconst>>sl)&1))
2389                   emit_orimm(sl,imm[i],tl);
2390                 else
2391                   emit_movimm(constmap[i][sl]|imm[i],tl);
2392               }
2393             }
2394             if(opcode[i]==0x0e) { // XORI
2395               if(sl<0) {
2396                 emit_xorimm(tl,imm[i],tl);
2397               }else{
2398                 if(!((i_regs->wasconst>>sl)&1))
2399                   emit_xorimm(sl,imm[i],tl);
2400                 else
2401                   emit_movimm(constmap[i][sl]^imm[i],tl);
2402               }
2403             }
2404           }
2405           else {
2406             emit_movimm(imm[i],tl);
2407           }
2408         }
2409       }
2410     }
2411   }
2412 }
2413
2414 void shiftimm_assemble(int i,struct regstat *i_regs)
2415 {
2416   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2417   {
2418     if(rt1[i]) {
2419       signed char s,t;
2420       t=get_reg(i_regs->regmap,rt1[i]);
2421       s=get_reg(i_regs->regmap,rs1[i]);
2422       //assert(t>=0);
2423       if(t>=0&&!((i_regs->isconst>>t)&1)){
2424         if(rs1[i]==0)
2425         {
2426           emit_zeroreg(t);
2427         }
2428         else
2429         {
2430           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2431           if(imm[i]) {
2432             if(opcode2[i]==0) // SLL
2433             {
2434               emit_shlimm(s<0?t:s,imm[i],t);
2435             }
2436             if(opcode2[i]==2) // SRL
2437             {
2438               emit_shrimm(s<0?t:s,imm[i],t);
2439             }
2440             if(opcode2[i]==3) // SRA
2441             {
2442               emit_sarimm(s<0?t:s,imm[i],t);
2443             }
2444           }else{
2445             // Shift by zero
2446             if(s>=0 && s!=t) emit_mov(s,t);
2447           }
2448         }
2449       }
2450       //emit_storereg(rt1[i],t); //DEBUG
2451     }
2452   }
2453   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2454   {
2455     assert(0);
2456   }
2457   if(opcode2[i]==0x3c) // DSLL32
2458   {
2459     assert(0);
2460   }
2461   if(opcode2[i]==0x3e) // DSRL32
2462   {
2463     assert(0);
2464   }
2465   if(opcode2[i]==0x3f) // DSRA32
2466   {
2467     assert(0);
2468   }
2469 }
2470
2471 #ifndef shift_assemble
2472 static void shift_assemble(int i,struct regstat *i_regs)
2473 {
2474   signed char s,t,shift;
2475   if (rt1[i] == 0)
2476     return;
2477   assert(opcode2[i]<=0x07); // SLLV/SRLV/SRAV
2478   t = get_reg(i_regs->regmap, rt1[i]);
2479   s = get_reg(i_regs->regmap, rs1[i]);
2480   shift = get_reg(i_regs->regmap, rs2[i]);
2481   if (t < 0)
2482     return;
2483
2484   if(rs1[i]==0)
2485     emit_zeroreg(t);
2486   else if(rs2[i]==0) {
2487     assert(s>=0);
2488     if(s!=t) emit_mov(s,t);
2489   }
2490   else {
2491     host_tempreg_acquire();
2492     emit_andimm(shift,31,HOST_TEMPREG);
2493     switch(opcode2[i]) {
2494     case 4: // SLLV
2495       emit_shl(s,HOST_TEMPREG,t);
2496       break;
2497     case 6: // SRLV
2498       emit_shr(s,HOST_TEMPREG,t);
2499       break;
2500     case 7: // SRAV
2501       emit_sar(s,HOST_TEMPREG,t);
2502       break;
2503     default:
2504       assert(0);
2505     }
2506     host_tempreg_release();
2507   }
2508 }
2509
2510 #endif
2511
2512 enum {
2513   MTYPE_8000 = 0,
2514   MTYPE_8020,
2515   MTYPE_0000,
2516   MTYPE_A000,
2517   MTYPE_1F80,
2518 };
2519
2520 static int get_ptr_mem_type(u_int a)
2521 {
2522   if(a < 0x00200000) {
2523     if(a<0x1000&&((start>>20)==0xbfc||(start>>24)==0xa0))
2524       // return wrong, must use memhandler for BIOS self-test to pass
2525       // 007 does similar stuff from a00 mirror, weird stuff
2526       return MTYPE_8000;
2527     return MTYPE_0000;
2528   }
2529   if(0x1f800000 <= a && a < 0x1f801000)
2530     return MTYPE_1F80;
2531   if(0x80200000 <= a && a < 0x80800000)
2532     return MTYPE_8020;
2533   if(0xa0000000 <= a && a < 0xa0200000)
2534     return MTYPE_A000;
2535   return MTYPE_8000;
2536 }
2537
2538 static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override)
2539 {
2540   void *jaddr = NULL;
2541   int type=0;
2542   int mr=rs1[i];
2543   if(((smrv_strong|smrv_weak)>>mr)&1) {
2544     type=get_ptr_mem_type(smrv[mr]);
2545     //printf("set %08x @%08x r%d %d\n", smrv[mr], start+i*4, mr, type);
2546   }
2547   else {
2548     // use the mirror we are running on
2549     type=get_ptr_mem_type(start);
2550     //printf("set nospec   @%08x r%d %d\n", start+i*4, mr, type);
2551   }
2552
2553   if(type==MTYPE_8020) { // RAM 80200000+ mirror
2554     host_tempreg_acquire();
2555     emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
2556     addr=*addr_reg_override=HOST_TEMPREG;
2557     type=0;
2558   }
2559   else if(type==MTYPE_0000) { // RAM 0 mirror
2560     host_tempreg_acquire();
2561     emit_orimm(addr,0x80000000,HOST_TEMPREG);
2562     addr=*addr_reg_override=HOST_TEMPREG;
2563     type=0;
2564   }
2565   else if(type==MTYPE_A000) { // RAM A mirror
2566     host_tempreg_acquire();
2567     emit_andimm(addr,~0x20000000,HOST_TEMPREG);
2568     addr=*addr_reg_override=HOST_TEMPREG;
2569     type=0;
2570   }
2571   else if(type==MTYPE_1F80) { // scratchpad
2572     if (psxH == (void *)0x1f800000) {
2573       host_tempreg_acquire();
2574       emit_xorimm(addr,0x1f800000,HOST_TEMPREG);
2575       emit_cmpimm(HOST_TEMPREG,0x1000);
2576       host_tempreg_release();
2577       jaddr=out;
2578       emit_jc(0);
2579     }
2580     else {
2581       // do the usual RAM check, jump will go to the right handler
2582       type=0;
2583     }
2584   }
2585
2586   if(type==0)
2587   {
2588     emit_cmpimm(addr,RAM_SIZE);
2589     jaddr=out;
2590     #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2591     // Hint to branch predictor that the branch is unlikely to be taken
2592     if(rs1[i]>=28)
2593       emit_jno_unlikely(0);
2594     else
2595     #endif
2596       emit_jno(0);
2597     if(ram_offset!=0) {
2598       host_tempreg_acquire();
2599       emit_addimm(addr,ram_offset,HOST_TEMPREG);
2600       addr=*addr_reg_override=HOST_TEMPREG;
2601     }
2602   }
2603
2604   return jaddr;
2605 }
2606
2607 // return memhandler, or get directly accessable address and return 0
2608 static void *get_direct_memhandler(void *table, u_int addr,
2609   enum stub_type type, uintptr_t *addr_host)
2610 {
2611   uintptr_t l1, l2 = 0;
2612   l1 = ((uintptr_t *)table)[addr>>12];
2613   if ((l1 & (1ul << (sizeof(l1)*8-1))) == 0) {
2614     uintptr_t v = l1 << 1;
2615     *addr_host = v + addr;
2616     return NULL;
2617   }
2618   else {
2619     l1 <<= 1;
2620     if (type == LOADB_STUB || type == LOADBU_STUB || type == STOREB_STUB)
2621       l2 = ((uintptr_t *)l1)[0x1000/4 + 0x1000/2 + (addr&0xfff)];
2622     else if (type == LOADH_STUB || type == LOADHU_STUB || type == STOREH_STUB)
2623       l2=((uintptr_t *)l1)[0x1000/4 + (addr&0xfff)/2];
2624     else
2625       l2=((uintptr_t *)l1)[(addr&0xfff)/4];
2626     if ((l2 & (1<<31)) == 0) {
2627       uintptr_t v = l2 << 1;
2628       *addr_host = v + (addr&0xfff);
2629       return NULL;
2630     }
2631     return (void *)(l2 << 1);
2632   }
2633 }
2634
2635 static void load_assemble(int i,struct regstat *i_regs)
2636 {
2637   int s,tl,addr;
2638   int offset;
2639   void *jaddr=0;
2640   int memtarget=0,c=0;
2641   int fastio_reg_override=-1;
2642   u_int hr,reglist=0;
2643   tl=get_reg(i_regs->regmap,rt1[i]);
2644   s=get_reg(i_regs->regmap,rs1[i]);
2645   offset=imm[i];
2646   for(hr=0;hr<HOST_REGS;hr++) {
2647     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2648   }
2649   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2650   if(s>=0) {
2651     c=(i_regs->wasconst>>s)&1;
2652     if (c) {
2653       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2654     }
2655   }
2656   //printf("load_assemble: c=%d\n",c);
2657   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2658   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2659   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2660     ||rt1[i]==0) {
2661       // could be FIFO, must perform the read
2662       // ||dummy read
2663       assem_debug("(forced read)\n");
2664       tl=get_reg(i_regs->regmap,-1);
2665       assert(tl>=0);
2666   }
2667   if(offset||s<0||c) addr=tl;
2668   else addr=s;
2669   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2670  if(tl>=0) {
2671   //printf("load_assemble: c=%d\n",c);
2672   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2673   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2674   reglist&=~(1<<tl);
2675   if(!c) {
2676     #ifdef R29_HACK
2677     // Strmnnrmn's speed hack
2678     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2679     #endif
2680     {
2681       jaddr=emit_fastpath_cmp_jump(i,addr,&fastio_reg_override);
2682     }
2683   }
2684   else if(ram_offset&&memtarget) {
2685     host_tempreg_acquire();
2686     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2687     fastio_reg_override=HOST_TEMPREG;
2688   }
2689   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2690   if (opcode[i]==0x20) { // LB
2691     if(!c||memtarget) {
2692       if(!dummy) {
2693         {
2694           int x=0,a=tl;
2695           if(!c) a=addr;
2696           if(fastio_reg_override>=0) a=fastio_reg_override;
2697
2698           emit_movsbl_indexed(x,a,tl);
2699         }
2700       }
2701       if(jaddr)
2702         add_stub_r(LOADB_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2703     }
2704     else
2705       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2706   }
2707   if (opcode[i]==0x21) { // LH
2708     if(!c||memtarget) {
2709       if(!dummy) {
2710         int x=0,a=tl;
2711         if(!c) a=addr;
2712         if(fastio_reg_override>=0) a=fastio_reg_override;
2713         emit_movswl_indexed(x,a,tl);
2714       }
2715       if(jaddr)
2716         add_stub_r(LOADH_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2717     }
2718     else
2719       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2720   }
2721   if (opcode[i]==0x23) { // LW
2722     if(!c||memtarget) {
2723       if(!dummy) {
2724         int a=addr;
2725         if(fastio_reg_override>=0) a=fastio_reg_override;
2726         emit_readword_indexed(0,a,tl);
2727       }
2728       if(jaddr)
2729         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2730     }
2731     else
2732       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2733   }
2734   if (opcode[i]==0x24) { // LBU
2735     if(!c||memtarget) {
2736       if(!dummy) {
2737         int x=0,a=tl;
2738         if(!c) a=addr;
2739         if(fastio_reg_override>=0) a=fastio_reg_override;
2740
2741         emit_movzbl_indexed(x,a,tl);
2742       }
2743       if(jaddr)
2744         add_stub_r(LOADBU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2745     }
2746     else
2747       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2748   }
2749   if (opcode[i]==0x25) { // LHU
2750     if(!c||memtarget) {
2751       if(!dummy) {
2752         int x=0,a=tl;
2753         if(!c) a=addr;
2754         if(fastio_reg_override>=0) a=fastio_reg_override;
2755         emit_movzwl_indexed(x,a,tl);
2756       }
2757       if(jaddr)
2758         add_stub_r(LOADHU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2759     }
2760     else
2761       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2762   }
2763   if (opcode[i]==0x27) { // LWU
2764     assert(0);
2765   }
2766   if (opcode[i]==0x37) { // LD
2767     assert(0);
2768   }
2769  }
2770  if (fastio_reg_override == HOST_TEMPREG)
2771    host_tempreg_release();
2772 }
2773
2774 #ifndef loadlr_assemble
2775 static void loadlr_assemble(int i,struct regstat *i_regs)
2776 {
2777   int s,tl,temp,temp2,addr;
2778   int offset;
2779   void *jaddr=0;
2780   int memtarget=0,c=0;
2781   int fastio_reg_override=-1;
2782   u_int hr,reglist=0;
2783   tl=get_reg(i_regs->regmap,rt1[i]);
2784   s=get_reg(i_regs->regmap,rs1[i]);
2785   temp=get_reg(i_regs->regmap,-1);
2786   temp2=get_reg(i_regs->regmap,FTEMP);
2787   addr=get_reg(i_regs->regmap,AGEN1+(i&1));
2788   assert(addr<0);
2789   offset=imm[i];
2790   for(hr=0;hr<HOST_REGS;hr++) {
2791     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2792   }
2793   reglist|=1<<temp;
2794   if(offset||s<0||c) addr=temp2;
2795   else addr=s;
2796   if(s>=0) {
2797     c=(i_regs->wasconst>>s)&1;
2798     if(c) {
2799       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2800     }
2801   }
2802   if(!c) {
2803     emit_shlimm(addr,3,temp);
2804     if (opcode[i]==0x22||opcode[i]==0x26) {
2805       emit_andimm(addr,0xFFFFFFFC,temp2); // LWL/LWR
2806     }else{
2807       emit_andimm(addr,0xFFFFFFF8,temp2); // LDL/LDR
2808     }
2809     jaddr=emit_fastpath_cmp_jump(i,temp2,&fastio_reg_override);
2810   }
2811   else {
2812     if(ram_offset&&memtarget) {
2813       host_tempreg_acquire();
2814       emit_addimm(temp2,ram_offset,HOST_TEMPREG);
2815       fastio_reg_override=HOST_TEMPREG;
2816     }
2817     if (opcode[i]==0x22||opcode[i]==0x26) {
2818       emit_movimm(((constmap[i][s]+offset)<<3)&24,temp); // LWL/LWR
2819     }else{
2820       emit_movimm(((constmap[i][s]+offset)<<3)&56,temp); // LDL/LDR
2821     }
2822   }
2823   if (opcode[i]==0x22||opcode[i]==0x26) { // LWL/LWR
2824     if(!c||memtarget) {
2825       int a=temp2;
2826       if(fastio_reg_override>=0) a=fastio_reg_override;
2827       emit_readword_indexed(0,a,temp2);
2828       if(fastio_reg_override==HOST_TEMPREG) host_tempreg_release();
2829       if(jaddr) add_stub_r(LOADW_STUB,jaddr,out,i,temp2,i_regs,ccadj[i],reglist);
2830     }
2831     else
2832       inline_readstub(LOADW_STUB,i,(constmap[i][s]+offset)&0xFFFFFFFC,i_regs->regmap,FTEMP,ccadj[i],reglist);
2833     if(rt1[i]) {
2834       assert(tl>=0);
2835       emit_andimm(temp,24,temp);
2836       if (opcode[i]==0x22) // LWL
2837         emit_xorimm(temp,24,temp);
2838       host_tempreg_acquire();
2839       emit_movimm(-1,HOST_TEMPREG);
2840       if (opcode[i]==0x26) {
2841         emit_shr(temp2,temp,temp2);
2842         emit_bic_lsr(tl,HOST_TEMPREG,temp,tl);
2843       }else{
2844         emit_shl(temp2,temp,temp2);
2845         emit_bic_lsl(tl,HOST_TEMPREG,temp,tl);
2846       }
2847       host_tempreg_release();
2848       emit_or(temp2,tl,tl);
2849     }
2850     //emit_storereg(rt1[i],tl); // DEBUG
2851   }
2852   if (opcode[i]==0x1A||opcode[i]==0x1B) { // LDL/LDR
2853     assert(0);
2854   }
2855 }
2856 #endif
2857
2858 void store_assemble(int i,struct regstat *i_regs)
2859 {
2860   int s,tl;
2861   int addr,temp;
2862   int offset;
2863   void *jaddr=0;
2864   enum stub_type type;
2865   int memtarget=0,c=0;
2866   int agr=AGEN1+(i&1);
2867   int fastio_reg_override=-1;
2868   u_int hr,reglist=0;
2869   tl=get_reg(i_regs->regmap,rs2[i]);
2870   s=get_reg(i_regs->regmap,rs1[i]);
2871   temp=get_reg(i_regs->regmap,agr);
2872   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2873   offset=imm[i];
2874   if(s>=0) {
2875     c=(i_regs->wasconst>>s)&1;
2876     if(c) {
2877       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2878     }
2879   }
2880   assert(tl>=0);
2881   assert(temp>=0);
2882   for(hr=0;hr<HOST_REGS;hr++) {
2883     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2884   }
2885   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2886   if(offset||s<0||c) addr=temp;
2887   else addr=s;
2888   if(!c) {
2889     jaddr=emit_fastpath_cmp_jump(i,addr,&fastio_reg_override);
2890   }
2891   else if(ram_offset&&memtarget) {
2892     host_tempreg_acquire();
2893     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2894     fastio_reg_override=HOST_TEMPREG;
2895   }
2896
2897   if (opcode[i]==0x28) { // SB
2898     if(!c||memtarget) {
2899       int x=0,a=temp;
2900       if(!c) a=addr;
2901       if(fastio_reg_override>=0) a=fastio_reg_override;
2902       emit_writebyte_indexed(tl,x,a);
2903     }
2904     type=STOREB_STUB;
2905   }
2906   if (opcode[i]==0x29) { // SH
2907     if(!c||memtarget) {
2908       int x=0,a=temp;
2909       if(!c) a=addr;
2910       if(fastio_reg_override>=0) a=fastio_reg_override;
2911       emit_writehword_indexed(tl,x,a);
2912     }
2913     type=STOREH_STUB;
2914   }
2915   if (opcode[i]==0x2B) { // SW
2916     if(!c||memtarget) {
2917       int a=addr;
2918       if(fastio_reg_override>=0) a=fastio_reg_override;
2919       emit_writeword_indexed(tl,0,a);
2920     }
2921     type=STOREW_STUB;
2922   }
2923   if (opcode[i]==0x3F) { // SD
2924     assert(0);
2925     type=STORED_STUB;
2926   }
2927   if(fastio_reg_override==HOST_TEMPREG)
2928     host_tempreg_release();
2929   if(jaddr) {
2930     // PCSX store handlers don't check invcode again
2931     reglist|=1<<addr;
2932     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2933     jaddr=0;
2934   }
2935   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2936     if(!c||memtarget) {
2937       #ifdef DESTRUCTIVE_SHIFT
2938       // The x86 shift operation is 'destructive'; it overwrites the
2939       // source register, so we need to make a copy first and use that.
2940       addr=temp;
2941       #endif
2942       #if defined(HOST_IMM8)
2943       int ir=get_reg(i_regs->regmap,INVCP);
2944       assert(ir>=0);
2945       emit_cmpmem_indexedsr12_reg(ir,addr,1);
2946       #else
2947       emit_cmpmem_indexedsr12_imm(invalid_code,addr,1);
2948       #endif
2949       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2950       emit_callne(invalidate_addr_reg[addr]);
2951       #else
2952       void *jaddr2 = out;
2953       emit_jne(0);
2954       add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),addr,0,0,0);
2955       #endif
2956     }
2957   }
2958   u_int addr_val=constmap[i][s]+offset;
2959   if(jaddr) {
2960     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2961   } else if(c&&!memtarget) {
2962     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
2963   }
2964   // basic current block modification detection..
2965   // not looking back as that should be in mips cache already
2966   // (see Spyro2 title->attract mode)
2967   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
2968     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
2969     assert(i_regs->regmap==regs[i].regmap); // not delay slot
2970     if(i_regs->regmap==regs[i].regmap) {
2971       load_all_consts(regs[i].regmap_entry,regs[i].wasdirty,i);
2972       wb_dirtys(regs[i].regmap_entry,regs[i].wasdirty);
2973       emit_movimm(start+i*4+4,0);
2974       emit_writeword(0,&pcaddr);
2975       emit_addimm(HOST_CCREG,2,HOST_CCREG);
2976       emit_far_call(get_addr_ht);
2977       emit_jmpreg(0);
2978     }
2979   }
2980 }
2981
2982 static void storelr_assemble(int i,struct regstat *i_regs)
2983 {
2984   int s,tl;
2985   int temp;
2986   int offset;
2987   void *jaddr=0;
2988   void *case1, *case2, *case3;
2989   void *done0, *done1, *done2;
2990   int memtarget=0,c=0;
2991   int agr=AGEN1+(i&1);
2992   u_int hr,reglist=0;
2993   tl=get_reg(i_regs->regmap,rs2[i]);
2994   s=get_reg(i_regs->regmap,rs1[i]);
2995   temp=get_reg(i_regs->regmap,agr);
2996   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2997   offset=imm[i];
2998   if(s>=0) {
2999     c=(i_regs->isconst>>s)&1;
3000     if(c) {
3001       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3002     }
3003   }
3004   assert(tl>=0);
3005   for(hr=0;hr<HOST_REGS;hr++) {
3006     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3007   }
3008   assert(temp>=0);
3009   if(!c) {
3010     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3011     if(!offset&&s!=temp) emit_mov(s,temp);
3012     jaddr=out;
3013     emit_jno(0);
3014   }
3015   else
3016   {
3017     if(!memtarget||!rs1[i]) {
3018       jaddr=out;
3019       emit_jmp(0);
3020     }
3021   }
3022   if(ram_offset)
3023     emit_addimm_no_flags(ram_offset,temp);
3024
3025   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3026     assert(0);
3027   }
3028
3029   emit_xorimm(temp,3,temp);
3030   emit_testimm(temp,2);
3031   case2=out;
3032   emit_jne(0);
3033   emit_testimm(temp,1);
3034   case1=out;
3035   emit_jne(0);
3036   // 0
3037   if (opcode[i]==0x2A) { // SWL
3038     emit_writeword_indexed(tl,0,temp);
3039   }
3040   else if (opcode[i]==0x2E) { // SWR
3041     emit_writebyte_indexed(tl,3,temp);
3042   }
3043   else
3044     assert(0);
3045   done0=out;
3046   emit_jmp(0);
3047   // 1
3048   set_jump_target(case1, out);
3049   if (opcode[i]==0x2A) { // SWL
3050     // Write 3 msb into three least significant bytes
3051     if(rs2[i]) emit_rorimm(tl,8,tl);
3052     emit_writehword_indexed(tl,-1,temp);
3053     if(rs2[i]) emit_rorimm(tl,16,tl);
3054     emit_writebyte_indexed(tl,1,temp);
3055     if(rs2[i]) emit_rorimm(tl,8,tl);
3056   }
3057   else if (opcode[i]==0x2E) { // SWR
3058     // Write two lsb into two most significant bytes
3059     emit_writehword_indexed(tl,1,temp);
3060   }
3061   done1=out;
3062   emit_jmp(0);
3063   // 2
3064   set_jump_target(case2, out);
3065   emit_testimm(temp,1);
3066   case3=out;
3067   emit_jne(0);
3068   if (opcode[i]==0x2A) { // SWL
3069     // Write two msb into two least significant bytes
3070     if(rs2[i]) emit_rorimm(tl,16,tl);
3071     emit_writehword_indexed(tl,-2,temp);
3072     if(rs2[i]) emit_rorimm(tl,16,tl);
3073   }
3074   else if (opcode[i]==0x2E) { // SWR
3075     // Write 3 lsb into three most significant bytes
3076     emit_writebyte_indexed(tl,-1,temp);
3077     if(rs2[i]) emit_rorimm(tl,8,tl);
3078     emit_writehword_indexed(tl,0,temp);
3079     if(rs2[i]) emit_rorimm(tl,24,tl);
3080   }
3081   done2=out;
3082   emit_jmp(0);
3083   // 3
3084   set_jump_target(case3, out);
3085   if (opcode[i]==0x2A) { // SWL
3086     // Write msb into least significant byte
3087     if(rs2[i]) emit_rorimm(tl,24,tl);
3088     emit_writebyte_indexed(tl,-3,temp);
3089     if(rs2[i]) emit_rorimm(tl,8,tl);
3090   }
3091   else if (opcode[i]==0x2E) { // SWR
3092     // Write entire word
3093     emit_writeword_indexed(tl,-3,temp);
3094   }
3095   set_jump_target(done0, out);
3096   set_jump_target(done1, out);
3097   set_jump_target(done2, out);
3098   if(!c||!memtarget)
3099     add_stub_r(STORELR_STUB,jaddr,out,i,temp,i_regs,ccadj[i],reglist);
3100   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3101     emit_addimm_no_flags(-ram_offset,temp);
3102     #if defined(HOST_IMM8)
3103     int ir=get_reg(i_regs->regmap,INVCP);
3104     assert(ir>=0);
3105     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3106     #else
3107     emit_cmpmem_indexedsr12_imm(invalid_code,temp,1);
3108     #endif
3109     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3110     emit_callne(invalidate_addr_reg[temp]);
3111     #else
3112     void *jaddr2 = out;
3113     emit_jne(0);
3114     add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3115     #endif
3116   }
3117 }
3118
3119 static void cop0_assemble(int i,struct regstat *i_regs)
3120 {
3121   if(opcode2[i]==0) // MFC0
3122   {
3123     signed char t=get_reg(i_regs->regmap,rt1[i]);
3124     u_int copr=(source[i]>>11)&0x1f;
3125     //assert(t>=0); // Why does this happen?  OOT is weird
3126     if(t>=0&&rt1[i]!=0) {
3127       emit_readword(&reg_cop0[copr],t);
3128     }
3129   }
3130   else if(opcode2[i]==4) // MTC0
3131   {
3132     signed char s=get_reg(i_regs->regmap,rs1[i]);
3133     char copr=(source[i]>>11)&0x1f;
3134     assert(s>=0);
3135     wb_register(rs1[i],i_regs->regmap,i_regs->dirty);
3136     if(copr==9||copr==11||copr==12||copr==13) {
3137       emit_readword(&last_count,HOST_TEMPREG);
3138       emit_loadreg(CCREG,HOST_CCREG); // TODO: do proper reg alloc
3139       emit_add(HOST_CCREG,HOST_TEMPREG,HOST_CCREG);
3140       emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3141       emit_writeword(HOST_CCREG,&Count);
3142     }
3143     // What a mess.  The status register (12) can enable interrupts,
3144     // so needs a special case to handle a pending interrupt.
3145     // The interrupt must be taken immediately, because a subsequent
3146     // instruction might disable interrupts again.
3147     if(copr==12||copr==13) {
3148       if (is_delayslot) {
3149         // burn cycles to cause cc_interrupt, which will
3150         // reschedule next_interupt. Relies on CCREG from above.
3151         assem_debug("MTC0 DS %d\n", copr);
3152         emit_writeword(HOST_CCREG,&last_count);
3153         emit_movimm(0,HOST_CCREG);
3154         emit_storereg(CCREG,HOST_CCREG);
3155         emit_loadreg(rs1[i],1);
3156         emit_movimm(copr,0);
3157         emit_far_call(pcsx_mtc0_ds);
3158         emit_loadreg(rs1[i],s);
3159         return;
3160       }
3161       emit_movimm(start+i*4+4,HOST_TEMPREG);
3162       emit_writeword(HOST_TEMPREG,&pcaddr);
3163       emit_movimm(0,HOST_TEMPREG);
3164       emit_writeword(HOST_TEMPREG,&pending_exception);
3165     }
3166     if(s==HOST_CCREG)
3167       emit_loadreg(rs1[i],1);
3168     else if(s!=1)
3169       emit_mov(s,1);
3170     emit_movimm(copr,0);
3171     emit_far_call(pcsx_mtc0);
3172     if(copr==9||copr==11||copr==12||copr==13) {
3173       emit_readword(&Count,HOST_CCREG);
3174       emit_readword(&next_interupt,HOST_TEMPREG);
3175       emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3176       emit_sub(HOST_CCREG,HOST_TEMPREG,HOST_CCREG);
3177       emit_writeword(HOST_TEMPREG,&last_count);
3178       emit_storereg(CCREG,HOST_CCREG);
3179     }
3180     if(copr==12||copr==13) {
3181       assert(!is_delayslot);
3182       emit_readword(&pending_exception,14);
3183       emit_test(14,14);
3184       void *jaddr = out;
3185       emit_jeq(0);
3186       emit_readword(&pcaddr, 0);
3187       emit_addimm(HOST_CCREG,2,HOST_CCREG);
3188       emit_far_call(get_addr_ht);
3189       emit_jmpreg(0);
3190       set_jump_target(jaddr, out);
3191     }
3192     emit_loadreg(rs1[i],s);
3193   }
3194   else
3195   {
3196     assert(opcode2[i]==0x10);
3197     //if((source[i]&0x3f)==0x10) // RFE
3198     {
3199       emit_readword(&Status,0);
3200       emit_andimm(0,0x3c,1);
3201       emit_andimm(0,~0xf,0);
3202       emit_orrshr_imm(1,2,0);
3203       emit_writeword(0,&Status);
3204     }
3205   }
3206 }
3207
3208 static void cop1_unusable(int i,struct regstat *i_regs)
3209 {
3210   // XXX: should just just do the exception instead
3211   //if(!cop1_usable)
3212   {
3213     void *jaddr=out;
3214     emit_jmp(0);
3215     add_stub_r(FP_STUB,jaddr,out,i,0,i_regs,is_delayslot,0);
3216   }
3217 }
3218
3219 static void cop1_assemble(int i,struct regstat *i_regs)
3220 {
3221   cop1_unusable(i, i_regs);
3222 }
3223
3224 static void c1ls_assemble(int i,struct regstat *i_regs)
3225 {
3226   cop1_unusable(i, i_regs);
3227 }
3228
3229 // FP_STUB
3230 static void do_cop1stub(int n)
3231 {
3232   literal_pool(256);
3233   assem_debug("do_cop1stub %x\n",start+stubs[n].a*4);
3234   set_jump_target(stubs[n].addr, out);
3235   int i=stubs[n].a;
3236 //  int rs=stubs[n].b;
3237   struct regstat *i_regs=(struct regstat *)stubs[n].c;
3238   int ds=stubs[n].d;
3239   if(!ds) {
3240     load_all_consts(regs[i].regmap_entry,regs[i].wasdirty,i);
3241     //if(i_regs!=&regs[i]) printf("oops: regs[i]=%x i_regs=%x",(int)&regs[i],(int)i_regs);
3242   }
3243   //else {printf("fp exception in delay slot\n");}
3244   wb_dirtys(i_regs->regmap_entry,i_regs->wasdirty);
3245   if(regs[i].regmap_entry[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
3246   emit_movimm(start+(i-ds)*4,EAX); // Get PC
3247   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3248   emit_far_jump(ds?fp_exception_ds:fp_exception);
3249 }
3250
3251 static void cop2_get_dreg(u_int copr,signed char tl,signed char temp)
3252 {
3253   switch (copr) {
3254     case 1:
3255     case 3:
3256     case 5:
3257     case 8:
3258     case 9:
3259     case 10:
3260     case 11:
3261       emit_readword(&reg_cop2d[copr],tl);
3262       emit_signextend16(tl,tl);
3263       emit_writeword(tl,&reg_cop2d[copr]); // hmh
3264       break;
3265     case 7:
3266     case 16:
3267     case 17:
3268     case 18:
3269     case 19:
3270       emit_readword(&reg_cop2d[copr],tl);
3271       emit_andimm(tl,0xffff,tl);
3272       emit_writeword(tl,&reg_cop2d[copr]);
3273       break;
3274     case 15:
3275       emit_readword(&reg_cop2d[14],tl); // SXY2
3276       emit_writeword(tl,&reg_cop2d[copr]);
3277       break;
3278     case 28:
3279     case 29:
3280       c2op_mfc2_29_assemble(tl,temp);
3281       break;
3282     default:
3283       emit_readword(&reg_cop2d[copr],tl);
3284       break;
3285   }
3286 }
3287
3288 static void cop2_put_dreg(u_int copr,signed char sl,signed char temp)
3289 {
3290   switch (copr) {
3291     case 15:
3292       emit_readword(&reg_cop2d[13],temp);  // SXY1
3293       emit_writeword(sl,&reg_cop2d[copr]);
3294       emit_writeword(temp,&reg_cop2d[12]); // SXY0
3295       emit_readword(&reg_cop2d[14],temp);  // SXY2
3296       emit_writeword(sl,&reg_cop2d[14]);
3297       emit_writeword(temp,&reg_cop2d[13]); // SXY1
3298       break;
3299     case 28:
3300       emit_andimm(sl,0x001f,temp);
3301       emit_shlimm(temp,7,temp);
3302       emit_writeword(temp,&reg_cop2d[9]);
3303       emit_andimm(sl,0x03e0,temp);
3304       emit_shlimm(temp,2,temp);
3305       emit_writeword(temp,&reg_cop2d[10]);
3306       emit_andimm(sl,0x7c00,temp);
3307       emit_shrimm(temp,3,temp);
3308       emit_writeword(temp,&reg_cop2d[11]);
3309       emit_writeword(sl,&reg_cop2d[28]);
3310       break;
3311     case 30:
3312       emit_xorsar_imm(sl,sl,31,temp);
3313 #if defined(HAVE_ARMV5) || defined(__aarch64__)
3314       emit_clz(temp,temp);
3315 #else
3316       emit_movs(temp,HOST_TEMPREG);
3317       emit_movimm(0,temp);
3318       emit_jeq((int)out+4*4);
3319       emit_addpl_imm(temp,1,temp);
3320       emit_lslpls_imm(HOST_TEMPREG,1,HOST_TEMPREG);
3321       emit_jns((int)out-2*4);
3322 #endif
3323       emit_writeword(sl,&reg_cop2d[30]);
3324       emit_writeword(temp,&reg_cop2d[31]);
3325       break;
3326     case 31:
3327       break;
3328     default:
3329       emit_writeword(sl,&reg_cop2d[copr]);
3330       break;
3331   }
3332 }
3333
3334 static void c2ls_assemble(int i,struct regstat *i_regs)
3335 {
3336   int s,tl;
3337   int ar;
3338   int offset;
3339   int memtarget=0,c=0;
3340   void *jaddr2=NULL;
3341   enum stub_type type;
3342   int agr=AGEN1+(i&1);
3343   int fastio_reg_override=-1;
3344   u_int hr,reglist=0;
3345   u_int copr=(source[i]>>16)&0x1f;
3346   s=get_reg(i_regs->regmap,rs1[i]);
3347   tl=get_reg(i_regs->regmap,FTEMP);
3348   offset=imm[i];
3349   assert(rs1[i]>0);
3350   assert(tl>=0);
3351
3352   for(hr=0;hr<HOST_REGS;hr++) {
3353     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3354   }
3355   if(i_regs->regmap[HOST_CCREG]==CCREG)
3356     reglist&=~(1<<HOST_CCREG);
3357
3358   // get the address
3359   if (opcode[i]==0x3a) { // SWC2
3360     ar=get_reg(i_regs->regmap,agr);
3361     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3362     reglist|=1<<ar;
3363   } else { // LWC2
3364     ar=tl;
3365   }
3366   if(s>=0) c=(i_regs->wasconst>>s)&1;
3367   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3368   if (!offset&&!c&&s>=0) ar=s;
3369   assert(ar>=0);
3370
3371   if (opcode[i]==0x3a) { // SWC2
3372     cop2_get_dreg(copr,tl,-1);
3373     type=STOREW_STUB;
3374   }
3375   else
3376     type=LOADW_STUB;
3377
3378   if(c&&!memtarget) {
3379     jaddr2=out;
3380     emit_jmp(0); // inline_readstub/inline_writestub?
3381   }
3382   else {
3383     if(!c) {
3384       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3385     }
3386     else if(ram_offset&&memtarget) {
3387       host_tempreg_acquire();
3388       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3389       fastio_reg_override=HOST_TEMPREG;
3390     }
3391     if (opcode[i]==0x32) { // LWC2
3392       int a=ar;
3393       if(fastio_reg_override>=0) a=fastio_reg_override;
3394       emit_readword_indexed(0,a,tl);
3395     }
3396     if (opcode[i]==0x3a) { // SWC2
3397       #ifdef DESTRUCTIVE_SHIFT
3398       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3399       #endif
3400       int a=ar;
3401       if(fastio_reg_override>=0) a=fastio_reg_override;
3402       emit_writeword_indexed(tl,0,a);
3403     }
3404   }
3405   if(fastio_reg_override==HOST_TEMPREG)
3406     host_tempreg_release();
3407   if(jaddr2)
3408     add_stub_r(type,jaddr2,out,i,ar,i_regs,ccadj[i],reglist);
3409   if(opcode[i]==0x3a) // SWC2
3410   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3411 #if defined(HOST_IMM8)
3412     int ir=get_reg(i_regs->regmap,INVCP);
3413     assert(ir>=0);
3414     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3415 #else
3416     emit_cmpmem_indexedsr12_imm(invalid_code,ar,1);
3417 #endif
3418     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3419     emit_callne(invalidate_addr_reg[ar]);
3420     #else
3421     void *jaddr3 = out;
3422     emit_jne(0);
3423     add_stub(INVCODE_STUB,jaddr3,out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3424     #endif
3425   }
3426   if (opcode[i]==0x32) { // LWC2
3427     host_tempreg_acquire();
3428     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3429     host_tempreg_release();
3430   }
3431 }
3432
3433 static void cop2_assemble(int i,struct regstat *i_regs)
3434 {
3435   u_int copr=(source[i]>>11)&0x1f;
3436   signed char temp=get_reg(i_regs->regmap,-1);
3437   if (opcode2[i]==0) { // MFC2
3438     signed char tl=get_reg(i_regs->regmap,rt1[i]);
3439     if(tl>=0&&rt1[i]!=0)
3440       cop2_get_dreg(copr,tl,temp);
3441   }
3442   else if (opcode2[i]==4) { // MTC2
3443     signed char sl=get_reg(i_regs->regmap,rs1[i]);
3444     cop2_put_dreg(copr,sl,temp);
3445   }
3446   else if (opcode2[i]==2) // CFC2
3447   {
3448     signed char tl=get_reg(i_regs->regmap,rt1[i]);
3449     if(tl>=0&&rt1[i]!=0)
3450       emit_readword(&reg_cop2c[copr],tl);
3451   }
3452   else if (opcode2[i]==6) // CTC2
3453   {
3454     signed char sl=get_reg(i_regs->regmap,rs1[i]);
3455     switch(copr) {
3456       case 4:
3457       case 12:
3458       case 20:
3459       case 26:
3460       case 27:
3461       case 29:
3462       case 30:
3463         emit_signextend16(sl,temp);
3464         break;
3465       case 31:
3466         c2op_ctc2_31_assemble(sl,temp);
3467         break;
3468       default:
3469         temp=sl;
3470         break;
3471     }
3472     emit_writeword(temp,&reg_cop2c[copr]);
3473     assert(sl>=0);
3474   }
3475 }
3476
3477 static void do_unalignedwritestub(int n)
3478 {
3479   assem_debug("do_unalignedwritestub %x\n",start+stubs[n].a*4);
3480   literal_pool(256);
3481   set_jump_target(stubs[n].addr, out);
3482
3483   int i=stubs[n].a;
3484   struct regstat *i_regs=(struct regstat *)stubs[n].c;
3485   int addr=stubs[n].b;
3486   u_int reglist=stubs[n].e;
3487   signed char *i_regmap=i_regs->regmap;
3488   int temp2=get_reg(i_regmap,FTEMP);
3489   int rt;
3490   rt=get_reg(i_regmap,rs2[i]);
3491   assert(rt>=0);
3492   assert(addr>=0);
3493   assert(opcode[i]==0x2a||opcode[i]==0x2e); // SWL/SWR only implemented
3494   reglist|=(1<<addr);
3495   reglist&=~(1<<temp2);
3496
3497 #if 1
3498   // don't bother with it and call write handler
3499   save_regs(reglist);
3500   pass_args(addr,rt);
3501   int cc=get_reg(i_regmap,CCREG);
3502   if(cc<0)
3503     emit_loadreg(CCREG,2);
3504   emit_addimm(cc<0?2:cc,CLOCK_ADJUST((int)stubs[n].d+1),2);
3505   emit_far_call((opcode[i]==0x2a?jump_handle_swl:jump_handle_swr));
3506   emit_addimm(0,-CLOCK_ADJUST((int)stubs[n].d+1),cc<0?2:cc);
3507   if(cc<0)
3508     emit_storereg(CCREG,2);
3509   restore_regs(reglist);
3510   emit_jmp(stubs[n].retaddr); // return address
3511 #else
3512   emit_andimm(addr,0xfffffffc,temp2);
3513   emit_writeword(temp2,&address);
3514
3515   save_regs(reglist);
3516   emit_shrimm(addr,16,1);
3517   int cc=get_reg(i_regmap,CCREG);
3518   if(cc<0) {
3519     emit_loadreg(CCREG,2);
3520   }
3521   emit_movimm((u_int)readmem,0);
3522   emit_addimm(cc<0?2:cc,2*stubs[n].d+2,2);
3523   emit_call((int)&indirect_jump_indexed);
3524   restore_regs(reglist);
3525
3526   emit_readword(&readmem_dword,temp2);
3527   int temp=addr; //hmh
3528   emit_shlimm(addr,3,temp);
3529   emit_andimm(temp,24,temp);
3530   if (opcode[i]==0x2a) // SWL
3531     emit_xorimm(temp,24,temp);
3532   emit_movimm(-1,HOST_TEMPREG);
3533   if (opcode[i]==0x2a) { // SWL
3534     emit_bic_lsr(temp2,HOST_TEMPREG,temp,temp2);
3535     emit_orrshr(rt,temp,temp2);
3536   }else{
3537     emit_bic_lsl(temp2,HOST_TEMPREG,temp,temp2);
3538     emit_orrshl(rt,temp,temp2);
3539   }
3540   emit_readword(&address,addr);
3541   emit_writeword(temp2,&word);
3542   //save_regs(reglist); // don't need to, no state changes
3543   emit_shrimm(addr,16,1);
3544   emit_movimm((u_int)writemem,0);
3545   //emit_call((int)&indirect_jump_indexed);
3546   emit_mov(15,14);
3547   emit_readword_dualindexedx4(0,1,15);
3548   emit_readword(&Count,HOST_TEMPREG);
3549   emit_readword(&next_interupt,2);
3550   emit_addimm(HOST_TEMPREG,-2*stubs[n].d-2,HOST_TEMPREG);
3551   emit_writeword(2,&last_count);
3552   emit_sub(HOST_TEMPREG,2,cc<0?HOST_TEMPREG:cc);
3553   if(cc<0) {
3554     emit_storereg(CCREG,HOST_TEMPREG);
3555   }
3556   restore_regs(reglist);
3557   emit_jmp(stubs[n].retaddr); // return address
3558 #endif
3559 }
3560
3561 #ifndef multdiv_assemble
3562 void multdiv_assemble(int i,struct regstat *i_regs)
3563 {
3564   printf("Need multdiv_assemble for this architecture.\n");
3565   abort();
3566 }
3567 #endif
3568
3569 static void mov_assemble(int i,struct regstat *i_regs)
3570 {
3571   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3572   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3573   if(rt1[i]) {
3574     signed char sl,tl;
3575     tl=get_reg(i_regs->regmap,rt1[i]);
3576     //assert(tl>=0);
3577     if(tl>=0) {
3578       sl=get_reg(i_regs->regmap,rs1[i]);
3579       if(sl>=0) emit_mov(sl,tl);
3580       else emit_loadreg(rs1[i],tl);
3581     }
3582   }
3583 }
3584
3585 // call interpreter, exception handler, things that change pc/regs/cycles ...
3586 static void call_c_cpu_handler(int i, const struct regstat *i_regs, u_int pc, void *func)
3587 {
3588   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3589   assert(ccreg==HOST_CCREG);
3590   assert(!is_delayslot);
3591   (void)ccreg;
3592
3593   emit_movimm(pc,3); // Get PC
3594   emit_readword(&last_count,2);
3595   emit_writeword(3,&psxRegs.pc);
3596   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3597   emit_add(2,HOST_CCREG,2);
3598   emit_writeword(2,&psxRegs.cycle);
3599   emit_far_call(func);
3600   emit_far_jump(jump_to_new_pc);
3601 }
3602
3603 static void syscall_assemble(int i,struct regstat *i_regs)
3604 {
3605   emit_movimm(0x20,0); // cause code
3606   emit_movimm(0,1);    // not in delay slot
3607   call_c_cpu_handler(i,i_regs,start+i*4,psxException);
3608 }
3609
3610 static void hlecall_assemble(int i,struct regstat *i_regs)
3611 {
3612   void *hlefunc = psxNULL;
3613   uint32_t hleCode = source[i] & 0x03ffffff;
3614   if (hleCode < ARRAY_SIZE(psxHLEt))
3615     hlefunc = psxHLEt[hleCode];
3616
3617   call_c_cpu_handler(i,i_regs,start+i*4+4,hlefunc);
3618 }
3619
3620 static void intcall_assemble(int i,struct regstat *i_regs)
3621 {
3622   call_c_cpu_handler(i,i_regs,start+i*4,execI);
3623 }
3624
3625 static void speculate_mov(int rs,int rt)
3626 {
3627   if(rt!=0) {
3628     smrv_strong_next|=1<<rt;
3629     smrv[rt]=smrv[rs];
3630   }
3631 }
3632
3633 static void speculate_mov_weak(int rs,int rt)
3634 {
3635   if(rt!=0) {
3636     smrv_weak_next|=1<<rt;
3637     smrv[rt]=smrv[rs];
3638   }
3639 }
3640
3641 static void speculate_register_values(int i)
3642 {
3643   if(i==0) {
3644     memcpy(smrv,psxRegs.GPR.r,sizeof(smrv));
3645     // gp,sp are likely to stay the same throughout the block
3646     smrv_strong_next=(1<<28)|(1<<29)|(1<<30);
3647     smrv_weak_next=~smrv_strong_next;
3648     //printf(" llr %08x\n", smrv[4]);
3649   }
3650   smrv_strong=smrv_strong_next;
3651   smrv_weak=smrv_weak_next;
3652   switch(itype[i]) {
3653     case ALU:
3654       if     ((smrv_strong>>rs1[i])&1) speculate_mov(rs1[i],rt1[i]);
3655       else if((smrv_strong>>rs2[i])&1) speculate_mov(rs2[i],rt1[i]);
3656       else if((smrv_weak>>rs1[i])&1) speculate_mov_weak(rs1[i],rt1[i]);
3657       else if((smrv_weak>>rs2[i])&1) speculate_mov_weak(rs2[i],rt1[i]);
3658       else {
3659         smrv_strong_next&=~(1<<rt1[i]);
3660         smrv_weak_next&=~(1<<rt1[i]);
3661       }
3662       break;
3663     case SHIFTIMM:
3664       smrv_strong_next&=~(1<<rt1[i]);
3665       smrv_weak_next&=~(1<<rt1[i]);
3666       // fallthrough
3667     case IMM16:
3668       if(rt1[i]&&is_const(&regs[i],rt1[i])) {
3669         int value,hr=get_reg(regs[i].regmap,rt1[i]);
3670         if(hr>=0) {
3671           if(get_final_value(hr,i,&value))
3672                smrv[rt1[i]]=value;
3673           else smrv[rt1[i]]=constmap[i][hr];
3674           smrv_strong_next|=1<<rt1[i];
3675         }
3676       }
3677       else {
3678         if     ((smrv_strong>>rs1[i])&1) speculate_mov(rs1[i],rt1[i]);
3679         else if((smrv_weak>>rs1[i])&1) speculate_mov_weak(rs1[i],rt1[i]);
3680       }
3681       break;
3682     case LOAD:
3683       if(start<0x2000&&(rt1[i]==26||(smrv[rt1[i]]>>24)==0xa0)) {
3684         // special case for BIOS
3685         smrv[rt1[i]]=0xa0000000;
3686         smrv_strong_next|=1<<rt1[i];
3687         break;
3688       }
3689       // fallthrough
3690     case SHIFT:
3691     case LOADLR:
3692     case MOV:
3693       smrv_strong_next&=~(1<<rt1[i]);
3694       smrv_weak_next&=~(1<<rt1[i]);
3695       break;
3696     case COP0:
3697     case COP2:
3698       if(opcode2[i]==0||opcode2[i]==2) { // MFC/CFC
3699         smrv_strong_next&=~(1<<rt1[i]);
3700         smrv_weak_next&=~(1<<rt1[i]);
3701       }
3702       break;
3703     case C2LS:
3704       if (opcode[i]==0x32) { // LWC2
3705         smrv_strong_next&=~(1<<rt1[i]);
3706         smrv_weak_next&=~(1<<rt1[i]);
3707       }
3708       break;
3709   }
3710 #if 0
3711   int r=4;
3712   printf("x %08x %08x %d %d c %08x %08x\n",smrv[r],start+i*4,
3713     ((smrv_strong>>r)&1),(smrv_weak>>r)&1,regs[i].isconst,regs[i].wasconst);
3714 #endif
3715 }
3716
3717 static void ds_assemble(int i,struct regstat *i_regs)
3718 {
3719   speculate_register_values(i);
3720   is_delayslot=1;
3721   switch(itype[i]) {
3722     case ALU:
3723       alu_assemble(i,i_regs);break;
3724     case IMM16:
3725       imm16_assemble(i,i_regs);break;
3726     case SHIFT:
3727       shift_assemble(i,i_regs);break;
3728     case SHIFTIMM:
3729       shiftimm_assemble(i,i_regs);break;
3730     case LOAD:
3731       load_assemble(i,i_regs);break;
3732     case LOADLR:
3733       loadlr_assemble(i,i_regs);break;
3734     case STORE:
3735       store_assemble(i,i_regs);break;
3736     case STORELR:
3737       storelr_assemble(i,i_regs);break;
3738     case COP0:
3739       cop0_assemble(i,i_regs);break;
3740     case COP1:
3741       cop1_assemble(i,i_regs);break;
3742     case C1LS:
3743       c1ls_assemble(i,i_regs);break;
3744     case COP2:
3745       cop2_assemble(i,i_regs);break;
3746     case C2LS:
3747       c2ls_assemble(i,i_regs);break;
3748     case C2OP:
3749       c2op_assemble(i,i_regs);break;
3750     case MULTDIV:
3751       multdiv_assemble(i,i_regs);break;
3752     case MOV:
3753       mov_assemble(i,i_regs);break;
3754     case SYSCALL:
3755     case HLECALL:
3756     case INTCALL:
3757     case SPAN:
3758     case UJUMP:
3759     case RJUMP:
3760     case CJUMP:
3761     case SJUMP:
3762       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3763   }
3764   is_delayslot=0;
3765 }
3766
3767 // Is the branch target a valid internal jump?
3768 static int internal_branch(int addr)
3769 {
3770   if(addr&1) return 0; // Indirect (register) jump
3771   if(addr>=start && addr<start+slen*4-4)
3772   {
3773     return 1;
3774   }
3775   return 0;
3776 }
3777
3778 static void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t u)
3779 {
3780   int hr;
3781   for(hr=0;hr<HOST_REGS;hr++) {
3782     if(hr!=EXCLUDE_REG) {
3783       if(pre[hr]!=entry[hr]) {
3784         if(pre[hr]>=0) {
3785           if((dirty>>hr)&1) {
3786             if(get_reg(entry,pre[hr])<0) {
3787               assert(pre[hr]<64);
3788               if(!((u>>pre[hr])&1))
3789                 emit_storereg(pre[hr],hr);
3790             }
3791           }
3792         }
3793       }
3794     }
3795   }
3796   // Move from one register to another (no writeback)
3797   for(hr=0;hr<HOST_REGS;hr++) {
3798     if(hr!=EXCLUDE_REG) {
3799       if(pre[hr]!=entry[hr]) {
3800         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3801           int nr;
3802           if((nr=get_reg(entry,pre[hr]))>=0) {
3803             emit_mov(hr,nr);
3804           }
3805         }
3806       }
3807     }
3808   }
3809 }
3810
3811 // Load the specified registers
3812 // This only loads the registers given as arguments because
3813 // we don't want to load things that will be overwritten
3814 static void load_regs(signed char entry[],signed char regmap[],int rs1,int rs2)
3815 {
3816   int hr;
3817   // Load 32-bit regs
3818   for(hr=0;hr<HOST_REGS;hr++) {
3819     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3820       if(entry[hr]!=regmap[hr]) {
3821         if(regmap[hr]==rs1||regmap[hr]==rs2)
3822         {
3823           if(regmap[hr]==0) {
3824             emit_zeroreg(hr);
3825           }
3826           else
3827           {
3828             emit_loadreg(regmap[hr],hr);
3829           }
3830         }
3831       }
3832     }
3833   }
3834 }
3835
3836 // Load registers prior to the start of a loop
3837 // so that they are not loaded within the loop
3838 static void loop_preload(signed char pre[],signed char entry[])
3839 {
3840   int hr;
3841   for(hr=0;hr<HOST_REGS;hr++) {
3842     if(hr!=EXCLUDE_REG) {
3843       if(pre[hr]!=entry[hr]) {
3844         if(entry[hr]>=0) {
3845           if(get_reg(pre,entry[hr])<0) {
3846             assem_debug("loop preload:\n");
3847             //printf("loop preload: %d\n",hr);
3848             if(entry[hr]==0) {
3849               emit_zeroreg(hr);
3850             }
3851             else if(entry[hr]<TEMPREG)
3852             {
3853               emit_loadreg(entry[hr],hr);
3854             }
3855             else if(entry[hr]-64<TEMPREG)
3856             {
3857               emit_loadreg(entry[hr],hr);
3858             }
3859           }
3860         }
3861       }
3862     }
3863   }
3864 }
3865
3866 // Generate address for load/store instruction
3867 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
3868 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3869 {
3870   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
3871     int ra=-1;
3872     int agr=AGEN1+(i&1);
3873     if(itype[i]==LOAD) {
3874       ra=get_reg(i_regs->regmap,rt1[i]);
3875       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3876       assert(ra>=0);
3877     }
3878     if(itype[i]==LOADLR) {
3879       ra=get_reg(i_regs->regmap,FTEMP);
3880     }
3881     if(itype[i]==STORE||itype[i]==STORELR) {
3882       ra=get_reg(i_regs->regmap,agr);
3883       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3884     }
3885     if(itype[i]==C1LS||itype[i]==C2LS) {
3886       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
3887         ra=get_reg(i_regs->regmap,FTEMP);
3888       else { // SWC1/SDC1/SWC2/SDC2
3889         ra=get_reg(i_regs->regmap,agr);
3890         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3891       }
3892     }
3893     int rs=get_reg(i_regs->regmap,rs1[i]);
3894     if(ra>=0) {
3895       int offset=imm[i];
3896       int c=(i_regs->wasconst>>rs)&1;
3897       if(rs1[i]==0) {
3898         // Using r0 as a base address
3899         if(!entry||entry[ra]!=agr) {
3900           if (opcode[i]==0x22||opcode[i]==0x26) {
3901             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3902           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3903             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3904           }else{
3905             emit_movimm(offset,ra);
3906           }
3907         } // else did it in the previous cycle
3908       }
3909       else if(rs<0) {
3910         if(!entry||entry[ra]!=rs1[i])
3911           emit_loadreg(rs1[i],ra);
3912         //if(!entry||entry[ra]!=rs1[i])
3913         //  printf("poor load scheduling!\n");
3914       }
3915       else if(c) {
3916         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3917           if(!entry||entry[ra]!=agr) {
3918             if (opcode[i]==0x22||opcode[i]==0x26) {
3919               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3920             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3921               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3922             }else{
3923               emit_movimm(constmap[i][rs]+offset,ra);
3924               regs[i].loadedconst|=1<<ra;
3925             }
3926           } // else did it in the previous cycle
3927         } // else load_consts already did it
3928       }
3929       if(offset&&!c&&rs1[i]) {
3930         if(rs>=0) {
3931           emit_addimm(rs,offset,ra);
3932         }else{
3933           emit_addimm(ra,offset,ra);
3934         }
3935       }
3936     }
3937   }
3938   // Preload constants for next instruction
3939   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
3940     int agr,ra;
3941     // Actual address
3942     agr=AGEN1+((i+1)&1);
3943     ra=get_reg(i_regs->regmap,agr);
3944     if(ra>=0) {
3945       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3946       int offset=imm[i+1];
3947       int c=(regs[i+1].wasconst>>rs)&1;
3948       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3949         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3950           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3951         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3952           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3953         }else{
3954           emit_movimm(constmap[i+1][rs]+offset,ra);
3955           regs[i+1].loadedconst|=1<<ra;
3956         }
3957       }
3958       else if(rs1[i+1]==0) {
3959         // Using r0 as a base address
3960         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3961           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3962         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3963           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3964         }else{
3965           emit_movimm(offset,ra);
3966         }
3967       }
3968     }
3969   }
3970 }
3971
3972 static int get_final_value(int hr, int i, int *value)
3973 {
3974   int reg=regs[i].regmap[hr];
3975   while(i<slen-1) {
3976     if(regs[i+1].regmap[hr]!=reg) break;
3977     if(!((regs[i+1].isconst>>hr)&1)) break;
3978     if(bt[i+1]) break;
3979     i++;
3980   }
3981   if(i<slen-1) {
3982     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3983       *value=constmap[i][hr];
3984       return 1;
3985     }
3986     if(!bt[i+1]) {
3987       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3988         // Load in delay slot, out-of-order execution
3989         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3990         {
3991           // Precompute load address
3992           *value=constmap[i][hr]+imm[i+2];
3993           return 1;
3994         }
3995       }
3996       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3997       {
3998         // Precompute load address
3999         *value=constmap[i][hr]+imm[i+1];
4000         //printf("c=%x imm=%lx\n",(long)constmap[i][hr],imm[i+1]);
4001         return 1;
4002       }
4003     }
4004   }
4005   *value=constmap[i][hr];
4006   //printf("c=%lx\n",(long)constmap[i][hr]);
4007   if(i==slen-1) return 1;
4008   assert(reg < 64);
4009   return !((unneeded_reg[i+1]>>reg)&1);
4010 }
4011
4012 // Load registers with known constants
4013 static void load_consts(signed char pre[],signed char regmap[],int i)
4014 {
4015   int hr,hr2;
4016   // propagate loaded constant flags
4017   if(i==0||bt[i])
4018     regs[i].loadedconst=0;
4019   else {
4020     for(hr=0;hr<HOST_REGS;hr++) {
4021       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
4022          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
4023       {
4024         regs[i].loadedconst|=1<<hr;
4025       }
4026     }
4027   }
4028   // Load 32-bit regs
4029   for(hr=0;hr<HOST_REGS;hr++) {
4030     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4031       //if(entry[hr]!=regmap[hr]) {
4032       if(!((regs[i].loadedconst>>hr)&1)) {
4033         assert(regmap[hr]<64);
4034         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>0) {
4035           int value,similar=0;
4036           if(get_final_value(hr,i,&value)) {
4037             // see if some other register has similar value
4038             for(hr2=0;hr2<HOST_REGS;hr2++) {
4039               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
4040                 if(is_similar_value(value,constmap[i][hr2])) {
4041                   similar=1;
4042                   break;
4043                 }
4044               }
4045             }
4046             if(similar) {
4047               int value2;
4048               if(get_final_value(hr2,i,&value2)) // is this needed?
4049                 emit_movimm_from(value2,hr2,value,hr);
4050               else
4051                 emit_movimm(value,hr);
4052             }
4053             else if(value==0) {
4054               emit_zeroreg(hr);
4055             }
4056             else {
4057               emit_movimm(value,hr);
4058             }
4059           }
4060           regs[i].loadedconst|=1<<hr;
4061         }
4062       }
4063     }
4064   }
4065 }
4066
4067 void load_all_consts(signed char regmap[], u_int dirty, int i)
4068 {
4069   int hr;
4070   // Load 32-bit regs
4071   for(hr=0;hr<HOST_REGS;hr++) {
4072     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4073       assert(regmap[hr] < 64);
4074       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>0) {
4075         int value=constmap[i][hr];
4076         if(value==0) {
4077           emit_zeroreg(hr);
4078         }
4079         else {
4080           emit_movimm(value,hr);
4081         }
4082       }
4083     }
4084   }
4085 }
4086
4087 // Write out all dirty registers (except cycle count)
4088 static void wb_dirtys(signed char i_regmap[],uint64_t i_dirty)
4089 {
4090   int hr;
4091   for(hr=0;hr<HOST_REGS;hr++) {
4092     if(hr!=EXCLUDE_REG) {
4093       if(i_regmap[hr]>0) {
4094         if(i_regmap[hr]!=CCREG) {
4095           if((i_dirty>>hr)&1) {
4096             assert(i_regmap[hr]<64);
4097             emit_storereg(i_regmap[hr],hr);
4098           }
4099         }
4100       }
4101     }
4102   }
4103 }
4104
4105 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4106 // This writes the registers not written by store_regs_bt
4107 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_dirty,int addr)
4108 {
4109   int hr;
4110   int t=(addr-start)>>2;
4111   for(hr=0;hr<HOST_REGS;hr++) {
4112     if(hr!=EXCLUDE_REG) {
4113       if(i_regmap[hr]>0) {
4114         if(i_regmap[hr]!=CCREG) {
4115           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1)) {
4116             if((i_dirty>>hr)&1) {
4117               assert(i_regmap[hr]<64);
4118               emit_storereg(i_regmap[hr],hr);
4119             }
4120           }
4121         }
4122       }
4123     }
4124   }
4125 }
4126
4127 // Load all registers (except cycle count)
4128 void load_all_regs(signed char i_regmap[])
4129 {
4130   int hr;
4131   for(hr=0;hr<HOST_REGS;hr++) {
4132     if(hr!=EXCLUDE_REG) {
4133       if(i_regmap[hr]==0) {
4134         emit_zeroreg(hr);
4135       }
4136       else
4137       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4138       {
4139         emit_loadreg(i_regmap[hr],hr);
4140       }
4141     }
4142   }
4143 }
4144
4145 // Load all current registers also needed by next instruction
4146 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4147 {
4148   int hr;
4149   for(hr=0;hr<HOST_REGS;hr++) {
4150     if(hr!=EXCLUDE_REG) {
4151       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4152         if(i_regmap[hr]==0) {
4153           emit_zeroreg(hr);
4154         }
4155         else
4156         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4157         {
4158           emit_loadreg(i_regmap[hr],hr);
4159         }
4160       }
4161     }
4162   }
4163 }
4164
4165 // Load all regs, storing cycle count if necessary
4166 void load_regs_entry(int t)
4167 {
4168   int hr;
4169   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
4170   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
4171   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4172     emit_storereg(CCREG,HOST_CCREG);
4173   }
4174   // Load 32-bit regs
4175   for(hr=0;hr<HOST_REGS;hr++) {
4176     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4177       if(regs[t].regmap_entry[hr]==0) {
4178         emit_zeroreg(hr);
4179       }
4180       else if(regs[t].regmap_entry[hr]!=CCREG)
4181       {
4182         emit_loadreg(regs[t].regmap_entry[hr],hr);
4183       }
4184     }
4185   }
4186 }
4187
4188 // Store dirty registers prior to branch
4189 void store_regs_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
4190 {
4191   if(internal_branch(addr))
4192   {
4193     int t=(addr-start)>>2;
4194     int hr;
4195     for(hr=0;hr<HOST_REGS;hr++) {
4196       if(hr!=EXCLUDE_REG) {
4197         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4198           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1)) {
4199             if((i_dirty>>hr)&1) {
4200               assert(i_regmap[hr]<64);
4201               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4202                 emit_storereg(i_regmap[hr],hr);
4203             }
4204           }
4205         }
4206       }
4207     }
4208   }
4209   else
4210   {
4211     // Branch out of this block, write out all dirty regs
4212     wb_dirtys(i_regmap,i_dirty);
4213   }
4214 }
4215
4216 // Load all needed registers for branch target
4217 static void load_regs_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
4218 {
4219   //if(addr>=start && addr<(start+slen*4))
4220   if(internal_branch(addr))
4221   {
4222     int t=(addr-start)>>2;
4223     int hr;
4224     // Store the cycle count before loading something else
4225     if(i_regmap[HOST_CCREG]!=CCREG) {
4226       assert(i_regmap[HOST_CCREG]==-1);
4227     }
4228     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4229       emit_storereg(CCREG,HOST_CCREG);
4230     }
4231     // Load 32-bit regs
4232     for(hr=0;hr<HOST_REGS;hr++) {
4233       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4234         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4235           if(regs[t].regmap_entry[hr]==0) {
4236             emit_zeroreg(hr);
4237           }
4238           else if(regs[t].regmap_entry[hr]!=CCREG)
4239           {
4240             emit_loadreg(regs[t].regmap_entry[hr],hr);
4241           }
4242         }
4243       }
4244     }
4245   }
4246 }
4247
4248 static int match_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
4249 {
4250   if(addr>=start && addr<start+slen*4-4)
4251   {
4252     int t=(addr-start)>>2;
4253     int hr;
4254     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4255     for(hr=0;hr<HOST_REGS;hr++)
4256     {
4257       if(hr!=EXCLUDE_REG)
4258       {
4259         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4260         {
4261           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4262           {
4263             return 0;
4264           }
4265           else
4266           if((i_dirty>>hr)&1)
4267           {
4268             if(i_regmap[hr]<TEMPREG)
4269             {
4270               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4271                 return 0;
4272             }
4273             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4274             {
4275               assert(0);
4276             }
4277           }
4278         }
4279         else // Same register but is it 32-bit or dirty?
4280         if(i_regmap[hr]>=0)
4281         {
4282           if(!((regs[t].dirty>>hr)&1))
4283           {
4284             if((i_dirty>>hr)&1)
4285             {
4286               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4287               {
4288                 //printf("%x: dirty no match\n",addr);
4289                 return 0;
4290               }
4291             }
4292           }
4293         }
4294       }
4295     }
4296     // Delay slots are not valid branch targets
4297     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP)) return 0;
4298     // Delay slots require additional processing, so do not match
4299     if(is_ds[t]) return 0;
4300   }
4301   else
4302   {
4303     int hr;
4304     for(hr=0;hr<HOST_REGS;hr++)
4305     {
4306       if(hr!=EXCLUDE_REG)
4307       {
4308         if(i_regmap[hr]>=0)
4309         {
4310           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4311           {
4312             if((i_dirty>>hr)&1)
4313             {
4314               return 0;
4315             }
4316           }
4317         }
4318       }
4319     }
4320   }
4321   return 1;
4322 }
4323
4324 #ifdef DRC_DBG
4325 static void drc_dbg_emit_do_cmp(int i)
4326 {
4327   extern void do_insn_cmp();
4328   //extern int cycle;
4329   u_int hr,reglist=0;
4330
4331   assem_debug("//do_insn_cmp %08x\n", start+i*4);
4332   for (hr = 0; hr < HOST_REGS; hr++)
4333     if(regs[i].regmap[hr]>=0) reglist|=1<<hr;
4334   save_regs(reglist);
4335   // write out changed consts to match the interpreter
4336   if (i > 0 && !bt[i]) {
4337     for (hr = 0; hr < HOST_REGS; hr++) {
4338       int reg = regs[i-1].regmap[hr];
4339       if (hr == EXCLUDE_REG || reg < 0)
4340         continue;
4341       if (!((regs[i-1].isconst >> hr) & 1))
4342         continue;
4343       if (i > 1 && reg == regs[i-2].regmap[hr] && constmap[i-1][hr] == constmap[i-2][hr])
4344         continue;
4345       emit_movimm(constmap[i-1][hr],0);
4346       emit_storereg(reg, 0);
4347     }
4348   }
4349   emit_movimm(start+i*4,0);
4350   emit_writeword(0,&pcaddr);
4351   emit_far_call(do_insn_cmp);
4352   //emit_readword(&cycle,0);
4353   //emit_addimm(0,2,0);
4354   //emit_writeword(0,&cycle);
4355   (void)get_reg2;
4356   restore_regs(reglist);
4357   assem_debug("\\\\do_insn_cmp\n");
4358 }
4359 #else
4360 #define drc_dbg_emit_do_cmp(x)
4361 #endif
4362
4363 // Used when a branch jumps into the delay slot of another branch
4364 static void ds_assemble_entry(int i)
4365 {
4366   int t=(ba[i]-start)>>2;
4367   if (!instr_addr[t])
4368     instr_addr[t] = out;
4369   assem_debug("Assemble delay slot at %x\n",ba[i]);
4370   assem_debug("<->\n");
4371   drc_dbg_emit_do_cmp(t);
4372   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4373     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty);
4374   load_regs(regs[t].regmap_entry,regs[t].regmap,rs1[t],rs2[t]);
4375   address_generation(t,&regs[t],regs[t].regmap_entry);
4376   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4377     load_regs(regs[t].regmap_entry,regs[t].regmap,INVCP,INVCP);
4378   is_delayslot=0;
4379   switch(itype[t]) {
4380     case ALU:
4381       alu_assemble(t,&regs[t]);break;
4382     case IMM16:
4383       imm16_assemble(t,&regs[t]);break;
4384     case SHIFT:
4385       shift_assemble(t,&regs[t]);break;
4386     case SHIFTIMM:
4387       shiftimm_assemble(t,&regs[t]);break;
4388     case LOAD:
4389       load_assemble(t,&regs[t]);break;
4390     case LOADLR:
4391       loadlr_assemble(t,&regs[t]);break;
4392     case STORE:
4393       store_assemble(t,&regs[t]);break;
4394     case STORELR:
4395       storelr_assemble(t,&regs[t]);break;
4396     case COP0:
4397       cop0_assemble(t,&regs[t]);break;
4398     case COP1:
4399       cop1_assemble(t,&regs[t]);break;
4400     case C1LS:
4401       c1ls_assemble(t,&regs[t]);break;
4402     case COP2:
4403       cop2_assemble(t,&regs[t]);break;
4404     case C2LS:
4405       c2ls_assemble(t,&regs[t]);break;
4406     case C2OP:
4407       c2op_assemble(t,&regs[t]);break;
4408     case MULTDIV:
4409       multdiv_assemble(t,&regs[t]);break;
4410     case MOV:
4411       mov_assemble(t,&regs[t]);break;
4412     case SYSCALL:
4413     case HLECALL:
4414     case INTCALL:
4415     case SPAN:
4416     case UJUMP:
4417     case RJUMP:
4418     case CJUMP:
4419     case SJUMP:
4420       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4421   }
4422   store_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+4);
4423   load_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+4);
4424   if(internal_branch(ba[i]+4))
4425     assem_debug("branch: internal\n");
4426   else
4427     assem_debug("branch: external\n");
4428   assert(internal_branch(ba[i]+4));
4429   add_to_linker(out,ba[i]+4,internal_branch(ba[i]+4));
4430   emit_jmp(0);
4431 }
4432
4433 static void emit_extjump(void *addr, u_int target)
4434 {
4435   emit_extjump2(addr, target, dyna_linker);
4436 }
4437
4438 static void emit_extjump_ds(void *addr, u_int target)
4439 {
4440   emit_extjump2(addr, target, dyna_linker_ds);
4441 }
4442
4443 // Load 2 immediates optimizing for small code size
4444 static void emit_mov2imm_compact(int imm1,u_int rt1,int imm2,u_int rt2)
4445 {
4446   emit_movimm(imm1,rt1);
4447   emit_movimm_from(imm1,rt1,imm2,rt2);
4448 }
4449
4450 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4451 {
4452   int count;
4453   void *jaddr;
4454   void *idle=NULL;
4455   int t=0;
4456   if(itype[i]==RJUMP)
4457   {
4458     *adj=0;
4459   }
4460   //if(ba[i]>=start && ba[i]<(start+slen*4))
4461   if(internal_branch(ba[i]))
4462   {
4463     t=(ba[i]-start)>>2;
4464     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4465     else *adj=ccadj[t];
4466   }
4467   else
4468   {
4469     *adj=0;
4470   }
4471   count=ccadj[i];
4472   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4473     // Idle loop
4474     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4475     idle=out;
4476     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4477     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4478     jaddr=out;
4479     emit_jmp(0);
4480   }
4481   else if(*adj==0||invert) {
4482     int cycles=CLOCK_ADJUST(count+2);
4483     // faster loop HACK
4484 #if 0
4485     if (t&&*adj) {
4486       int rel=t-i;
4487       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4488         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4489     }
4490 #endif
4491     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4492     jaddr=out;
4493     emit_jns(0);
4494   }
4495   else
4496   {
4497     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4498     jaddr=out;
4499     emit_jns(0);
4500   }
4501   add_stub(CC_STUB,jaddr,idle?idle:out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4502 }
4503
4504 static void do_ccstub(int n)
4505 {
4506   literal_pool(256);
4507   assem_debug("do_ccstub %x\n",start+(u_int)stubs[n].b*4);
4508   set_jump_target(stubs[n].addr, out);
4509   int i=stubs[n].b;
4510   if(stubs[n].d==NULLDS) {
4511     // Delay slot instruction is nullified ("likely" branch)
4512     wb_dirtys(regs[i].regmap,regs[i].dirty);
4513   }
4514   else if(stubs[n].d!=TAKEN) {
4515     wb_dirtys(branch_regs[i].regmap,branch_regs[i].dirty);
4516   }
4517   else {
4518     if(internal_branch(ba[i]))
4519       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4520   }
4521   if(stubs[n].c!=-1)
4522   {
4523     // Save PC as return address
4524     emit_movimm(stubs[n].c,EAX);
4525     emit_writeword(EAX,&pcaddr);
4526   }
4527   else
4528   {
4529     // Return address depends on which way the branch goes
4530     if(itype[i]==CJUMP||itype[i]==SJUMP)
4531     {
4532       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4533       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4534       if(rs1[i]==0)
4535       {
4536         s1l=s2l;
4537         s2l=-1;
4538       }
4539       else if(rs2[i]==0)
4540       {
4541         s2l=-1;
4542       }
4543       assert(s1l>=0);
4544       #ifdef DESTRUCTIVE_WRITEBACK
4545       if(rs1[i]) {
4546         if((branch_regs[i].dirty>>s1l)&&1)
4547           emit_loadreg(rs1[i],s1l);
4548       }
4549       else {
4550         if((branch_regs[i].dirty>>s1l)&1)
4551           emit_loadreg(rs2[i],s1l);
4552       }
4553       if(s2l>=0)
4554         if((branch_regs[i].dirty>>s2l)&1)
4555           emit_loadreg(rs2[i],s2l);
4556       #endif
4557       int hr=0;
4558       int addr=-1,alt=-1,ntaddr=-1;
4559       while(hr<HOST_REGS)
4560       {
4561         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4562            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4563            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4564         {
4565           addr=hr++;break;
4566         }
4567         hr++;
4568       }
4569       while(hr<HOST_REGS)
4570       {
4571         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4572            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4573            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4574         {
4575           alt=hr++;break;
4576         }
4577         hr++;
4578       }
4579       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4580       {
4581         while(hr<HOST_REGS)
4582         {
4583           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4584              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4585              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4586           {
4587             ntaddr=hr;break;
4588           }
4589           hr++;
4590         }
4591         assert(hr<HOST_REGS);
4592       }
4593       if((opcode[i]&0x2f)==4) // BEQ
4594       {
4595         #ifdef HAVE_CMOV_IMM
4596         if(s2l>=0) emit_cmp(s1l,s2l);
4597         else emit_test(s1l,s1l);
4598         emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4599         #else
4600         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4601         if(s2l>=0) emit_cmp(s1l,s2l);
4602         else emit_test(s1l,s1l);
4603         emit_cmovne_reg(alt,addr);
4604         #endif
4605       }
4606       if((opcode[i]&0x2f)==5) // BNE
4607       {
4608         #ifdef HAVE_CMOV_IMM
4609         if(s2l>=0) emit_cmp(s1l,s2l);
4610         else emit_test(s1l,s1l);
4611         emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4612         #else
4613         emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4614         if(s2l>=0) emit_cmp(s1l,s2l);
4615         else emit_test(s1l,s1l);
4616         emit_cmovne_reg(alt,addr);
4617         #endif
4618       }
4619       if((opcode[i]&0x2f)==6) // BLEZ
4620       {
4621         //emit_movimm(ba[i],alt);
4622         //emit_movimm(start+i*4+8,addr);
4623         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4624         emit_cmpimm(s1l,1);
4625         emit_cmovl_reg(alt,addr);
4626       }
4627       if((opcode[i]&0x2f)==7) // BGTZ
4628       {
4629         //emit_movimm(ba[i],addr);
4630         //emit_movimm(start+i*4+8,ntaddr);
4631         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4632         emit_cmpimm(s1l,1);
4633         emit_cmovl_reg(ntaddr,addr);
4634       }
4635       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4636       {
4637         //emit_movimm(ba[i],alt);
4638         //emit_movimm(start+i*4+8,addr);
4639         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4640         emit_test(s1l,s1l);
4641         emit_cmovs_reg(alt,addr);
4642       }
4643       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4644       {
4645         //emit_movimm(ba[i],addr);
4646         //emit_movimm(start+i*4+8,alt);
4647         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4648         emit_test(s1l,s1l);
4649         emit_cmovs_reg(alt,addr);
4650       }
4651       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4652         if(source[i]&0x10000) // BC1T
4653         {
4654           //emit_movimm(ba[i],alt);
4655           //emit_movimm(start+i*4+8,addr);
4656           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4657           emit_testimm(s1l,0x800000);
4658           emit_cmovne_reg(alt,addr);
4659         }
4660         else // BC1F
4661         {
4662           //emit_movimm(ba[i],addr);
4663           //emit_movimm(start+i*4+8,alt);
4664           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4665           emit_testimm(s1l,0x800000);
4666           emit_cmovne_reg(alt,addr);
4667         }
4668       }
4669       emit_writeword(addr,&pcaddr);
4670     }
4671     else
4672     if(itype[i]==RJUMP)
4673     {
4674       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4675       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4676         r=get_reg(branch_regs[i].regmap,RTEMP);
4677       }
4678       emit_writeword(r,&pcaddr);
4679     }
4680     else {SysPrintf("Unknown branch type in do_ccstub\n");abort();}
4681   }
4682   // Update cycle count
4683   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4684   if(stubs[n].a) emit_addimm(HOST_CCREG,CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
4685   emit_far_call(cc_interrupt);
4686   if(stubs[n].a) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
4687   if(stubs[n].d==TAKEN) {
4688     if(internal_branch(ba[i]))
4689       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4690     else if(itype[i]==RJUMP) {
4691       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4692         emit_readword(&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4693       else
4694         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4695     }
4696   }else if(stubs[n].d==NOTTAKEN) {
4697     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4698     else load_all_regs(branch_regs[i].regmap);
4699   }else if(stubs[n].d==NULLDS) {
4700     // Delay slot instruction is nullified ("likely" branch)
4701     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4702     else load_all_regs(regs[i].regmap);
4703   }else{
4704     load_all_regs(branch_regs[i].regmap);
4705   }
4706   if (stubs[n].retaddr)
4707     emit_jmp(stubs[n].retaddr);
4708   else
4709     do_jump_vaddr(stubs[n].e);
4710 }
4711
4712 static void add_to_linker(void *addr, u_int target, int ext)
4713 {
4714   assert(linkcount < ARRAY_SIZE(link_addr));
4715   link_addr[linkcount].addr = addr;
4716   link_addr[linkcount].target = target;
4717   link_addr[linkcount].ext = ext;
4718   linkcount++;
4719 }
4720
4721 static void ujump_assemble_write_ra(int i)
4722 {
4723   int rt;
4724   unsigned int return_address;
4725   rt=get_reg(branch_regs[i].regmap,31);
4726   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4727   //assert(rt>=0);
4728   return_address=start+i*4+8;
4729   if(rt>=0) {
4730     #ifdef USE_MINI_HT
4731     if(internal_branch(return_address)&&rt1[i+1]!=31) {
4732       int temp=-1; // note: must be ds-safe
4733       #ifdef HOST_TEMPREG
4734       temp=HOST_TEMPREG;
4735       #endif
4736       if(temp>=0) do_miniht_insert(return_address,rt,temp);
4737       else emit_movimm(return_address,rt);
4738     }
4739     else
4740     #endif
4741     {
4742       #ifdef REG_PREFETCH
4743       if(temp>=0)
4744       {
4745         if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4746       }
4747       #endif
4748       emit_movimm(return_address,rt); // PC into link register
4749       #ifdef IMM_PREFETCH
4750       emit_prefetch(hash_table_get(return_address));
4751       #endif
4752     }
4753   }
4754 }
4755
4756 static void ujump_assemble(int i,struct regstat *i_regs)
4757 {
4758   int ra_done=0;
4759   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4760   address_generation(i+1,i_regs,regs[i].regmap_entry);
4761   #ifdef REG_PREFETCH
4762   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4763   if(rt1[i]==31&&temp>=0)
4764   {
4765     signed char *i_regmap=i_regs->regmap;
4766     int return_address=start+i*4+8;
4767     if(get_reg(branch_regs[i].regmap,31)>0)
4768     if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4769   }
4770   #endif
4771   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4772     ujump_assemble_write_ra(i); // writeback ra for DS
4773     ra_done=1;
4774   }
4775   ds_assemble(i+1,i_regs);
4776   uint64_t bc_unneeded=branch_regs[i].u;
4777   bc_unneeded|=1|(1LL<<rt1[i]);
4778   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
4779   load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
4780   if(!ra_done&&rt1[i]==31)
4781     ujump_assemble_write_ra(i);
4782   int cc,adj;
4783   cc=get_reg(branch_regs[i].regmap,CCREG);
4784   assert(cc==HOST_CCREG);
4785   store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4786   #ifdef REG_PREFETCH
4787   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4788   #endif
4789   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4790   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4791   load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4792   if(internal_branch(ba[i]))
4793     assem_debug("branch: internal\n");
4794   else
4795     assem_debug("branch: external\n");
4796   if(internal_branch(ba[i])&&is_ds[(ba[i]-start)>>2]) {
4797     ds_assemble_entry(i);
4798   }
4799   else {
4800     add_to_linker(out,ba[i],internal_branch(ba[i]));
4801     emit_jmp(0);
4802   }
4803 }
4804
4805 static void rjump_assemble_write_ra(int i)
4806 {
4807   int rt,return_address;
4808   assert(rt1[i+1]!=rt1[i]);
4809   assert(rt2[i+1]!=rt1[i]);
4810   rt=get_reg(branch_regs[i].regmap,rt1[i]);
4811   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4812   assert(rt>=0);
4813   return_address=start+i*4+8;
4814   #ifdef REG_PREFETCH
4815   if(temp>=0)
4816   {
4817     if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4818   }
4819   #endif
4820   emit_movimm(return_address,rt); // PC into link register
4821   #ifdef IMM_PREFETCH
4822   emit_prefetch(hash_table_get(return_address));
4823   #endif
4824 }
4825
4826 static void rjump_assemble(int i,struct regstat *i_regs)
4827 {
4828   int temp;
4829   int rs,cc;
4830   int ra_done=0;
4831   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4832   assert(rs>=0);
4833   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4834     // Delay slot abuse, make a copy of the branch address register
4835     temp=get_reg(branch_regs[i].regmap,RTEMP);
4836     assert(temp>=0);
4837     assert(regs[i].regmap[temp]==RTEMP);
4838     emit_mov(rs,temp);
4839     rs=temp;
4840   }
4841   address_generation(i+1,i_regs,regs[i].regmap_entry);
4842   #ifdef REG_PREFETCH
4843   if(rt1[i]==31)
4844   {
4845     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4846       signed char *i_regmap=i_regs->regmap;
4847       int return_address=start+i*4+8;
4848       if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4849     }
4850   }
4851   #endif
4852   #ifdef USE_MINI_HT
4853   if(rs1[i]==31) {
4854     int rh=get_reg(regs[i].regmap,RHASH);
4855     if(rh>=0) do_preload_rhash(rh);
4856   }
4857   #endif
4858   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4859     rjump_assemble_write_ra(i);
4860     ra_done=1;
4861   }
4862   ds_assemble(i+1,i_regs);
4863   uint64_t bc_unneeded=branch_regs[i].u;
4864   bc_unneeded|=1|(1LL<<rt1[i]);
4865   bc_unneeded&=~(1LL<<rs1[i]);
4866   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
4867   load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],CCREG);
4868   if(!ra_done&&rt1[i]!=0)
4869     rjump_assemble_write_ra(i);
4870   cc=get_reg(branch_regs[i].regmap,CCREG);
4871   assert(cc==HOST_CCREG);
4872   (void)cc;
4873   #ifdef USE_MINI_HT
4874   int rh=get_reg(branch_regs[i].regmap,RHASH);
4875   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4876   if(rs1[i]==31) {
4877     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4878     do_preload_rhtbl(ht);
4879     do_rhash(rs,rh);
4880   }
4881   #endif
4882   store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1);
4883   #ifdef DESTRUCTIVE_WRITEBACK
4884   if((branch_regs[i].dirty>>rs)&1) {
4885     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4886       emit_loadreg(rs1[i],rs);
4887     }
4888   }
4889   #endif
4890   #ifdef REG_PREFETCH
4891   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4892   #endif
4893   #ifdef USE_MINI_HT
4894   if(rs1[i]==31) {
4895     do_miniht_load(ht,rh);
4896   }
4897   #endif
4898   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4899   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4900   //assert(adj==0);
4901   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4902   add_stub(CC_STUB,out,NULL,0,i,-1,TAKEN,rs);
4903   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
4904     // special case for RFE
4905     emit_jmp(0);
4906   else
4907     emit_jns(0);
4908   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1);
4909   #ifdef USE_MINI_HT
4910   if(rs1[i]==31) {
4911     do_miniht_jump(rs,rh,ht);
4912   }
4913   else
4914   #endif
4915   {
4916     do_jump_vaddr(rs);
4917   }
4918   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4919   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4920   #endif
4921 }
4922
4923 static void cjump_assemble(int i,struct regstat *i_regs)
4924 {
4925   signed char *i_regmap=i_regs->regmap;
4926   int cc;
4927   int match;
4928   match=match_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4929   assem_debug("match=%d\n",match);
4930   int s1l,s2l;
4931   int unconditional=0,nop=0;
4932   int invert=0;
4933   int internal=internal_branch(ba[i]);
4934   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4935   if(!match) invert=1;
4936   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4937   if(i>(ba[i]-start)>>2) invert=1;
4938   #endif
4939   #ifdef __aarch64__
4940   invert=1; // because of near cond. branches
4941   #endif
4942
4943   if(ooo[i]) {
4944     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4945     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4946   }
4947   else {
4948     s1l=get_reg(i_regmap,rs1[i]);
4949     s2l=get_reg(i_regmap,rs2[i]);
4950   }
4951   if(rs1[i]==0&&rs2[i]==0)
4952   {
4953     if(opcode[i]&1) nop=1;
4954     else unconditional=1;
4955     //assert(opcode[i]!=5);
4956     //assert(opcode[i]!=7);
4957     //assert(opcode[i]!=0x15);
4958     //assert(opcode[i]!=0x17);
4959   }
4960   else if(rs1[i]==0)
4961   {
4962     s1l=s2l;
4963     s2l=-1;
4964   }
4965   else if(rs2[i]==0)
4966   {
4967     s2l=-1;
4968   }
4969
4970   if(ooo[i]) {
4971     // Out of order execution (delay slot first)
4972     //printf("OOOE\n");
4973     address_generation(i+1,i_regs,regs[i].regmap_entry);
4974     ds_assemble(i+1,i_regs);
4975     int adj;
4976     uint64_t bc_unneeded=branch_regs[i].u;
4977     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
4978     bc_unneeded|=1;
4979     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
4980     load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],rs2[i]);
4981     load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
4982     cc=get_reg(branch_regs[i].regmap,CCREG);
4983     assert(cc==HOST_CCREG);
4984     if(unconditional)
4985       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4986     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
4987     //assem_debug("cycle count (adj)\n");
4988     if(unconditional) {
4989       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4990       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
4991         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4992         load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4993         if(internal)
4994           assem_debug("branch: internal\n");
4995         else
4996           assem_debug("branch: external\n");
4997         if(internal&&is_ds[(ba[i]-start)>>2]) {
4998           ds_assemble_entry(i);
4999         }
5000         else {
5001           add_to_linker(out,ba[i],internal);
5002           emit_jmp(0);
5003         }
5004         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5005         if(((u_int)out)&7) emit_addnop(0);
5006         #endif
5007       }
5008     }
5009     else if(nop) {
5010       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5011       void *jaddr=out;
5012       emit_jns(0);
5013       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5014     }
5015     else {
5016       void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
5017       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5018       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5019
5020       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5021       assert(s1l>=0);
5022       if(opcode[i]==4) // BEQ
5023       {
5024         if(s2l>=0) emit_cmp(s1l,s2l);
5025         else emit_test(s1l,s1l);
5026         if(invert){
5027           nottaken=out;
5028           emit_jne(DJT_1);
5029         }else{
5030           add_to_linker(out,ba[i],internal);
5031           emit_jeq(0);
5032         }
5033       }
5034       if(opcode[i]==5) // BNE
5035       {
5036         if(s2l>=0) emit_cmp(s1l,s2l);
5037         else emit_test(s1l,s1l);
5038         if(invert){
5039           nottaken=out;
5040           emit_jeq(DJT_1);
5041         }else{
5042           add_to_linker(out,ba[i],internal);
5043           emit_jne(0);
5044         }
5045       }
5046       if(opcode[i]==6) // BLEZ
5047       {
5048         emit_cmpimm(s1l,1);
5049         if(invert){
5050           nottaken=out;
5051           emit_jge(DJT_1);
5052         }else{
5053           add_to_linker(out,ba[i],internal);
5054           emit_jl(0);
5055         }
5056       }
5057       if(opcode[i]==7) // BGTZ
5058       {
5059         emit_cmpimm(s1l,1);
5060         if(invert){
5061           nottaken=out;
5062           emit_jl(DJT_1);
5063         }else{
5064           add_to_linker(out,ba[i],internal);
5065           emit_jge(0);
5066         }
5067       }
5068       if(invert) {
5069         if(taken) set_jump_target(taken, out);
5070         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5071         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5072           if(adj) {
5073             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5074             add_to_linker(out,ba[i],internal);
5075           }else{
5076             emit_addnop(13);
5077             add_to_linker(out,ba[i],internal*2);
5078           }
5079           emit_jmp(0);
5080         }else
5081         #endif
5082         {
5083           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5084           store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5085           load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5086           if(internal)
5087             assem_debug("branch: internal\n");
5088           else
5089             assem_debug("branch: external\n");
5090           if(internal&&is_ds[(ba[i]-start)>>2]) {
5091             ds_assemble_entry(i);
5092           }
5093           else {
5094             add_to_linker(out,ba[i],internal);
5095             emit_jmp(0);
5096           }
5097         }
5098         set_jump_target(nottaken, out);
5099       }
5100
5101       if(nottaken1) set_jump_target(nottaken1, out);
5102       if(adj) {
5103         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5104       }
5105     } // (!unconditional)
5106   } // if(ooo)
5107   else
5108   {
5109     // In-order execution (branch first)
5110     //if(likely[i]) printf("IOL\n");
5111     //else
5112     //printf("IOE\n");
5113     void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
5114     if(!unconditional&&!nop) {
5115       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5116       assert(s1l>=0);
5117       if((opcode[i]&0x2f)==4) // BEQ
5118       {
5119         if(s2l>=0) emit_cmp(s1l,s2l);
5120         else emit_test(s1l,s1l);
5121         nottaken=out;
5122         emit_jne(DJT_2);
5123       }
5124       if((opcode[i]&0x2f)==5) // BNE
5125       {
5126         if(s2l>=0) emit_cmp(s1l,s2l);
5127         else emit_test(s1l,s1l);
5128         nottaken=out;
5129         emit_jeq(DJT_2);
5130       }
5131       if((opcode[i]&0x2f)==6) // BLEZ
5132       {
5133         emit_cmpimm(s1l,1);
5134         nottaken=out;
5135         emit_jge(DJT_2);
5136       }
5137       if((opcode[i]&0x2f)==7) // BGTZ
5138       {
5139         emit_cmpimm(s1l,1);
5140         nottaken=out;
5141         emit_jl(DJT_2);
5142       }
5143     } // if(!unconditional)
5144     int adj;
5145     uint64_t ds_unneeded=branch_regs[i].u;
5146     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5147     ds_unneeded|=1;
5148     // branch taken
5149     if(!nop) {
5150       if(taken) set_jump_target(taken, out);
5151       assem_debug("1:\n");
5152       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5153       // load regs
5154       load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5155       address_generation(i+1,&branch_regs[i],0);
5156       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
5157       ds_assemble(i+1,&branch_regs[i]);
5158       cc=get_reg(branch_regs[i].regmap,CCREG);
5159       if(cc==-1) {
5160         emit_loadreg(CCREG,cc=HOST_CCREG);
5161         // CHECK: Is the following instruction (fall thru) allocated ok?
5162       }
5163       assert(cc==HOST_CCREG);
5164       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5165       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5166       assem_debug("cycle count (adj)\n");
5167       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5168       load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5169       if(internal)
5170         assem_debug("branch: internal\n");
5171       else
5172         assem_debug("branch: external\n");
5173       if(internal&&is_ds[(ba[i]-start)>>2]) {
5174         ds_assemble_entry(i);
5175       }
5176       else {
5177         add_to_linker(out,ba[i],internal);
5178         emit_jmp(0);
5179       }
5180     }
5181     // branch not taken
5182     if(!unconditional) {
5183       if(nottaken1) set_jump_target(nottaken1, out);
5184       set_jump_target(nottaken, out);
5185       assem_debug("2:\n");
5186       if(!likely[i]) {
5187         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5188         load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5189         address_generation(i+1,&branch_regs[i],0);
5190         load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5191         ds_assemble(i+1,&branch_regs[i]);
5192       }
5193       cc=get_reg(branch_regs[i].regmap,CCREG);
5194       if(cc==-1&&!likely[i]) {
5195         // Cycle count isn't in a register, temporarily load it then write it out
5196         emit_loadreg(CCREG,HOST_CCREG);
5197         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5198         void *jaddr=out;
5199         emit_jns(0);
5200         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5201         emit_storereg(CCREG,HOST_CCREG);
5202       }
5203       else{
5204         cc=get_reg(i_regmap,CCREG);
5205         assert(cc==HOST_CCREG);
5206         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5207         void *jaddr=out;
5208         emit_jns(0);
5209         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5210       }
5211     }
5212   }
5213 }
5214
5215 static void sjump_assemble(int i,struct regstat *i_regs)
5216 {
5217   signed char *i_regmap=i_regs->regmap;
5218   int cc;
5219   int match;
5220   match=match_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5221   assem_debug("smatch=%d\n",match);
5222   int s1l;
5223   int unconditional=0,nevertaken=0;
5224   int invert=0;
5225   int internal=internal_branch(ba[i]);
5226   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5227   if(!match) invert=1;
5228   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5229   if(i>(ba[i]-start)>>2) invert=1;
5230   #endif
5231   #ifdef __aarch64__
5232   invert=1; // because of near cond. branches
5233   #endif
5234
5235   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5236   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5237
5238   if(ooo[i]) {
5239     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5240   }
5241   else {
5242     s1l=get_reg(i_regmap,rs1[i]);
5243   }
5244   if(rs1[i]==0)
5245   {
5246     if(opcode2[i]&1) unconditional=1;
5247     else nevertaken=1;
5248     // These are never taken (r0 is never less than zero)
5249     //assert(opcode2[i]!=0);
5250     //assert(opcode2[i]!=2);
5251     //assert(opcode2[i]!=0x10);
5252     //assert(opcode2[i]!=0x12);
5253   }
5254
5255   if(ooo[i]) {
5256     // Out of order execution (delay slot first)
5257     //printf("OOOE\n");
5258     address_generation(i+1,i_regs,regs[i].regmap_entry);
5259     ds_assemble(i+1,i_regs);
5260     int adj;
5261     uint64_t bc_unneeded=branch_regs[i].u;
5262     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5263     bc_unneeded|=1;
5264     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
5265     load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],rs1[i]);
5266     load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5267     if(rt1[i]==31) {
5268       int rt,return_address;
5269       rt=get_reg(branch_regs[i].regmap,31);
5270       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5271       if(rt>=0) {
5272         // Save the PC even if the branch is not taken
5273         return_address=start+i*4+8;
5274         emit_movimm(return_address,rt); // PC into link register
5275         #ifdef IMM_PREFETCH
5276         if(!nevertaken) emit_prefetch(hash_table_get(return_address));
5277         #endif
5278       }
5279     }
5280     cc=get_reg(branch_regs[i].regmap,CCREG);
5281     assert(cc==HOST_CCREG);
5282     if(unconditional)
5283       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5284     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5285     assem_debug("cycle count (adj)\n");
5286     if(unconditional) {
5287       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5288       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5289         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5290         load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5291         if(internal)
5292           assem_debug("branch: internal\n");
5293         else
5294           assem_debug("branch: external\n");
5295         if(internal&&is_ds[(ba[i]-start)>>2]) {
5296           ds_assemble_entry(i);
5297         }
5298         else {
5299           add_to_linker(out,ba[i],internal);
5300           emit_jmp(0);
5301         }
5302         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5303         if(((u_int)out)&7) emit_addnop(0);
5304         #endif
5305       }
5306     }
5307     else if(nevertaken) {
5308       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5309       void *jaddr=out;
5310       emit_jns(0);
5311       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5312     }
5313     else {
5314       void *nottaken = NULL;
5315       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5316       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5317       {
5318         assert(s1l>=0);
5319         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5320         {
5321           emit_test(s1l,s1l);
5322           if(invert){
5323             nottaken=out;
5324             emit_jns(DJT_1);
5325           }else{
5326             add_to_linker(out,ba[i],internal);
5327             emit_js(0);
5328           }
5329         }
5330         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5331         {
5332           emit_test(s1l,s1l);
5333           if(invert){
5334             nottaken=out;
5335             emit_js(DJT_1);
5336           }else{
5337             add_to_linker(out,ba[i],internal);
5338             emit_jns(0);
5339           }
5340         }
5341       }
5342
5343       if(invert) {
5344         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5345         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5346           if(adj) {
5347             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5348             add_to_linker(out,ba[i],internal);
5349           }else{
5350             emit_addnop(13);
5351             add_to_linker(out,ba[i],internal*2);
5352           }
5353           emit_jmp(0);
5354         }else
5355         #endif
5356         {
5357           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5358           store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5359           load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5360           if(internal)
5361             assem_debug("branch: internal\n");
5362           else
5363             assem_debug("branch: external\n");
5364           if(internal&&is_ds[(ba[i]-start)>>2]) {
5365             ds_assemble_entry(i);
5366           }
5367           else {
5368             add_to_linker(out,ba[i],internal);
5369             emit_jmp(0);
5370           }
5371         }
5372         set_jump_target(nottaken, out);
5373       }
5374
5375       if(adj) {
5376         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5377       }
5378     } // (!unconditional)
5379   } // if(ooo)
5380   else
5381   {
5382     // In-order execution (branch first)
5383     //printf("IOE\n");
5384     void *nottaken = NULL;
5385     if(rt1[i]==31) {
5386       int rt,return_address;
5387       rt=get_reg(branch_regs[i].regmap,31);
5388       if(rt>=0) {
5389         // Save the PC even if the branch is not taken
5390         return_address=start+i*4+8;
5391         emit_movimm(return_address,rt); // PC into link register
5392         #ifdef IMM_PREFETCH
5393         emit_prefetch(hash_table_get(return_address));
5394         #endif
5395       }
5396     }
5397     if(!unconditional) {
5398       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5399         assert(s1l>=0);
5400         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5401         {
5402           emit_test(s1l,s1l);
5403           nottaken=out;
5404           emit_jns(DJT_1);
5405         }
5406         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5407         {
5408           emit_test(s1l,s1l);
5409           nottaken=out;
5410           emit_js(DJT_1);
5411         }
5412     } // if(!unconditional)
5413     int adj;
5414     uint64_t ds_unneeded=branch_regs[i].u;
5415     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5416     ds_unneeded|=1;
5417     // branch taken
5418     if(!nevertaken) {
5419       //assem_debug("1:\n");
5420       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5421       // load regs
5422       load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5423       address_generation(i+1,&branch_regs[i],0);
5424       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
5425       ds_assemble(i+1,&branch_regs[i]);
5426       cc=get_reg(branch_regs[i].regmap,CCREG);
5427       if(cc==-1) {
5428         emit_loadreg(CCREG,cc=HOST_CCREG);
5429         // CHECK: Is the following instruction (fall thru) allocated ok?
5430       }
5431       assert(cc==HOST_CCREG);
5432       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5433       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5434       assem_debug("cycle count (adj)\n");
5435       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5436       load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5437       if(internal)
5438         assem_debug("branch: internal\n");
5439       else
5440         assem_debug("branch: external\n");
5441       if(internal&&is_ds[(ba[i]-start)>>2]) {
5442         ds_assemble_entry(i);
5443       }
5444       else {
5445         add_to_linker(out,ba[i],internal);
5446         emit_jmp(0);
5447       }
5448     }
5449     // branch not taken
5450     if(!unconditional) {
5451       set_jump_target(nottaken, out);
5452       assem_debug("1:\n");
5453       if(!likely[i]) {
5454         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5455         load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5456         address_generation(i+1,&branch_regs[i],0);
5457         load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5458         ds_assemble(i+1,&branch_regs[i]);
5459       }
5460       cc=get_reg(branch_regs[i].regmap,CCREG);
5461       if(cc==-1&&!likely[i]) {
5462         // Cycle count isn't in a register, temporarily load it then write it out
5463         emit_loadreg(CCREG,HOST_CCREG);
5464         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5465         void *jaddr=out;
5466         emit_jns(0);
5467         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5468         emit_storereg(CCREG,HOST_CCREG);
5469       }
5470       else{
5471         cc=get_reg(i_regmap,CCREG);
5472         assert(cc==HOST_CCREG);
5473         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5474         void *jaddr=out;
5475         emit_jns(0);
5476         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5477       }
5478     }
5479   }
5480 }
5481
5482 static void pagespan_assemble(int i,struct regstat *i_regs)
5483 {
5484   int s1l=get_reg(i_regs->regmap,rs1[i]);
5485   int s2l=get_reg(i_regs->regmap,rs2[i]);
5486   void *taken = NULL;
5487   void *nottaken = NULL;
5488   int unconditional=0;
5489   if(rs1[i]==0)
5490   {
5491     s1l=s2l;
5492     s2l=-1;
5493   }
5494   else if(rs2[i]==0)
5495   {
5496     s2l=-1;
5497   }
5498   int hr=0;
5499   int addr=-1,alt=-1,ntaddr=-1;
5500   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5501   else {
5502     while(hr<HOST_REGS)
5503     {
5504       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5505          (i_regs->regmap[hr]&63)!=rs1[i] &&
5506          (i_regs->regmap[hr]&63)!=rs2[i] )
5507       {
5508         addr=hr++;break;
5509       }
5510       hr++;
5511     }
5512   }
5513   while(hr<HOST_REGS)
5514   {
5515     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5516        (i_regs->regmap[hr]&63)!=rs1[i] &&
5517        (i_regs->regmap[hr]&63)!=rs2[i] )
5518     {
5519       alt=hr++;break;
5520     }
5521     hr++;
5522   }
5523   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5524   {
5525     while(hr<HOST_REGS)
5526     {
5527       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5528          (i_regs->regmap[hr]&63)!=rs1[i] &&
5529          (i_regs->regmap[hr]&63)!=rs2[i] )
5530       {
5531         ntaddr=hr;break;
5532       }
5533       hr++;
5534     }
5535   }
5536   assert(hr<HOST_REGS);
5537   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5538     load_regs(regs[i].regmap_entry,regs[i].regmap,CCREG,CCREG);
5539   }
5540   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5541   if(opcode[i]==2) // J
5542   {
5543     unconditional=1;
5544   }
5545   if(opcode[i]==3) // JAL
5546   {
5547     // TODO: mini_ht
5548     int rt=get_reg(i_regs->regmap,31);
5549     emit_movimm(start+i*4+8,rt);
5550     unconditional=1;
5551   }
5552   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
5553   {
5554     emit_mov(s1l,addr);
5555     if(opcode2[i]==9) // JALR
5556     {
5557       int rt=get_reg(i_regs->regmap,rt1[i]);
5558       emit_movimm(start+i*4+8,rt);
5559     }
5560   }
5561   if((opcode[i]&0x3f)==4) // BEQ
5562   {
5563     if(rs1[i]==rs2[i])
5564     {
5565       unconditional=1;
5566     }
5567     else
5568     #ifdef HAVE_CMOV_IMM
5569     if(1) {
5570       if(s2l>=0) emit_cmp(s1l,s2l);
5571       else emit_test(s1l,s1l);
5572       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5573     }
5574     else
5575     #endif
5576     {
5577       assert(s1l>=0);
5578       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5579       if(s2l>=0) emit_cmp(s1l,s2l);
5580       else emit_test(s1l,s1l);
5581       emit_cmovne_reg(alt,addr);
5582     }
5583   }
5584   if((opcode[i]&0x3f)==5) // BNE
5585   {
5586     #ifdef HAVE_CMOV_IMM
5587     if(s2l>=0) emit_cmp(s1l,s2l);
5588     else emit_test(s1l,s1l);
5589     emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5590     #else
5591     assert(s1l>=0);
5592     emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5593     if(s2l>=0) emit_cmp(s1l,s2l);
5594     else emit_test(s1l,s1l);
5595     emit_cmovne_reg(alt,addr);
5596     #endif
5597   }
5598   if((opcode[i]&0x3f)==0x14) // BEQL
5599   {
5600     if(s2l>=0) emit_cmp(s1l,s2l);
5601     else emit_test(s1l,s1l);
5602     if(nottaken) set_jump_target(nottaken, out);
5603     nottaken=out;
5604     emit_jne(0);
5605   }
5606   if((opcode[i]&0x3f)==0x15) // BNEL
5607   {
5608     if(s2l>=0) emit_cmp(s1l,s2l);
5609     else emit_test(s1l,s1l);
5610     nottaken=out;
5611     emit_jeq(0);
5612     if(taken) set_jump_target(taken, out);
5613   }
5614   if((opcode[i]&0x3f)==6) // BLEZ
5615   {
5616     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5617     emit_cmpimm(s1l,1);
5618     emit_cmovl_reg(alt,addr);
5619   }
5620   if((opcode[i]&0x3f)==7) // BGTZ
5621   {
5622     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5623     emit_cmpimm(s1l,1);
5624     emit_cmovl_reg(ntaddr,addr);
5625   }
5626   if((opcode[i]&0x3f)==0x16) // BLEZL
5627   {
5628     assert((opcode[i]&0x3f)!=0x16);
5629   }
5630   if((opcode[i]&0x3f)==0x17) // BGTZL
5631   {
5632     assert((opcode[i]&0x3f)!=0x17);
5633   }
5634   assert(opcode[i]!=1); // BLTZ/BGEZ
5635
5636   //FIXME: Check CSREG
5637   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5638     if((source[i]&0x30000)==0) // BC1F
5639     {
5640       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5641       emit_testimm(s1l,0x800000);
5642       emit_cmovne_reg(alt,addr);
5643     }
5644     if((source[i]&0x30000)==0x10000) // BC1T
5645     {
5646       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5647       emit_testimm(s1l,0x800000);
5648       emit_cmovne_reg(alt,addr);
5649     }
5650     if((source[i]&0x30000)==0x20000) // BC1FL
5651     {
5652       emit_testimm(s1l,0x800000);
5653       nottaken=out;
5654       emit_jne(0);
5655     }
5656     if((source[i]&0x30000)==0x30000) // BC1TL
5657     {
5658       emit_testimm(s1l,0x800000);
5659       nottaken=out;
5660       emit_jeq(0);
5661     }
5662   }
5663
5664   assert(i_regs->regmap[HOST_CCREG]==CCREG);
5665   wb_dirtys(regs[i].regmap,regs[i].dirty);
5666   if(likely[i]||unconditional)
5667   {
5668     emit_movimm(ba[i],HOST_BTREG);
5669   }
5670   else if(addr!=HOST_BTREG)
5671   {
5672     emit_mov(addr,HOST_BTREG);
5673   }
5674   void *branch_addr=out;
5675   emit_jmp(0);
5676   int target_addr=start+i*4+5;
5677   void *stub=out;
5678   void *compiled_target_addr=check_addr(target_addr);
5679   emit_extjump_ds(branch_addr, target_addr);
5680   if(compiled_target_addr) {
5681     set_jump_target(branch_addr, compiled_target_addr);
5682     add_link(target_addr,stub);
5683   }
5684   else set_jump_target(branch_addr, stub);
5685   if(likely[i]) {
5686     // Not-taken path
5687     set_jump_target(nottaken, out);
5688     wb_dirtys(regs[i].regmap,regs[i].dirty);
5689     void *branch_addr=out;
5690     emit_jmp(0);
5691     int target_addr=start+i*4+8;
5692     void *stub=out;
5693     void *compiled_target_addr=check_addr(target_addr);
5694     emit_extjump_ds(branch_addr, target_addr);
5695     if(compiled_target_addr) {
5696       set_jump_target(branch_addr, compiled_target_addr);
5697       add_link(target_addr,stub);
5698     }
5699     else set_jump_target(branch_addr, stub);
5700   }
5701 }
5702
5703 // Assemble the delay slot for the above
5704 static void pagespan_ds()
5705 {
5706   assem_debug("initial delay slot:\n");
5707   u_int vaddr=start+1;
5708   u_int page=get_page(vaddr);
5709   u_int vpage=get_vpage(vaddr);
5710   ll_add(jump_dirty+vpage,vaddr,(void *)out);
5711   do_dirty_stub_ds();
5712   ll_add(jump_in+page,vaddr,(void *)out);
5713   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
5714   if(regs[0].regmap[HOST_CCREG]!=CCREG)
5715     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty);
5716   if(regs[0].regmap[HOST_BTREG]!=BTREG)
5717     emit_writeword(HOST_BTREG,&branch_target);
5718   load_regs(regs[0].regmap_entry,regs[0].regmap,rs1[0],rs2[0]);
5719   address_generation(0,&regs[0],regs[0].regmap_entry);
5720   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
5721     load_regs(regs[0].regmap_entry,regs[0].regmap,INVCP,INVCP);
5722   is_delayslot=0;
5723   switch(itype[0]) {
5724     case ALU:
5725       alu_assemble(0,&regs[0]);break;
5726     case IMM16:
5727       imm16_assemble(0,&regs[0]);break;
5728     case SHIFT:
5729       shift_assemble(0,&regs[0]);break;
5730     case SHIFTIMM:
5731       shiftimm_assemble(0,&regs[0]);break;
5732     case LOAD:
5733       load_assemble(0,&regs[0]);break;
5734     case LOADLR:
5735       loadlr_assemble(0,&regs[0]);break;
5736     case STORE:
5737       store_assemble(0,&regs[0]);break;
5738     case STORELR:
5739       storelr_assemble(0,&regs[0]);break;
5740     case COP0:
5741       cop0_assemble(0,&regs[0]);break;
5742     case COP1:
5743       cop1_assemble(0,&regs[0]);break;
5744     case C1LS:
5745       c1ls_assemble(0,&regs[0]);break;
5746     case COP2:
5747       cop2_assemble(0,&regs[0]);break;
5748     case C2LS:
5749       c2ls_assemble(0,&regs[0]);break;
5750     case C2OP:
5751       c2op_assemble(0,&regs[0]);break;
5752     case MULTDIV:
5753       multdiv_assemble(0,&regs[0]);break;
5754     case MOV:
5755       mov_assemble(0,&regs[0]);break;
5756     case SYSCALL:
5757     case HLECALL:
5758     case INTCALL:
5759     case SPAN:
5760     case UJUMP:
5761     case RJUMP:
5762     case CJUMP:
5763     case SJUMP:
5764       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
5765   }
5766   int btaddr=get_reg(regs[0].regmap,BTREG);
5767   if(btaddr<0) {
5768     btaddr=get_reg(regs[0].regmap,-1);
5769     emit_readword(&branch_target,btaddr);
5770   }
5771   assert(btaddr!=HOST_CCREG);
5772   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
5773 #ifdef HOST_IMM8
5774   host_tempreg_acquire();
5775   emit_movimm(start+4,HOST_TEMPREG);
5776   emit_cmp(btaddr,HOST_TEMPREG);
5777   host_tempreg_release();
5778 #else
5779   emit_cmpimm(btaddr,start+4);
5780 #endif
5781   void *branch = out;
5782   emit_jeq(0);
5783   store_regs_bt(regs[0].regmap,regs[0].dirty,-1);
5784   do_jump_vaddr(btaddr);
5785   set_jump_target(branch, out);
5786   store_regs_bt(regs[0].regmap,regs[0].dirty,start+4);
5787   load_regs_bt(regs[0].regmap,regs[0].dirty,start+4);
5788 }
5789
5790 // Basic liveness analysis for MIPS registers
5791 void unneeded_registers(int istart,int iend,int r)
5792 {
5793   int i;
5794   uint64_t u,gte_u,b,gte_b;
5795   uint64_t temp_u,temp_gte_u=0;
5796   uint64_t gte_u_unknown=0;
5797   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
5798     gte_u_unknown=~0ll;
5799   if(iend==slen-1) {
5800     u=1;
5801     gte_u=gte_u_unknown;
5802   }else{
5803     //u=unneeded_reg[iend+1];
5804     u=1;
5805     gte_u=gte_unneeded[iend+1];
5806   }
5807
5808   for (i=iend;i>=istart;i--)
5809   {
5810     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
5811     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
5812     {
5813       // If subroutine call, flag return address as a possible branch target
5814       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
5815
5816       if(ba[i]<start || ba[i]>=(start+slen*4))
5817       {
5818         // Branch out of this block, flush all regs
5819         u=1;
5820         gte_u=gte_u_unknown;
5821         branch_unneeded_reg[i]=u;
5822         // Merge in delay slot
5823         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5824         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5825         u|=1;
5826         gte_u|=gte_rt[i+1];
5827         gte_u&=~gte_rs[i+1];
5828         // If branch is "likely" (and conditional)
5829         // then we skip the delay slot on the fall-thru path
5830         if(likely[i]) {
5831           if(i<slen-1) {
5832             u&=unneeded_reg[i+2];
5833             gte_u&=gte_unneeded[i+2];
5834           }
5835           else
5836           {
5837             u=1;
5838             gte_u=gte_u_unknown;
5839           }
5840         }
5841       }
5842       else
5843       {
5844         // Internal branch, flag target
5845         bt[(ba[i]-start)>>2]=1;
5846         if(ba[i]<=start+i*4) {
5847           // Backward branch
5848           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
5849           {
5850             // Unconditional branch
5851             temp_u=1;
5852             temp_gte_u=0;
5853           } else {
5854             // Conditional branch (not taken case)
5855             temp_u=unneeded_reg[i+2];
5856             temp_gte_u&=gte_unneeded[i+2];
5857           }
5858           // Merge in delay slot
5859           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5860           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5861           temp_u|=1;
5862           temp_gte_u|=gte_rt[i+1];
5863           temp_gte_u&=~gte_rs[i+1];
5864           // If branch is "likely" (and conditional)
5865           // then we skip the delay slot on the fall-thru path
5866           if(likely[i]) {
5867             if(i<slen-1) {
5868               temp_u&=unneeded_reg[i+2];
5869               temp_gte_u&=gte_unneeded[i+2];
5870             }
5871             else
5872             {
5873               temp_u=1;
5874               temp_gte_u=gte_u_unknown;
5875             }
5876           }
5877           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
5878           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5879           temp_u|=1;
5880           temp_gte_u|=gte_rt[i];
5881           temp_gte_u&=~gte_rs[i];
5882           unneeded_reg[i]=temp_u;
5883           gte_unneeded[i]=temp_gte_u;
5884           // Only go three levels deep.  This recursion can take an
5885           // excessive amount of time if there are a lot of nested loops.
5886           if(r<2) {
5887             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
5888           }else{
5889             unneeded_reg[(ba[i]-start)>>2]=1;
5890             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
5891           }
5892         } /*else*/ if(1) {
5893           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
5894           {
5895             // Unconditional branch
5896             u=unneeded_reg[(ba[i]-start)>>2];
5897             gte_u=gte_unneeded[(ba[i]-start)>>2];
5898             branch_unneeded_reg[i]=u;
5899             // Merge in delay slot
5900             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5901             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5902             u|=1;
5903             gte_u|=gte_rt[i+1];
5904             gte_u&=~gte_rs[i+1];
5905           } else {
5906             // Conditional branch
5907             b=unneeded_reg[(ba[i]-start)>>2];
5908             gte_b=gte_unneeded[(ba[i]-start)>>2];
5909             branch_unneeded_reg[i]=b;
5910             // Branch delay slot
5911             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5912             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5913             b|=1;
5914             gte_b|=gte_rt[i+1];
5915             gte_b&=~gte_rs[i+1];
5916             // If branch is "likely" then we skip the
5917             // delay slot on the fall-thru path
5918             if(likely[i]) {
5919               u=b;
5920               gte_u=gte_b;
5921               if(i<slen-1) {
5922                 u&=unneeded_reg[i+2];
5923                 gte_u&=gte_unneeded[i+2];
5924               }
5925             } else {
5926               u&=b;
5927               gte_u&=gte_b;
5928             }
5929             if(i<slen-1) {
5930               branch_unneeded_reg[i]&=unneeded_reg[i+2];
5931             } else {
5932               branch_unneeded_reg[i]=1;
5933             }
5934           }
5935         }
5936       }
5937     }
5938     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
5939     {
5940       // SYSCALL instruction (software interrupt)
5941       u=1;
5942     }
5943     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
5944     {
5945       // ERET instruction (return from interrupt)
5946       u=1;
5947     }
5948     //u=1; // DEBUG
5949     // Written registers are unneeded
5950     u|=1LL<<rt1[i];
5951     u|=1LL<<rt2[i];
5952     gte_u|=gte_rt[i];
5953     // Accessed registers are needed
5954     u&=~(1LL<<rs1[i]);
5955     u&=~(1LL<<rs2[i]);
5956     gte_u&=~gte_rs[i];
5957     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
5958       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
5959     // Source-target dependencies
5960     // R0 is always unneeded
5961     u|=1;
5962     // Save it
5963     unneeded_reg[i]=u;
5964     gte_unneeded[i]=gte_u;
5965     /*
5966     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
5967     printf("U:");
5968     int r;
5969     for(r=1;r<=CCREG;r++) {
5970       if((unneeded_reg[i]>>r)&1) {
5971         if(r==HIREG) printf(" HI");
5972         else if(r==LOREG) printf(" LO");
5973         else printf(" r%d",r);
5974       }
5975     }
5976     printf("\n");
5977     */
5978   }
5979 }
5980
5981 // Write back dirty registers as soon as we will no longer modify them,
5982 // so that we don't end up with lots of writes at the branches.
5983 void clean_registers(int istart,int iend,int wr)
5984 {
5985   int i;
5986   int r;
5987   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
5988   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
5989   if(iend==slen-1) {
5990     will_dirty_i=will_dirty_next=0;
5991     wont_dirty_i=wont_dirty_next=0;
5992   }else{
5993     will_dirty_i=will_dirty_next=will_dirty[iend+1];
5994     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
5995   }
5996   for (i=iend;i>=istart;i--)
5997   {
5998     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
5999     {
6000       if(ba[i]<start || ba[i]>=(start+slen*4))
6001       {
6002         // Branch out of this block, flush all regs
6003         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6004         {
6005           // Unconditional branch
6006           will_dirty_i=0;
6007           wont_dirty_i=0;
6008           // Merge in delay slot (will dirty)
6009           for(r=0;r<HOST_REGS;r++) {
6010             if(r!=EXCLUDE_REG) {
6011               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6012               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6013               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6014               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6015               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6016               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6017               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6018               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6019               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6020               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6021               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6022               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6023               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6024               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6025             }
6026           }
6027         }
6028         else
6029         {
6030           // Conditional branch
6031           will_dirty_i=0;
6032           wont_dirty_i=wont_dirty_next;
6033           // Merge in delay slot (will dirty)
6034           for(r=0;r<HOST_REGS;r++) {
6035             if(r!=EXCLUDE_REG) {
6036               if(!likely[i]) {
6037                 // Might not dirty if likely branch is not taken
6038                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6039                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6040                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6041                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6042                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6043                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6044                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6045                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6046                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6047                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6048                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6049                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6050                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6051                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6052               }
6053             }
6054           }
6055         }
6056         // Merge in delay slot (wont dirty)
6057         for(r=0;r<HOST_REGS;r++) {
6058           if(r!=EXCLUDE_REG) {
6059             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6060             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6061             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6062             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6063             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6064             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6065             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6066             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6067             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6068             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6069           }
6070         }
6071         if(wr) {
6072           #ifndef DESTRUCTIVE_WRITEBACK
6073           branch_regs[i].dirty&=wont_dirty_i;
6074           #endif
6075           branch_regs[i].dirty|=will_dirty_i;
6076         }
6077       }
6078       else
6079       {
6080         // Internal branch
6081         if(ba[i]<=start+i*4) {
6082           // Backward branch
6083           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6084           {
6085             // Unconditional branch
6086             temp_will_dirty=0;
6087             temp_wont_dirty=0;
6088             // Merge in delay slot (will dirty)
6089             for(r=0;r<HOST_REGS;r++) {
6090               if(r!=EXCLUDE_REG) {
6091                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6092                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6093                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6094                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6095                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6096                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6097                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6098                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6099                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6100                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6101                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6102                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6103                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6104                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6105               }
6106             }
6107           } else {
6108             // Conditional branch (not taken case)
6109             temp_will_dirty=will_dirty_next;
6110             temp_wont_dirty=wont_dirty_next;
6111             // Merge in delay slot (will dirty)
6112             for(r=0;r<HOST_REGS;r++) {
6113               if(r!=EXCLUDE_REG) {
6114                 if(!likely[i]) {
6115                   // Will not dirty if likely branch is not taken
6116                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6117                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6118                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6119                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6120                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6121                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
6122                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6123                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6124                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6125                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6126                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6127                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6128                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6129                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6130                 }
6131               }
6132             }
6133           }
6134           // Merge in delay slot (wont dirty)
6135           for(r=0;r<HOST_REGS;r++) {
6136             if(r!=EXCLUDE_REG) {
6137               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6138               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6139               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6140               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6141               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6142               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6143               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6144               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6145               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6146               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6147             }
6148           }
6149           // Deal with changed mappings
6150           if(i<iend) {
6151             for(r=0;r<HOST_REGS;r++) {
6152               if(r!=EXCLUDE_REG) {
6153                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
6154                   temp_will_dirty&=~(1<<r);
6155                   temp_wont_dirty&=~(1<<r);
6156                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6157                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6158                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6159                   } else {
6160                     temp_will_dirty|=1<<r;
6161                     temp_wont_dirty|=1<<r;
6162                   }
6163                 }
6164               }
6165             }
6166           }
6167           if(wr) {
6168             will_dirty[i]=temp_will_dirty;
6169             wont_dirty[i]=temp_wont_dirty;
6170             clean_registers((ba[i]-start)>>2,i-1,0);
6171           }else{
6172             // Limit recursion.  It can take an excessive amount
6173             // of time if there are a lot of nested loops.
6174             will_dirty[(ba[i]-start)>>2]=0;
6175             wont_dirty[(ba[i]-start)>>2]=-1;
6176           }
6177         }
6178         /*else*/ if(1)
6179         {
6180           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6181           {
6182             // Unconditional branch
6183             will_dirty_i=0;
6184             wont_dirty_i=0;
6185           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6186             for(r=0;r<HOST_REGS;r++) {
6187               if(r!=EXCLUDE_REG) {
6188                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6189                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
6190                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6191                 }
6192                 if(branch_regs[i].regmap[r]>=0) {
6193                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6194                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6195                 }
6196               }
6197             }
6198           //}
6199             // Merge in delay slot
6200             for(r=0;r<HOST_REGS;r++) {
6201               if(r!=EXCLUDE_REG) {
6202                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6203                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6204                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6205                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6206                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6207                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6208                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6209                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6210                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6211                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6212                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6213                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6214                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6215                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6216               }
6217             }
6218           } else {
6219             // Conditional branch
6220             will_dirty_i=will_dirty_next;
6221             wont_dirty_i=wont_dirty_next;
6222           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6223             for(r=0;r<HOST_REGS;r++) {
6224               if(r!=EXCLUDE_REG) {
6225                 signed char target_reg=branch_regs[i].regmap[r];
6226                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6227                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6228                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6229                 }
6230                 else if(target_reg>=0) {
6231                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6232                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6233                 }
6234                 // Treat delay slot as part of branch too
6235                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6236                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6237                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6238                 }
6239                 else
6240                 {
6241                   will_dirty[i+1]&=~(1<<r);
6242                 }*/
6243               }
6244             }
6245           //}
6246             // Merge in delay slot
6247             for(r=0;r<HOST_REGS;r++) {
6248               if(r!=EXCLUDE_REG) {
6249                 if(!likely[i]) {
6250                   // Might not dirty if likely branch is not taken
6251                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6252                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6253                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6254                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6255                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6256                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6257                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6258                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6259                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6260                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6261                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6262                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6263                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6264                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6265                 }
6266               }
6267             }
6268           }
6269           // Merge in delay slot (won't dirty)
6270           for(r=0;r<HOST_REGS;r++) {
6271             if(r!=EXCLUDE_REG) {
6272               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6273               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6274               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6275               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6276               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6277               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6278               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6279               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6280               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6281               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6282             }
6283           }
6284           if(wr) {
6285             #ifndef DESTRUCTIVE_WRITEBACK
6286             branch_regs[i].dirty&=wont_dirty_i;
6287             #endif
6288             branch_regs[i].dirty|=will_dirty_i;
6289           }
6290         }
6291       }
6292     }
6293     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6294     {
6295       // SYSCALL instruction (software interrupt)
6296       will_dirty_i=0;
6297       wont_dirty_i=0;
6298     }
6299     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6300     {
6301       // ERET instruction (return from interrupt)
6302       will_dirty_i=0;
6303       wont_dirty_i=0;
6304     }
6305     will_dirty_next=will_dirty_i;
6306     wont_dirty_next=wont_dirty_i;
6307     for(r=0;r<HOST_REGS;r++) {
6308       if(r!=EXCLUDE_REG) {
6309         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6310         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6311         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6312         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6313         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6314         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6315         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6316         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6317         if(i>istart) {
6318           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP)
6319           {
6320             // Don't store a register immediately after writing it,
6321             // may prevent dual-issue.
6322             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
6323             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
6324           }
6325         }
6326       }
6327     }
6328     // Save it
6329     will_dirty[i]=will_dirty_i;
6330     wont_dirty[i]=wont_dirty_i;
6331     // Mark registers that won't be dirtied as not dirty
6332     if(wr) {
6333       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
6334       for(r=0;r<HOST_REGS;r++) {
6335         if((will_dirty_i>>r)&1) {
6336           printf(" r%d",r);
6337         }
6338       }
6339       printf("\n");*/
6340
6341       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP)) {
6342         regs[i].dirty|=will_dirty_i;
6343         #ifndef DESTRUCTIVE_WRITEBACK
6344         regs[i].dirty&=wont_dirty_i;
6345         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
6346         {
6347           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
6348             for(r=0;r<HOST_REGS;r++) {
6349               if(r!=EXCLUDE_REG) {
6350                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
6351                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
6352                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6353               }
6354             }
6355           }
6356         }
6357         else
6358         {
6359           if(i<iend) {
6360             for(r=0;r<HOST_REGS;r++) {
6361               if(r!=EXCLUDE_REG) {
6362                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
6363                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
6364                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6365               }
6366             }
6367           }
6368         }
6369         #endif
6370       //}
6371     }
6372     // Deal with changed mappings
6373     temp_will_dirty=will_dirty_i;
6374     temp_wont_dirty=wont_dirty_i;
6375     for(r=0;r<HOST_REGS;r++) {
6376       if(r!=EXCLUDE_REG) {
6377         int nr;
6378         if(regs[i].regmap[r]==regmap_pre[i][r]) {
6379           if(wr) {
6380             #ifndef DESTRUCTIVE_WRITEBACK
6381             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6382             #endif
6383             regs[i].wasdirty|=will_dirty_i&(1<<r);
6384           }
6385         }
6386         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
6387           // Register moved to a different register
6388           will_dirty_i&=~(1<<r);
6389           wont_dirty_i&=~(1<<r);
6390           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
6391           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
6392           if(wr) {
6393             #ifndef DESTRUCTIVE_WRITEBACK
6394             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6395             #endif
6396             regs[i].wasdirty|=will_dirty_i&(1<<r);
6397           }
6398         }
6399         else {
6400           will_dirty_i&=~(1<<r);
6401           wont_dirty_i&=~(1<<r);
6402           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6403             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6404             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6405           } else {
6406             wont_dirty_i|=1<<r;
6407             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
6408           }
6409         }
6410       }
6411     }
6412   }
6413 }
6414
6415 #ifdef DISASM
6416   /* disassembly */
6417 void disassemble_inst(int i)
6418 {
6419     if (bt[i]) printf("*"); else printf(" ");
6420     switch(itype[i]) {
6421       case UJUMP:
6422         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6423       case CJUMP:
6424         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
6425       case SJUMP:
6426         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
6427       case RJUMP:
6428         if (opcode[i]==0x9&&rt1[i]!=31)
6429           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
6430         else
6431           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6432         break;
6433       case SPAN:
6434         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
6435       case IMM16:
6436         if(opcode[i]==0xf) //LUI
6437           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
6438         else
6439           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6440         break;
6441       case LOAD:
6442       case LOADLR:
6443         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6444         break;
6445       case STORE:
6446       case STORELR:
6447         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
6448         break;
6449       case ALU:
6450       case SHIFT:
6451         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
6452         break;
6453       case MULTDIV:
6454         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
6455         break;
6456       case SHIFTIMM:
6457         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6458         break;
6459       case MOV:
6460         if((opcode2[i]&0x1d)==0x10)
6461           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
6462         else if((opcode2[i]&0x1d)==0x11)
6463           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6464         else
6465           printf (" %x: %s\n",start+i*4,insn[i]);
6466         break;
6467       case COP0:
6468         if(opcode2[i]==0)
6469           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
6470         else if(opcode2[i]==4)
6471           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
6472         else printf (" %x: %s\n",start+i*4,insn[i]);
6473         break;
6474       case COP1:
6475         if(opcode2[i]<3)
6476           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
6477         else if(opcode2[i]>3)
6478           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
6479         else printf (" %x: %s\n",start+i*4,insn[i]);
6480         break;
6481       case COP2:
6482         if(opcode2[i]<3)
6483           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
6484         else if(opcode2[i]>3)
6485           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
6486         else printf (" %x: %s\n",start+i*4,insn[i]);
6487         break;
6488       case C1LS:
6489         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6490         break;
6491       case C2LS:
6492         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6493         break;
6494       case INTCALL:
6495         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
6496         break;
6497       default:
6498         //printf (" %s %8x\n",insn[i],source[i]);
6499         printf (" %x: %s\n",start+i*4,insn[i]);
6500     }
6501 }
6502 #else
6503 static void disassemble_inst(int i) {}
6504 #endif // DISASM
6505
6506 #define DRC_TEST_VAL 0x74657374
6507
6508 static void new_dynarec_test(void)
6509 {
6510   int (*testfunc)(void);
6511   void *beginning;
6512   int ret[2];
6513   size_t i;
6514
6515   // check structure linkage
6516   if ((u_char *)rcnts - (u_char *)&psxRegs != sizeof(psxRegs))
6517   {
6518     SysPrintf("linkage_arm* miscompilation/breakage detected.\n");
6519   }
6520
6521   SysPrintf("testing if we can run recompiled code...\n");
6522   ((volatile u_int *)out)[0]++; // make cache dirty
6523
6524   for (i = 0; i < ARRAY_SIZE(ret); i++) {
6525     out = ndrc->translation_cache;
6526     beginning = start_block();
6527     emit_movimm(DRC_TEST_VAL + i, 0); // test
6528     emit_ret();
6529     literal_pool(0);
6530     end_block(beginning);
6531     testfunc = beginning;
6532     ret[i] = testfunc();
6533   }
6534
6535   if (ret[0] == DRC_TEST_VAL && ret[1] == DRC_TEST_VAL + 1)
6536     SysPrintf("test passed.\n");
6537   else
6538     SysPrintf("test failed, will likely crash soon (r=%08x %08x)\n", ret[0], ret[1]);
6539   out = ndrc->translation_cache;
6540 }
6541
6542 // clear the state completely, instead of just marking
6543 // things invalid like invalidate_all_pages() does
6544 void new_dynarec_clear_full(void)
6545 {
6546   int n;
6547   out = ndrc->translation_cache;
6548   memset(invalid_code,1,sizeof(invalid_code));
6549   memset(hash_table,0xff,sizeof(hash_table));
6550   memset(mini_ht,-1,sizeof(mini_ht));
6551   memset(restore_candidate,0,sizeof(restore_candidate));
6552   memset(shadow,0,sizeof(shadow));
6553   copy=shadow;
6554   expirep=16384; // Expiry pointer, +2 blocks
6555   pending_exception=0;
6556   literalcount=0;
6557   stop_after_jal=0;
6558   inv_code_start=inv_code_end=~0;
6559   // TLB
6560   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6561   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6562   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6563 }
6564
6565 void new_dynarec_init(void)
6566 {
6567   SysPrintf("Init new dynarec\n");
6568
6569 #ifdef BASE_ADDR_DYNAMIC
6570   #ifdef VITA
6571   sceBlock = sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
6572   if (sceBlock < 0)
6573     SysPrintf("sceKernelAllocMemBlockForVM failed\n");
6574   int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&ndrc);
6575   if (ret < 0)
6576     SysPrintf("sceKernelGetMemBlockBase failed\n");
6577   #else
6578   uintptr_t desired_addr = 0;
6579   #ifdef __ELF__
6580   extern char _end;
6581   desired_addr = ((uintptr_t)&_end + 0xffffff) & ~0xffffffl;
6582   #endif
6583   ndrc = mmap((void *)desired_addr, sizeof(*ndrc),
6584             PROT_READ | PROT_WRITE | PROT_EXEC,
6585             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
6586   if (ndrc == MAP_FAILED) {
6587     SysPrintf("mmap() failed: %s\n", strerror(errno));
6588     abort();
6589   }
6590   #endif
6591 #else
6592   #ifndef NO_WRITE_EXEC
6593   // not all systems allow execute in data segment by default
6594   if (mprotect(ndrc, sizeof(ndrc->translation_cache) + sizeof(ndrc->tramp.ops),
6595                PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
6596     SysPrintf("mprotect() failed: %s\n", strerror(errno));
6597   #endif
6598 #endif
6599   out = ndrc->translation_cache;
6600   cycle_multiplier=200;
6601   new_dynarec_clear_full();
6602 #ifdef HOST_IMM8
6603   // Copy this into local area so we don't have to put it in every literal pool
6604   invc_ptr=invalid_code;
6605 #endif
6606   arch_init();
6607   new_dynarec_test();
6608 #ifndef RAM_FIXED
6609   ram_offset=(uintptr_t)rdram-0x80000000;
6610 #endif
6611   if (ram_offset!=0)
6612     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
6613 }
6614
6615 void new_dynarec_cleanup(void)
6616 {
6617   int n;
6618 #ifdef BASE_ADDR_DYNAMIC
6619   #ifdef VITA
6620   sceKernelFreeMemBlock(sceBlock);
6621   sceBlock = -1;
6622   #else
6623   if (munmap(ndrc, sizeof(*ndrc)) < 0)
6624     SysPrintf("munmap() failed\n");
6625   #endif
6626 #endif
6627   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6628   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6629   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6630   #ifdef ROM_COPY
6631   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
6632   #endif
6633 }
6634
6635 static u_int *get_source_start(u_int addr, u_int *limit)
6636 {
6637   if (!(new_dynarec_hacks & NDHACK_OVERRIDE_CYCLE_M))
6638     cycle_multiplier_override = 0;
6639
6640   if (addr < 0x00200000 ||
6641     (0xa0000000 <= addr && addr < 0xa0200000))
6642   {
6643     // used for BIOS calls mostly?
6644     *limit = (addr&0xa0000000)|0x00200000;
6645     return (u_int *)(rdram + (addr&0x1fffff));
6646   }
6647   else if (!Config.HLE && (
6648     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
6649     (0xbfc00000 <= addr && addr < 0xbfc80000)))
6650   {
6651     // BIOS. The multiplier should be much higher as it's uncached 8bit mem,
6652     // but timings in PCSX are too tied to the interpreter's BIAS
6653     if (!(new_dynarec_hacks & NDHACK_OVERRIDE_CYCLE_M))
6654       cycle_multiplier_override = 200;
6655
6656     *limit = (addr & 0xfff00000) | 0x80000;
6657     return (u_int *)((u_char *)psxR + (addr&0x7ffff));
6658   }
6659   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
6660     *limit = (addr & 0x80600000) + 0x00200000;
6661     return (u_int *)(rdram + (addr&0x1fffff));
6662   }
6663   return NULL;
6664 }
6665
6666 static u_int scan_for_ret(u_int addr)
6667 {
6668   u_int limit = 0;
6669   u_int *mem;
6670
6671   mem = get_source_start(addr, &limit);
6672   if (mem == NULL)
6673     return addr;
6674
6675   if (limit > addr + 0x1000)
6676     limit = addr + 0x1000;
6677   for (; addr < limit; addr += 4, mem++) {
6678     if (*mem == 0x03e00008) // jr $ra
6679       return addr + 8;
6680   }
6681   return addr;
6682 }
6683
6684 struct savestate_block {
6685   uint32_t addr;
6686   uint32_t regflags;
6687 };
6688
6689 static int addr_cmp(const void *p1_, const void *p2_)
6690 {
6691   const struct savestate_block *p1 = p1_, *p2 = p2_;
6692   return p1->addr - p2->addr;
6693 }
6694
6695 int new_dynarec_save_blocks(void *save, int size)
6696 {
6697   struct savestate_block *blocks = save;
6698   int maxcount = size / sizeof(blocks[0]);
6699   struct savestate_block tmp_blocks[1024];
6700   struct ll_entry *head;
6701   int p, s, d, o, bcnt;
6702   u_int addr;
6703
6704   o = 0;
6705   for (p = 0; p < ARRAY_SIZE(jump_in); p++) {
6706     bcnt = 0;
6707     for (head = jump_in[p]; head != NULL; head = head->next) {
6708       tmp_blocks[bcnt].addr = head->vaddr;
6709       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
6710       bcnt++;
6711     }
6712     if (bcnt < 1)
6713       continue;
6714     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
6715
6716     addr = tmp_blocks[0].addr;
6717     for (s = d = 0; s < bcnt; s++) {
6718       if (tmp_blocks[s].addr < addr)
6719         continue;
6720       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
6721         tmp_blocks[d++] = tmp_blocks[s];
6722       addr = scan_for_ret(tmp_blocks[s].addr);
6723     }
6724
6725     if (o + d > maxcount)
6726       d = maxcount - o;
6727     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
6728     o += d;
6729   }
6730
6731   return o * sizeof(blocks[0]);
6732 }
6733
6734 void new_dynarec_load_blocks(const void *save, int size)
6735 {
6736   const struct savestate_block *blocks = save;
6737   int count = size / sizeof(blocks[0]);
6738   u_int regs_save[32];
6739   uint32_t f;
6740   int i, b;
6741
6742   get_addr(psxRegs.pc);
6743
6744   // change GPRs for speculation to at least partially work..
6745   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
6746   for (i = 1; i < 32; i++)
6747     psxRegs.GPR.r[i] = 0x80000000;
6748
6749   for (b = 0; b < count; b++) {
6750     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
6751       if (f & 1)
6752         psxRegs.GPR.r[i] = 0x1f800000;
6753     }
6754
6755     get_addr(blocks[b].addr);
6756
6757     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
6758       if (f & 1)
6759         psxRegs.GPR.r[i] = 0x80000000;
6760     }
6761   }
6762
6763   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
6764 }
6765
6766 int new_recompile_block(u_int addr)
6767 {
6768   u_int pagelimit = 0;
6769   u_int state_rflags = 0;
6770   int i;
6771
6772   assem_debug("NOTCOMPILED: addr = %x -> %p\n", addr, out);
6773   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
6774   //if(debug)
6775   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
6776
6777   // this is just for speculation
6778   for (i = 1; i < 32; i++) {
6779     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
6780       state_rflags |= 1 << i;
6781   }
6782
6783   start = (u_int)addr&~3;
6784   //assert(((u_int)addr&1)==0); // start-in-delay-slot flag
6785   new_dynarec_did_compile=1;
6786   if (Config.HLE && start == 0x80001000) // hlecall
6787   {
6788     // XXX: is this enough? Maybe check hleSoftCall?
6789     void *beginning=start_block();
6790     u_int page=get_page(start);
6791
6792     invalid_code[start>>12]=0;
6793     emit_movimm(start,0);
6794     emit_writeword(0,&pcaddr);
6795     emit_far_jump(new_dyna_leave);
6796     literal_pool(0);
6797     end_block(beginning);
6798     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
6799     return 0;
6800   }
6801
6802   source = get_source_start(start, &pagelimit);
6803   if (source == NULL) {
6804     SysPrintf("Compile at bogus memory address: %08x\n", addr);
6805     abort();
6806   }
6807
6808   /* Pass 1: disassemble */
6809   /* Pass 2: register dependencies, branch targets */
6810   /* Pass 3: register allocation */
6811   /* Pass 4: branch dependencies */
6812   /* Pass 5: pre-alloc */
6813   /* Pass 6: optimize clean/dirty state */
6814   /* Pass 7: flag 32-bit registers */
6815   /* Pass 8: assembly */
6816   /* Pass 9: linker */
6817   /* Pass 10: garbage collection / free memory */
6818
6819   int j;
6820   int done=0;
6821   unsigned int type,op,op2;
6822
6823   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
6824
6825   /* Pass 1 disassembly */
6826
6827   for(i=0;!done;i++) {
6828     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
6829     minimum_free_regs[i]=0;
6830     opcode[i]=op=source[i]>>26;
6831     switch(op)
6832     {
6833       case 0x00: strcpy(insn[i],"special"); type=NI;
6834         op2=source[i]&0x3f;
6835         switch(op2)
6836         {
6837           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
6838           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
6839           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
6840           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
6841           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
6842           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
6843           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
6844           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
6845           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
6846           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
6847           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
6848           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
6849           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
6850           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
6851           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
6852           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
6853           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
6854           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
6855           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
6856           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
6857           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
6858           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
6859           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
6860           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
6861           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
6862           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
6863           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
6864           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
6865           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
6866           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
6867           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
6868           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
6869           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
6870           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
6871           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
6872 #if 0
6873           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
6874           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
6875           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
6876           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
6877           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
6878           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
6879           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
6880           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
6881           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
6882           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
6883           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
6884           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
6885           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
6886           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
6887           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
6888           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
6889           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
6890 #endif
6891         }
6892         break;
6893       case 0x01: strcpy(insn[i],"regimm"); type=NI;
6894         op2=(source[i]>>16)&0x1f;
6895         switch(op2)
6896         {
6897           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
6898           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
6899           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
6900           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
6901           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
6902           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
6903           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
6904           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
6905           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
6906           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
6907           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
6908           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
6909           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
6910           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
6911         }
6912         break;
6913       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
6914       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
6915       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
6916       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
6917       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
6918       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
6919       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
6920       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
6921       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
6922       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
6923       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
6924       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
6925       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
6926       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
6927       case 0x10: strcpy(insn[i],"cop0"); type=NI;
6928         op2=(source[i]>>21)&0x1f;
6929         switch(op2)
6930         {
6931           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
6932           case 0x02: strcpy(insn[i],"CFC0"); type=COP0; break;
6933           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
6934           case 0x06: strcpy(insn[i],"CTC0"); type=COP0; break;
6935           case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
6936         }
6937         break;
6938       case 0x11: strcpy(insn[i],"cop1"); type=COP1;
6939         op2=(source[i]>>21)&0x1f;
6940         break;
6941 #if 0
6942       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
6943       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
6944       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
6945       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
6946       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
6947       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
6948       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
6949       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
6950 #endif
6951       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
6952       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
6953       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
6954       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
6955       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
6956       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
6957       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
6958 #if 0
6959       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
6960 #endif
6961       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
6962       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
6963       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
6964       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
6965 #if 0
6966       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
6967       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
6968 #endif
6969       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
6970       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
6971       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
6972       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
6973 #if 0
6974       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
6975       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
6976       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
6977 #endif
6978       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
6979       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
6980 #if 0
6981       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
6982       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
6983       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
6984 #endif
6985       case 0x12: strcpy(insn[i],"COP2"); type=NI;
6986         op2=(source[i]>>21)&0x1f;
6987         //if (op2 & 0x10)
6988         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
6989           if (gte_handlers[source[i]&0x3f]!=NULL) {
6990             if (gte_regnames[source[i]&0x3f]!=NULL)
6991               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
6992             else
6993               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
6994             type=C2OP;
6995           }
6996         }
6997         else switch(op2)
6998         {
6999           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
7000           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
7001           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
7002           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
7003         }
7004         break;
7005       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
7006       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
7007       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
7008       default: strcpy(insn[i],"???"); type=NI;
7009         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
7010         break;
7011     }
7012     itype[i]=type;
7013     opcode2[i]=op2;
7014     /* Get registers/immediates */
7015     lt1[i]=0;
7016     dep1[i]=0;
7017     dep2[i]=0;
7018     gte_rs[i]=gte_rt[i]=0;
7019     switch(type) {
7020       case LOAD:
7021         rs1[i]=(source[i]>>21)&0x1f;
7022         rs2[i]=0;
7023         rt1[i]=(source[i]>>16)&0x1f;
7024         rt2[i]=0;
7025         imm[i]=(short)source[i];
7026         break;
7027       case STORE:
7028       case STORELR:
7029         rs1[i]=(source[i]>>21)&0x1f;
7030         rs2[i]=(source[i]>>16)&0x1f;
7031         rt1[i]=0;
7032         rt2[i]=0;
7033         imm[i]=(short)source[i];
7034         break;
7035       case LOADLR:
7036         // LWL/LWR only load part of the register,
7037         // therefore the target register must be treated as a source too
7038         rs1[i]=(source[i]>>21)&0x1f;
7039         rs2[i]=(source[i]>>16)&0x1f;
7040         rt1[i]=(source[i]>>16)&0x1f;
7041         rt2[i]=0;
7042         imm[i]=(short)source[i];
7043         if(op==0x26) dep1[i]=rt1[i]; // LWR
7044         break;
7045       case IMM16:
7046         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7047         else rs1[i]=(source[i]>>21)&0x1f;
7048         rs2[i]=0;
7049         rt1[i]=(source[i]>>16)&0x1f;
7050         rt2[i]=0;
7051         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7052           imm[i]=(unsigned short)source[i];
7053         }else{
7054           imm[i]=(short)source[i];
7055         }
7056         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7057         break;
7058       case UJUMP:
7059         rs1[i]=0;
7060         rs2[i]=0;
7061         rt1[i]=0;
7062         rt2[i]=0;
7063         // The JAL instruction writes to r31.
7064         if (op&1) {
7065           rt1[i]=31;
7066         }
7067         rs2[i]=CCREG;
7068         break;
7069       case RJUMP:
7070         rs1[i]=(source[i]>>21)&0x1f;
7071         rs2[i]=0;
7072         rt1[i]=0;
7073         rt2[i]=0;
7074         // The JALR instruction writes to rd.
7075         if (op2&1) {
7076           rt1[i]=(source[i]>>11)&0x1f;
7077         }
7078         rs2[i]=CCREG;
7079         break;
7080       case CJUMP:
7081         rs1[i]=(source[i]>>21)&0x1f;
7082         rs2[i]=(source[i]>>16)&0x1f;
7083         rt1[i]=0;
7084         rt2[i]=0;
7085         if(op&2) { // BGTZ/BLEZ
7086           rs2[i]=0;
7087         }
7088         likely[i]=op>>4;
7089         break;
7090       case SJUMP:
7091         rs1[i]=(source[i]>>21)&0x1f;
7092         rs2[i]=CCREG;
7093         rt1[i]=0;
7094         rt2[i]=0;
7095         if(op2&0x10) { // BxxAL
7096           rt1[i]=31;
7097           // NOTE: If the branch is not taken, r31 is still overwritten
7098         }
7099         likely[i]=(op2&2)>>1;
7100         break;
7101       case ALU:
7102         rs1[i]=(source[i]>>21)&0x1f; // source
7103         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7104         rt1[i]=(source[i]>>11)&0x1f; // destination
7105         rt2[i]=0;
7106         if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7107           dep1[i]=rs1[i];dep2[i]=rs2[i];
7108         }
7109         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7110           dep1[i]=rs1[i];dep2[i]=rs2[i];
7111         }
7112         break;
7113       case MULTDIV:
7114         rs1[i]=(source[i]>>21)&0x1f; // source
7115         rs2[i]=(source[i]>>16)&0x1f; // divisor
7116         rt1[i]=HIREG;
7117         rt2[i]=LOREG;
7118         break;
7119       case MOV:
7120         rs1[i]=0;
7121         rs2[i]=0;
7122         rt1[i]=0;
7123         rt2[i]=0;
7124         if(op2==0x10) rs1[i]=HIREG; // MFHI
7125         if(op2==0x11) rt1[i]=HIREG; // MTHI
7126         if(op2==0x12) rs1[i]=LOREG; // MFLO
7127         if(op2==0x13) rt1[i]=LOREG; // MTLO
7128         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7129         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7130         dep1[i]=rs1[i];
7131         break;
7132       case SHIFT:
7133         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7134         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7135         rt1[i]=(source[i]>>11)&0x1f; // destination
7136         rt2[i]=0;
7137         break;
7138       case SHIFTIMM:
7139         rs1[i]=(source[i]>>16)&0x1f;
7140         rs2[i]=0;
7141         rt1[i]=(source[i]>>11)&0x1f;
7142         rt2[i]=0;
7143         imm[i]=(source[i]>>6)&0x1f;
7144         // DSxx32 instructions
7145         if(op2>=0x3c) imm[i]|=0x20;
7146         break;
7147       case COP0:
7148         rs1[i]=0;
7149         rs2[i]=0;
7150         rt1[i]=0;
7151         rt2[i]=0;
7152         if(op2==0||op2==2) rt1[i]=(source[i]>>16)&0x1F; // MFC0/CFC0
7153         if(op2==4||op2==6) rs1[i]=(source[i]>>16)&0x1F; // MTC0/CTC0
7154         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7155         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7156         break;
7157       case COP1:
7158         rs1[i]=0;
7159         rs2[i]=0;
7160         rt1[i]=0;
7161         rt2[i]=0;
7162         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7163         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7164         rs2[i]=CSREG;
7165         break;
7166       case COP2:
7167         rs1[i]=0;
7168         rs2[i]=0;
7169         rt1[i]=0;
7170         rt2[i]=0;
7171         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
7172         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
7173         rs2[i]=CSREG;
7174         int gr=(source[i]>>11)&0x1F;
7175         switch(op2)
7176         {
7177           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
7178           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
7179           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
7180           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
7181         }
7182         break;
7183       case C1LS:
7184         rs1[i]=(source[i]>>21)&0x1F;
7185         rs2[i]=CSREG;
7186         rt1[i]=0;
7187         rt2[i]=0;
7188         imm[i]=(short)source[i];
7189         break;
7190       case C2LS:
7191         rs1[i]=(source[i]>>21)&0x1F;
7192         rs2[i]=0;
7193         rt1[i]=0;
7194         rt2[i]=0;
7195         imm[i]=(short)source[i];
7196         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
7197         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
7198         break;
7199       case C2OP:
7200         rs1[i]=0;
7201         rs2[i]=0;
7202         rt1[i]=0;
7203         rt2[i]=0;
7204         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
7205         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
7206         gte_rt[i]|=1ll<<63; // every op changes flags
7207         if((source[i]&0x3f)==GTE_MVMVA) {
7208           int v = (source[i] >> 15) & 3;
7209           gte_rs[i]&=~0xe3fll;
7210           if(v==3) gte_rs[i]|=0xe00ll;
7211           else gte_rs[i]|=3ll<<(v*2);
7212         }
7213         break;
7214       case SYSCALL:
7215       case HLECALL:
7216       case INTCALL:
7217         rs1[i]=CCREG;
7218         rs2[i]=0;
7219         rt1[i]=0;
7220         rt2[i]=0;
7221         break;
7222       default:
7223         rs1[i]=0;
7224         rs2[i]=0;
7225         rt1[i]=0;
7226         rt2[i]=0;
7227     }
7228     /* Calculate branch target addresses */
7229     if(type==UJUMP)
7230       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
7231     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
7232       ba[i]=start+i*4+8; // Ignore never taken branch
7233     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
7234       ba[i]=start+i*4+8; // Ignore never taken branch
7235     else if(type==CJUMP||type==SJUMP)
7236       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
7237     else ba[i]=-1;
7238     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP)) {
7239       int do_in_intrp=0;
7240       // branch in delay slot?
7241       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP) {
7242         // don't handle first branch and call interpreter if it's hit
7243         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
7244         do_in_intrp=1;
7245       }
7246       // basic load delay detection
7247       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
7248         int t=(ba[i-1]-start)/4;
7249         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
7250           // jump target wants DS result - potential load delay effect
7251           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
7252           do_in_intrp=1;
7253           bt[t+1]=1; // expected return from interpreter
7254         }
7255         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
7256               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
7257           // v0 overwrite like this is a sign of trouble, bail out
7258           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
7259           do_in_intrp=1;
7260         }
7261       }
7262       if(do_in_intrp) {
7263         rs1[i-1]=CCREG;
7264         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
7265         ba[i-1]=-1;
7266         itype[i-1]=INTCALL;
7267         done=2;
7268         i--; // don't compile the DS
7269       }
7270     }
7271     /* Is this the end of the block? */
7272     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
7273       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
7274         done=2;
7275       }
7276       else {
7277         if(stop_after_jal) done=1;
7278         // Stop on BREAK
7279         if((source[i+1]&0xfc00003f)==0x0d) done=1;
7280       }
7281       // Don't recompile stuff that's already compiled
7282       if(check_addr(start+i*4+4)) done=1;
7283       // Don't get too close to the limit
7284       if(i>MAXBLOCK/2) done=1;
7285     }
7286     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
7287     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
7288     if(done==2) {
7289       // Does the block continue due to a branch?
7290       for(j=i-1;j>=0;j--)
7291       {
7292         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
7293         if(ba[j]==start+i*4+4) done=j=0;
7294         if(ba[j]==start+i*4+8) done=j=0;
7295       }
7296     }
7297     //assert(i<MAXBLOCK-1);
7298     if(start+i*4==pagelimit-4) done=1;
7299     assert(start+i*4<pagelimit);
7300     if (i==MAXBLOCK-1) done=1;
7301     // Stop if we're compiling junk
7302     if(itype[i]==NI&&opcode[i]==0x11) {
7303       done=stop_after_jal=1;
7304       SysPrintf("Disabled speculative precompilation\n");
7305     }
7306   }
7307   slen=i;
7308   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP) {
7309     if(start+i*4==pagelimit) {
7310       itype[i-1]=SPAN;
7311     }
7312   }
7313   assert(slen>0);
7314
7315   /* Pass 2 - Register dependencies and branch targets */
7316
7317   unneeded_registers(0,slen-1,0);
7318
7319   /* Pass 3 - Register allocation */
7320
7321   struct regstat current; // Current register allocations/status
7322   current.dirty=0;
7323   current.u=unneeded_reg[0];
7324   clear_all_regs(current.regmap);
7325   alloc_reg(&current,0,CCREG);
7326   dirty_reg(&current,CCREG);
7327   current.isconst=0;
7328   current.wasconst=0;
7329   current.waswritten=0;
7330   int ds=0;
7331   int cc=0;
7332   int hr=-1;
7333
7334   if((u_int)addr&1) {
7335     // First instruction is delay slot
7336     cc=-1;
7337     bt[1]=1;
7338     ds=1;
7339     unneeded_reg[0]=1;
7340     current.regmap[HOST_BTREG]=BTREG;
7341   }
7342
7343   for(i=0;i<slen;i++)
7344   {
7345     if(bt[i])
7346     {
7347       int hr;
7348       for(hr=0;hr<HOST_REGS;hr++)
7349       {
7350         // Is this really necessary?
7351         if(current.regmap[hr]==0) current.regmap[hr]=-1;
7352       }
7353       current.isconst=0;
7354       current.waswritten=0;
7355     }
7356
7357     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
7358     regs[i].wasconst=current.isconst;
7359     regs[i].wasdirty=current.dirty;
7360     regs[i].loadedconst=0;
7361     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP) {
7362       if(i+1<slen) {
7363         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7364         current.u|=1;
7365       } else {
7366         current.u=1;
7367       }
7368     } else {
7369       if(i+1<slen) {
7370         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
7371         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7372         current.u|=1;
7373       } else { SysPrintf("oops, branch at end of block with no delay slot\n");abort(); }
7374     }
7375     is_ds[i]=ds;
7376     if(ds) {
7377       ds=0; // Skip delay slot, already allocated as part of branch
7378       // ...but we need to alloc it in case something jumps here
7379       if(i+1<slen) {
7380         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
7381       }else{
7382         current.u=branch_unneeded_reg[i-1];
7383       }
7384       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7385       current.u|=1;
7386       struct regstat temp;
7387       memcpy(&temp,&current,sizeof(current));
7388       temp.wasdirty=temp.dirty;
7389       // TODO: Take into account unconditional branches, as below
7390       delayslot_alloc(&temp,i);
7391       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
7392       regs[i].wasdirty=temp.wasdirty;
7393       regs[i].dirty=temp.dirty;
7394       regs[i].isconst=0;
7395       regs[i].wasconst=0;
7396       current.isconst=0;
7397       // Create entry (branch target) regmap
7398       for(hr=0;hr<HOST_REGS;hr++)
7399       {
7400         int r=temp.regmap[hr];
7401         if(r>=0) {
7402           if(r!=regmap_pre[i][hr]) {
7403             regs[i].regmap_entry[hr]=-1;
7404           }
7405           else
7406           {
7407               assert(r < 64);
7408               if((current.u>>r)&1) {
7409                 regs[i].regmap_entry[hr]=-1;
7410                 regs[i].regmap[hr]=-1;
7411                 //Don't clear regs in the delay slot as the branch might need them
7412                 //current.regmap[hr]=-1;
7413               }else
7414                 regs[i].regmap_entry[hr]=r;
7415           }
7416         } else {
7417           // First instruction expects CCREG to be allocated
7418           if(i==0&&hr==HOST_CCREG)
7419             regs[i].regmap_entry[hr]=CCREG;
7420           else
7421             regs[i].regmap_entry[hr]=-1;
7422         }
7423       }
7424     }
7425     else { // Not delay slot
7426       switch(itype[i]) {
7427         case UJUMP:
7428           //current.isconst=0; // DEBUG
7429           //current.wasconst=0; // DEBUG
7430           //regs[i].wasconst=0; // DEBUG
7431           clear_const(&current,rt1[i]);
7432           alloc_cc(&current,i);
7433           dirty_reg(&current,CCREG);
7434           if (rt1[i]==31) {
7435             alloc_reg(&current,i,31);
7436             dirty_reg(&current,31);
7437             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
7438             //assert(rt1[i+1]!=rt1[i]);
7439             #ifdef REG_PREFETCH
7440             alloc_reg(&current,i,PTEMP);
7441             #endif
7442           }
7443           ooo[i]=1;
7444           delayslot_alloc(&current,i+1);
7445           //current.isconst=0; // DEBUG
7446           ds=1;
7447           //printf("i=%d, isconst=%x\n",i,current.isconst);
7448           break;
7449         case RJUMP:
7450           //current.isconst=0;
7451           //current.wasconst=0;
7452           //regs[i].wasconst=0;
7453           clear_const(&current,rs1[i]);
7454           clear_const(&current,rt1[i]);
7455           alloc_cc(&current,i);
7456           dirty_reg(&current,CCREG);
7457           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
7458             alloc_reg(&current,i,rs1[i]);
7459             if (rt1[i]!=0) {
7460               alloc_reg(&current,i,rt1[i]);
7461               dirty_reg(&current,rt1[i]);
7462               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
7463               assert(rt1[i+1]!=rt1[i]);
7464               #ifdef REG_PREFETCH
7465               alloc_reg(&current,i,PTEMP);
7466               #endif
7467             }
7468             #ifdef USE_MINI_HT
7469             if(rs1[i]==31) { // JALR
7470               alloc_reg(&current,i,RHASH);
7471               alloc_reg(&current,i,RHTBL);
7472             }
7473             #endif
7474             delayslot_alloc(&current,i+1);
7475           } else {
7476             // The delay slot overwrites our source register,
7477             // allocate a temporary register to hold the old value.
7478             current.isconst=0;
7479             current.wasconst=0;
7480             regs[i].wasconst=0;
7481             delayslot_alloc(&current,i+1);
7482             current.isconst=0;
7483             alloc_reg(&current,i,RTEMP);
7484           }
7485           //current.isconst=0; // DEBUG
7486           ooo[i]=1;
7487           ds=1;
7488           break;
7489         case CJUMP:
7490           //current.isconst=0;
7491           //current.wasconst=0;
7492           //regs[i].wasconst=0;
7493           clear_const(&current,rs1[i]);
7494           clear_const(&current,rs2[i]);
7495           if((opcode[i]&0x3E)==4) // BEQ/BNE
7496           {
7497             alloc_cc(&current,i);
7498             dirty_reg(&current,CCREG);
7499             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7500             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
7501             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
7502                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
7503               // The delay slot overwrites one of our conditions.
7504               // Allocate the branch condition registers instead.
7505               current.isconst=0;
7506               current.wasconst=0;
7507               regs[i].wasconst=0;
7508               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7509               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
7510             }
7511             else
7512             {
7513               ooo[i]=1;
7514               delayslot_alloc(&current,i+1);
7515             }
7516           }
7517           else
7518           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
7519           {
7520             alloc_cc(&current,i);
7521             dirty_reg(&current,CCREG);
7522             alloc_reg(&current,i,rs1[i]);
7523             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
7524               // The delay slot overwrites one of our conditions.
7525               // Allocate the branch condition registers instead.
7526               current.isconst=0;
7527               current.wasconst=0;
7528               regs[i].wasconst=0;
7529               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7530             }
7531             else
7532             {
7533               ooo[i]=1;
7534               delayslot_alloc(&current,i+1);
7535             }
7536           }
7537           else
7538           // Don't alloc the delay slot yet because we might not execute it
7539           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
7540           {
7541             current.isconst=0;
7542             current.wasconst=0;
7543             regs[i].wasconst=0;
7544             alloc_cc(&current,i);
7545             dirty_reg(&current,CCREG);
7546             alloc_reg(&current,i,rs1[i]);
7547             alloc_reg(&current,i,rs2[i]);
7548           }
7549           else
7550           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
7551           {
7552             current.isconst=0;
7553             current.wasconst=0;
7554             regs[i].wasconst=0;
7555             alloc_cc(&current,i);
7556             dirty_reg(&current,CCREG);
7557             alloc_reg(&current,i,rs1[i]);
7558           }
7559           ds=1;
7560           //current.isconst=0;
7561           break;
7562         case SJUMP:
7563           //current.isconst=0;
7564           //current.wasconst=0;
7565           //regs[i].wasconst=0;
7566           clear_const(&current,rs1[i]);
7567           clear_const(&current,rt1[i]);
7568           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
7569           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
7570           {
7571             alloc_cc(&current,i);
7572             dirty_reg(&current,CCREG);
7573             alloc_reg(&current,i,rs1[i]);
7574             if (rt1[i]==31) { // BLTZAL/BGEZAL
7575               alloc_reg(&current,i,31);
7576               dirty_reg(&current,31);
7577               //#ifdef REG_PREFETCH
7578               //alloc_reg(&current,i,PTEMP);
7579               //#endif
7580             }
7581             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
7582                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
7583               // Allocate the branch condition registers instead.
7584               current.isconst=0;
7585               current.wasconst=0;
7586               regs[i].wasconst=0;
7587               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7588             }
7589             else
7590             {
7591               ooo[i]=1;
7592               delayslot_alloc(&current,i+1);
7593             }
7594           }
7595           else
7596           // Don't alloc the delay slot yet because we might not execute it
7597           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
7598           {
7599             current.isconst=0;
7600             current.wasconst=0;
7601             regs[i].wasconst=0;
7602             alloc_cc(&current,i);
7603             dirty_reg(&current,CCREG);
7604             alloc_reg(&current,i,rs1[i]);
7605           }
7606           ds=1;
7607           //current.isconst=0;
7608           break;
7609         case IMM16:
7610           imm16_alloc(&current,i);
7611           break;
7612         case LOAD:
7613         case LOADLR:
7614           load_alloc(&current,i);
7615           break;
7616         case STORE:
7617         case STORELR:
7618           store_alloc(&current,i);
7619           break;
7620         case ALU:
7621           alu_alloc(&current,i);
7622           break;
7623         case SHIFT:
7624           shift_alloc(&current,i);
7625           break;
7626         case MULTDIV:
7627           multdiv_alloc(&current,i);
7628           break;
7629         case SHIFTIMM:
7630           shiftimm_alloc(&current,i);
7631           break;
7632         case MOV:
7633           mov_alloc(&current,i);
7634           break;
7635         case COP0:
7636           cop0_alloc(&current,i);
7637           break;
7638         case COP1:
7639         case COP2:
7640           cop12_alloc(&current,i);
7641           break;
7642         case C1LS:
7643           c1ls_alloc(&current,i);
7644           break;
7645         case C2LS:
7646           c2ls_alloc(&current,i);
7647           break;
7648         case C2OP:
7649           c2op_alloc(&current,i);
7650           break;
7651         case SYSCALL:
7652         case HLECALL:
7653         case INTCALL:
7654           syscall_alloc(&current,i);
7655           break;
7656         case SPAN:
7657           pagespan_alloc(&current,i);
7658           break;
7659       }
7660
7661       // Create entry (branch target) regmap
7662       for(hr=0;hr<HOST_REGS;hr++)
7663       {
7664         int r,or;
7665         r=current.regmap[hr];
7666         if(r>=0) {
7667           if(r!=regmap_pre[i][hr]) {
7668             // TODO: delay slot (?)
7669             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
7670             if(or<0||(r&63)>=TEMPREG){
7671               regs[i].regmap_entry[hr]=-1;
7672             }
7673             else
7674             {
7675               // Just move it to a different register
7676               regs[i].regmap_entry[hr]=r;
7677               // If it was dirty before, it's still dirty
7678               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
7679             }
7680           }
7681           else
7682           {
7683             // Unneeded
7684             if(r==0){
7685               regs[i].regmap_entry[hr]=0;
7686             }
7687             else
7688             {
7689               assert(r<64);
7690               if((current.u>>r)&1) {
7691                 regs[i].regmap_entry[hr]=-1;
7692                 //regs[i].regmap[hr]=-1;
7693                 current.regmap[hr]=-1;
7694               }else
7695                 regs[i].regmap_entry[hr]=r;
7696             }
7697           }
7698         } else {
7699           // Branches expect CCREG to be allocated at the target
7700           if(regmap_pre[i][hr]==CCREG)
7701             regs[i].regmap_entry[hr]=CCREG;
7702           else
7703             regs[i].regmap_entry[hr]=-1;
7704         }
7705       }
7706       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
7707     }
7708
7709     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
7710       current.waswritten|=1<<rs1[i-1];
7711     current.waswritten&=~(1<<rt1[i]);
7712     current.waswritten&=~(1<<rt2[i]);
7713     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
7714       current.waswritten&=~(1<<rs1[i]);
7715
7716     /* Branch post-alloc */
7717     if(i>0)
7718     {
7719       current.wasdirty=current.dirty;
7720       switch(itype[i-1]) {
7721         case UJUMP:
7722           memcpy(&branch_regs[i-1],&current,sizeof(current));
7723           branch_regs[i-1].isconst=0;
7724           branch_regs[i-1].wasconst=0;
7725           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
7726           alloc_cc(&branch_regs[i-1],i-1);
7727           dirty_reg(&branch_regs[i-1],CCREG);
7728           if(rt1[i-1]==31) { // JAL
7729             alloc_reg(&branch_regs[i-1],i-1,31);
7730             dirty_reg(&branch_regs[i-1],31);
7731           }
7732           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7733           memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
7734           break;
7735         case RJUMP:
7736           memcpy(&branch_regs[i-1],&current,sizeof(current));
7737           branch_regs[i-1].isconst=0;
7738           branch_regs[i-1].wasconst=0;
7739           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
7740           alloc_cc(&branch_regs[i-1],i-1);
7741           dirty_reg(&branch_regs[i-1],CCREG);
7742           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
7743           if(rt1[i-1]!=0) { // JALR
7744             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
7745             dirty_reg(&branch_regs[i-1],rt1[i-1]);
7746           }
7747           #ifdef USE_MINI_HT
7748           if(rs1[i-1]==31) { // JALR
7749             alloc_reg(&branch_regs[i-1],i-1,RHASH);
7750             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
7751           }
7752           #endif
7753           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7754           memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
7755           break;
7756         case CJUMP:
7757           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
7758           {
7759             alloc_cc(&current,i-1);
7760             dirty_reg(&current,CCREG);
7761             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
7762                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
7763               // The delay slot overwrote one of our conditions
7764               // Delay slot goes after the test (in order)
7765               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7766               current.u|=1;
7767               delayslot_alloc(&current,i);
7768               current.isconst=0;
7769             }
7770             else
7771             {
7772               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
7773               // Alloc the branch condition registers
7774               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
7775               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
7776             }
7777             memcpy(&branch_regs[i-1],&current,sizeof(current));
7778             branch_regs[i-1].isconst=0;
7779             branch_regs[i-1].wasconst=0;
7780             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
7781             memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
7782           }
7783           else
7784           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
7785           {
7786             alloc_cc(&current,i-1);
7787             dirty_reg(&current,CCREG);
7788             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
7789               // The delay slot overwrote the branch condition
7790               // Delay slot goes after the test (in order)
7791               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7792               current.u|=1;
7793               delayslot_alloc(&current,i);
7794               current.isconst=0;
7795             }
7796             else
7797             {
7798               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
7799               // Alloc the branch condition register
7800               alloc_reg(&current,i-1,rs1[i-1]);
7801             }
7802             memcpy(&branch_regs[i-1],&current,sizeof(current));
7803             branch_regs[i-1].isconst=0;
7804             branch_regs[i-1].wasconst=0;
7805             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
7806             memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
7807           }
7808           else
7809           // Alloc the delay slot in case the branch is taken
7810           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
7811           {
7812             memcpy(&branch_regs[i-1],&current,sizeof(current));
7813             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
7814             alloc_cc(&branch_regs[i-1],i);
7815             dirty_reg(&branch_regs[i-1],CCREG);
7816             delayslot_alloc(&branch_regs[i-1],i);
7817             branch_regs[i-1].isconst=0;
7818             alloc_reg(&current,i,CCREG); // Not taken path
7819             dirty_reg(&current,CCREG);
7820             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7821           }
7822           else
7823           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
7824           {
7825             memcpy(&branch_regs[i-1],&current,sizeof(current));
7826             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
7827             alloc_cc(&branch_regs[i-1],i);
7828             dirty_reg(&branch_regs[i-1],CCREG);
7829             delayslot_alloc(&branch_regs[i-1],i);
7830             branch_regs[i-1].isconst=0;
7831             alloc_reg(&current,i,CCREG); // Not taken path
7832             dirty_reg(&current,CCREG);
7833             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7834           }
7835           break;
7836         case SJUMP:
7837           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
7838           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
7839           {
7840             alloc_cc(&current,i-1);
7841             dirty_reg(&current,CCREG);
7842             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
7843               // The delay slot overwrote the branch condition
7844               // Delay slot goes after the test (in order)
7845               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7846               current.u|=1;
7847               delayslot_alloc(&current,i);
7848               current.isconst=0;
7849             }
7850             else
7851             {
7852               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
7853               // Alloc the branch condition register
7854               alloc_reg(&current,i-1,rs1[i-1]);
7855             }
7856             memcpy(&branch_regs[i-1],&current,sizeof(current));
7857             branch_regs[i-1].isconst=0;
7858             branch_regs[i-1].wasconst=0;
7859             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
7860             memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
7861           }
7862           else
7863           // Alloc the delay slot in case the branch is taken
7864           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
7865           {
7866             memcpy(&branch_regs[i-1],&current,sizeof(current));
7867             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
7868             alloc_cc(&branch_regs[i-1],i);
7869             dirty_reg(&branch_regs[i-1],CCREG);
7870             delayslot_alloc(&branch_regs[i-1],i);
7871             branch_regs[i-1].isconst=0;
7872             alloc_reg(&current,i,CCREG); // Not taken path
7873             dirty_reg(&current,CCREG);
7874             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7875           }
7876           // FIXME: BLTZAL/BGEZAL
7877           if(opcode2[i-1]&0x10) { // BxxZAL
7878             alloc_reg(&branch_regs[i-1],i-1,31);
7879             dirty_reg(&branch_regs[i-1],31);
7880           }
7881           break;
7882       }
7883
7884       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
7885       {
7886         if(rt1[i-1]==31) // JAL/JALR
7887         {
7888           // Subroutine call will return here, don't alloc any registers
7889           current.dirty=0;
7890           clear_all_regs(current.regmap);
7891           alloc_reg(&current,i,CCREG);
7892           dirty_reg(&current,CCREG);
7893         }
7894         else if(i+1<slen)
7895         {
7896           // Internal branch will jump here, match registers to caller
7897           current.dirty=0;
7898           clear_all_regs(current.regmap);
7899           alloc_reg(&current,i,CCREG);
7900           dirty_reg(&current,CCREG);
7901           for(j=i-1;j>=0;j--)
7902           {
7903             if(ba[j]==start+i*4+4) {
7904               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
7905               current.dirty=branch_regs[j].dirty;
7906               break;
7907             }
7908           }
7909           while(j>=0) {
7910             if(ba[j]==start+i*4+4) {
7911               for(hr=0;hr<HOST_REGS;hr++) {
7912                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
7913                   current.regmap[hr]=-1;
7914                 }
7915                 current.dirty&=branch_regs[j].dirty;
7916               }
7917             }
7918             j--;
7919           }
7920         }
7921       }
7922     }
7923
7924     // Count cycles in between branches
7925     ccadj[i]=cc;
7926     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
7927     {
7928       cc=0;
7929     }
7930 #if !defined(DRC_DBG)
7931     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
7932     {
7933       // GTE runs in parallel until accessed, divide by 2 for a rough guess
7934       cc+=gte_cycletab[source[i]&0x3f]/2;
7935     }
7936     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
7937     {
7938       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
7939     }
7940     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
7941     {
7942       cc+=4;
7943     }
7944     else if(itype[i]==C2LS)
7945     {
7946       cc+=4;
7947     }
7948 #endif
7949     else
7950     {
7951       cc++;
7952     }
7953
7954     if(!is_ds[i]) {
7955       regs[i].dirty=current.dirty;
7956       regs[i].isconst=current.isconst;
7957       memcpy(constmap[i],current_constmap,sizeof(constmap[i]));
7958     }
7959     for(hr=0;hr<HOST_REGS;hr++) {
7960       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
7961         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
7962           regs[i].wasconst&=~(1<<hr);
7963         }
7964       }
7965     }
7966     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
7967     regs[i].waswritten=current.waswritten;
7968   }
7969
7970   /* Pass 4 - Cull unused host registers */
7971
7972   uint64_t nr=0;
7973
7974   for (i=slen-1;i>=0;i--)
7975   {
7976     int hr;
7977     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
7978     {
7979       if(ba[i]<start || ba[i]>=(start+slen*4))
7980       {
7981         // Branch out of this block, don't need anything
7982         nr=0;
7983       }
7984       else
7985       {
7986         // Internal branch
7987         // Need whatever matches the target
7988         nr=0;
7989         int t=(ba[i]-start)>>2;
7990         for(hr=0;hr<HOST_REGS;hr++)
7991         {
7992           if(regs[i].regmap_entry[hr]>=0) {
7993             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
7994           }
7995         }
7996       }
7997       // Conditional branch may need registers for following instructions
7998       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7999       {
8000         if(i<slen-2) {
8001           nr|=needed_reg[i+2];
8002           for(hr=0;hr<HOST_REGS;hr++)
8003           {
8004             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
8005             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
8006           }
8007         }
8008       }
8009       // Don't need stuff which is overwritten
8010       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8011       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8012       // Merge in delay slot
8013       for(hr=0;hr<HOST_REGS;hr++)
8014       {
8015         if(!likely[i]) {
8016           // These are overwritten unless the branch is "likely"
8017           // and the delay slot is nullified if not taken
8018           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8019           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8020         }
8021         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8022         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8023         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8024         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8025         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
8026           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8027           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8028         }
8029       }
8030     }
8031     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
8032     {
8033       // SYSCALL instruction (software interrupt)
8034       nr=0;
8035     }
8036     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
8037     {
8038       // ERET instruction (return from interrupt)
8039       nr=0;
8040     }
8041     else // Non-branch
8042     {
8043       if(i<slen-1) {
8044         for(hr=0;hr<HOST_REGS;hr++) {
8045           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
8046           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
8047           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8048           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8049         }
8050       }
8051     }
8052     for(hr=0;hr<HOST_REGS;hr++)
8053     {
8054       // Overwritten registers are not needed
8055       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8056       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8057       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8058       // Source registers are needed
8059       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
8060       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
8061       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8062       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8063       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
8064         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8065         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8066       }
8067       // Don't store a register immediately after writing it,
8068       // may prevent dual-issue.
8069       // But do so if this is a branch target, otherwise we
8070       // might have to load the register before the branch.
8071       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
8072         if((regmap_pre[i][hr]>0&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1))) {
8073           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8074           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8075         }
8076         if((regs[i].regmap_entry[hr]>0&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1))) {
8077           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8078           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8079         }
8080       }
8081     }
8082     // Cycle count is needed at branches.  Assume it is needed at the target too.
8083     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==SPAN) {
8084       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8085       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8086     }
8087     // Save it
8088     needed_reg[i]=nr;
8089
8090     // Deallocate unneeded registers
8091     for(hr=0;hr<HOST_REGS;hr++)
8092     {
8093       if(!((nr>>hr)&1)) {
8094         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
8095         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8096            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8097            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
8098         {
8099           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8100           {
8101             if(likely[i]) {
8102               regs[i].regmap[hr]=-1;
8103               regs[i].isconst&=~(1<<hr);
8104               if(i<slen-2) {
8105                 regmap_pre[i+2][hr]=-1;
8106                 regs[i+2].wasconst&=~(1<<hr);
8107               }
8108             }
8109           }
8110         }
8111         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
8112         {
8113           int map=0,temp=0;
8114           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
8115              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8116             map=INVCP;
8117           }
8118           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
8119              itype[i+1]==C1LS || itype[i+1]==C2LS)
8120             temp=FTEMP;
8121           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8122              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8123              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
8124              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
8125              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
8126              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
8127              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
8128              regs[i].regmap[hr]!=map )
8129           {
8130             regs[i].regmap[hr]=-1;
8131             regs[i].isconst&=~(1<<hr);
8132             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
8133                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
8134                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
8135                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
8136                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
8137                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
8138                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
8139                branch_regs[i].regmap[hr]!=map)
8140             {
8141               branch_regs[i].regmap[hr]=-1;
8142               branch_regs[i].regmap_entry[hr]=-1;
8143               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8144               {
8145                 if(!likely[i]&&i<slen-2) {
8146                   regmap_pre[i+2][hr]=-1;
8147                   regs[i+2].wasconst&=~(1<<hr);
8148                 }
8149               }
8150             }
8151           }
8152         }
8153         else
8154         {
8155           // Non-branch
8156           if(i>0)
8157           {
8158             int map=-1,temp=-1;
8159             if(itype[i]==STORE || itype[i]==STORELR ||
8160                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8161               map=INVCP;
8162             }
8163             if(itype[i]==LOADLR || itype[i]==STORELR ||
8164                itype[i]==C1LS || itype[i]==C2LS)
8165               temp=FTEMP;
8166             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8167                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
8168                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
8169                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
8170             {
8171               if(i<slen-1&&!is_ds[i]) {
8172                 assert(regs[i].regmap[hr]<64);
8173                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]>0)
8174                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
8175                 {
8176                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
8177                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
8178                 }
8179                 regmap_pre[i+1][hr]=-1;
8180                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
8181                 regs[i+1].wasconst&=~(1<<hr);
8182               }
8183               regs[i].regmap[hr]=-1;
8184               regs[i].isconst&=~(1<<hr);
8185             }
8186           }
8187         }
8188       } // if needed
8189     } // for hr
8190   }
8191
8192   /* Pass 5 - Pre-allocate registers */
8193
8194   // If a register is allocated during a loop, try to allocate it for the
8195   // entire loop, if possible.  This avoids loading/storing registers
8196   // inside of the loop.
8197
8198   signed char f_regmap[HOST_REGS];
8199   clear_all_regs(f_regmap);
8200   for(i=0;i<slen-1;i++)
8201   {
8202     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
8203     {
8204       if(ba[i]>=start && ba[i]<(start+i*4))
8205       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
8206       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
8207       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
8208       ||itype[i+1]==SHIFT||itype[i+1]==COP1
8209       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
8210       {
8211         int t=(ba[i]-start)>>2;
8212         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP)) // loop_preload can't handle jumps into delay slots
8213         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
8214         for(hr=0;hr<HOST_REGS;hr++)
8215         {
8216           if(regs[i].regmap[hr]>=0) {
8217             if(f_regmap[hr]!=regs[i].regmap[hr]) {
8218               // dealloc old register
8219               int n;
8220               for(n=0;n<HOST_REGS;n++)
8221               {
8222                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
8223               }
8224               // and alloc new one
8225               f_regmap[hr]=regs[i].regmap[hr];
8226             }
8227           }
8228           if(branch_regs[i].regmap[hr]>=0) {
8229             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
8230               // dealloc old register
8231               int n;
8232               for(n=0;n<HOST_REGS;n++)
8233               {
8234                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
8235               }
8236               // and alloc new one
8237               f_regmap[hr]=branch_regs[i].regmap[hr];
8238             }
8239           }
8240           if(ooo[i]) {
8241             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
8242               f_regmap[hr]=branch_regs[i].regmap[hr];
8243           }else{
8244             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
8245               f_regmap[hr]=branch_regs[i].regmap[hr];
8246           }
8247           // Avoid dirty->clean transition
8248           #ifdef DESTRUCTIVE_WRITEBACK
8249           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
8250           #endif
8251           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
8252           // case above, however it's always a good idea.  We can't hoist the
8253           // load if the register was already allocated, so there's no point
8254           // wasting time analyzing most of these cases.  It only "succeeds"
8255           // when the mapping was different and the load can be replaced with
8256           // a mov, which is of negligible benefit.  So such cases are
8257           // skipped below.
8258           if(f_regmap[hr]>0) {
8259             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
8260               int r=f_regmap[hr];
8261               for(j=t;j<=i;j++)
8262               {
8263                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
8264                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
8265                 assert(r < 64);
8266                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
8267                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
8268                   int k;
8269                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
8270                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
8271                     if(r>63) {
8272                       if(get_reg(regs[i].regmap,r&63)<0) break;
8273                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
8274                     }
8275                     k=i;
8276                     while(k>1&&regs[k-1].regmap[hr]==-1) {
8277                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
8278                         //printf("no free regs for store %x\n",start+(k-1)*4);
8279                         break;
8280                       }
8281                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
8282                         //printf("no-match due to different register\n");
8283                         break;
8284                       }
8285                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP) {
8286                         //printf("no-match due to branch\n");
8287                         break;
8288                       }
8289                       // call/ret fast path assumes no registers allocated
8290                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
8291                         break;
8292                       }
8293                       assert(r < 64);
8294                       k--;
8295                     }
8296                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
8297                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
8298                       while(k<i) {
8299                         regs[k].regmap_entry[hr]=f_regmap[hr];
8300                         regs[k].regmap[hr]=f_regmap[hr];
8301                         regmap_pre[k+1][hr]=f_regmap[hr];
8302                         regs[k].wasdirty&=~(1<<hr);
8303                         regs[k].dirty&=~(1<<hr);
8304                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
8305                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
8306                         regs[k].wasconst&=~(1<<hr);
8307                         regs[k].isconst&=~(1<<hr);
8308                         k++;
8309                       }
8310                     }
8311                     else {
8312                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
8313                       break;
8314                     }
8315                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
8316                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
8317                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
8318                       regs[i].regmap_entry[hr]=f_regmap[hr];
8319                       regs[i].regmap[hr]=f_regmap[hr];
8320                       regs[i].wasdirty&=~(1<<hr);
8321                       regs[i].dirty&=~(1<<hr);
8322                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
8323                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
8324                       regs[i].wasconst&=~(1<<hr);
8325                       regs[i].isconst&=~(1<<hr);
8326                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
8327                       branch_regs[i].wasdirty&=~(1<<hr);
8328                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
8329                       branch_regs[i].regmap[hr]=f_regmap[hr];
8330                       branch_regs[i].dirty&=~(1<<hr);
8331                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
8332                       branch_regs[i].wasconst&=~(1<<hr);
8333                       branch_regs[i].isconst&=~(1<<hr);
8334                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
8335                         regmap_pre[i+2][hr]=f_regmap[hr];
8336                         regs[i+2].wasdirty&=~(1<<hr);
8337                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
8338                       }
8339                     }
8340                   }
8341                   for(k=t;k<j;k++) {
8342                     // Alloc register clean at beginning of loop,
8343                     // but may dirty it in pass 6
8344                     regs[k].regmap_entry[hr]=f_regmap[hr];
8345                     regs[k].regmap[hr]=f_regmap[hr];
8346                     regs[k].dirty&=~(1<<hr);
8347                     regs[k].wasconst&=~(1<<hr);
8348                     regs[k].isconst&=~(1<<hr);
8349                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP) {
8350                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
8351                       branch_regs[k].regmap[hr]=f_regmap[hr];
8352                       branch_regs[k].dirty&=~(1<<hr);
8353                       branch_regs[k].wasconst&=~(1<<hr);
8354                       branch_regs[k].isconst&=~(1<<hr);
8355                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
8356                         regmap_pre[k+2][hr]=f_regmap[hr];
8357                         regs[k+2].wasdirty&=~(1<<hr);
8358                       }
8359                     }
8360                     else
8361                     {
8362                       regmap_pre[k+1][hr]=f_regmap[hr];
8363                       regs[k+1].wasdirty&=~(1<<hr);
8364                     }
8365                   }
8366                   if(regs[j].regmap[hr]==f_regmap[hr])
8367                     regs[j].regmap_entry[hr]=f_regmap[hr];
8368                   break;
8369                 }
8370                 if(j==i) break;
8371                 if(regs[j].regmap[hr]>=0)
8372                   break;
8373                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
8374                   //printf("no-match due to different register\n");
8375                   break;
8376                 }
8377                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
8378                 {
8379                   // Stop on unconditional branch
8380                   break;
8381                 }
8382                 if(itype[j]==CJUMP||itype[j]==SJUMP)
8383                 {
8384                   if(ooo[j]) {
8385                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
8386                       break;
8387                   }else{
8388                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
8389                       break;
8390                   }
8391                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
8392                     //printf("no-match due to different register (branch)\n");
8393                     break;
8394                   }
8395                 }
8396                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
8397                   //printf("No free regs for store %x\n",start+j*4);
8398                   break;
8399                 }
8400                 assert(f_regmap[hr]<64);
8401               }
8402             }
8403           }
8404         }
8405       }
8406     }else{
8407       // Non branch or undetermined branch target
8408       for(hr=0;hr<HOST_REGS;hr++)
8409       {
8410         if(hr!=EXCLUDE_REG) {
8411           if(regs[i].regmap[hr]>=0) {
8412             if(f_regmap[hr]!=regs[i].regmap[hr]) {
8413               // dealloc old register
8414               int n;
8415               for(n=0;n<HOST_REGS;n++)
8416               {
8417                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
8418               }
8419               // and alloc new one
8420               f_regmap[hr]=regs[i].regmap[hr];
8421             }
8422           }
8423         }
8424       }
8425       // Try to restore cycle count at branch targets
8426       if(bt[i]) {
8427         for(j=i;j<slen-1;j++) {
8428           if(regs[j].regmap[HOST_CCREG]!=-1) break;
8429           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
8430             //printf("no free regs for store %x\n",start+j*4);
8431             break;
8432           }
8433         }
8434         if(regs[j].regmap[HOST_CCREG]==CCREG) {
8435           int k=i;
8436           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
8437           while(k<j) {
8438             regs[k].regmap_entry[HOST_CCREG]=CCREG;
8439             regs[k].regmap[HOST_CCREG]=CCREG;
8440             regmap_pre[k+1][HOST_CCREG]=CCREG;
8441             regs[k+1].wasdirty|=1<<HOST_CCREG;
8442             regs[k].dirty|=1<<HOST_CCREG;
8443             regs[k].wasconst&=~(1<<HOST_CCREG);
8444             regs[k].isconst&=~(1<<HOST_CCREG);
8445             k++;
8446           }
8447           regs[j].regmap_entry[HOST_CCREG]=CCREG;
8448         }
8449         // Work backwards from the branch target
8450         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
8451         {
8452           //printf("Extend backwards\n");
8453           int k;
8454           k=i;
8455           while(regs[k-1].regmap[HOST_CCREG]==-1) {
8456             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
8457               //printf("no free regs for store %x\n",start+(k-1)*4);
8458               break;
8459             }
8460             k--;
8461           }
8462           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
8463             //printf("Extend CC, %x ->\n",start+k*4);
8464             while(k<=i) {
8465               regs[k].regmap_entry[HOST_CCREG]=CCREG;
8466               regs[k].regmap[HOST_CCREG]=CCREG;
8467               regmap_pre[k+1][HOST_CCREG]=CCREG;
8468               regs[k+1].wasdirty|=1<<HOST_CCREG;
8469               regs[k].dirty|=1<<HOST_CCREG;
8470               regs[k].wasconst&=~(1<<HOST_CCREG);
8471               regs[k].isconst&=~(1<<HOST_CCREG);
8472               k++;
8473             }
8474           }
8475           else {
8476             //printf("Fail Extend CC, %x ->\n",start+k*4);
8477           }
8478         }
8479       }
8480       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
8481          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
8482          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1)
8483       {
8484         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
8485       }
8486     }
8487   }
8488
8489   // This allocates registers (if possible) one instruction prior
8490   // to use, which can avoid a load-use penalty on certain CPUs.
8491   for(i=0;i<slen-1;i++)
8492   {
8493     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP))
8494     {
8495       if(!bt[i+1])
8496       {
8497         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
8498            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
8499         {
8500           if(rs1[i+1]) {
8501             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
8502             {
8503               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8504               {
8505                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
8506                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
8507                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
8508                 regs[i].isconst&=~(1<<hr);
8509                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8510                 constmap[i][hr]=constmap[i+1][hr];
8511                 regs[i+1].wasdirty&=~(1<<hr);
8512                 regs[i].dirty&=~(1<<hr);
8513               }
8514             }
8515           }
8516           if(rs2[i+1]) {
8517             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
8518             {
8519               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8520               {
8521                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
8522                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
8523                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
8524                 regs[i].isconst&=~(1<<hr);
8525                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8526                 constmap[i][hr]=constmap[i+1][hr];
8527                 regs[i+1].wasdirty&=~(1<<hr);
8528                 regs[i].dirty&=~(1<<hr);
8529               }
8530             }
8531           }
8532           // Preload target address for load instruction (non-constant)
8533           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8534             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
8535             {
8536               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8537               {
8538                 regs[i].regmap[hr]=rs1[i+1];
8539                 regmap_pre[i+1][hr]=rs1[i+1];
8540                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8541                 regs[i].isconst&=~(1<<hr);
8542                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8543                 constmap[i][hr]=constmap[i+1][hr];
8544                 regs[i+1].wasdirty&=~(1<<hr);
8545                 regs[i].dirty&=~(1<<hr);
8546               }
8547             }
8548           }
8549           // Load source into target register
8550           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8551             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
8552             {
8553               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8554               {
8555                 regs[i].regmap[hr]=rs1[i+1];
8556                 regmap_pre[i+1][hr]=rs1[i+1];
8557                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8558                 regs[i].isconst&=~(1<<hr);
8559                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8560                 constmap[i][hr]=constmap[i+1][hr];
8561                 regs[i+1].wasdirty&=~(1<<hr);
8562                 regs[i].dirty&=~(1<<hr);
8563               }
8564             }
8565           }
8566           // Address for store instruction (non-constant)
8567           if(itype[i+1]==STORE||itype[i+1]==STORELR
8568              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
8569             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8570               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
8571               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
8572               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
8573               assert(hr>=0);
8574               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8575               {
8576                 regs[i].regmap[hr]=rs1[i+1];
8577                 regmap_pre[i+1][hr]=rs1[i+1];
8578                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8579                 regs[i].isconst&=~(1<<hr);
8580                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8581                 constmap[i][hr]=constmap[i+1][hr];
8582                 regs[i+1].wasdirty&=~(1<<hr);
8583                 regs[i].dirty&=~(1<<hr);
8584               }
8585             }
8586           }
8587           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
8588             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8589               int nr;
8590               hr=get_reg(regs[i+1].regmap,FTEMP);
8591               assert(hr>=0);
8592               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8593               {
8594                 regs[i].regmap[hr]=rs1[i+1];
8595                 regmap_pre[i+1][hr]=rs1[i+1];
8596                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8597                 regs[i].isconst&=~(1<<hr);
8598                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8599                 constmap[i][hr]=constmap[i+1][hr];
8600                 regs[i+1].wasdirty&=~(1<<hr);
8601                 regs[i].dirty&=~(1<<hr);
8602               }
8603               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
8604               {
8605                 // move it to another register
8606                 regs[i+1].regmap[hr]=-1;
8607                 regmap_pre[i+2][hr]=-1;
8608                 regs[i+1].regmap[nr]=FTEMP;
8609                 regmap_pre[i+2][nr]=FTEMP;
8610                 regs[i].regmap[nr]=rs1[i+1];
8611                 regmap_pre[i+1][nr]=rs1[i+1];
8612                 regs[i+1].regmap_entry[nr]=rs1[i+1];
8613                 regs[i].isconst&=~(1<<nr);
8614                 regs[i+1].isconst&=~(1<<nr);
8615                 regs[i].dirty&=~(1<<nr);
8616                 regs[i+1].wasdirty&=~(1<<nr);
8617                 regs[i+1].dirty&=~(1<<nr);
8618                 regs[i+2].wasdirty&=~(1<<nr);
8619               }
8620             }
8621           }
8622           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
8623             if(itype[i+1]==LOAD)
8624               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
8625             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
8626               hr=get_reg(regs[i+1].regmap,FTEMP);
8627             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
8628               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
8629               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
8630             }
8631             if(hr>=0&&regs[i].regmap[hr]<0) {
8632               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
8633               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
8634                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
8635                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
8636                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
8637                 regs[i].isconst&=~(1<<hr);
8638                 regs[i+1].wasdirty&=~(1<<hr);
8639                 regs[i].dirty&=~(1<<hr);
8640               }
8641             }
8642           }
8643         }
8644       }
8645     }
8646   }
8647
8648   /* Pass 6 - Optimize clean/dirty state */
8649   clean_registers(0,slen-1,1);
8650
8651   /* Pass 7 - Identify 32-bit registers */
8652   for (i=slen-1;i>=0;i--)
8653   {
8654     if(itype[i]==CJUMP||itype[i]==SJUMP)
8655     {
8656       // Conditional branch
8657       if((source[i]>>16)!=0x1000&&i<slen-2) {
8658         // Mark this address as a branch target since it may be called
8659         // upon return from interrupt
8660         bt[i+2]=1;
8661       }
8662     }
8663   }
8664
8665   if(itype[slen-1]==SPAN) {
8666     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
8667   }
8668
8669 #ifdef DISASM
8670   /* Debug/disassembly */
8671   for(i=0;i<slen;i++)
8672   {
8673     printf("U:");
8674     int r;
8675     for(r=1;r<=CCREG;r++) {
8676       if((unneeded_reg[i]>>r)&1) {
8677         if(r==HIREG) printf(" HI");
8678         else if(r==LOREG) printf(" LO");
8679         else printf(" r%d",r);
8680       }
8681     }
8682     printf("\n");
8683     #if defined(__i386__) || defined(__x86_64__)
8684     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
8685     #endif
8686     #ifdef __arm__
8687     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
8688     #endif
8689     #if defined(__i386__) || defined(__x86_64__)
8690     printf("needs: ");
8691     if(needed_reg[i]&1) printf("eax ");
8692     if((needed_reg[i]>>1)&1) printf("ecx ");
8693     if((needed_reg[i]>>2)&1) printf("edx ");
8694     if((needed_reg[i]>>3)&1) printf("ebx ");
8695     if((needed_reg[i]>>5)&1) printf("ebp ");
8696     if((needed_reg[i]>>6)&1) printf("esi ");
8697     if((needed_reg[i]>>7)&1) printf("edi ");
8698     printf("\n");
8699     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
8700     printf("dirty: ");
8701     if(regs[i].wasdirty&1) printf("eax ");
8702     if((regs[i].wasdirty>>1)&1) printf("ecx ");
8703     if((regs[i].wasdirty>>2)&1) printf("edx ");
8704     if((regs[i].wasdirty>>3)&1) printf("ebx ");
8705     if((regs[i].wasdirty>>5)&1) printf("ebp ");
8706     if((regs[i].wasdirty>>6)&1) printf("esi ");
8707     if((regs[i].wasdirty>>7)&1) printf("edi ");
8708     #endif
8709     #ifdef __arm__
8710     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
8711     printf("dirty: ");
8712     if(regs[i].wasdirty&1) printf("r0 ");
8713     if((regs[i].wasdirty>>1)&1) printf("r1 ");
8714     if((regs[i].wasdirty>>2)&1) printf("r2 ");
8715     if((regs[i].wasdirty>>3)&1) printf("r3 ");
8716     if((regs[i].wasdirty>>4)&1) printf("r4 ");
8717     if((regs[i].wasdirty>>5)&1) printf("r5 ");
8718     if((regs[i].wasdirty>>6)&1) printf("r6 ");
8719     if((regs[i].wasdirty>>7)&1) printf("r7 ");
8720     if((regs[i].wasdirty>>8)&1) printf("r8 ");
8721     if((regs[i].wasdirty>>9)&1) printf("r9 ");
8722     if((regs[i].wasdirty>>10)&1) printf("r10 ");
8723     if((regs[i].wasdirty>>12)&1) printf("r12 ");
8724     #endif
8725     printf("\n");
8726     disassemble_inst(i);
8727     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
8728     #if defined(__i386__) || defined(__x86_64__)
8729     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
8730     if(regs[i].dirty&1) printf("eax ");
8731     if((regs[i].dirty>>1)&1) printf("ecx ");
8732     if((regs[i].dirty>>2)&1) printf("edx ");
8733     if((regs[i].dirty>>3)&1) printf("ebx ");
8734     if((regs[i].dirty>>5)&1) printf("ebp ");
8735     if((regs[i].dirty>>6)&1) printf("esi ");
8736     if((regs[i].dirty>>7)&1) printf("edi ");
8737     #endif
8738     #ifdef __arm__
8739     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
8740     if(regs[i].dirty&1) printf("r0 ");
8741     if((regs[i].dirty>>1)&1) printf("r1 ");
8742     if((regs[i].dirty>>2)&1) printf("r2 ");
8743     if((regs[i].dirty>>3)&1) printf("r3 ");
8744     if((regs[i].dirty>>4)&1) printf("r4 ");
8745     if((regs[i].dirty>>5)&1) printf("r5 ");
8746     if((regs[i].dirty>>6)&1) printf("r6 ");
8747     if((regs[i].dirty>>7)&1) printf("r7 ");
8748     if((regs[i].dirty>>8)&1) printf("r8 ");
8749     if((regs[i].dirty>>9)&1) printf("r9 ");
8750     if((regs[i].dirty>>10)&1) printf("r10 ");
8751     if((regs[i].dirty>>12)&1) printf("r12 ");
8752     #endif
8753     printf("\n");
8754     if(regs[i].isconst) {
8755       printf("constants: ");
8756       #if defined(__i386__) || defined(__x86_64__)
8757       if(regs[i].isconst&1) printf("eax=%x ",(u_int)constmap[i][0]);
8758       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(u_int)constmap[i][1]);
8759       if((regs[i].isconst>>2)&1) printf("edx=%x ",(u_int)constmap[i][2]);
8760       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(u_int)constmap[i][3]);
8761       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(u_int)constmap[i][5]);
8762       if((regs[i].isconst>>6)&1) printf("esi=%x ",(u_int)constmap[i][6]);
8763       if((regs[i].isconst>>7)&1) printf("edi=%x ",(u_int)constmap[i][7]);
8764       #endif
8765       #if defined(__arm__) || defined(__aarch64__)
8766       int r;
8767       for (r = 0; r < ARRAY_SIZE(constmap[i]); r++)
8768         if ((regs[i].isconst >> r) & 1)
8769           printf(" r%d=%x", r, (u_int)constmap[i][r]);
8770       #endif
8771       printf("\n");
8772     }
8773     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
8774       #if defined(__i386__) || defined(__x86_64__)
8775       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
8776       if(branch_regs[i].dirty&1) printf("eax ");
8777       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
8778       if((branch_regs[i].dirty>>2)&1) printf("edx ");
8779       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
8780       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
8781       if((branch_regs[i].dirty>>6)&1) printf("esi ");
8782       if((branch_regs[i].dirty>>7)&1) printf("edi ");
8783       #endif
8784       #ifdef __arm__
8785       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
8786       if(branch_regs[i].dirty&1) printf("r0 ");
8787       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
8788       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
8789       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
8790       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
8791       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
8792       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
8793       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
8794       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
8795       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
8796       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
8797       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
8798       #endif
8799     }
8800   }
8801 #endif // DISASM
8802
8803   /* Pass 8 - Assembly */
8804   linkcount=0;stubcount=0;
8805   ds=0;is_delayslot=0;
8806   u_int dirty_pre=0;
8807   void *beginning=start_block();
8808   if((u_int)addr&1) {
8809     ds=1;
8810     pagespan_ds();
8811   }
8812   void *instr_addr0_override = NULL;
8813
8814   if (start == 0x80030000) {
8815     // nasty hack for the fastbios thing
8816     // override block entry to this code
8817     instr_addr0_override = out;
8818     emit_movimm(start,0);
8819     // abuse io address var as a flag that we
8820     // have already returned here once
8821     emit_readword(&address,1);
8822     emit_writeword(0,&pcaddr);
8823     emit_writeword(0,&address);
8824     emit_cmp(0,1);
8825     #ifdef __aarch64__
8826     emit_jeq(out + 4*2);
8827     emit_far_jump(new_dyna_leave);
8828     #else
8829     emit_jne(new_dyna_leave);
8830     #endif
8831   }
8832   for(i=0;i<slen;i++)
8833   {
8834     //if(ds) printf("ds: ");
8835     disassemble_inst(i);
8836     if(ds) {
8837       ds=0; // Skip delay slot
8838       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
8839       instr_addr[i] = NULL;
8840     } else {
8841       speculate_register_values(i);
8842       #ifndef DESTRUCTIVE_WRITEBACK
8843       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
8844       {
8845         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,unneeded_reg[i]);
8846       }
8847       if((itype[i]==CJUMP||itype[i]==SJUMP)&&!likely[i]) {
8848         dirty_pre=branch_regs[i].dirty;
8849       }else{
8850         dirty_pre=regs[i].dirty;
8851       }
8852       #endif
8853       // write back
8854       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
8855       {
8856         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,unneeded_reg[i]);
8857         loop_preload(regmap_pre[i],regs[i].regmap_entry);
8858       }
8859       // branch target entry point
8860       instr_addr[i] = out;
8861       assem_debug("<->\n");
8862       drc_dbg_emit_do_cmp(i);
8863
8864       // load regs
8865       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
8866         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty);
8867       load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i],rs2[i]);
8868       address_generation(i,&regs[i],regs[i].regmap_entry);
8869       load_consts(regmap_pre[i],regs[i].regmap,i);
8870       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
8871       {
8872         // Load the delay slot registers if necessary
8873         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
8874           load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i+1],rs1[i+1]);
8875         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
8876           load_regs(regs[i].regmap_entry,regs[i].regmap,rs2[i+1],rs2[i+1]);
8877         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
8878           load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP);
8879       }
8880       else if(i+1<slen)
8881       {
8882         // Preload registers for following instruction
8883         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
8884           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
8885             load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i+1],rs1[i+1]);
8886         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
8887           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
8888             load_regs(regs[i].regmap_entry,regs[i].regmap,rs2[i+1],rs2[i+1]);
8889       }
8890       // TODO: if(is_ooo(i)) address_generation(i+1);
8891       if(itype[i]==CJUMP)
8892         load_regs(regs[i].regmap_entry,regs[i].regmap,CCREG,CCREG);
8893       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
8894         load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP);
8895       // assemble
8896       switch(itype[i]) {
8897         case ALU:
8898           alu_assemble(i,&regs[i]);break;
8899         case IMM16:
8900           imm16_assemble(i,&regs[i]);break;
8901         case SHIFT:
8902           shift_assemble(i,&regs[i]);break;
8903         case SHIFTIMM:
8904           shiftimm_assemble(i,&regs[i]);break;
8905         case LOAD:
8906           load_assemble(i,&regs[i]);break;
8907         case LOADLR:
8908           loadlr_assemble(i,&regs[i]);break;
8909         case STORE:
8910           store_assemble(i,&regs[i]);break;
8911         case STORELR:
8912           storelr_assemble(i,&regs[i]);break;
8913         case COP0:
8914           cop0_assemble(i,&regs[i]);break;
8915         case COP1:
8916           cop1_assemble(i,&regs[i]);break;
8917         case C1LS:
8918           c1ls_assemble(i,&regs[i]);break;
8919         case COP2:
8920           cop2_assemble(i,&regs[i]);break;
8921         case C2LS:
8922           c2ls_assemble(i,&regs[i]);break;
8923         case C2OP:
8924           c2op_assemble(i,&regs[i]);break;
8925         case MULTDIV:
8926           multdiv_assemble(i,&regs[i]);break;
8927         case MOV:
8928           mov_assemble(i,&regs[i]);break;
8929         case SYSCALL:
8930           syscall_assemble(i,&regs[i]);break;
8931         case HLECALL:
8932           hlecall_assemble(i,&regs[i]);break;
8933         case INTCALL:
8934           intcall_assemble(i,&regs[i]);break;
8935         case UJUMP:
8936           ujump_assemble(i,&regs[i]);ds=1;break;
8937         case RJUMP:
8938           rjump_assemble(i,&regs[i]);ds=1;break;
8939         case CJUMP:
8940           cjump_assemble(i,&regs[i]);ds=1;break;
8941         case SJUMP:
8942           sjump_assemble(i,&regs[i]);ds=1;break;
8943         case SPAN:
8944           pagespan_assemble(i,&regs[i]);break;
8945       }
8946       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
8947         literal_pool(1024);
8948       else
8949         literal_pool_jumpover(256);
8950     }
8951   }
8952   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
8953   // If the block did not end with an unconditional branch,
8954   // add a jump to the next instruction.
8955   if(i>1) {
8956     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
8957       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP);
8958       assert(i==slen);
8959       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP) {
8960         store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4);
8961         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
8962           emit_loadreg(CCREG,HOST_CCREG);
8963         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
8964       }
8965       else if(!likely[i-2])
8966       {
8967         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].dirty,start+i*4);
8968         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
8969       }
8970       else
8971       {
8972         store_regs_bt(regs[i-2].regmap,regs[i-2].dirty,start+i*4);
8973         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
8974       }
8975       add_to_linker(out,start+i*4,0);
8976       emit_jmp(0);
8977     }
8978   }
8979   else
8980   {
8981     assert(i>0);
8982     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP);
8983     store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4);
8984     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
8985       emit_loadreg(CCREG,HOST_CCREG);
8986     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
8987     add_to_linker(out,start+i*4,0);
8988     emit_jmp(0);
8989   }
8990
8991   // TODO: delay slot stubs?
8992   // Stubs
8993   for(i=0;i<stubcount;i++)
8994   {
8995     switch(stubs[i].type)
8996     {
8997       case LOADB_STUB:
8998       case LOADH_STUB:
8999       case LOADW_STUB:
9000       case LOADD_STUB:
9001       case LOADBU_STUB:
9002       case LOADHU_STUB:
9003         do_readstub(i);break;
9004       case STOREB_STUB:
9005       case STOREH_STUB:
9006       case STOREW_STUB:
9007       case STORED_STUB:
9008         do_writestub(i);break;
9009       case CC_STUB:
9010         do_ccstub(i);break;
9011       case INVCODE_STUB:
9012         do_invstub(i);break;
9013       case FP_STUB:
9014         do_cop1stub(i);break;
9015       case STORELR_STUB:
9016         do_unalignedwritestub(i);break;
9017     }
9018   }
9019
9020   if (instr_addr0_override)
9021     instr_addr[0] = instr_addr0_override;
9022
9023   /* Pass 9 - Linker */
9024   for(i=0;i<linkcount;i++)
9025   {
9026     assem_debug("%p -> %8x\n",link_addr[i].addr,link_addr[i].target);
9027     literal_pool(64);
9028     if (!link_addr[i].ext)
9029     {
9030       void *stub = out;
9031       void *addr = check_addr(link_addr[i].target);
9032       emit_extjump(link_addr[i].addr, link_addr[i].target);
9033       if (addr) {
9034         set_jump_target(link_addr[i].addr, addr);
9035         add_link(link_addr[i].target,stub);
9036       }
9037       else
9038         set_jump_target(link_addr[i].addr, stub);
9039     }
9040     else
9041     {
9042       // Internal branch
9043       int target=(link_addr[i].target-start)>>2;
9044       assert(target>=0&&target<slen);
9045       assert(instr_addr[target]);
9046       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
9047       //set_jump_target_fillslot(link_addr[i].addr,instr_addr[target],link_addr[i].ext>>1);
9048       //#else
9049       set_jump_target(link_addr[i].addr, instr_addr[target]);
9050       //#endif
9051     }
9052   }
9053   // External Branch Targets (jump_in)
9054   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
9055   for(i=0;i<slen;i++)
9056   {
9057     if(bt[i]||i==0)
9058     {
9059       if(instr_addr[i]) // TODO - delay slots (=null)
9060       {
9061         u_int vaddr=start+i*4;
9062         u_int page=get_page(vaddr);
9063         u_int vpage=get_vpage(vaddr);
9064         literal_pool(256);
9065         {
9066           assem_debug("%p (%d) <- %8x\n",instr_addr[i],i,start+i*4);
9067           assem_debug("jump_in: %x\n",start+i*4);
9068           ll_add(jump_dirty+vpage,vaddr,out);
9069           void *entry_point = do_dirty_stub(i);
9070           ll_add_flags(jump_in+page,vaddr,state_rflags,entry_point);
9071           // If there was an existing entry in the hash table,
9072           // replace it with the new address.
9073           // Don't add new entries.  We'll insert the
9074           // ones that actually get used in check_addr().
9075           struct ht_entry *ht_bin = hash_table_get(vaddr);
9076           if (ht_bin->vaddr[0] == vaddr)
9077             ht_bin->tcaddr[0] = entry_point;
9078           if (ht_bin->vaddr[1] == vaddr)
9079             ht_bin->tcaddr[1] = entry_point;
9080         }
9081       }
9082     }
9083   }
9084   // Write out the literal pool if necessary
9085   literal_pool(0);
9086   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
9087   // Align code
9088   if(((u_int)out)&7) emit_addnop(13);
9089   #endif
9090   assert(out - (u_char *)beginning < MAX_OUTPUT_BLOCK_SIZE);
9091   //printf("shadow buffer: %p-%p\n",copy,(u_char *)copy+slen*4);
9092   memcpy(copy,source,slen*4);
9093   copy+=slen*4;
9094
9095   end_block(beginning);
9096
9097   // If we're within 256K of the end of the buffer,
9098   // start over from the beginning. (Is 256K enough?)
9099   if (out > ndrc->translation_cache + sizeof(ndrc->translation_cache) - MAX_OUTPUT_BLOCK_SIZE)
9100     out = ndrc->translation_cache;
9101
9102   // Trap writes to any of the pages we compiled
9103   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
9104     invalid_code[i]=0;
9105   }
9106   inv_code_start=inv_code_end=~0;
9107
9108   // for PCSX we need to mark all mirrors too
9109   if(get_page(start)<(RAM_SIZE>>12))
9110     for(i=start>>12;i<=(start+slen*4)>>12;i++)
9111       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
9112       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
9113       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
9114
9115   /* Pass 10 - Free memory by expiring oldest blocks */
9116
9117   int end=(((out-ndrc->translation_cache)>>(TARGET_SIZE_2-16))+16384)&65535;
9118   while(expirep!=end)
9119   {
9120     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
9121     uintptr_t base=(uintptr_t)ndrc->translation_cache+((expirep>>13)<<shift); // Base address of this block
9122     inv_debug("EXP: Phase %d\n",expirep);
9123     switch((expirep>>11)&3)
9124     {
9125       case 0:
9126         // Clear jump_in and jump_dirty
9127         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
9128         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
9129         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
9130         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
9131         break;
9132       case 1:
9133         // Clear pointers
9134         ll_kill_pointers(jump_out[expirep&2047],base,shift);
9135         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
9136         break;
9137       case 2:
9138         // Clear hash table
9139         for(i=0;i<32;i++) {
9140           struct ht_entry *ht_bin = &hash_table[((expirep&2047)<<5)+i];
9141           if (((uintptr_t)ht_bin->tcaddr[1]>>shift) == (base>>shift) ||
9142              (((uintptr_t)ht_bin->tcaddr[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
9143             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[1],ht_bin->tcaddr[1]);
9144             ht_bin->vaddr[1] = -1;
9145             ht_bin->tcaddr[1] = NULL;
9146           }
9147           if (((uintptr_t)ht_bin->tcaddr[0]>>shift) == (base>>shift) ||
9148              (((uintptr_t)ht_bin->tcaddr[0]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
9149             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[0],ht_bin->tcaddr[0]);
9150             ht_bin->vaddr[0] = ht_bin->vaddr[1];
9151             ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
9152             ht_bin->vaddr[1] = -1;
9153             ht_bin->tcaddr[1] = NULL;
9154           }
9155         }
9156         break;
9157       case 3:
9158         // Clear jump_out
9159         if((expirep&2047)==0)
9160           do_clear_cache();
9161         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
9162         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
9163         break;
9164     }
9165     expirep=(expirep+1)&65535;
9166   }
9167   return 0;
9168 }
9169
9170 // vim:shiftwidth=2:expandtab