ceba0e744c54677086cc243f95ded5bb6d00eead
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29 #ifdef _3DS
30 #include <3ds_utils.h>
31 #endif
32 #ifdef VITA
33 #include <psp2/kernel/sysmem.h>
34 static int sceBlock;
35 #endif
36
37 #include "new_dynarec_config.h"
38 #include "../psxhle.h"
39 #include "../psxinterpreter.h"
40 #include "emu_if.h" //emulator interface
41
42 #define noinline __attribute__((noinline,noclone))
43 #ifndef ARRAY_SIZE
44 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
45 #endif
46
47 //#define DISASM
48 //#define assem_debug printf
49 //#define inv_debug printf
50 #define assem_debug(...)
51 #define inv_debug(...)
52
53 #ifdef __i386__
54 #include "assem_x86.h"
55 #endif
56 #ifdef __x86_64__
57 #include "assem_x64.h"
58 #endif
59 #ifdef __arm__
60 #include "assem_arm.h"
61 #endif
62 #ifdef __aarch64__
63 #include "assem_arm64.h"
64 #endif
65
66 #define MAXBLOCK 4096
67 #define MAX_OUTPUT_BLOCK_SIZE 262144
68
69 struct ndrc_mem
70 {
71   u_char translation_cache[1 << TARGET_SIZE_2];
72   struct
73   {
74     struct tramp_insns ops[2048 / sizeof(struct tramp_insns)];
75     const void *f[2048 / sizeof(void *)];
76   } tramp;
77 };
78
79 #ifdef BASE_ADDR_DYNAMIC
80 static struct ndrc_mem *ndrc;
81 #else
82 static struct ndrc_mem ndrc_ __attribute__((aligned(4096)));
83 static struct ndrc_mem *ndrc = &ndrc_;
84 #endif
85
86 // stubs
87 enum stub_type {
88   CC_STUB = 1,
89   FP_STUB = 2,
90   LOADB_STUB = 3,
91   LOADH_STUB = 4,
92   LOADW_STUB = 5,
93   LOADD_STUB = 6,
94   LOADBU_STUB = 7,
95   LOADHU_STUB = 8,
96   STOREB_STUB = 9,
97   STOREH_STUB = 10,
98   STOREW_STUB = 11,
99   STORED_STUB = 12,
100   STORELR_STUB = 13,
101   INVCODE_STUB = 14,
102 };
103
104 struct regstat
105 {
106   signed char regmap_entry[HOST_REGS];
107   signed char regmap[HOST_REGS];
108   uint64_t wasdirty;
109   uint64_t dirty;
110   uint64_t u;
111   u_int wasconst;
112   u_int isconst;
113   u_int loadedconst;             // host regs that have constants loaded
114   u_int waswritten;              // MIPS regs that were used as store base before
115 };
116
117 // note: asm depends on this layout
118 struct ll_entry
119 {
120   u_int vaddr;
121   u_int reg_sv_flags;
122   void *addr;
123   struct ll_entry *next;
124 };
125
126 struct ht_entry
127 {
128   u_int vaddr[2];
129   void *tcaddr[2];
130 };
131
132 struct code_stub
133 {
134   enum stub_type type;
135   void *addr;
136   void *retaddr;
137   u_int a;
138   uintptr_t b;
139   uintptr_t c;
140   u_int d;
141   u_int e;
142 };
143
144 struct link_entry
145 {
146   void *addr;
147   u_int target;
148   u_int ext;
149 };
150
151   // used by asm:
152   u_char *out;
153   struct ht_entry hash_table[65536]  __attribute__((aligned(16)));
154   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
155   struct ll_entry *jump_dirty[4096];
156
157   static struct ll_entry *jump_out[4096];
158   static u_int start;
159   static u_int *source;
160   static char insn[MAXBLOCK][10];
161   static u_char itype[MAXBLOCK];
162   static u_char opcode[MAXBLOCK];
163   static u_char opcode2[MAXBLOCK];
164   static u_char bt[MAXBLOCK];
165   static u_char rs1[MAXBLOCK];
166   static u_char rs2[MAXBLOCK];
167   static u_char rt1[MAXBLOCK];
168   static u_char rt2[MAXBLOCK];
169   static u_char dep1[MAXBLOCK];
170   static u_char dep2[MAXBLOCK];
171   static u_char lt1[MAXBLOCK];
172   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
173   static uint64_t gte_rt[MAXBLOCK];
174   static uint64_t gte_unneeded[MAXBLOCK];
175   static u_int smrv[32]; // speculated MIPS register values
176   static u_int smrv_strong; // mask or regs that are likely to have correct values
177   static u_int smrv_weak; // same, but somewhat less likely
178   static u_int smrv_strong_next; // same, but after current insn executes
179   static u_int smrv_weak_next;
180   static int imm[MAXBLOCK];
181   static u_int ba[MAXBLOCK];
182   static char likely[MAXBLOCK];
183   static char is_ds[MAXBLOCK];
184   static char ooo[MAXBLOCK];
185   static uint64_t unneeded_reg[MAXBLOCK];
186   static uint64_t branch_unneeded_reg[MAXBLOCK];
187   static signed char regmap_pre[MAXBLOCK][HOST_REGS]; // pre-instruction i?
188   static uint64_t current_constmap[HOST_REGS];
189   static uint64_t constmap[MAXBLOCK][HOST_REGS];
190   static struct regstat regs[MAXBLOCK];
191   static struct regstat branch_regs[MAXBLOCK];
192   static signed char minimum_free_regs[MAXBLOCK];
193   static u_int needed_reg[MAXBLOCK];
194   static u_int wont_dirty[MAXBLOCK];
195   static u_int will_dirty[MAXBLOCK];
196   static int ccadj[MAXBLOCK];
197   static int slen;
198   static void *instr_addr[MAXBLOCK];
199   static struct link_entry link_addr[MAXBLOCK];
200   static int linkcount;
201   static struct code_stub stubs[MAXBLOCK*3];
202   static int stubcount;
203   static u_int literals[1024][2];
204   static int literalcount;
205   static int is_delayslot;
206   static char shadow[1048576]  __attribute__((aligned(16)));
207   static void *copy;
208   static int expirep;
209   static u_int stop_after_jal;
210 #ifndef RAM_FIXED
211   static uintptr_t ram_offset;
212 #else
213   static const uintptr_t ram_offset=0;
214 #endif
215
216   int new_dynarec_hacks;
217   int new_dynarec_did_compile;
218
219   extern int cycle_count; // ... until end of the timeslice, counts -N -> 0
220   extern int last_count;  // last absolute target, often = next_interupt
221   extern int pcaddr;
222   extern int pending_exception;
223   extern int branch_target;
224   extern uintptr_t mini_ht[32][2];
225   extern u_char restore_candidate[512];
226
227   /* registers that may be allocated */
228   /* 1-31 gpr */
229 #define LOREG 32 // lo
230 #define HIREG 33 // hi
231 //#define FSREG 34 // FPU status (FCSR)
232 #define CSREG 35 // Coprocessor status
233 #define CCREG 36 // Cycle count
234 #define INVCP 37 // Pointer to invalid_code
235 //#define MMREG 38 // Pointer to memory_map
236 //#define ROREG 39 // ram offset (if rdram!=0x80000000)
237 #define TEMPREG 40
238 #define FTEMP 40 // FPU temporary register
239 #define PTEMP 41 // Prefetch temporary register
240 //#define TLREG 42 // TLB mapping offset
241 #define RHASH 43 // Return address hash
242 #define RHTBL 44 // Return address hash table address
243 #define RTEMP 45 // JR/JALR address register
244 #define MAXREG 45
245 #define AGEN1 46 // Address generation temporary register
246 //#define AGEN2 47 // Address generation temporary register
247 //#define MGEN1 48 // Maptable address generation temporary register
248 //#define MGEN2 49 // Maptable address generation temporary register
249 #define BTREG 50 // Branch target temporary register
250
251   /* instruction types */
252 #define NOP 0     // No operation
253 #define LOAD 1    // Load
254 #define STORE 2   // Store
255 #define LOADLR 3  // Unaligned load
256 #define STORELR 4 // Unaligned store
257 #define MOV 5     // Move
258 #define ALU 6     // Arithmetic/logic
259 #define MULTDIV 7 // Multiply/divide
260 #define SHIFT 8   // Shift by register
261 #define SHIFTIMM 9// Shift by immediate
262 #define IMM16 10  // 16-bit immediate
263 #define RJUMP 11  // Unconditional jump to register
264 #define UJUMP 12  // Unconditional jump
265 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
266 #define SJUMP 14  // Conditional branch (regimm format)
267 #define COP0 15   // Coprocessor 0
268 #define COP1 16   // Coprocessor 1
269 #define C1LS 17   // Coprocessor 1 load/store
270 //#define FJUMP 18  // Conditional branch (floating point)
271 //#define FLOAT 19  // Floating point unit
272 //#define FCONV 20  // Convert integer to float
273 //#define FCOMP 21  // Floating point compare (sets FSREG)
274 #define SYSCALL 22// SYSCALL
275 #define OTHER 23  // Other
276 #define SPAN 24   // Branch/delay slot spans 2 pages
277 #define NI 25     // Not implemented
278 #define HLECALL 26// PCSX fake opcodes for HLE
279 #define COP2 27   // Coprocessor 2 move
280 #define C2LS 28   // Coprocessor 2 load/store
281 #define C2OP 29   // Coprocessor 2 operation
282 #define INTCALL 30// Call interpreter to handle rare corner cases
283
284   /* branch codes */
285 #define TAKEN 1
286 #define NOTTAKEN 2
287 #define NULLDS 3
288
289 #define DJT_1 (void *)1l // no function, just a label in assem_debug log
290 #define DJT_2 (void *)2l
291
292 // asm linkage
293 int new_recompile_block(u_int addr);
294 void *get_addr_ht(u_int vaddr);
295 void invalidate_block(u_int block);
296 void invalidate_addr(u_int addr);
297 void remove_hash(int vaddr);
298 void dyna_linker();
299 void dyna_linker_ds();
300 void verify_code();
301 void verify_code_ds();
302 void cc_interrupt();
303 void fp_exception();
304 void fp_exception_ds();
305 void jump_to_new_pc();
306 void new_dyna_leave();
307
308 // Needed by assembler
309 static void wb_register(signed char r,signed char regmap[],uint64_t dirty);
310 static void wb_dirtys(signed char i_regmap[],uint64_t i_dirty);
311 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_dirty,int addr);
312 static void load_all_regs(signed char i_regmap[]);
313 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
314 static void load_regs_entry(int t);
315 static void load_all_consts(signed char regmap[],u_int dirty,int i);
316
317 static int verify_dirty(const u_int *ptr);
318 static int get_final_value(int hr, int i, int *value);
319 static void add_stub(enum stub_type type, void *addr, void *retaddr,
320   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e);
321 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
322   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist);
323 static void add_to_linker(void *addr, u_int target, int ext);
324 static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override);
325 static void *get_direct_memhandler(void *table, u_int addr,
326   enum stub_type type, uintptr_t *addr_host);
327 static void pass_args(int a0, int a1);
328 static void emit_far_jump(const void *f);
329 static void emit_far_call(const void *f);
330
331 static void mprotect_w_x(void *start, void *end, int is_x)
332 {
333 #ifdef NO_WRITE_EXEC
334   #if defined(VITA)
335   // *Open* enables write on all memory that was
336   // allocated by sceKernelAllocMemBlockForVM()?
337   if (is_x)
338     sceKernelCloseVMDomain();
339   else
340     sceKernelOpenVMDomain();
341   #else
342   u_long mstart = (u_long)start & ~4095ul;
343   u_long mend = (u_long)end;
344   if (mprotect((void *)mstart, mend - mstart,
345                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
346     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
347   #endif
348 #endif
349 }
350
351 static void start_tcache_write(void *start, void *end)
352 {
353   mprotect_w_x(start, end, 0);
354 }
355
356 static void end_tcache_write(void *start, void *end)
357 {
358 #if defined(__arm__) || defined(__aarch64__)
359   size_t len = (char *)end - (char *)start;
360   #if   defined(__BLACKBERRY_QNX__)
361   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
362   #elif defined(__MACH__)
363   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
364   #elif defined(VITA)
365   sceKernelSyncVMDomain(sceBlock, start, len);
366   #elif defined(_3DS)
367   ctr_flush_invalidate_cache();
368   #elif defined(__aarch64__)
369   // as of 2021, __clear_cache() is still broken on arm64
370   // so here is a custom one :(
371   clear_cache_arm64(start, end);
372   #else
373   __clear_cache(start, end);
374   #endif
375   (void)len;
376 #endif
377
378   mprotect_w_x(start, end, 1);
379 }
380
381 static void *start_block(void)
382 {
383   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
384   if (end > ndrc->translation_cache + sizeof(ndrc->translation_cache))
385     end = ndrc->translation_cache + sizeof(ndrc->translation_cache);
386   start_tcache_write(out, end);
387   return out;
388 }
389
390 static void end_block(void *start)
391 {
392   end_tcache_write(start, out);
393 }
394
395 // also takes care of w^x mappings when patching code
396 static u_int needs_clear_cache[1<<(TARGET_SIZE_2-17)];
397
398 static void mark_clear_cache(void *target)
399 {
400   uintptr_t offset = (u_char *)target - ndrc->translation_cache;
401   u_int mask = 1u << ((offset >> 12) & 31);
402   if (!(needs_clear_cache[offset >> 17] & mask)) {
403     char *start = (char *)((uintptr_t)target & ~4095l);
404     start_tcache_write(start, start + 4095);
405     needs_clear_cache[offset >> 17] |= mask;
406   }
407 }
408
409 // Clearing the cache is rather slow on ARM Linux, so mark the areas
410 // that need to be cleared, and then only clear these areas once.
411 static void do_clear_cache(void)
412 {
413   int i, j;
414   for (i = 0; i < (1<<(TARGET_SIZE_2-17)); i++)
415   {
416     u_int bitmap = needs_clear_cache[i];
417     if (!bitmap)
418       continue;
419     for (j = 0; j < 32; j++)
420     {
421       u_char *start, *end;
422       if (!(bitmap & (1<<j)))
423         continue;
424
425       start = ndrc->translation_cache + i*131072 + j*4096;
426       end = start + 4095;
427       for (j++; j < 32; j++) {
428         if (!(bitmap & (1<<j)))
429           break;
430         end += 4096;
431       }
432       end_tcache_write(start, end);
433     }
434     needs_clear_cache[i] = 0;
435   }
436 }
437
438 //#define DEBUG_CYCLE_COUNT 1
439
440 #define NO_CYCLE_PENALTY_THR 12
441
442 int cycle_multiplier; // 100 for 1.0
443
444 static int CLOCK_ADJUST(int x)
445 {
446   int s=(x>>31)|1;
447   return (x * cycle_multiplier + s * 50) / 100;
448 }
449
450 static u_int get_page(u_int vaddr)
451 {
452   u_int page=vaddr&~0xe0000000;
453   if (page < 0x1000000)
454     page &= ~0x0e00000; // RAM mirrors
455   page>>=12;
456   if(page>2048) page=2048+(page&2047);
457   return page;
458 }
459
460 // no virtual mem in PCSX
461 static u_int get_vpage(u_int vaddr)
462 {
463   return get_page(vaddr);
464 }
465
466 static struct ht_entry *hash_table_get(u_int vaddr)
467 {
468   return &hash_table[((vaddr>>16)^vaddr)&0xFFFF];
469 }
470
471 static void hash_table_add(struct ht_entry *ht_bin, u_int vaddr, void *tcaddr)
472 {
473   ht_bin->vaddr[1] = ht_bin->vaddr[0];
474   ht_bin->tcaddr[1] = ht_bin->tcaddr[0];
475   ht_bin->vaddr[0] = vaddr;
476   ht_bin->tcaddr[0] = tcaddr;
477 }
478
479 // some messy ari64's code, seems to rely on unsigned 32bit overflow
480 static int doesnt_expire_soon(void *tcaddr)
481 {
482   u_int diff = (u_int)((u_char *)tcaddr - out) << (32-TARGET_SIZE_2);
483   return diff > (u_int)(0x60000000 + (MAX_OUTPUT_BLOCK_SIZE << (32-TARGET_SIZE_2)));
484 }
485
486 // Get address from virtual address
487 // This is called from the recompiled JR/JALR instructions
488 void noinline *get_addr(u_int vaddr)
489 {
490   u_int page=get_page(vaddr);
491   u_int vpage=get_vpage(vaddr);
492   struct ll_entry *head;
493   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
494   head=jump_in[page];
495   while(head!=NULL) {
496     if(head->vaddr==vaddr) {
497   //printf("TRACE: count=%d next=%d (get_addr match %x: %p)\n",Count,next_interupt,vaddr,head->addr);
498       hash_table_add(hash_table_get(vaddr), vaddr, head->addr);
499       return head->addr;
500     }
501     head=head->next;
502   }
503   head=jump_dirty[vpage];
504   while(head!=NULL) {
505     if(head->vaddr==vaddr) {
506       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %p)\n",Count,next_interupt,vaddr,head->addr);
507       // Don't restore blocks which are about to expire from the cache
508       if (doesnt_expire_soon(head->addr))
509       if (verify_dirty(head->addr)) {
510         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
511         invalid_code[vaddr>>12]=0;
512         inv_code_start=inv_code_end=~0;
513         if(vpage<2048) {
514           restore_candidate[vpage>>3]|=1<<(vpage&7);
515         }
516         else restore_candidate[page>>3]|=1<<(page&7);
517         struct ht_entry *ht_bin = hash_table_get(vaddr);
518         if (ht_bin->vaddr[0] == vaddr)
519           ht_bin->tcaddr[0] = head->addr; // Replace existing entry
520         else
521           hash_table_add(ht_bin, vaddr, head->addr);
522
523         return head->addr;
524       }
525     }
526     head=head->next;
527   }
528   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
529   int r=new_recompile_block(vaddr);
530   if(r==0) return get_addr(vaddr);
531   // Execute in unmapped page, generate pagefault execption
532   Status|=2;
533   Cause=(vaddr<<31)|0x8;
534   EPC=(vaddr&1)?vaddr-5:vaddr;
535   BadVAddr=(vaddr&~1);
536   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
537   EntryHi=BadVAddr&0xFFFFE000;
538   return get_addr_ht(0x80000000);
539 }
540 // Look up address in hash table first
541 void *get_addr_ht(u_int vaddr)
542 {
543   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
544   const struct ht_entry *ht_bin = hash_table_get(vaddr);
545   if (ht_bin->vaddr[0] == vaddr) return ht_bin->tcaddr[0];
546   if (ht_bin->vaddr[1] == vaddr) return ht_bin->tcaddr[1];
547   return get_addr(vaddr);
548 }
549
550 void clear_all_regs(signed char regmap[])
551 {
552   int hr;
553   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
554 }
555
556 static signed char get_reg(const signed char regmap[],int r)
557 {
558   int hr;
559   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
560   return -1;
561 }
562
563 // Find a register that is available for two consecutive cycles
564 static signed char get_reg2(signed char regmap1[], const signed char regmap2[], int r)
565 {
566   int hr;
567   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
568   return -1;
569 }
570
571 int count_free_regs(signed char regmap[])
572 {
573   int count=0;
574   int hr;
575   for(hr=0;hr<HOST_REGS;hr++)
576   {
577     if(hr!=EXCLUDE_REG) {
578       if(regmap[hr]<0) count++;
579     }
580   }
581   return count;
582 }
583
584 void dirty_reg(struct regstat *cur,signed char reg)
585 {
586   int hr;
587   if(!reg) return;
588   for (hr=0;hr<HOST_REGS;hr++) {
589     if((cur->regmap[hr]&63)==reg) {
590       cur->dirty|=1<<hr;
591     }
592   }
593 }
594
595 void set_const(struct regstat *cur,signed char reg,uint64_t value)
596 {
597   int hr;
598   if(!reg) return;
599   for (hr=0;hr<HOST_REGS;hr++) {
600     if(cur->regmap[hr]==reg) {
601       cur->isconst|=1<<hr;
602       current_constmap[hr]=value;
603     }
604   }
605 }
606
607 void clear_const(struct regstat *cur,signed char reg)
608 {
609   int hr;
610   if(!reg) return;
611   for (hr=0;hr<HOST_REGS;hr++) {
612     if((cur->regmap[hr]&63)==reg) {
613       cur->isconst&=~(1<<hr);
614     }
615   }
616 }
617
618 int is_const(struct regstat *cur,signed char reg)
619 {
620   int hr;
621   if(reg<0) return 0;
622   if(!reg) return 1;
623   for (hr=0;hr<HOST_REGS;hr++) {
624     if((cur->regmap[hr]&63)==reg) {
625       return (cur->isconst>>hr)&1;
626     }
627   }
628   return 0;
629 }
630 uint64_t get_const(struct regstat *cur,signed char reg)
631 {
632   int hr;
633   if(!reg) return 0;
634   for (hr=0;hr<HOST_REGS;hr++) {
635     if(cur->regmap[hr]==reg) {
636       return current_constmap[hr];
637     }
638   }
639   SysPrintf("Unknown constant in r%d\n",reg);
640   abort();
641 }
642
643 // Least soon needed registers
644 // Look at the next ten instructions and see which registers
645 // will be used.  Try not to reallocate these.
646 void lsn(u_char hsn[], int i, int *preferred_reg)
647 {
648   int j;
649   int b=-1;
650   for(j=0;j<9;j++)
651   {
652     if(i+j>=slen) {
653       j=slen-i-1;
654       break;
655     }
656     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
657     {
658       // Don't go past an unconditonal jump
659       j++;
660       break;
661     }
662   }
663   for(;j>=0;j--)
664   {
665     if(rs1[i+j]) hsn[rs1[i+j]]=j;
666     if(rs2[i+j]) hsn[rs2[i+j]]=j;
667     if(rt1[i+j]) hsn[rt1[i+j]]=j;
668     if(rt2[i+j]) hsn[rt2[i+j]]=j;
669     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
670       // Stores can allocate zero
671       hsn[rs1[i+j]]=j;
672       hsn[rs2[i+j]]=j;
673     }
674     // On some architectures stores need invc_ptr
675     #if defined(HOST_IMM8)
676     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
677       hsn[INVCP]=j;
678     }
679     #endif
680     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP))
681     {
682       hsn[CCREG]=j;
683       b=j;
684     }
685   }
686   if(b>=0)
687   {
688     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
689     {
690       // Follow first branch
691       int t=(ba[i+b]-start)>>2;
692       j=7-b;if(t+j>=slen) j=slen-t-1;
693       for(;j>=0;j--)
694       {
695         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
696         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
697         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
698         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
699       }
700     }
701     // TODO: preferred register based on backward branch
702   }
703   // Delay slot should preferably not overwrite branch conditions or cycle count
704   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP)) {
705     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
706     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
707     hsn[CCREG]=1;
708     // ...or hash tables
709     hsn[RHASH]=1;
710     hsn[RHTBL]=1;
711   }
712   // Coprocessor load/store needs FTEMP, even if not declared
713   if(itype[i]==C1LS||itype[i]==C2LS) {
714     hsn[FTEMP]=0;
715   }
716   // Load L/R also uses FTEMP as a temporary register
717   if(itype[i]==LOADLR) {
718     hsn[FTEMP]=0;
719   }
720   // Also SWL/SWR/SDL/SDR
721   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
722     hsn[FTEMP]=0;
723   }
724   // Don't remove the miniht registers
725   if(itype[i]==UJUMP||itype[i]==RJUMP)
726   {
727     hsn[RHASH]=0;
728     hsn[RHTBL]=0;
729   }
730 }
731
732 // We only want to allocate registers if we're going to use them again soon
733 int needed_again(int r, int i)
734 {
735   int j;
736   int b=-1;
737   int rn=10;
738
739   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
740   {
741     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
742       return 0; // Don't need any registers if exiting the block
743   }
744   for(j=0;j<9;j++)
745   {
746     if(i+j>=slen) {
747       j=slen-i-1;
748       break;
749     }
750     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
751     {
752       // Don't go past an unconditonal jump
753       j++;
754       break;
755     }
756     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
757     {
758       break;
759     }
760   }
761   for(;j>=1;j--)
762   {
763     if(rs1[i+j]==r) rn=j;
764     if(rs2[i+j]==r) rn=j;
765     if((unneeded_reg[i+j]>>r)&1) rn=10;
766     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP))
767     {
768       b=j;
769     }
770   }
771   /*
772   if(b>=0)
773   {
774     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
775     {
776       // Follow first branch
777       int o=rn;
778       int t=(ba[i+b]-start)>>2;
779       j=7-b;if(t+j>=slen) j=slen-t-1;
780       for(;j>=0;j--)
781       {
782         if(!((unneeded_reg[t+j]>>r)&1)) {
783           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
784           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
785         }
786         else rn=o;
787       }
788     }
789   }*/
790   if(rn<10) return 1;
791   (void)b;
792   return 0;
793 }
794
795 // Try to match register allocations at the end of a loop with those
796 // at the beginning
797 int loop_reg(int i, int r, int hr)
798 {
799   int j,k;
800   for(j=0;j<9;j++)
801   {
802     if(i+j>=slen) {
803       j=slen-i-1;
804       break;
805     }
806     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
807     {
808       // Don't go past an unconditonal jump
809       j++;
810       break;
811     }
812   }
813   k=0;
814   if(i>0){
815     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP)
816       k--;
817   }
818   for(;k<j;k++)
819   {
820     assert(r < 64);
821     if((unneeded_reg[i+k]>>r)&1) return hr;
822     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP))
823     {
824       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
825       {
826         int t=(ba[i+k]-start)>>2;
827         int reg=get_reg(regs[t].regmap_entry,r);
828         if(reg>=0) return reg;
829         //reg=get_reg(regs[t+1].regmap_entry,r);
830         //if(reg>=0) return reg;
831       }
832     }
833   }
834   return hr;
835 }
836
837
838 // Allocate every register, preserving source/target regs
839 void alloc_all(struct regstat *cur,int i)
840 {
841   int hr;
842
843   for(hr=0;hr<HOST_REGS;hr++) {
844     if(hr!=EXCLUDE_REG) {
845       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
846          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
847       {
848         cur->regmap[hr]=-1;
849         cur->dirty&=~(1<<hr);
850       }
851       // Don't need zeros
852       if((cur->regmap[hr]&63)==0)
853       {
854         cur->regmap[hr]=-1;
855         cur->dirty&=~(1<<hr);
856       }
857     }
858   }
859 }
860
861 #ifndef NDEBUG
862 static int host_tempreg_in_use;
863
864 static void host_tempreg_acquire(void)
865 {
866   assert(!host_tempreg_in_use);
867   host_tempreg_in_use = 1;
868 }
869
870 static void host_tempreg_release(void)
871 {
872   host_tempreg_in_use = 0;
873 }
874 #else
875 static void host_tempreg_acquire(void) {}
876 static void host_tempreg_release(void) {}
877 #endif
878
879 #ifdef DRC_DBG
880 extern void gen_interupt();
881 extern void do_insn_cmp();
882 #define FUNCNAME(f) { f, " " #f }
883 static const struct {
884   void *addr;
885   const char *name;
886 } function_names[] = {
887   FUNCNAME(cc_interrupt),
888   FUNCNAME(gen_interupt),
889   FUNCNAME(get_addr_ht),
890   FUNCNAME(get_addr),
891   FUNCNAME(jump_handler_read8),
892   FUNCNAME(jump_handler_read16),
893   FUNCNAME(jump_handler_read32),
894   FUNCNAME(jump_handler_write8),
895   FUNCNAME(jump_handler_write16),
896   FUNCNAME(jump_handler_write32),
897   FUNCNAME(invalidate_addr),
898   FUNCNAME(jump_to_new_pc),
899   FUNCNAME(new_dyna_leave),
900   FUNCNAME(pcsx_mtc0),
901   FUNCNAME(pcsx_mtc0_ds),
902   FUNCNAME(do_insn_cmp),
903 #ifdef __arm__
904   FUNCNAME(verify_code),
905 #endif
906 };
907
908 static const char *func_name(const void *a)
909 {
910   int i;
911   for (i = 0; i < sizeof(function_names)/sizeof(function_names[0]); i++)
912     if (function_names[i].addr == a)
913       return function_names[i].name;
914   return "";
915 }
916 #else
917 #define func_name(x) ""
918 #endif
919
920 #ifdef __i386__
921 #include "assem_x86.c"
922 #endif
923 #ifdef __x86_64__
924 #include "assem_x64.c"
925 #endif
926 #ifdef __arm__
927 #include "assem_arm.c"
928 #endif
929 #ifdef __aarch64__
930 #include "assem_arm64.c"
931 #endif
932
933 static void *get_trampoline(const void *f)
934 {
935   size_t i;
936
937   for (i = 0; i < ARRAY_SIZE(ndrc->tramp.f); i++) {
938     if (ndrc->tramp.f[i] == f || ndrc->tramp.f[i] == NULL)
939       break;
940   }
941   if (i == ARRAY_SIZE(ndrc->tramp.f)) {
942     SysPrintf("trampoline table is full, last func %p\n", f);
943     abort();
944   }
945   if (ndrc->tramp.f[i] == NULL) {
946     start_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]);
947     ndrc->tramp.f[i] = f;
948     end_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]);
949   }
950   return &ndrc->tramp.ops[i];
951 }
952
953 static void emit_far_jump(const void *f)
954 {
955   if (can_jump_or_call(f)) {
956     emit_jmp(f);
957     return;
958   }
959
960   f = get_trampoline(f);
961   emit_jmp(f);
962 }
963
964 static void emit_far_call(const void *f)
965 {
966   if (can_jump_or_call(f)) {
967     emit_call(f);
968     return;
969   }
970
971   f = get_trampoline(f);
972   emit_call(f);
973 }
974
975 // Add virtual address mapping to linked list
976 void ll_add(struct ll_entry **head,int vaddr,void *addr)
977 {
978   struct ll_entry *new_entry;
979   new_entry=malloc(sizeof(struct ll_entry));
980   assert(new_entry!=NULL);
981   new_entry->vaddr=vaddr;
982   new_entry->reg_sv_flags=0;
983   new_entry->addr=addr;
984   new_entry->next=*head;
985   *head=new_entry;
986 }
987
988 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
989 {
990   ll_add(head,vaddr,addr);
991   (*head)->reg_sv_flags=reg_sv_flags;
992 }
993
994 // Check if an address is already compiled
995 // but don't return addresses which are about to expire from the cache
996 void *check_addr(u_int vaddr)
997 {
998   struct ht_entry *ht_bin = hash_table_get(vaddr);
999   size_t i;
1000   for (i = 0; i < ARRAY_SIZE(ht_bin->vaddr); i++) {
1001     if (ht_bin->vaddr[i] == vaddr)
1002       if (doesnt_expire_soon((u_char *)ht_bin->tcaddr[i] - MAX_OUTPUT_BLOCK_SIZE))
1003         if (isclean(ht_bin->tcaddr[i]))
1004           return ht_bin->tcaddr[i];
1005   }
1006   u_int page=get_page(vaddr);
1007   struct ll_entry *head;
1008   head=jump_in[page];
1009   while (head != NULL) {
1010     if (head->vaddr == vaddr) {
1011       if (doesnt_expire_soon(head->addr)) {
1012         // Update existing entry with current address
1013         if (ht_bin->vaddr[0] == vaddr) {
1014           ht_bin->tcaddr[0] = head->addr;
1015           return head->addr;
1016         }
1017         if (ht_bin->vaddr[1] == vaddr) {
1018           ht_bin->tcaddr[1] = head->addr;
1019           return head->addr;
1020         }
1021         // Insert into hash table with low priority.
1022         // Don't evict existing entries, as they are probably
1023         // addresses that are being accessed frequently.
1024         if (ht_bin->vaddr[0] == -1) {
1025           ht_bin->vaddr[0] = vaddr;
1026           ht_bin->tcaddr[0] = head->addr;
1027         }
1028         else if (ht_bin->vaddr[1] == -1) {
1029           ht_bin->vaddr[1] = vaddr;
1030           ht_bin->tcaddr[1] = head->addr;
1031         }
1032         return head->addr;
1033       }
1034     }
1035     head=head->next;
1036   }
1037   return 0;
1038 }
1039
1040 void remove_hash(int vaddr)
1041 {
1042   //printf("remove hash: %x\n",vaddr);
1043   struct ht_entry *ht_bin = hash_table_get(vaddr);
1044   if (ht_bin->vaddr[1] == vaddr) {
1045     ht_bin->vaddr[1] = -1;
1046     ht_bin->tcaddr[1] = NULL;
1047   }
1048   if (ht_bin->vaddr[0] == vaddr) {
1049     ht_bin->vaddr[0] = ht_bin->vaddr[1];
1050     ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
1051     ht_bin->vaddr[1] = -1;
1052     ht_bin->tcaddr[1] = NULL;
1053   }
1054 }
1055
1056 void ll_remove_matching_addrs(struct ll_entry **head,uintptr_t addr,int shift)
1057 {
1058   struct ll_entry *next;
1059   while(*head) {
1060     if(((uintptr_t)((*head)->addr)>>shift)==(addr>>shift) ||
1061        ((uintptr_t)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1062     {
1063       inv_debug("EXP: Remove pointer to %p (%x)\n",(*head)->addr,(*head)->vaddr);
1064       remove_hash((*head)->vaddr);
1065       next=(*head)->next;
1066       free(*head);
1067       *head=next;
1068     }
1069     else
1070     {
1071       head=&((*head)->next);
1072     }
1073   }
1074 }
1075
1076 // Remove all entries from linked list
1077 void ll_clear(struct ll_entry **head)
1078 {
1079   struct ll_entry *cur;
1080   struct ll_entry *next;
1081   if((cur=*head)) {
1082     *head=0;
1083     while(cur) {
1084       next=cur->next;
1085       free(cur);
1086       cur=next;
1087     }
1088   }
1089 }
1090
1091 // Dereference the pointers and remove if it matches
1092 static void ll_kill_pointers(struct ll_entry *head,uintptr_t addr,int shift)
1093 {
1094   while(head) {
1095     uintptr_t ptr = (uintptr_t)get_pointer(head->addr);
1096     inv_debug("EXP: Lookup pointer to %lx at %p (%x)\n",(long)ptr,head->addr,head->vaddr);
1097     if(((ptr>>shift)==(addr>>shift)) ||
1098        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1099     {
1100       inv_debug("EXP: Kill pointer at %p (%x)\n",head->addr,head->vaddr);
1101       void *host_addr=find_extjump_insn(head->addr);
1102       mark_clear_cache(host_addr);
1103       set_jump_target(host_addr, head->addr);
1104     }
1105     head=head->next;
1106   }
1107 }
1108
1109 // This is called when we write to a compiled block (see do_invstub)
1110 static void invalidate_page(u_int page)
1111 {
1112   struct ll_entry *head;
1113   struct ll_entry *next;
1114   head=jump_in[page];
1115   jump_in[page]=0;
1116   while(head!=NULL) {
1117     inv_debug("INVALIDATE: %x\n",head->vaddr);
1118     remove_hash(head->vaddr);
1119     next=head->next;
1120     free(head);
1121     head=next;
1122   }
1123   head=jump_out[page];
1124   jump_out[page]=0;
1125   while(head!=NULL) {
1126     inv_debug("INVALIDATE: kill pointer to %x (%p)\n",head->vaddr,head->addr);
1127     void *host_addr=find_extjump_insn(head->addr);
1128     mark_clear_cache(host_addr);
1129     set_jump_target(host_addr, head->addr);
1130     next=head->next;
1131     free(head);
1132     head=next;
1133   }
1134 }
1135
1136 static void invalidate_block_range(u_int block, u_int first, u_int last)
1137 {
1138   u_int page=get_page(block<<12);
1139   //printf("first=%d last=%d\n",first,last);
1140   invalidate_page(page);
1141   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1142   assert(last<page+5);
1143   // Invalidate the adjacent pages if a block crosses a 4K boundary
1144   while(first<page) {
1145     invalidate_page(first);
1146     first++;
1147   }
1148   for(first=page+1;first<last;first++) {
1149     invalidate_page(first);
1150   }
1151   do_clear_cache();
1152
1153   // Don't trap writes
1154   invalid_code[block]=1;
1155
1156   #ifdef USE_MINI_HT
1157   memset(mini_ht,-1,sizeof(mini_ht));
1158   #endif
1159 }
1160
1161 void invalidate_block(u_int block)
1162 {
1163   u_int page=get_page(block<<12);
1164   u_int vpage=get_vpage(block<<12);
1165   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1166   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1167   u_int first,last;
1168   first=last=page;
1169   struct ll_entry *head;
1170   head=jump_dirty[vpage];
1171   //printf("page=%d vpage=%d\n",page,vpage);
1172   while(head!=NULL) {
1173     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1174       u_char *start, *end;
1175       get_bounds(head->addr, &start, &end);
1176       //printf("start: %p end: %p\n", start, end);
1177       if (page < 2048 && start >= rdram && end < rdram+RAM_SIZE) {
1178         if (((start-rdram)>>12) <= page && ((end-1-rdram)>>12) >= page) {
1179           if ((((start-rdram)>>12)&2047) < first) first = ((start-rdram)>>12)&2047;
1180           if ((((end-1-rdram)>>12)&2047) > last)  last = ((end-1-rdram)>>12)&2047;
1181         }
1182       }
1183     }
1184     head=head->next;
1185   }
1186   invalidate_block_range(block,first,last);
1187 }
1188
1189 void invalidate_addr(u_int addr)
1190 {
1191   //static int rhits;
1192   // this check is done by the caller
1193   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1194   u_int page=get_vpage(addr);
1195   if(page<2048) { // RAM
1196     struct ll_entry *head;
1197     u_int addr_min=~0, addr_max=0;
1198     u_int mask=RAM_SIZE-1;
1199     u_int addr_main=0x80000000|(addr&mask);
1200     int pg1;
1201     inv_code_start=addr_main&~0xfff;
1202     inv_code_end=addr_main|0xfff;
1203     pg1=page;
1204     if (pg1>0) {
1205       // must check previous page too because of spans..
1206       pg1--;
1207       inv_code_start-=0x1000;
1208     }
1209     for(;pg1<=page;pg1++) {
1210       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1211         u_char *start_h, *end_h;
1212         u_int start, end;
1213         get_bounds(head->addr, &start_h, &end_h);
1214         start = (uintptr_t)start_h - ram_offset;
1215         end = (uintptr_t)end_h - ram_offset;
1216         if(start<=addr_main&&addr_main<end) {
1217           if(start<addr_min) addr_min=start;
1218           if(end>addr_max) addr_max=end;
1219         }
1220         else if(addr_main<start) {
1221           if(start<inv_code_end)
1222             inv_code_end=start-1;
1223         }
1224         else {
1225           if(end>inv_code_start)
1226             inv_code_start=end;
1227         }
1228       }
1229     }
1230     if (addr_min!=~0) {
1231       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1232       inv_code_start=inv_code_end=~0;
1233       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1234       return;
1235     }
1236     else {
1237       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1238       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1239       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1240       return;
1241     }
1242   }
1243   invalidate_block(addr>>12);
1244 }
1245
1246 // This is called when loading a save state.
1247 // Anything could have changed, so invalidate everything.
1248 void invalidate_all_pages(void)
1249 {
1250   u_int page;
1251   for(page=0;page<4096;page++)
1252     invalidate_page(page);
1253   for(page=0;page<1048576;page++)
1254     if(!invalid_code[page]) {
1255       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1256       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1257     }
1258   #ifdef USE_MINI_HT
1259   memset(mini_ht,-1,sizeof(mini_ht));
1260   #endif
1261   do_clear_cache();
1262 }
1263
1264 static void do_invstub(int n)
1265 {
1266   literal_pool(20);
1267   u_int reglist=stubs[n].a;
1268   set_jump_target(stubs[n].addr, out);
1269   save_regs(reglist);
1270   if(stubs[n].b!=0) emit_mov(stubs[n].b,0);
1271   emit_far_call(invalidate_addr);
1272   restore_regs(reglist);
1273   emit_jmp(stubs[n].retaddr); // return address
1274 }
1275
1276 // Add an entry to jump_out after making a link
1277 // src should point to code by emit_extjump2()
1278 void add_link(u_int vaddr,void *src)
1279 {
1280   u_int page=get_page(vaddr);
1281   inv_debug("add_link: %p -> %x (%d)\n",src,vaddr,page);
1282   check_extjump2(src);
1283   ll_add(jump_out+page,vaddr,src);
1284   //void *ptr=get_pointer(src);
1285   //inv_debug("add_link: Pointer is to %p\n",ptr);
1286 }
1287
1288 // If a code block was found to be unmodified (bit was set in
1289 // restore_candidate) and it remains unmodified (bit is clear
1290 // in invalid_code) then move the entries for that 4K page from
1291 // the dirty list to the clean list.
1292 void clean_blocks(u_int page)
1293 {
1294   struct ll_entry *head;
1295   inv_debug("INV: clean_blocks page=%d\n",page);
1296   head=jump_dirty[page];
1297   while(head!=NULL) {
1298     if(!invalid_code[head->vaddr>>12]) {
1299       // Don't restore blocks which are about to expire from the cache
1300       if (doesnt_expire_soon(head->addr)) {
1301         if(verify_dirty(head->addr)) {
1302           u_char *start, *end;
1303           //printf("Possibly Restore %x (%p)\n",head->vaddr, head->addr);
1304           u_int i;
1305           u_int inv=0;
1306           get_bounds(head->addr, &start, &end);
1307           if (start - rdram < RAM_SIZE) {
1308             for (i = (start-rdram+0x80000000)>>12; i <= (end-1-rdram+0x80000000)>>12; i++) {
1309               inv|=invalid_code[i];
1310             }
1311           }
1312           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1313             inv=1;
1314           }
1315           if(!inv) {
1316             void *clean_addr = get_clean_addr(head->addr);
1317             if (doesnt_expire_soon(clean_addr)) {
1318               u_int ppage=page;
1319               inv_debug("INV: Restored %x (%p/%p)\n",head->vaddr, head->addr, clean_addr);
1320               //printf("page=%x, addr=%x\n",page,head->vaddr);
1321               //assert(head->vaddr>>12==(page|0x80000));
1322               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1323               struct ht_entry *ht_bin = hash_table_get(head->vaddr);
1324               if (ht_bin->vaddr[0] == head->vaddr)
1325                 ht_bin->tcaddr[0] = clean_addr; // Replace existing entry
1326               if (ht_bin->vaddr[1] == head->vaddr)
1327                 ht_bin->tcaddr[1] = clean_addr; // Replace existing entry
1328             }
1329           }
1330         }
1331       }
1332     }
1333     head=head->next;
1334   }
1335 }
1336
1337 /* Register allocation */
1338
1339 // Note: registers are allocated clean (unmodified state)
1340 // if you intend to modify the register, you must call dirty_reg().
1341 static void alloc_reg(struct regstat *cur,int i,signed char reg)
1342 {
1343   int r,hr;
1344   int preferred_reg = (reg&7);
1345   if(reg==CCREG) preferred_reg=HOST_CCREG;
1346   if(reg==PTEMP||reg==FTEMP) preferred_reg=12;
1347
1348   // Don't allocate unused registers
1349   if((cur->u>>reg)&1) return;
1350
1351   // see if it's already allocated
1352   for(hr=0;hr<HOST_REGS;hr++)
1353   {
1354     if(cur->regmap[hr]==reg) return;
1355   }
1356
1357   // Keep the same mapping if the register was already allocated in a loop
1358   preferred_reg = loop_reg(i,reg,preferred_reg);
1359
1360   // Try to allocate the preferred register
1361   if(cur->regmap[preferred_reg]==-1) {
1362     cur->regmap[preferred_reg]=reg;
1363     cur->dirty&=~(1<<preferred_reg);
1364     cur->isconst&=~(1<<preferred_reg);
1365     return;
1366   }
1367   r=cur->regmap[preferred_reg];
1368   assert(r < 64);
1369   if((cur->u>>r)&1) {
1370     cur->regmap[preferred_reg]=reg;
1371     cur->dirty&=~(1<<preferred_reg);
1372     cur->isconst&=~(1<<preferred_reg);
1373     return;
1374   }
1375
1376   // Clear any unneeded registers
1377   // We try to keep the mapping consistent, if possible, because it
1378   // makes branches easier (especially loops).  So we try to allocate
1379   // first (see above) before removing old mappings.  If this is not
1380   // possible then go ahead and clear out the registers that are no
1381   // longer needed.
1382   for(hr=0;hr<HOST_REGS;hr++)
1383   {
1384     r=cur->regmap[hr];
1385     if(r>=0) {
1386       assert(r < 64);
1387       if((cur->u>>r)&1) {cur->regmap[hr]=-1;break;}
1388     }
1389   }
1390   // Try to allocate any available register, but prefer
1391   // registers that have not been used recently.
1392   if(i>0) {
1393     for(hr=0;hr<HOST_REGS;hr++) {
1394       if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
1395         if(regs[i-1].regmap[hr]!=rs1[i-1]&&regs[i-1].regmap[hr]!=rs2[i-1]&&regs[i-1].regmap[hr]!=rt1[i-1]&&regs[i-1].regmap[hr]!=rt2[i-1]) {
1396           cur->regmap[hr]=reg;
1397           cur->dirty&=~(1<<hr);
1398           cur->isconst&=~(1<<hr);
1399           return;
1400         }
1401       }
1402     }
1403   }
1404   // Try to allocate any available register
1405   for(hr=0;hr<HOST_REGS;hr++) {
1406     if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
1407       cur->regmap[hr]=reg;
1408       cur->dirty&=~(1<<hr);
1409       cur->isconst&=~(1<<hr);
1410       return;
1411     }
1412   }
1413
1414   // Ok, now we have to evict someone
1415   // Pick a register we hopefully won't need soon
1416   u_char hsn[MAXREG+1];
1417   memset(hsn,10,sizeof(hsn));
1418   int j;
1419   lsn(hsn,i,&preferred_reg);
1420   //printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",cur->regmap[0],cur->regmap[1],cur->regmap[2],cur->regmap[3],cur->regmap[5],cur->regmap[6],cur->regmap[7]);
1421   //printf("hsn(%x): %d %d %d %d %d %d %d\n",start+i*4,hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]);
1422   if(i>0) {
1423     // Don't evict the cycle count at entry points, otherwise the entry
1424     // stub will have to write it.
1425     if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2;
1426     if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP)) hsn[CCREG]=2;
1427     for(j=10;j>=3;j--)
1428     {
1429       // Alloc preferred register if available
1430       if(hsn[r=cur->regmap[preferred_reg]&63]==j) {
1431         for(hr=0;hr<HOST_REGS;hr++) {
1432           // Evict both parts of a 64-bit register
1433           if((cur->regmap[hr]&63)==r) {
1434             cur->regmap[hr]=-1;
1435             cur->dirty&=~(1<<hr);
1436             cur->isconst&=~(1<<hr);
1437           }
1438         }
1439         cur->regmap[preferred_reg]=reg;
1440         return;
1441       }
1442       for(r=1;r<=MAXREG;r++)
1443       {
1444         if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) {
1445           for(hr=0;hr<HOST_REGS;hr++) {
1446             if(hr!=HOST_CCREG||j<hsn[CCREG]) {
1447               if(cur->regmap[hr]==r) {
1448                 cur->regmap[hr]=reg;
1449                 cur->dirty&=~(1<<hr);
1450                 cur->isconst&=~(1<<hr);
1451                 return;
1452               }
1453             }
1454           }
1455         }
1456       }
1457     }
1458   }
1459   for(j=10;j>=0;j--)
1460   {
1461     for(r=1;r<=MAXREG;r++)
1462     {
1463       if(hsn[r]==j) {
1464         for(hr=0;hr<HOST_REGS;hr++) {
1465           if(cur->regmap[hr]==r) {
1466             cur->regmap[hr]=reg;
1467             cur->dirty&=~(1<<hr);
1468             cur->isconst&=~(1<<hr);
1469             return;
1470           }
1471         }
1472       }
1473     }
1474   }
1475   SysPrintf("This shouldn't happen (alloc_reg)");abort();
1476 }
1477
1478 // Allocate a temporary register.  This is done without regard to
1479 // dirty status or whether the register we request is on the unneeded list
1480 // Note: This will only allocate one register, even if called multiple times
1481 static void alloc_reg_temp(struct regstat *cur,int i,signed char reg)
1482 {
1483   int r,hr;
1484   int preferred_reg = -1;
1485
1486   // see if it's already allocated
1487   for(hr=0;hr<HOST_REGS;hr++)
1488   {
1489     if(hr!=EXCLUDE_REG&&cur->regmap[hr]==reg) return;
1490   }
1491
1492   // Try to allocate any available register
1493   for(hr=HOST_REGS-1;hr>=0;hr--) {
1494     if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
1495       cur->regmap[hr]=reg;
1496       cur->dirty&=~(1<<hr);
1497       cur->isconst&=~(1<<hr);
1498       return;
1499     }
1500   }
1501
1502   // Find an unneeded register
1503   for(hr=HOST_REGS-1;hr>=0;hr--)
1504   {
1505     r=cur->regmap[hr];
1506     if(r>=0) {
1507       assert(r < 64);
1508       if((cur->u>>r)&1) {
1509         if(i==0||((unneeded_reg[i-1]>>r)&1)) {
1510           cur->regmap[hr]=reg;
1511           cur->dirty&=~(1<<hr);
1512           cur->isconst&=~(1<<hr);
1513           return;
1514         }
1515       }
1516     }
1517   }
1518
1519   // Ok, now we have to evict someone
1520   // Pick a register we hopefully won't need soon
1521   // TODO: we might want to follow unconditional jumps here
1522   // TODO: get rid of dupe code and make this into a function
1523   u_char hsn[MAXREG+1];
1524   memset(hsn,10,sizeof(hsn));
1525   int j;
1526   lsn(hsn,i,&preferred_reg);
1527   //printf("hsn: %d %d %d %d %d %d %d\n",hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]);
1528   if(i>0) {
1529     // Don't evict the cycle count at entry points, otherwise the entry
1530     // stub will have to write it.
1531     if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2;
1532     if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP)) hsn[CCREG]=2;
1533     for(j=10;j>=3;j--)
1534     {
1535       for(r=1;r<=MAXREG;r++)
1536       {
1537         if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) {
1538           for(hr=0;hr<HOST_REGS;hr++) {
1539             if(hr!=HOST_CCREG||hsn[CCREG]>2) {
1540               if(cur->regmap[hr]==r) {
1541                 cur->regmap[hr]=reg;
1542                 cur->dirty&=~(1<<hr);
1543                 cur->isconst&=~(1<<hr);
1544                 return;
1545               }
1546             }
1547           }
1548         }
1549       }
1550     }
1551   }
1552   for(j=10;j>=0;j--)
1553   {
1554     for(r=1;r<=MAXREG;r++)
1555     {
1556       if(hsn[r]==j) {
1557         for(hr=0;hr<HOST_REGS;hr++) {
1558           if(cur->regmap[hr]==r) {
1559             cur->regmap[hr]=reg;
1560             cur->dirty&=~(1<<hr);
1561             cur->isconst&=~(1<<hr);
1562             return;
1563           }
1564         }
1565       }
1566     }
1567   }
1568   SysPrintf("This shouldn't happen");abort();
1569 }
1570
1571 static void mov_alloc(struct regstat *current,int i)
1572 {
1573   // Note: Don't need to actually alloc the source registers
1574   //alloc_reg(current,i,rs1[i]);
1575   alloc_reg(current,i,rt1[i]);
1576
1577   clear_const(current,rs1[i]);
1578   clear_const(current,rt1[i]);
1579   dirty_reg(current,rt1[i]);
1580 }
1581
1582 static void shiftimm_alloc(struct regstat *current,int i)
1583 {
1584   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1585   {
1586     if(rt1[i]) {
1587       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1588       else lt1[i]=rs1[i];
1589       alloc_reg(current,i,rt1[i]);
1590       dirty_reg(current,rt1[i]);
1591       if(is_const(current,rs1[i])) {
1592         int v=get_const(current,rs1[i]);
1593         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1594         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1595         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1596       }
1597       else clear_const(current,rt1[i]);
1598     }
1599   }
1600   else
1601   {
1602     clear_const(current,rs1[i]);
1603     clear_const(current,rt1[i]);
1604   }
1605
1606   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1607   {
1608     assert(0);
1609   }
1610   if(opcode2[i]==0x3c) // DSLL32
1611   {
1612     assert(0);
1613   }
1614   if(opcode2[i]==0x3e) // DSRL32
1615   {
1616     assert(0);
1617   }
1618   if(opcode2[i]==0x3f) // DSRA32
1619   {
1620     assert(0);
1621   }
1622 }
1623
1624 static void shift_alloc(struct regstat *current,int i)
1625 {
1626   if(rt1[i]) {
1627     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1628     {
1629       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1630       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1631       alloc_reg(current,i,rt1[i]);
1632       if(rt1[i]==rs2[i]) {
1633         alloc_reg_temp(current,i,-1);
1634         minimum_free_regs[i]=1;
1635       }
1636     } else { // DSLLV/DSRLV/DSRAV
1637       assert(0);
1638     }
1639     clear_const(current,rs1[i]);
1640     clear_const(current,rs2[i]);
1641     clear_const(current,rt1[i]);
1642     dirty_reg(current,rt1[i]);
1643   }
1644 }
1645
1646 static void alu_alloc(struct regstat *current,int i)
1647 {
1648   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1649     if(rt1[i]) {
1650       if(rs1[i]&&rs2[i]) {
1651         alloc_reg(current,i,rs1[i]);
1652         alloc_reg(current,i,rs2[i]);
1653       }
1654       else {
1655         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1656         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1657       }
1658       alloc_reg(current,i,rt1[i]);
1659     }
1660   }
1661   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1662     if(rt1[i]) {
1663       alloc_reg(current,i,rs1[i]);
1664       alloc_reg(current,i,rs2[i]);
1665       alloc_reg(current,i,rt1[i]);
1666     }
1667   }
1668   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1669     if(rt1[i]) {
1670       if(rs1[i]&&rs2[i]) {
1671         alloc_reg(current,i,rs1[i]);
1672         alloc_reg(current,i,rs2[i]);
1673       }
1674       else
1675       {
1676         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1677         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1678       }
1679       alloc_reg(current,i,rt1[i]);
1680     }
1681   }
1682   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1683     assert(0);
1684   }
1685   clear_const(current,rs1[i]);
1686   clear_const(current,rs2[i]);
1687   clear_const(current,rt1[i]);
1688   dirty_reg(current,rt1[i]);
1689 }
1690
1691 static void imm16_alloc(struct regstat *current,int i)
1692 {
1693   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1694   else lt1[i]=rs1[i];
1695   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1696   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1697     assert(0);
1698   }
1699   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1700     clear_const(current,rs1[i]);
1701     clear_const(current,rt1[i]);
1702   }
1703   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1704     if(is_const(current,rs1[i])) {
1705       int v=get_const(current,rs1[i]);
1706       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1707       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1708       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1709     }
1710     else clear_const(current,rt1[i]);
1711   }
1712   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1713     if(is_const(current,rs1[i])) {
1714       int v=get_const(current,rs1[i]);
1715       set_const(current,rt1[i],v+imm[i]);
1716     }
1717     else clear_const(current,rt1[i]);
1718   }
1719   else {
1720     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1721   }
1722   dirty_reg(current,rt1[i]);
1723 }
1724
1725 static void load_alloc(struct regstat *current,int i)
1726 {
1727   clear_const(current,rt1[i]);
1728   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1729   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1730   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1731   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1732     alloc_reg(current,i,rt1[i]);
1733     assert(get_reg(current->regmap,rt1[i])>=0);
1734     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1735     {
1736       assert(0);
1737     }
1738     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1739     {
1740       assert(0);
1741     }
1742     dirty_reg(current,rt1[i]);
1743     // LWL/LWR need a temporary register for the old value
1744     if(opcode[i]==0x22||opcode[i]==0x26)
1745     {
1746       alloc_reg(current,i,FTEMP);
1747       alloc_reg_temp(current,i,-1);
1748       minimum_free_regs[i]=1;
1749     }
1750   }
1751   else
1752   {
1753     // Load to r0 or unneeded register (dummy load)
1754     // but we still need a register to calculate the address
1755     if(opcode[i]==0x22||opcode[i]==0x26)
1756     {
1757       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1758     }
1759     alloc_reg_temp(current,i,-1);
1760     minimum_free_regs[i]=1;
1761     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1762     {
1763       assert(0);
1764     }
1765   }
1766 }
1767
1768 void store_alloc(struct regstat *current,int i)
1769 {
1770   clear_const(current,rs2[i]);
1771   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1772   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1773   alloc_reg(current,i,rs2[i]);
1774   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1775     assert(0);
1776   }
1777   #if defined(HOST_IMM8)
1778   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1779   else alloc_reg(current,i,INVCP);
1780   #endif
1781   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1782     alloc_reg(current,i,FTEMP);
1783   }
1784   // We need a temporary register for address generation
1785   alloc_reg_temp(current,i,-1);
1786   minimum_free_regs[i]=1;
1787 }
1788
1789 void c1ls_alloc(struct regstat *current,int i)
1790 {
1791   //clear_const(current,rs1[i]); // FIXME
1792   clear_const(current,rt1[i]);
1793   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1794   alloc_reg(current,i,CSREG); // Status
1795   alloc_reg(current,i,FTEMP);
1796   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1797     assert(0);
1798   }
1799   #if defined(HOST_IMM8)
1800   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1801   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1802     alloc_reg(current,i,INVCP);
1803   #endif
1804   // We need a temporary register for address generation
1805   alloc_reg_temp(current,i,-1);
1806 }
1807
1808 void c2ls_alloc(struct regstat *current,int i)
1809 {
1810   clear_const(current,rt1[i]);
1811   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1812   alloc_reg(current,i,FTEMP);
1813   #if defined(HOST_IMM8)
1814   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1815   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1816     alloc_reg(current,i,INVCP);
1817   #endif
1818   // We need a temporary register for address generation
1819   alloc_reg_temp(current,i,-1);
1820   minimum_free_regs[i]=1;
1821 }
1822
1823 #ifndef multdiv_alloc
1824 void multdiv_alloc(struct regstat *current,int i)
1825 {
1826   //  case 0x18: MULT
1827   //  case 0x19: MULTU
1828   //  case 0x1A: DIV
1829   //  case 0x1B: DIVU
1830   //  case 0x1C: DMULT
1831   //  case 0x1D: DMULTU
1832   //  case 0x1E: DDIV
1833   //  case 0x1F: DDIVU
1834   clear_const(current,rs1[i]);
1835   clear_const(current,rs2[i]);
1836   if(rs1[i]&&rs2[i])
1837   {
1838     if((opcode2[i]&4)==0) // 32-bit
1839     {
1840       current->u&=~(1LL<<HIREG);
1841       current->u&=~(1LL<<LOREG);
1842       alloc_reg(current,i,HIREG);
1843       alloc_reg(current,i,LOREG);
1844       alloc_reg(current,i,rs1[i]);
1845       alloc_reg(current,i,rs2[i]);
1846       dirty_reg(current,HIREG);
1847       dirty_reg(current,LOREG);
1848     }
1849     else // 64-bit
1850     {
1851       assert(0);
1852     }
1853   }
1854   else
1855   {
1856     // Multiply by zero is zero.
1857     // MIPS does not have a divide by zero exception.
1858     // The result is undefined, we return zero.
1859     alloc_reg(current,i,HIREG);
1860     alloc_reg(current,i,LOREG);
1861     dirty_reg(current,HIREG);
1862     dirty_reg(current,LOREG);
1863   }
1864 }
1865 #endif
1866
1867 void cop0_alloc(struct regstat *current,int i)
1868 {
1869   if(opcode2[i]==0) // MFC0
1870   {
1871     if(rt1[i]) {
1872       clear_const(current,rt1[i]);
1873       alloc_all(current,i);
1874       alloc_reg(current,i,rt1[i]);
1875       dirty_reg(current,rt1[i]);
1876     }
1877   }
1878   else if(opcode2[i]==4) // MTC0
1879   {
1880     if(rs1[i]){
1881       clear_const(current,rs1[i]);
1882       alloc_reg(current,i,rs1[i]);
1883       alloc_all(current,i);
1884     }
1885     else {
1886       alloc_all(current,i); // FIXME: Keep r0
1887       current->u&=~1LL;
1888       alloc_reg(current,i,0);
1889     }
1890   }
1891   else
1892   {
1893     // TLBR/TLBWI/TLBWR/TLBP/ERET
1894     assert(opcode2[i]==0x10);
1895     alloc_all(current,i);
1896   }
1897   minimum_free_regs[i]=HOST_REGS;
1898 }
1899
1900 static void cop12_alloc(struct regstat *current,int i)
1901 {
1902   alloc_reg(current,i,CSREG); // Load status
1903   if(opcode2[i]<3) // MFC1/CFC1
1904   {
1905     if(rt1[i]){
1906       clear_const(current,rt1[i]);
1907       alloc_reg(current,i,rt1[i]);
1908       dirty_reg(current,rt1[i]);
1909     }
1910     alloc_reg_temp(current,i,-1);
1911   }
1912   else if(opcode2[i]>3) // MTC1/CTC1
1913   {
1914     if(rs1[i]){
1915       clear_const(current,rs1[i]);
1916       alloc_reg(current,i,rs1[i]);
1917     }
1918     else {
1919       current->u&=~1LL;
1920       alloc_reg(current,i,0);
1921     }
1922     alloc_reg_temp(current,i,-1);
1923   }
1924   minimum_free_regs[i]=1;
1925 }
1926
1927 void c2op_alloc(struct regstat *current,int i)
1928 {
1929   alloc_reg_temp(current,i,-1);
1930 }
1931
1932 void syscall_alloc(struct regstat *current,int i)
1933 {
1934   alloc_cc(current,i);
1935   dirty_reg(current,CCREG);
1936   alloc_all(current,i);
1937   minimum_free_regs[i]=HOST_REGS;
1938   current->isconst=0;
1939 }
1940
1941 void delayslot_alloc(struct regstat *current,int i)
1942 {
1943   switch(itype[i]) {
1944     case UJUMP:
1945     case CJUMP:
1946     case SJUMP:
1947     case RJUMP:
1948     case SYSCALL:
1949     case HLECALL:
1950     case SPAN:
1951       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//abort();
1952       SysPrintf("Disabled speculative precompilation\n");
1953       stop_after_jal=1;
1954       break;
1955     case IMM16:
1956       imm16_alloc(current,i);
1957       break;
1958     case LOAD:
1959     case LOADLR:
1960       load_alloc(current,i);
1961       break;
1962     case STORE:
1963     case STORELR:
1964       store_alloc(current,i);
1965       break;
1966     case ALU:
1967       alu_alloc(current,i);
1968       break;
1969     case SHIFT:
1970       shift_alloc(current,i);
1971       break;
1972     case MULTDIV:
1973       multdiv_alloc(current,i);
1974       break;
1975     case SHIFTIMM:
1976       shiftimm_alloc(current,i);
1977       break;
1978     case MOV:
1979       mov_alloc(current,i);
1980       break;
1981     case COP0:
1982       cop0_alloc(current,i);
1983       break;
1984     case COP1:
1985     case COP2:
1986       cop12_alloc(current,i);
1987       break;
1988     case C1LS:
1989       c1ls_alloc(current,i);
1990       break;
1991     case C2LS:
1992       c2ls_alloc(current,i);
1993       break;
1994     case C2OP:
1995       c2op_alloc(current,i);
1996       break;
1997   }
1998 }
1999
2000 // Special case where a branch and delay slot span two pages in virtual memory
2001 static void pagespan_alloc(struct regstat *current,int i)
2002 {
2003   current->isconst=0;
2004   current->wasconst=0;
2005   regs[i].wasconst=0;
2006   minimum_free_regs[i]=HOST_REGS;
2007   alloc_all(current,i);
2008   alloc_cc(current,i);
2009   dirty_reg(current,CCREG);
2010   if(opcode[i]==3) // JAL
2011   {
2012     alloc_reg(current,i,31);
2013     dirty_reg(current,31);
2014   }
2015   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
2016   {
2017     alloc_reg(current,i,rs1[i]);
2018     if (rt1[i]!=0) {
2019       alloc_reg(current,i,rt1[i]);
2020       dirty_reg(current,rt1[i]);
2021     }
2022   }
2023   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
2024   {
2025     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2026     if(rs2[i]) alloc_reg(current,i,rs2[i]);
2027   }
2028   else
2029   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
2030   {
2031     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2032   }
2033   //else ...
2034 }
2035
2036 static void add_stub(enum stub_type type, void *addr, void *retaddr,
2037   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e)
2038 {
2039   assert(stubcount < ARRAY_SIZE(stubs));
2040   stubs[stubcount].type = type;
2041   stubs[stubcount].addr = addr;
2042   stubs[stubcount].retaddr = retaddr;
2043   stubs[stubcount].a = a;
2044   stubs[stubcount].b = b;
2045   stubs[stubcount].c = c;
2046   stubs[stubcount].d = d;
2047   stubs[stubcount].e = e;
2048   stubcount++;
2049 }
2050
2051 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
2052   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist)
2053 {
2054   add_stub(type, addr, retaddr, i, addr_reg, (uintptr_t)i_regs, ccadj, reglist);
2055 }
2056
2057 // Write out a single register
2058 static void wb_register(signed char r,signed char regmap[],uint64_t dirty)
2059 {
2060   int hr;
2061   for(hr=0;hr<HOST_REGS;hr++) {
2062     if(hr!=EXCLUDE_REG) {
2063       if((regmap[hr]&63)==r) {
2064         if((dirty>>hr)&1) {
2065           assert(regmap[hr]<64);
2066           emit_storereg(r,hr);
2067         }
2068       }
2069     }
2070   }
2071 }
2072
2073 static void wb_valid(signed char pre[],signed char entry[],u_int dirty_pre,u_int dirty,uint64_t u)
2074 {
2075   //if(dirty_pre==dirty) return;
2076   int hr,reg;
2077   for(hr=0;hr<HOST_REGS;hr++) {
2078     if(hr!=EXCLUDE_REG) {
2079       reg=pre[hr];
2080       if(((~u)>>(reg&63))&1) {
2081         if(reg>0) {
2082           if(((dirty_pre&~dirty)>>hr)&1) {
2083             if(reg>0&&reg<34) {
2084               emit_storereg(reg,hr);
2085             }
2086             else if(reg>=64) {
2087               assert(0);
2088             }
2089           }
2090         }
2091       }
2092     }
2093   }
2094 }
2095
2096 // trashes r2
2097 static void pass_args(int a0, int a1)
2098 {
2099   if(a0==1&&a1==0) {
2100     // must swap
2101     emit_mov(a0,2); emit_mov(a1,1); emit_mov(2,0);
2102   }
2103   else if(a0!=0&&a1==0) {
2104     emit_mov(a1,1);
2105     if (a0>=0) emit_mov(a0,0);
2106   }
2107   else {
2108     if(a0>=0&&a0!=0) emit_mov(a0,0);
2109     if(a1>=0&&a1!=1) emit_mov(a1,1);
2110   }
2111 }
2112
2113 static void alu_assemble(int i,struct regstat *i_regs)
2114 {
2115   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2116     if(rt1[i]) {
2117       signed char s1,s2,t;
2118       t=get_reg(i_regs->regmap,rt1[i]);
2119       if(t>=0) {
2120         s1=get_reg(i_regs->regmap,rs1[i]);
2121         s2=get_reg(i_regs->regmap,rs2[i]);
2122         if(rs1[i]&&rs2[i]) {
2123           assert(s1>=0);
2124           assert(s2>=0);
2125           if(opcode2[i]&2) emit_sub(s1,s2,t);
2126           else emit_add(s1,s2,t);
2127         }
2128         else if(rs1[i]) {
2129           if(s1>=0) emit_mov(s1,t);
2130           else emit_loadreg(rs1[i],t);
2131         }
2132         else if(rs2[i]) {
2133           if(s2>=0) {
2134             if(opcode2[i]&2) emit_neg(s2,t);
2135             else emit_mov(s2,t);
2136           }
2137           else {
2138             emit_loadreg(rs2[i],t);
2139             if(opcode2[i]&2) emit_neg(t,t);
2140           }
2141         }
2142         else emit_zeroreg(t);
2143       }
2144     }
2145   }
2146   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2147     assert(0);
2148   }
2149   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2150     if(rt1[i]) {
2151       signed char s1l,s2l,t;
2152       {
2153         t=get_reg(i_regs->regmap,rt1[i]);
2154         //assert(t>=0);
2155         if(t>=0) {
2156           s1l=get_reg(i_regs->regmap,rs1[i]);
2157           s2l=get_reg(i_regs->regmap,rs2[i]);
2158           if(rs2[i]==0) // rx<r0
2159           {
2160             if(opcode2[i]==0x2a&&rs1[i]!=0) { // SLT
2161               assert(s1l>=0);
2162               emit_shrimm(s1l,31,t);
2163             }
2164             else // SLTU (unsigned can not be less than zero, 0<0)
2165               emit_zeroreg(t);
2166           }
2167           else if(rs1[i]==0) // r0<rx
2168           {
2169             assert(s2l>=0);
2170             if(opcode2[i]==0x2a) // SLT
2171               emit_set_gz32(s2l,t);
2172             else // SLTU (set if not zero)
2173               emit_set_nz32(s2l,t);
2174           }
2175           else{
2176             assert(s1l>=0);assert(s2l>=0);
2177             if(opcode2[i]==0x2a) // SLT
2178               emit_set_if_less32(s1l,s2l,t);
2179             else // SLTU
2180               emit_set_if_carry32(s1l,s2l,t);
2181           }
2182         }
2183       }
2184     }
2185   }
2186   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2187     if(rt1[i]) {
2188       signed char s1l,s2l,tl;
2189       tl=get_reg(i_regs->regmap,rt1[i]);
2190       {
2191         if(tl>=0) {
2192           s1l=get_reg(i_regs->regmap,rs1[i]);
2193           s2l=get_reg(i_regs->regmap,rs2[i]);
2194           if(rs1[i]&&rs2[i]) {
2195             assert(s1l>=0);
2196             assert(s2l>=0);
2197             if(opcode2[i]==0x24) { // AND
2198               emit_and(s1l,s2l,tl);
2199             } else
2200             if(opcode2[i]==0x25) { // OR
2201               emit_or(s1l,s2l,tl);
2202             } else
2203             if(opcode2[i]==0x26) { // XOR
2204               emit_xor(s1l,s2l,tl);
2205             } else
2206             if(opcode2[i]==0x27) { // NOR
2207               emit_or(s1l,s2l,tl);
2208               emit_not(tl,tl);
2209             }
2210           }
2211           else
2212           {
2213             if(opcode2[i]==0x24) { // AND
2214               emit_zeroreg(tl);
2215             } else
2216             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2217               if(rs1[i]){
2218                 if(s1l>=0) emit_mov(s1l,tl);
2219                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2220               }
2221               else
2222               if(rs2[i]){
2223                 if(s2l>=0) emit_mov(s2l,tl);
2224                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2225               }
2226               else emit_zeroreg(tl);
2227             } else
2228             if(opcode2[i]==0x27) { // NOR
2229               if(rs1[i]){
2230                 if(s1l>=0) emit_not(s1l,tl);
2231                 else {
2232                   emit_loadreg(rs1[i],tl);
2233                   emit_not(tl,tl);
2234                 }
2235               }
2236               else
2237               if(rs2[i]){
2238                 if(s2l>=0) emit_not(s2l,tl);
2239                 else {
2240                   emit_loadreg(rs2[i],tl);
2241                   emit_not(tl,tl);
2242                 }
2243               }
2244               else emit_movimm(-1,tl);
2245             }
2246           }
2247         }
2248       }
2249     }
2250   }
2251 }
2252
2253 void imm16_assemble(int i,struct regstat *i_regs)
2254 {
2255   if (opcode[i]==0x0f) { // LUI
2256     if(rt1[i]) {
2257       signed char t;
2258       t=get_reg(i_regs->regmap,rt1[i]);
2259       //assert(t>=0);
2260       if(t>=0) {
2261         if(!((i_regs->isconst>>t)&1))
2262           emit_movimm(imm[i]<<16,t);
2263       }
2264     }
2265   }
2266   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2267     if(rt1[i]) {
2268       signed char s,t;
2269       t=get_reg(i_regs->regmap,rt1[i]);
2270       s=get_reg(i_regs->regmap,rs1[i]);
2271       if(rs1[i]) {
2272         //assert(t>=0);
2273         //assert(s>=0);
2274         if(t>=0) {
2275           if(!((i_regs->isconst>>t)&1)) {
2276             if(s<0) {
2277               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2278               emit_addimm(t,imm[i],t);
2279             }else{
2280               if(!((i_regs->wasconst>>s)&1))
2281                 emit_addimm(s,imm[i],t);
2282               else
2283                 emit_movimm(constmap[i][s]+imm[i],t);
2284             }
2285           }
2286         }
2287       } else {
2288         if(t>=0) {
2289           if(!((i_regs->isconst>>t)&1))
2290             emit_movimm(imm[i],t);
2291         }
2292       }
2293     }
2294   }
2295   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2296     if(rt1[i]) {
2297       signed char sl,tl;
2298       tl=get_reg(i_regs->regmap,rt1[i]);
2299       sl=get_reg(i_regs->regmap,rs1[i]);
2300       if(tl>=0) {
2301         if(rs1[i]) {
2302           assert(sl>=0);
2303           emit_addimm(sl,imm[i],tl);
2304         } else {
2305           emit_movimm(imm[i],tl);
2306         }
2307       }
2308     }
2309   }
2310   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2311     if(rt1[i]) {
2312       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2313       signed char sl,t;
2314       t=get_reg(i_regs->regmap,rt1[i]);
2315       sl=get_reg(i_regs->regmap,rs1[i]);
2316       //assert(t>=0);
2317       if(t>=0) {
2318         if(rs1[i]>0) {
2319             if(opcode[i]==0x0a) { // SLTI
2320               if(sl<0) {
2321                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2322                 emit_slti32(t,imm[i],t);
2323               }else{
2324                 emit_slti32(sl,imm[i],t);
2325               }
2326             }
2327             else { // SLTIU
2328               if(sl<0) {
2329                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2330                 emit_sltiu32(t,imm[i],t);
2331               }else{
2332                 emit_sltiu32(sl,imm[i],t);
2333               }
2334             }
2335         }else{
2336           // SLTI(U) with r0 is just stupid,
2337           // nonetheless examples can be found
2338           if(opcode[i]==0x0a) // SLTI
2339             if(0<imm[i]) emit_movimm(1,t);
2340             else emit_zeroreg(t);
2341           else // SLTIU
2342           {
2343             if(imm[i]) emit_movimm(1,t);
2344             else emit_zeroreg(t);
2345           }
2346         }
2347       }
2348     }
2349   }
2350   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2351     if(rt1[i]) {
2352       signed char sl,tl;
2353       tl=get_reg(i_regs->regmap,rt1[i]);
2354       sl=get_reg(i_regs->regmap,rs1[i]);
2355       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2356         if(opcode[i]==0x0c) //ANDI
2357         {
2358           if(rs1[i]) {
2359             if(sl<0) {
2360               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2361               emit_andimm(tl,imm[i],tl);
2362             }else{
2363               if(!((i_regs->wasconst>>sl)&1))
2364                 emit_andimm(sl,imm[i],tl);
2365               else
2366                 emit_movimm(constmap[i][sl]&imm[i],tl);
2367             }
2368           }
2369           else
2370             emit_zeroreg(tl);
2371         }
2372         else
2373         {
2374           if(rs1[i]) {
2375             if(sl<0) {
2376               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2377             }
2378             if(opcode[i]==0x0d) { // ORI
2379               if(sl<0) {
2380                 emit_orimm(tl,imm[i],tl);
2381               }else{
2382                 if(!((i_regs->wasconst>>sl)&1))
2383                   emit_orimm(sl,imm[i],tl);
2384                 else
2385                   emit_movimm(constmap[i][sl]|imm[i],tl);
2386               }
2387             }
2388             if(opcode[i]==0x0e) { // XORI
2389               if(sl<0) {
2390                 emit_xorimm(tl,imm[i],tl);
2391               }else{
2392                 if(!((i_regs->wasconst>>sl)&1))
2393                   emit_xorimm(sl,imm[i],tl);
2394                 else
2395                   emit_movimm(constmap[i][sl]^imm[i],tl);
2396               }
2397             }
2398           }
2399           else {
2400             emit_movimm(imm[i],tl);
2401           }
2402         }
2403       }
2404     }
2405   }
2406 }
2407
2408 void shiftimm_assemble(int i,struct regstat *i_regs)
2409 {
2410   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2411   {
2412     if(rt1[i]) {
2413       signed char s,t;
2414       t=get_reg(i_regs->regmap,rt1[i]);
2415       s=get_reg(i_regs->regmap,rs1[i]);
2416       //assert(t>=0);
2417       if(t>=0&&!((i_regs->isconst>>t)&1)){
2418         if(rs1[i]==0)
2419         {
2420           emit_zeroreg(t);
2421         }
2422         else
2423         {
2424           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2425           if(imm[i]) {
2426             if(opcode2[i]==0) // SLL
2427             {
2428               emit_shlimm(s<0?t:s,imm[i],t);
2429             }
2430             if(opcode2[i]==2) // SRL
2431             {
2432               emit_shrimm(s<0?t:s,imm[i],t);
2433             }
2434             if(opcode2[i]==3) // SRA
2435             {
2436               emit_sarimm(s<0?t:s,imm[i],t);
2437             }
2438           }else{
2439             // Shift by zero
2440             if(s>=0 && s!=t) emit_mov(s,t);
2441           }
2442         }
2443       }
2444       //emit_storereg(rt1[i],t); //DEBUG
2445     }
2446   }
2447   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2448   {
2449     assert(0);
2450   }
2451   if(opcode2[i]==0x3c) // DSLL32
2452   {
2453     assert(0);
2454   }
2455   if(opcode2[i]==0x3e) // DSRL32
2456   {
2457     assert(0);
2458   }
2459   if(opcode2[i]==0x3f) // DSRA32
2460   {
2461     assert(0);
2462   }
2463 }
2464
2465 #ifndef shift_assemble
2466 static void shift_assemble(int i,struct regstat *i_regs)
2467 {
2468   signed char s,t,shift;
2469   if (rt1[i] == 0)
2470     return;
2471   assert(opcode2[i]<=0x07); // SLLV/SRLV/SRAV
2472   t = get_reg(i_regs->regmap, rt1[i]);
2473   s = get_reg(i_regs->regmap, rs1[i]);
2474   shift = get_reg(i_regs->regmap, rs2[i]);
2475   if (t < 0)
2476     return;
2477
2478   if(rs1[i]==0)
2479     emit_zeroreg(t);
2480   else if(rs2[i]==0) {
2481     assert(s>=0);
2482     if(s!=t) emit_mov(s,t);
2483   }
2484   else {
2485     host_tempreg_acquire();
2486     emit_andimm(shift,31,HOST_TEMPREG);
2487     switch(opcode2[i]) {
2488     case 4: // SLLV
2489       emit_shl(s,HOST_TEMPREG,t);
2490       break;
2491     case 6: // SRLV
2492       emit_shr(s,HOST_TEMPREG,t);
2493       break;
2494     case 7: // SRAV
2495       emit_sar(s,HOST_TEMPREG,t);
2496       break;
2497     default:
2498       assert(0);
2499     }
2500     host_tempreg_release();
2501   }
2502 }
2503
2504 #endif
2505
2506 enum {
2507   MTYPE_8000 = 0,
2508   MTYPE_8020,
2509   MTYPE_0000,
2510   MTYPE_A000,
2511   MTYPE_1F80,
2512 };
2513
2514 static int get_ptr_mem_type(u_int a)
2515 {
2516   if(a < 0x00200000) {
2517     if(a<0x1000&&((start>>20)==0xbfc||(start>>24)==0xa0))
2518       // return wrong, must use memhandler for BIOS self-test to pass
2519       // 007 does similar stuff from a00 mirror, weird stuff
2520       return MTYPE_8000;
2521     return MTYPE_0000;
2522   }
2523   if(0x1f800000 <= a && a < 0x1f801000)
2524     return MTYPE_1F80;
2525   if(0x80200000 <= a && a < 0x80800000)
2526     return MTYPE_8020;
2527   if(0xa0000000 <= a && a < 0xa0200000)
2528     return MTYPE_A000;
2529   return MTYPE_8000;
2530 }
2531
2532 static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override)
2533 {
2534   void *jaddr = NULL;
2535   int type=0;
2536   int mr=rs1[i];
2537   if(((smrv_strong|smrv_weak)>>mr)&1) {
2538     type=get_ptr_mem_type(smrv[mr]);
2539     //printf("set %08x @%08x r%d %d\n", smrv[mr], start+i*4, mr, type);
2540   }
2541   else {
2542     // use the mirror we are running on
2543     type=get_ptr_mem_type(start);
2544     //printf("set nospec   @%08x r%d %d\n", start+i*4, mr, type);
2545   }
2546
2547   if(type==MTYPE_8020) { // RAM 80200000+ mirror
2548     host_tempreg_acquire();
2549     emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
2550     addr=*addr_reg_override=HOST_TEMPREG;
2551     type=0;
2552   }
2553   else if(type==MTYPE_0000) { // RAM 0 mirror
2554     host_tempreg_acquire();
2555     emit_orimm(addr,0x80000000,HOST_TEMPREG);
2556     addr=*addr_reg_override=HOST_TEMPREG;
2557     type=0;
2558   }
2559   else if(type==MTYPE_A000) { // RAM A mirror
2560     host_tempreg_acquire();
2561     emit_andimm(addr,~0x20000000,HOST_TEMPREG);
2562     addr=*addr_reg_override=HOST_TEMPREG;
2563     type=0;
2564   }
2565   else if(type==MTYPE_1F80) { // scratchpad
2566     if (psxH == (void *)0x1f800000) {
2567       host_tempreg_acquire();
2568       emit_xorimm(addr,0x1f800000,HOST_TEMPREG);
2569       emit_cmpimm(HOST_TEMPREG,0x1000);
2570       host_tempreg_release();
2571       jaddr=out;
2572       emit_jc(0);
2573     }
2574     else {
2575       // do the usual RAM check, jump will go to the right handler
2576       type=0;
2577     }
2578   }
2579
2580   if(type==0)
2581   {
2582     emit_cmpimm(addr,RAM_SIZE);
2583     jaddr=out;
2584     #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2585     // Hint to branch predictor that the branch is unlikely to be taken
2586     if(rs1[i]>=28)
2587       emit_jno_unlikely(0);
2588     else
2589     #endif
2590       emit_jno(0);
2591     if(ram_offset!=0) {
2592       host_tempreg_acquire();
2593       emit_addimm(addr,ram_offset,HOST_TEMPREG);
2594       addr=*addr_reg_override=HOST_TEMPREG;
2595     }
2596   }
2597
2598   return jaddr;
2599 }
2600
2601 // return memhandler, or get directly accessable address and return 0
2602 static void *get_direct_memhandler(void *table, u_int addr,
2603   enum stub_type type, uintptr_t *addr_host)
2604 {
2605   uintptr_t l1, l2 = 0;
2606   l1 = ((uintptr_t *)table)[addr>>12];
2607   if ((l1 & (1ul << (sizeof(l1)*8-1))) == 0) {
2608     uintptr_t v = l1 << 1;
2609     *addr_host = v + addr;
2610     return NULL;
2611   }
2612   else {
2613     l1 <<= 1;
2614     if (type == LOADB_STUB || type == LOADBU_STUB || type == STOREB_STUB)
2615       l2 = ((uintptr_t *)l1)[0x1000/4 + 0x1000/2 + (addr&0xfff)];
2616     else if (type == LOADH_STUB || type == LOADHU_STUB || type == STOREH_STUB)
2617       l2=((uintptr_t *)l1)[0x1000/4 + (addr&0xfff)/2];
2618     else
2619       l2=((uintptr_t *)l1)[(addr&0xfff)/4];
2620     if ((l2 & (1<<31)) == 0) {
2621       uintptr_t v = l2 << 1;
2622       *addr_host = v + (addr&0xfff);
2623       return NULL;
2624     }
2625     return (void *)(l2 << 1);
2626   }
2627 }
2628
2629 static void load_assemble(int i,struct regstat *i_regs)
2630 {
2631   int s,tl,addr;
2632   int offset;
2633   void *jaddr=0;
2634   int memtarget=0,c=0;
2635   int fastio_reg_override=-1;
2636   u_int hr,reglist=0;
2637   tl=get_reg(i_regs->regmap,rt1[i]);
2638   s=get_reg(i_regs->regmap,rs1[i]);
2639   offset=imm[i];
2640   for(hr=0;hr<HOST_REGS;hr++) {
2641     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2642   }
2643   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2644   if(s>=0) {
2645     c=(i_regs->wasconst>>s)&1;
2646     if (c) {
2647       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2648     }
2649   }
2650   //printf("load_assemble: c=%d\n",c);
2651   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2652   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2653   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2654     ||rt1[i]==0) {
2655       // could be FIFO, must perform the read
2656       // ||dummy read
2657       assem_debug("(forced read)\n");
2658       tl=get_reg(i_regs->regmap,-1);
2659       assert(tl>=0);
2660   }
2661   if(offset||s<0||c) addr=tl;
2662   else addr=s;
2663   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2664  if(tl>=0) {
2665   //printf("load_assemble: c=%d\n",c);
2666   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2667   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2668   reglist&=~(1<<tl);
2669   if(!c) {
2670     #ifdef R29_HACK
2671     // Strmnnrmn's speed hack
2672     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2673     #endif
2674     {
2675       jaddr=emit_fastpath_cmp_jump(i,addr,&fastio_reg_override);
2676     }
2677   }
2678   else if(ram_offset&&memtarget) {
2679     host_tempreg_acquire();
2680     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2681     fastio_reg_override=HOST_TEMPREG;
2682   }
2683   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2684   if (opcode[i]==0x20) { // LB
2685     if(!c||memtarget) {
2686       if(!dummy) {
2687         {
2688           int x=0,a=tl;
2689           if(!c) a=addr;
2690           if(fastio_reg_override>=0) a=fastio_reg_override;
2691
2692           emit_movsbl_indexed(x,a,tl);
2693         }
2694       }
2695       if(jaddr)
2696         add_stub_r(LOADB_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2697     }
2698     else
2699       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2700   }
2701   if (opcode[i]==0x21) { // LH
2702     if(!c||memtarget) {
2703       if(!dummy) {
2704         int x=0,a=tl;
2705         if(!c) a=addr;
2706         if(fastio_reg_override>=0) a=fastio_reg_override;
2707         emit_movswl_indexed(x,a,tl);
2708       }
2709       if(jaddr)
2710         add_stub_r(LOADH_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2711     }
2712     else
2713       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2714   }
2715   if (opcode[i]==0x23) { // LW
2716     if(!c||memtarget) {
2717       if(!dummy) {
2718         int a=addr;
2719         if(fastio_reg_override>=0) a=fastio_reg_override;
2720         emit_readword_indexed(0,a,tl);
2721       }
2722       if(jaddr)
2723         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2724     }
2725     else
2726       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2727   }
2728   if (opcode[i]==0x24) { // LBU
2729     if(!c||memtarget) {
2730       if(!dummy) {
2731         int x=0,a=tl;
2732         if(!c) a=addr;
2733         if(fastio_reg_override>=0) a=fastio_reg_override;
2734
2735         emit_movzbl_indexed(x,a,tl);
2736       }
2737       if(jaddr)
2738         add_stub_r(LOADBU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2739     }
2740     else
2741       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2742   }
2743   if (opcode[i]==0x25) { // LHU
2744     if(!c||memtarget) {
2745       if(!dummy) {
2746         int x=0,a=tl;
2747         if(!c) a=addr;
2748         if(fastio_reg_override>=0) a=fastio_reg_override;
2749         emit_movzwl_indexed(x,a,tl);
2750       }
2751       if(jaddr)
2752         add_stub_r(LOADHU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2753     }
2754     else
2755       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2756   }
2757   if (opcode[i]==0x27) { // LWU
2758     assert(0);
2759   }
2760   if (opcode[i]==0x37) { // LD
2761     assert(0);
2762   }
2763  }
2764  if (fastio_reg_override == HOST_TEMPREG)
2765    host_tempreg_release();
2766 }
2767
2768 #ifndef loadlr_assemble
2769 static void loadlr_assemble(int i,struct regstat *i_regs)
2770 {
2771   int s,tl,temp,temp2,addr;
2772   int offset;
2773   void *jaddr=0;
2774   int memtarget=0,c=0;
2775   int fastio_reg_override=-1;
2776   u_int hr,reglist=0;
2777   tl=get_reg(i_regs->regmap,rt1[i]);
2778   s=get_reg(i_regs->regmap,rs1[i]);
2779   temp=get_reg(i_regs->regmap,-1);
2780   temp2=get_reg(i_regs->regmap,FTEMP);
2781   addr=get_reg(i_regs->regmap,AGEN1+(i&1));
2782   assert(addr<0);
2783   offset=imm[i];
2784   for(hr=0;hr<HOST_REGS;hr++) {
2785     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2786   }
2787   reglist|=1<<temp;
2788   if(offset||s<0||c) addr=temp2;
2789   else addr=s;
2790   if(s>=0) {
2791     c=(i_regs->wasconst>>s)&1;
2792     if(c) {
2793       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2794     }
2795   }
2796   if(!c) {
2797     emit_shlimm(addr,3,temp);
2798     if (opcode[i]==0x22||opcode[i]==0x26) {
2799       emit_andimm(addr,0xFFFFFFFC,temp2); // LWL/LWR
2800     }else{
2801       emit_andimm(addr,0xFFFFFFF8,temp2); // LDL/LDR
2802     }
2803     jaddr=emit_fastpath_cmp_jump(i,temp2,&fastio_reg_override);
2804   }
2805   else {
2806     if(ram_offset&&memtarget) {
2807       host_tempreg_acquire();
2808       emit_addimm(temp2,ram_offset,HOST_TEMPREG);
2809       fastio_reg_override=HOST_TEMPREG;
2810     }
2811     if (opcode[i]==0x22||opcode[i]==0x26) {
2812       emit_movimm(((constmap[i][s]+offset)<<3)&24,temp); // LWL/LWR
2813     }else{
2814       emit_movimm(((constmap[i][s]+offset)<<3)&56,temp); // LDL/LDR
2815     }
2816   }
2817   if (opcode[i]==0x22||opcode[i]==0x26) { // LWL/LWR
2818     if(!c||memtarget) {
2819       int a=temp2;
2820       if(fastio_reg_override>=0) a=fastio_reg_override;
2821       emit_readword_indexed(0,a,temp2);
2822       if(fastio_reg_override==HOST_TEMPREG) host_tempreg_release();
2823       if(jaddr) add_stub_r(LOADW_STUB,jaddr,out,i,temp2,i_regs,ccadj[i],reglist);
2824     }
2825     else
2826       inline_readstub(LOADW_STUB,i,(constmap[i][s]+offset)&0xFFFFFFFC,i_regs->regmap,FTEMP,ccadj[i],reglist);
2827     if(rt1[i]) {
2828       assert(tl>=0);
2829       emit_andimm(temp,24,temp);
2830       if (opcode[i]==0x22) // LWL
2831         emit_xorimm(temp,24,temp);
2832       host_tempreg_acquire();
2833       emit_movimm(-1,HOST_TEMPREG);
2834       if (opcode[i]==0x26) {
2835         emit_shr(temp2,temp,temp2);
2836         emit_bic_lsr(tl,HOST_TEMPREG,temp,tl);
2837       }else{
2838         emit_shl(temp2,temp,temp2);
2839         emit_bic_lsl(tl,HOST_TEMPREG,temp,tl);
2840       }
2841       host_tempreg_release();
2842       emit_or(temp2,tl,tl);
2843     }
2844     //emit_storereg(rt1[i],tl); // DEBUG
2845   }
2846   if (opcode[i]==0x1A||opcode[i]==0x1B) { // LDL/LDR
2847     assert(0);
2848   }
2849 }
2850 #endif
2851
2852 void store_assemble(int i,struct regstat *i_regs)
2853 {
2854   int s,tl;
2855   int addr,temp;
2856   int offset;
2857   void *jaddr=0;
2858   enum stub_type type;
2859   int memtarget=0,c=0;
2860   int agr=AGEN1+(i&1);
2861   int fastio_reg_override=-1;
2862   u_int hr,reglist=0;
2863   tl=get_reg(i_regs->regmap,rs2[i]);
2864   s=get_reg(i_regs->regmap,rs1[i]);
2865   temp=get_reg(i_regs->regmap,agr);
2866   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2867   offset=imm[i];
2868   if(s>=0) {
2869     c=(i_regs->wasconst>>s)&1;
2870     if(c) {
2871       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2872     }
2873   }
2874   assert(tl>=0);
2875   assert(temp>=0);
2876   for(hr=0;hr<HOST_REGS;hr++) {
2877     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2878   }
2879   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2880   if(offset||s<0||c) addr=temp;
2881   else addr=s;
2882   if(!c) {
2883     jaddr=emit_fastpath_cmp_jump(i,addr,&fastio_reg_override);
2884   }
2885   else if(ram_offset&&memtarget) {
2886     host_tempreg_acquire();
2887     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2888     fastio_reg_override=HOST_TEMPREG;
2889   }
2890
2891   if (opcode[i]==0x28) { // SB
2892     if(!c||memtarget) {
2893       int x=0,a=temp;
2894       if(!c) a=addr;
2895       if(fastio_reg_override>=0) a=fastio_reg_override;
2896       emit_writebyte_indexed(tl,x,a);
2897     }
2898     type=STOREB_STUB;
2899   }
2900   if (opcode[i]==0x29) { // SH
2901     if(!c||memtarget) {
2902       int x=0,a=temp;
2903       if(!c) a=addr;
2904       if(fastio_reg_override>=0) a=fastio_reg_override;
2905       emit_writehword_indexed(tl,x,a);
2906     }
2907     type=STOREH_STUB;
2908   }
2909   if (opcode[i]==0x2B) { // SW
2910     if(!c||memtarget) {
2911       int a=addr;
2912       if(fastio_reg_override>=0) a=fastio_reg_override;
2913       emit_writeword_indexed(tl,0,a);
2914     }
2915     type=STOREW_STUB;
2916   }
2917   if (opcode[i]==0x3F) { // SD
2918     assert(0);
2919     type=STORED_STUB;
2920   }
2921   if(fastio_reg_override==HOST_TEMPREG)
2922     host_tempreg_release();
2923   if(jaddr) {
2924     // PCSX store handlers don't check invcode again
2925     reglist|=1<<addr;
2926     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2927     jaddr=0;
2928   }
2929   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2930     if(!c||memtarget) {
2931       #ifdef DESTRUCTIVE_SHIFT
2932       // The x86 shift operation is 'destructive'; it overwrites the
2933       // source register, so we need to make a copy first and use that.
2934       addr=temp;
2935       #endif
2936       #if defined(HOST_IMM8)
2937       int ir=get_reg(i_regs->regmap,INVCP);
2938       assert(ir>=0);
2939       emit_cmpmem_indexedsr12_reg(ir,addr,1);
2940       #else
2941       emit_cmpmem_indexedsr12_imm(invalid_code,addr,1);
2942       #endif
2943       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2944       emit_callne(invalidate_addr_reg[addr]);
2945       #else
2946       void *jaddr2 = out;
2947       emit_jne(0);
2948       add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),addr,0,0,0);
2949       #endif
2950     }
2951   }
2952   u_int addr_val=constmap[i][s]+offset;
2953   if(jaddr) {
2954     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2955   } else if(c&&!memtarget) {
2956     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
2957   }
2958   // basic current block modification detection..
2959   // not looking back as that should be in mips cache already
2960   // (see Spyro2 title->attract mode)
2961   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
2962     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
2963     assert(i_regs->regmap==regs[i].regmap); // not delay slot
2964     if(i_regs->regmap==regs[i].regmap) {
2965       load_all_consts(regs[i].regmap_entry,regs[i].wasdirty,i);
2966       wb_dirtys(regs[i].regmap_entry,regs[i].wasdirty);
2967       emit_movimm(start+i*4+4,0);
2968       emit_writeword(0,&pcaddr);
2969       emit_addimm(HOST_CCREG,2,HOST_CCREG);
2970       emit_far_call(get_addr_ht);
2971       emit_jmpreg(0);
2972     }
2973   }
2974 }
2975
2976 static void storelr_assemble(int i,struct regstat *i_regs)
2977 {
2978   int s,tl;
2979   int temp;
2980   int offset;
2981   void *jaddr=0;
2982   void *case1, *case2, *case3;
2983   void *done0, *done1, *done2;
2984   int memtarget=0,c=0;
2985   int agr=AGEN1+(i&1);
2986   u_int hr,reglist=0;
2987   tl=get_reg(i_regs->regmap,rs2[i]);
2988   s=get_reg(i_regs->regmap,rs1[i]);
2989   temp=get_reg(i_regs->regmap,agr);
2990   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2991   offset=imm[i];
2992   if(s>=0) {
2993     c=(i_regs->isconst>>s)&1;
2994     if(c) {
2995       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2996     }
2997   }
2998   assert(tl>=0);
2999   for(hr=0;hr<HOST_REGS;hr++) {
3000     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3001   }
3002   assert(temp>=0);
3003   if(!c) {
3004     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3005     if(!offset&&s!=temp) emit_mov(s,temp);
3006     jaddr=out;
3007     emit_jno(0);
3008   }
3009   else
3010   {
3011     if(!memtarget||!rs1[i]) {
3012       jaddr=out;
3013       emit_jmp(0);
3014     }
3015   }
3016   if(ram_offset)
3017     emit_addimm_no_flags(ram_offset,temp);
3018
3019   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3020     assert(0);
3021   }
3022
3023   emit_xorimm(temp,3,temp);
3024   emit_testimm(temp,2);
3025   case2=out;
3026   emit_jne(0);
3027   emit_testimm(temp,1);
3028   case1=out;
3029   emit_jne(0);
3030   // 0
3031   if (opcode[i]==0x2A) { // SWL
3032     emit_writeword_indexed(tl,0,temp);
3033   }
3034   else if (opcode[i]==0x2E) { // SWR
3035     emit_writebyte_indexed(tl,3,temp);
3036   }
3037   else
3038     assert(0);
3039   done0=out;
3040   emit_jmp(0);
3041   // 1
3042   set_jump_target(case1, out);
3043   if (opcode[i]==0x2A) { // SWL
3044     // Write 3 msb into three least significant bytes
3045     if(rs2[i]) emit_rorimm(tl,8,tl);
3046     emit_writehword_indexed(tl,-1,temp);
3047     if(rs2[i]) emit_rorimm(tl,16,tl);
3048     emit_writebyte_indexed(tl,1,temp);
3049     if(rs2[i]) emit_rorimm(tl,8,tl);
3050   }
3051   else if (opcode[i]==0x2E) { // SWR
3052     // Write two lsb into two most significant bytes
3053     emit_writehword_indexed(tl,1,temp);
3054   }
3055   done1=out;
3056   emit_jmp(0);
3057   // 2
3058   set_jump_target(case2, out);
3059   emit_testimm(temp,1);
3060   case3=out;
3061   emit_jne(0);
3062   if (opcode[i]==0x2A) { // SWL
3063     // Write two msb into two least significant bytes
3064     if(rs2[i]) emit_rorimm(tl,16,tl);
3065     emit_writehword_indexed(tl,-2,temp);
3066     if(rs2[i]) emit_rorimm(tl,16,tl);
3067   }
3068   else if (opcode[i]==0x2E) { // SWR
3069     // Write 3 lsb into three most significant bytes
3070     emit_writebyte_indexed(tl,-1,temp);
3071     if(rs2[i]) emit_rorimm(tl,8,tl);
3072     emit_writehword_indexed(tl,0,temp);
3073     if(rs2[i]) emit_rorimm(tl,24,tl);
3074   }
3075   done2=out;
3076   emit_jmp(0);
3077   // 3
3078   set_jump_target(case3, out);
3079   if (opcode[i]==0x2A) { // SWL
3080     // Write msb into least significant byte
3081     if(rs2[i]) emit_rorimm(tl,24,tl);
3082     emit_writebyte_indexed(tl,-3,temp);
3083     if(rs2[i]) emit_rorimm(tl,8,tl);
3084   }
3085   else if (opcode[i]==0x2E) { // SWR
3086     // Write entire word
3087     emit_writeword_indexed(tl,-3,temp);
3088   }
3089   set_jump_target(done0, out);
3090   set_jump_target(done1, out);
3091   set_jump_target(done2, out);
3092   if(!c||!memtarget)
3093     add_stub_r(STORELR_STUB,jaddr,out,i,temp,i_regs,ccadj[i],reglist);
3094   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3095     emit_addimm_no_flags(-ram_offset,temp);
3096     #if defined(HOST_IMM8)
3097     int ir=get_reg(i_regs->regmap,INVCP);
3098     assert(ir>=0);
3099     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3100     #else
3101     emit_cmpmem_indexedsr12_imm(invalid_code,temp,1);
3102     #endif
3103     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3104     emit_callne(invalidate_addr_reg[temp]);
3105     #else
3106     void *jaddr2 = out;
3107     emit_jne(0);
3108     add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3109     #endif
3110   }
3111 }
3112
3113 static void cop0_assemble(int i,struct regstat *i_regs)
3114 {
3115   if(opcode2[i]==0) // MFC0
3116   {
3117     signed char t=get_reg(i_regs->regmap,rt1[i]);
3118     u_int copr=(source[i]>>11)&0x1f;
3119     //assert(t>=0); // Why does this happen?  OOT is weird
3120     if(t>=0&&rt1[i]!=0) {
3121       emit_readword(&reg_cop0[copr],t);
3122     }
3123   }
3124   else if(opcode2[i]==4) // MTC0
3125   {
3126     signed char s=get_reg(i_regs->regmap,rs1[i]);
3127     char copr=(source[i]>>11)&0x1f;
3128     assert(s>=0);
3129     wb_register(rs1[i],i_regs->regmap,i_regs->dirty);
3130     if(copr==9||copr==11||copr==12||copr==13) {
3131       emit_readword(&last_count,HOST_TEMPREG);
3132       emit_loadreg(CCREG,HOST_CCREG); // TODO: do proper reg alloc
3133       emit_add(HOST_CCREG,HOST_TEMPREG,HOST_CCREG);
3134       emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3135       emit_writeword(HOST_CCREG,&Count);
3136     }
3137     // What a mess.  The status register (12) can enable interrupts,
3138     // so needs a special case to handle a pending interrupt.
3139     // The interrupt must be taken immediately, because a subsequent
3140     // instruction might disable interrupts again.
3141     if(copr==12||copr==13) {
3142       if (is_delayslot) {
3143         // burn cycles to cause cc_interrupt, which will
3144         // reschedule next_interupt. Relies on CCREG from above.
3145         assem_debug("MTC0 DS %d\n", copr);
3146         emit_writeword(HOST_CCREG,&last_count);
3147         emit_movimm(0,HOST_CCREG);
3148         emit_storereg(CCREG,HOST_CCREG);
3149         emit_loadreg(rs1[i],1);
3150         emit_movimm(copr,0);
3151         emit_far_call(pcsx_mtc0_ds);
3152         emit_loadreg(rs1[i],s);
3153         return;
3154       }
3155       emit_movimm(start+i*4+4,HOST_TEMPREG);
3156       emit_writeword(HOST_TEMPREG,&pcaddr);
3157       emit_movimm(0,HOST_TEMPREG);
3158       emit_writeword(HOST_TEMPREG,&pending_exception);
3159     }
3160     if(s==HOST_CCREG)
3161       emit_loadreg(rs1[i],1);
3162     else if(s!=1)
3163       emit_mov(s,1);
3164     emit_movimm(copr,0);
3165     emit_far_call(pcsx_mtc0);
3166     if(copr==9||copr==11||copr==12||copr==13) {
3167       emit_readword(&Count,HOST_CCREG);
3168       emit_readword(&next_interupt,HOST_TEMPREG);
3169       emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3170       emit_sub(HOST_CCREG,HOST_TEMPREG,HOST_CCREG);
3171       emit_writeword(HOST_TEMPREG,&last_count);
3172       emit_storereg(CCREG,HOST_CCREG);
3173     }
3174     if(copr==12||copr==13) {
3175       assert(!is_delayslot);
3176       emit_readword(&pending_exception,14);
3177       emit_test(14,14);
3178       void *jaddr = out;
3179       emit_jeq(0);
3180       emit_readword(&pcaddr, 0);
3181       emit_addimm(HOST_CCREG,2,HOST_CCREG);
3182       emit_far_call(get_addr_ht);
3183       emit_jmpreg(0);
3184       set_jump_target(jaddr, out);
3185     }
3186     emit_loadreg(rs1[i],s);
3187   }
3188   else
3189   {
3190     assert(opcode2[i]==0x10);
3191     //if((source[i]&0x3f)==0x10) // RFE
3192     {
3193       emit_readword(&Status,0);
3194       emit_andimm(0,0x3c,1);
3195       emit_andimm(0,~0xf,0);
3196       emit_orrshr_imm(1,2,0);
3197       emit_writeword(0,&Status);
3198     }
3199   }
3200 }
3201
3202 static void cop1_unusable(int i,struct regstat *i_regs)
3203 {
3204   // XXX: should just just do the exception instead
3205   //if(!cop1_usable)
3206   {
3207     void *jaddr=out;
3208     emit_jmp(0);
3209     add_stub_r(FP_STUB,jaddr,out,i,0,i_regs,is_delayslot,0);
3210   }
3211 }
3212
3213 static void cop1_assemble(int i,struct regstat *i_regs)
3214 {
3215   cop1_unusable(i, i_regs);
3216 }
3217
3218 static void c1ls_assemble(int i,struct regstat *i_regs)
3219 {
3220   cop1_unusable(i, i_regs);
3221 }
3222
3223 // FP_STUB
3224 static void do_cop1stub(int n)
3225 {
3226   literal_pool(256);
3227   assem_debug("do_cop1stub %x\n",start+stubs[n].a*4);
3228   set_jump_target(stubs[n].addr, out);
3229   int i=stubs[n].a;
3230 //  int rs=stubs[n].b;
3231   struct regstat *i_regs=(struct regstat *)stubs[n].c;
3232   int ds=stubs[n].d;
3233   if(!ds) {
3234     load_all_consts(regs[i].regmap_entry,regs[i].wasdirty,i);
3235     //if(i_regs!=&regs[i]) printf("oops: regs[i]=%x i_regs=%x",(int)&regs[i],(int)i_regs);
3236   }
3237   //else {printf("fp exception in delay slot\n");}
3238   wb_dirtys(i_regs->regmap_entry,i_regs->wasdirty);
3239   if(regs[i].regmap_entry[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
3240   emit_movimm(start+(i-ds)*4,EAX); // Get PC
3241   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3242   emit_far_jump(ds?fp_exception_ds:fp_exception);
3243 }
3244
3245 static void cop2_get_dreg(u_int copr,signed char tl,signed char temp)
3246 {
3247   switch (copr) {
3248     case 1:
3249     case 3:
3250     case 5:
3251     case 8:
3252     case 9:
3253     case 10:
3254     case 11:
3255       emit_readword(&reg_cop2d[copr],tl);
3256       emit_signextend16(tl,tl);
3257       emit_writeword(tl,&reg_cop2d[copr]); // hmh
3258       break;
3259     case 7:
3260     case 16:
3261     case 17:
3262     case 18:
3263     case 19:
3264       emit_readword(&reg_cop2d[copr],tl);
3265       emit_andimm(tl,0xffff,tl);
3266       emit_writeword(tl,&reg_cop2d[copr]);
3267       break;
3268     case 15:
3269       emit_readword(&reg_cop2d[14],tl); // SXY2
3270       emit_writeword(tl,&reg_cop2d[copr]);
3271       break;
3272     case 28:
3273     case 29:
3274       c2op_mfc2_29_assemble(tl,temp);
3275       break;
3276     default:
3277       emit_readword(&reg_cop2d[copr],tl);
3278       break;
3279   }
3280 }
3281
3282 static void cop2_put_dreg(u_int copr,signed char sl,signed char temp)
3283 {
3284   switch (copr) {
3285     case 15:
3286       emit_readword(&reg_cop2d[13],temp);  // SXY1
3287       emit_writeword(sl,&reg_cop2d[copr]);
3288       emit_writeword(temp,&reg_cop2d[12]); // SXY0
3289       emit_readword(&reg_cop2d[14],temp);  // SXY2
3290       emit_writeword(sl,&reg_cop2d[14]);
3291       emit_writeword(temp,&reg_cop2d[13]); // SXY1
3292       break;
3293     case 28:
3294       emit_andimm(sl,0x001f,temp);
3295       emit_shlimm(temp,7,temp);
3296       emit_writeword(temp,&reg_cop2d[9]);
3297       emit_andimm(sl,0x03e0,temp);
3298       emit_shlimm(temp,2,temp);
3299       emit_writeword(temp,&reg_cop2d[10]);
3300       emit_andimm(sl,0x7c00,temp);
3301       emit_shrimm(temp,3,temp);
3302       emit_writeword(temp,&reg_cop2d[11]);
3303       emit_writeword(sl,&reg_cop2d[28]);
3304       break;
3305     case 30:
3306       emit_xorsar_imm(sl,sl,31,temp);
3307 #if defined(HAVE_ARMV5) || defined(__aarch64__)
3308       emit_clz(temp,temp);
3309 #else
3310       emit_movs(temp,HOST_TEMPREG);
3311       emit_movimm(0,temp);
3312       emit_jeq((int)out+4*4);
3313       emit_addpl_imm(temp,1,temp);
3314       emit_lslpls_imm(HOST_TEMPREG,1,HOST_TEMPREG);
3315       emit_jns((int)out-2*4);
3316 #endif
3317       emit_writeword(sl,&reg_cop2d[30]);
3318       emit_writeword(temp,&reg_cop2d[31]);
3319       break;
3320     case 31:
3321       break;
3322     default:
3323       emit_writeword(sl,&reg_cop2d[copr]);
3324       break;
3325   }
3326 }
3327
3328 static void c2ls_assemble(int i,struct regstat *i_regs)
3329 {
3330   int s,tl;
3331   int ar;
3332   int offset;
3333   int memtarget=0,c=0;
3334   void *jaddr2=NULL;
3335   enum stub_type type;
3336   int agr=AGEN1+(i&1);
3337   int fastio_reg_override=-1;
3338   u_int hr,reglist=0;
3339   u_int copr=(source[i]>>16)&0x1f;
3340   s=get_reg(i_regs->regmap,rs1[i]);
3341   tl=get_reg(i_regs->regmap,FTEMP);
3342   offset=imm[i];
3343   assert(rs1[i]>0);
3344   assert(tl>=0);
3345
3346   for(hr=0;hr<HOST_REGS;hr++) {
3347     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3348   }
3349   if(i_regs->regmap[HOST_CCREG]==CCREG)
3350     reglist&=~(1<<HOST_CCREG);
3351
3352   // get the address
3353   if (opcode[i]==0x3a) { // SWC2
3354     ar=get_reg(i_regs->regmap,agr);
3355     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3356     reglist|=1<<ar;
3357   } else { // LWC2
3358     ar=tl;
3359   }
3360   if(s>=0) c=(i_regs->wasconst>>s)&1;
3361   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3362   if (!offset&&!c&&s>=0) ar=s;
3363   assert(ar>=0);
3364
3365   if (opcode[i]==0x3a) { // SWC2
3366     cop2_get_dreg(copr,tl,-1);
3367     type=STOREW_STUB;
3368   }
3369   else
3370     type=LOADW_STUB;
3371
3372   if(c&&!memtarget) {
3373     jaddr2=out;
3374     emit_jmp(0); // inline_readstub/inline_writestub?
3375   }
3376   else {
3377     if(!c) {
3378       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3379     }
3380     else if(ram_offset&&memtarget) {
3381       host_tempreg_acquire();
3382       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3383       fastio_reg_override=HOST_TEMPREG;
3384     }
3385     if (opcode[i]==0x32) { // LWC2
3386       int a=ar;
3387       if(fastio_reg_override>=0) a=fastio_reg_override;
3388       emit_readword_indexed(0,a,tl);
3389     }
3390     if (opcode[i]==0x3a) { // SWC2
3391       #ifdef DESTRUCTIVE_SHIFT
3392       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3393       #endif
3394       int a=ar;
3395       if(fastio_reg_override>=0) a=fastio_reg_override;
3396       emit_writeword_indexed(tl,0,a);
3397     }
3398   }
3399   if(fastio_reg_override==HOST_TEMPREG)
3400     host_tempreg_release();
3401   if(jaddr2)
3402     add_stub_r(type,jaddr2,out,i,ar,i_regs,ccadj[i],reglist);
3403   if(opcode[i]==0x3a) // SWC2
3404   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3405 #if defined(HOST_IMM8)
3406     int ir=get_reg(i_regs->regmap,INVCP);
3407     assert(ir>=0);
3408     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3409 #else
3410     emit_cmpmem_indexedsr12_imm(invalid_code,ar,1);
3411 #endif
3412     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3413     emit_callne(invalidate_addr_reg[ar]);
3414     #else
3415     void *jaddr3 = out;
3416     emit_jne(0);
3417     add_stub(INVCODE_STUB,jaddr3,out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3418     #endif
3419   }
3420   if (opcode[i]==0x32) { // LWC2
3421     host_tempreg_acquire();
3422     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3423     host_tempreg_release();
3424   }
3425 }
3426
3427 static void cop2_assemble(int i,struct regstat *i_regs)
3428 {
3429   u_int copr=(source[i]>>11)&0x1f;
3430   signed char temp=get_reg(i_regs->regmap,-1);
3431   if (opcode2[i]==0) { // MFC2
3432     signed char tl=get_reg(i_regs->regmap,rt1[i]);
3433     if(tl>=0&&rt1[i]!=0)
3434       cop2_get_dreg(copr,tl,temp);
3435   }
3436   else if (opcode2[i]==4) { // MTC2
3437     signed char sl=get_reg(i_regs->regmap,rs1[i]);
3438     cop2_put_dreg(copr,sl,temp);
3439   }
3440   else if (opcode2[i]==2) // CFC2
3441   {
3442     signed char tl=get_reg(i_regs->regmap,rt1[i]);
3443     if(tl>=0&&rt1[i]!=0)
3444       emit_readword(&reg_cop2c[copr],tl);
3445   }
3446   else if (opcode2[i]==6) // CTC2
3447   {
3448     signed char sl=get_reg(i_regs->regmap,rs1[i]);
3449     switch(copr) {
3450       case 4:
3451       case 12:
3452       case 20:
3453       case 26:
3454       case 27:
3455       case 29:
3456       case 30:
3457         emit_signextend16(sl,temp);
3458         break;
3459       case 31:
3460         c2op_ctc2_31_assemble(sl,temp);
3461         break;
3462       default:
3463         temp=sl;
3464         break;
3465     }
3466     emit_writeword(temp,&reg_cop2c[copr]);
3467     assert(sl>=0);
3468   }
3469 }
3470
3471 static void do_unalignedwritestub(int n)
3472 {
3473   assem_debug("do_unalignedwritestub %x\n",start+stubs[n].a*4);
3474   literal_pool(256);
3475   set_jump_target(stubs[n].addr, out);
3476
3477   int i=stubs[n].a;
3478   struct regstat *i_regs=(struct regstat *)stubs[n].c;
3479   int addr=stubs[n].b;
3480   u_int reglist=stubs[n].e;
3481   signed char *i_regmap=i_regs->regmap;
3482   int temp2=get_reg(i_regmap,FTEMP);
3483   int rt;
3484   rt=get_reg(i_regmap,rs2[i]);
3485   assert(rt>=0);
3486   assert(addr>=0);
3487   assert(opcode[i]==0x2a||opcode[i]==0x2e); // SWL/SWR only implemented
3488   reglist|=(1<<addr);
3489   reglist&=~(1<<temp2);
3490
3491 #if 1
3492   // don't bother with it and call write handler
3493   save_regs(reglist);
3494   pass_args(addr,rt);
3495   int cc=get_reg(i_regmap,CCREG);
3496   if(cc<0)
3497     emit_loadreg(CCREG,2);
3498   emit_addimm(cc<0?2:cc,CLOCK_ADJUST((int)stubs[n].d+1),2);
3499   emit_far_call((opcode[i]==0x2a?jump_handle_swl:jump_handle_swr));
3500   emit_addimm(0,-CLOCK_ADJUST((int)stubs[n].d+1),cc<0?2:cc);
3501   if(cc<0)
3502     emit_storereg(CCREG,2);
3503   restore_regs(reglist);
3504   emit_jmp(stubs[n].retaddr); // return address
3505 #else
3506   emit_andimm(addr,0xfffffffc,temp2);
3507   emit_writeword(temp2,&address);
3508
3509   save_regs(reglist);
3510   emit_shrimm(addr,16,1);
3511   int cc=get_reg(i_regmap,CCREG);
3512   if(cc<0) {
3513     emit_loadreg(CCREG,2);
3514   }
3515   emit_movimm((u_int)readmem,0);
3516   emit_addimm(cc<0?2:cc,2*stubs[n].d+2,2);
3517   emit_call((int)&indirect_jump_indexed);
3518   restore_regs(reglist);
3519
3520   emit_readword(&readmem_dword,temp2);
3521   int temp=addr; //hmh
3522   emit_shlimm(addr,3,temp);
3523   emit_andimm(temp,24,temp);
3524   if (opcode[i]==0x2a) // SWL
3525     emit_xorimm(temp,24,temp);
3526   emit_movimm(-1,HOST_TEMPREG);
3527   if (opcode[i]==0x2a) { // SWL
3528     emit_bic_lsr(temp2,HOST_TEMPREG,temp,temp2);
3529     emit_orrshr(rt,temp,temp2);
3530   }else{
3531     emit_bic_lsl(temp2,HOST_TEMPREG,temp,temp2);
3532     emit_orrshl(rt,temp,temp2);
3533   }
3534   emit_readword(&address,addr);
3535   emit_writeword(temp2,&word);
3536   //save_regs(reglist); // don't need to, no state changes
3537   emit_shrimm(addr,16,1);
3538   emit_movimm((u_int)writemem,0);
3539   //emit_call((int)&indirect_jump_indexed);
3540   emit_mov(15,14);
3541   emit_readword_dualindexedx4(0,1,15);
3542   emit_readword(&Count,HOST_TEMPREG);
3543   emit_readword(&next_interupt,2);
3544   emit_addimm(HOST_TEMPREG,-2*stubs[n].d-2,HOST_TEMPREG);
3545   emit_writeword(2,&last_count);
3546   emit_sub(HOST_TEMPREG,2,cc<0?HOST_TEMPREG:cc);
3547   if(cc<0) {
3548     emit_storereg(CCREG,HOST_TEMPREG);
3549   }
3550   restore_regs(reglist);
3551   emit_jmp(stubs[n].retaddr); // return address
3552 #endif
3553 }
3554
3555 #ifndef multdiv_assemble
3556 void multdiv_assemble(int i,struct regstat *i_regs)
3557 {
3558   printf("Need multdiv_assemble for this architecture.\n");
3559   abort();
3560 }
3561 #endif
3562
3563 static void mov_assemble(int i,struct regstat *i_regs)
3564 {
3565   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3566   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3567   if(rt1[i]) {
3568     signed char sl,tl;
3569     tl=get_reg(i_regs->regmap,rt1[i]);
3570     //assert(tl>=0);
3571     if(tl>=0) {
3572       sl=get_reg(i_regs->regmap,rs1[i]);
3573       if(sl>=0) emit_mov(sl,tl);
3574       else emit_loadreg(rs1[i],tl);
3575     }
3576   }
3577 }
3578
3579 // call interpreter, exception handler, things that change pc/regs/cycles ...
3580 static void call_c_cpu_handler(int i, const struct regstat *i_regs, u_int pc, void *func)
3581 {
3582   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3583   assert(ccreg==HOST_CCREG);
3584   assert(!is_delayslot);
3585   (void)ccreg;
3586
3587   emit_movimm(pc,3); // Get PC
3588   emit_readword(&last_count,2);
3589   emit_writeword(3,&psxRegs.pc);
3590   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3591   emit_add(2,HOST_CCREG,2);
3592   emit_writeword(2,&psxRegs.cycle);
3593   emit_far_call(func);
3594   emit_far_jump(jump_to_new_pc);
3595 }
3596
3597 static void syscall_assemble(int i,struct regstat *i_regs)
3598 {
3599   emit_movimm(0x20,0); // cause code
3600   emit_movimm(0,1);    // not in delay slot
3601   call_c_cpu_handler(i,i_regs,start+i*4,psxException);
3602 }
3603
3604 static void hlecall_assemble(int i,struct regstat *i_regs)
3605 {
3606   void *hlefunc = psxNULL;
3607   uint32_t hleCode = source[i] & 0x03ffffff;
3608   if (hleCode < ARRAY_SIZE(psxHLEt))
3609     hlefunc = psxHLEt[hleCode];
3610
3611   call_c_cpu_handler(i,i_regs,start+i*4+4,hlefunc);
3612 }
3613
3614 static void intcall_assemble(int i,struct regstat *i_regs)
3615 {
3616   call_c_cpu_handler(i,i_regs,start+i*4,execI);
3617 }
3618
3619 static void speculate_mov(int rs,int rt)
3620 {
3621   if(rt!=0) {
3622     smrv_strong_next|=1<<rt;
3623     smrv[rt]=smrv[rs];
3624   }
3625 }
3626
3627 static void speculate_mov_weak(int rs,int rt)
3628 {
3629   if(rt!=0) {
3630     smrv_weak_next|=1<<rt;
3631     smrv[rt]=smrv[rs];
3632   }
3633 }
3634
3635 static void speculate_register_values(int i)
3636 {
3637   if(i==0) {
3638     memcpy(smrv,psxRegs.GPR.r,sizeof(smrv));
3639     // gp,sp are likely to stay the same throughout the block
3640     smrv_strong_next=(1<<28)|(1<<29)|(1<<30);
3641     smrv_weak_next=~smrv_strong_next;
3642     //printf(" llr %08x\n", smrv[4]);
3643   }
3644   smrv_strong=smrv_strong_next;
3645   smrv_weak=smrv_weak_next;
3646   switch(itype[i]) {
3647     case ALU:
3648       if     ((smrv_strong>>rs1[i])&1) speculate_mov(rs1[i],rt1[i]);
3649       else if((smrv_strong>>rs2[i])&1) speculate_mov(rs2[i],rt1[i]);
3650       else if((smrv_weak>>rs1[i])&1) speculate_mov_weak(rs1[i],rt1[i]);
3651       else if((smrv_weak>>rs2[i])&1) speculate_mov_weak(rs2[i],rt1[i]);
3652       else {
3653         smrv_strong_next&=~(1<<rt1[i]);
3654         smrv_weak_next&=~(1<<rt1[i]);
3655       }
3656       break;
3657     case SHIFTIMM:
3658       smrv_strong_next&=~(1<<rt1[i]);
3659       smrv_weak_next&=~(1<<rt1[i]);
3660       // fallthrough
3661     case IMM16:
3662       if(rt1[i]&&is_const(&regs[i],rt1[i])) {
3663         int value,hr=get_reg(regs[i].regmap,rt1[i]);
3664         if(hr>=0) {
3665           if(get_final_value(hr,i,&value))
3666                smrv[rt1[i]]=value;
3667           else smrv[rt1[i]]=constmap[i][hr];
3668           smrv_strong_next|=1<<rt1[i];
3669         }
3670       }
3671       else {
3672         if     ((smrv_strong>>rs1[i])&1) speculate_mov(rs1[i],rt1[i]);
3673         else if((smrv_weak>>rs1[i])&1) speculate_mov_weak(rs1[i],rt1[i]);
3674       }
3675       break;
3676     case LOAD:
3677       if(start<0x2000&&(rt1[i]==26||(smrv[rt1[i]]>>24)==0xa0)) {
3678         // special case for BIOS
3679         smrv[rt1[i]]=0xa0000000;
3680         smrv_strong_next|=1<<rt1[i];
3681         break;
3682       }
3683       // fallthrough
3684     case SHIFT:
3685     case LOADLR:
3686     case MOV:
3687       smrv_strong_next&=~(1<<rt1[i]);
3688       smrv_weak_next&=~(1<<rt1[i]);
3689       break;
3690     case COP0:
3691     case COP2:
3692       if(opcode2[i]==0||opcode2[i]==2) { // MFC/CFC
3693         smrv_strong_next&=~(1<<rt1[i]);
3694         smrv_weak_next&=~(1<<rt1[i]);
3695       }
3696       break;
3697     case C2LS:
3698       if (opcode[i]==0x32) { // LWC2
3699         smrv_strong_next&=~(1<<rt1[i]);
3700         smrv_weak_next&=~(1<<rt1[i]);
3701       }
3702       break;
3703   }
3704 #if 0
3705   int r=4;
3706   printf("x %08x %08x %d %d c %08x %08x\n",smrv[r],start+i*4,
3707     ((smrv_strong>>r)&1),(smrv_weak>>r)&1,regs[i].isconst,regs[i].wasconst);
3708 #endif
3709 }
3710
3711 static void ds_assemble(int i,struct regstat *i_regs)
3712 {
3713   speculate_register_values(i);
3714   is_delayslot=1;
3715   switch(itype[i]) {
3716     case ALU:
3717       alu_assemble(i,i_regs);break;
3718     case IMM16:
3719       imm16_assemble(i,i_regs);break;
3720     case SHIFT:
3721       shift_assemble(i,i_regs);break;
3722     case SHIFTIMM:
3723       shiftimm_assemble(i,i_regs);break;
3724     case LOAD:
3725       load_assemble(i,i_regs);break;
3726     case LOADLR:
3727       loadlr_assemble(i,i_regs);break;
3728     case STORE:
3729       store_assemble(i,i_regs);break;
3730     case STORELR:
3731       storelr_assemble(i,i_regs);break;
3732     case COP0:
3733       cop0_assemble(i,i_regs);break;
3734     case COP1:
3735       cop1_assemble(i,i_regs);break;
3736     case C1LS:
3737       c1ls_assemble(i,i_regs);break;
3738     case COP2:
3739       cop2_assemble(i,i_regs);break;
3740     case C2LS:
3741       c2ls_assemble(i,i_regs);break;
3742     case C2OP:
3743       c2op_assemble(i,i_regs);break;
3744     case MULTDIV:
3745       multdiv_assemble(i,i_regs);break;
3746     case MOV:
3747       mov_assemble(i,i_regs);break;
3748     case SYSCALL:
3749     case HLECALL:
3750     case INTCALL:
3751     case SPAN:
3752     case UJUMP:
3753     case RJUMP:
3754     case CJUMP:
3755     case SJUMP:
3756       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3757   }
3758   is_delayslot=0;
3759 }
3760
3761 // Is the branch target a valid internal jump?
3762 static int internal_branch(int addr)
3763 {
3764   if(addr&1) return 0; // Indirect (register) jump
3765   if(addr>=start && addr<start+slen*4-4)
3766   {
3767     return 1;
3768   }
3769   return 0;
3770 }
3771
3772 static void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t u)
3773 {
3774   int hr;
3775   for(hr=0;hr<HOST_REGS;hr++) {
3776     if(hr!=EXCLUDE_REG) {
3777       if(pre[hr]!=entry[hr]) {
3778         if(pre[hr]>=0) {
3779           if((dirty>>hr)&1) {
3780             if(get_reg(entry,pre[hr])<0) {
3781               assert(pre[hr]<64);
3782               if(!((u>>pre[hr])&1))
3783                 emit_storereg(pre[hr],hr);
3784             }
3785           }
3786         }
3787       }
3788     }
3789   }
3790   // Move from one register to another (no writeback)
3791   for(hr=0;hr<HOST_REGS;hr++) {
3792     if(hr!=EXCLUDE_REG) {
3793       if(pre[hr]!=entry[hr]) {
3794         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3795           int nr;
3796           if((nr=get_reg(entry,pre[hr]))>=0) {
3797             emit_mov(hr,nr);
3798           }
3799         }
3800       }
3801     }
3802   }
3803 }
3804
3805 // Load the specified registers
3806 // This only loads the registers given as arguments because
3807 // we don't want to load things that will be overwritten
3808 static void load_regs(signed char entry[],signed char regmap[],int rs1,int rs2)
3809 {
3810   int hr;
3811   // Load 32-bit regs
3812   for(hr=0;hr<HOST_REGS;hr++) {
3813     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3814       if(entry[hr]!=regmap[hr]) {
3815         if(regmap[hr]==rs1||regmap[hr]==rs2)
3816         {
3817           if(regmap[hr]==0) {
3818             emit_zeroreg(hr);
3819           }
3820           else
3821           {
3822             emit_loadreg(regmap[hr],hr);
3823           }
3824         }
3825       }
3826     }
3827   }
3828 }
3829
3830 // Load registers prior to the start of a loop
3831 // so that they are not loaded within the loop
3832 static void loop_preload(signed char pre[],signed char entry[])
3833 {
3834   int hr;
3835   for(hr=0;hr<HOST_REGS;hr++) {
3836     if(hr!=EXCLUDE_REG) {
3837       if(pre[hr]!=entry[hr]) {
3838         if(entry[hr]>=0) {
3839           if(get_reg(pre,entry[hr])<0) {
3840             assem_debug("loop preload:\n");
3841             //printf("loop preload: %d\n",hr);
3842             if(entry[hr]==0) {
3843               emit_zeroreg(hr);
3844             }
3845             else if(entry[hr]<TEMPREG)
3846             {
3847               emit_loadreg(entry[hr],hr);
3848             }
3849             else if(entry[hr]-64<TEMPREG)
3850             {
3851               emit_loadreg(entry[hr],hr);
3852             }
3853           }
3854         }
3855       }
3856     }
3857   }
3858 }
3859
3860 // Generate address for load/store instruction
3861 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
3862 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3863 {
3864   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
3865     int ra=-1;
3866     int agr=AGEN1+(i&1);
3867     if(itype[i]==LOAD) {
3868       ra=get_reg(i_regs->regmap,rt1[i]);
3869       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3870       assert(ra>=0);
3871     }
3872     if(itype[i]==LOADLR) {
3873       ra=get_reg(i_regs->regmap,FTEMP);
3874     }
3875     if(itype[i]==STORE||itype[i]==STORELR) {
3876       ra=get_reg(i_regs->regmap,agr);
3877       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3878     }
3879     if(itype[i]==C1LS||itype[i]==C2LS) {
3880       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
3881         ra=get_reg(i_regs->regmap,FTEMP);
3882       else { // SWC1/SDC1/SWC2/SDC2
3883         ra=get_reg(i_regs->regmap,agr);
3884         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3885       }
3886     }
3887     int rs=get_reg(i_regs->regmap,rs1[i]);
3888     if(ra>=0) {
3889       int offset=imm[i];
3890       int c=(i_regs->wasconst>>rs)&1;
3891       if(rs1[i]==0) {
3892         // Using r0 as a base address
3893         if(!entry||entry[ra]!=agr) {
3894           if (opcode[i]==0x22||opcode[i]==0x26) {
3895             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3896           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3897             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3898           }else{
3899             emit_movimm(offset,ra);
3900           }
3901         } // else did it in the previous cycle
3902       }
3903       else if(rs<0) {
3904         if(!entry||entry[ra]!=rs1[i])
3905           emit_loadreg(rs1[i],ra);
3906         //if(!entry||entry[ra]!=rs1[i])
3907         //  printf("poor load scheduling!\n");
3908       }
3909       else if(c) {
3910         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3911           if(!entry||entry[ra]!=agr) {
3912             if (opcode[i]==0x22||opcode[i]==0x26) {
3913               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3914             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3915               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3916             }else{
3917               emit_movimm(constmap[i][rs]+offset,ra);
3918               regs[i].loadedconst|=1<<ra;
3919             }
3920           } // else did it in the previous cycle
3921         } // else load_consts already did it
3922       }
3923       if(offset&&!c&&rs1[i]) {
3924         if(rs>=0) {
3925           emit_addimm(rs,offset,ra);
3926         }else{
3927           emit_addimm(ra,offset,ra);
3928         }
3929       }
3930     }
3931   }
3932   // Preload constants for next instruction
3933   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
3934     int agr,ra;
3935     // Actual address
3936     agr=AGEN1+((i+1)&1);
3937     ra=get_reg(i_regs->regmap,agr);
3938     if(ra>=0) {
3939       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3940       int offset=imm[i+1];
3941       int c=(regs[i+1].wasconst>>rs)&1;
3942       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3943         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3944           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3945         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3946           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3947         }else{
3948           emit_movimm(constmap[i+1][rs]+offset,ra);
3949           regs[i+1].loadedconst|=1<<ra;
3950         }
3951       }
3952       else if(rs1[i+1]==0) {
3953         // Using r0 as a base address
3954         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3955           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3956         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3957           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3958         }else{
3959           emit_movimm(offset,ra);
3960         }
3961       }
3962     }
3963   }
3964 }
3965
3966 static int get_final_value(int hr, int i, int *value)
3967 {
3968   int reg=regs[i].regmap[hr];
3969   while(i<slen-1) {
3970     if(regs[i+1].regmap[hr]!=reg) break;
3971     if(!((regs[i+1].isconst>>hr)&1)) break;
3972     if(bt[i+1]) break;
3973     i++;
3974   }
3975   if(i<slen-1) {
3976     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3977       *value=constmap[i][hr];
3978       return 1;
3979     }
3980     if(!bt[i+1]) {
3981       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3982         // Load in delay slot, out-of-order execution
3983         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3984         {
3985           // Precompute load address
3986           *value=constmap[i][hr]+imm[i+2];
3987           return 1;
3988         }
3989       }
3990       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3991       {
3992         // Precompute load address
3993         *value=constmap[i][hr]+imm[i+1];
3994         //printf("c=%x imm=%lx\n",(long)constmap[i][hr],imm[i+1]);
3995         return 1;
3996       }
3997     }
3998   }
3999   *value=constmap[i][hr];
4000   //printf("c=%lx\n",(long)constmap[i][hr]);
4001   if(i==slen-1) return 1;
4002   assert(reg < 64);
4003   return !((unneeded_reg[i+1]>>reg)&1);
4004 }
4005
4006 // Load registers with known constants
4007 static void load_consts(signed char pre[],signed char regmap[],int i)
4008 {
4009   int hr,hr2;
4010   // propagate loaded constant flags
4011   if(i==0||bt[i])
4012     regs[i].loadedconst=0;
4013   else {
4014     for(hr=0;hr<HOST_REGS;hr++) {
4015       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
4016          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
4017       {
4018         regs[i].loadedconst|=1<<hr;
4019       }
4020     }
4021   }
4022   // Load 32-bit regs
4023   for(hr=0;hr<HOST_REGS;hr++) {
4024     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4025       //if(entry[hr]!=regmap[hr]) {
4026       if(!((regs[i].loadedconst>>hr)&1)) {
4027         assert(regmap[hr]<64);
4028         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>0) {
4029           int value,similar=0;
4030           if(get_final_value(hr,i,&value)) {
4031             // see if some other register has similar value
4032             for(hr2=0;hr2<HOST_REGS;hr2++) {
4033               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
4034                 if(is_similar_value(value,constmap[i][hr2])) {
4035                   similar=1;
4036                   break;
4037                 }
4038               }
4039             }
4040             if(similar) {
4041               int value2;
4042               if(get_final_value(hr2,i,&value2)) // is this needed?
4043                 emit_movimm_from(value2,hr2,value,hr);
4044               else
4045                 emit_movimm(value,hr);
4046             }
4047             else if(value==0) {
4048               emit_zeroreg(hr);
4049             }
4050             else {
4051               emit_movimm(value,hr);
4052             }
4053           }
4054           regs[i].loadedconst|=1<<hr;
4055         }
4056       }
4057     }
4058   }
4059 }
4060
4061 void load_all_consts(signed char regmap[], u_int dirty, int i)
4062 {
4063   int hr;
4064   // Load 32-bit regs
4065   for(hr=0;hr<HOST_REGS;hr++) {
4066     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4067       assert(regmap[hr] < 64);
4068       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>0) {
4069         int value=constmap[i][hr];
4070         if(value==0) {
4071           emit_zeroreg(hr);
4072         }
4073         else {
4074           emit_movimm(value,hr);
4075         }
4076       }
4077     }
4078   }
4079 }
4080
4081 // Write out all dirty registers (except cycle count)
4082 static void wb_dirtys(signed char i_regmap[],uint64_t i_dirty)
4083 {
4084   int hr;
4085   for(hr=0;hr<HOST_REGS;hr++) {
4086     if(hr!=EXCLUDE_REG) {
4087       if(i_regmap[hr]>0) {
4088         if(i_regmap[hr]!=CCREG) {
4089           if((i_dirty>>hr)&1) {
4090             assert(i_regmap[hr]<64);
4091             emit_storereg(i_regmap[hr],hr);
4092           }
4093         }
4094       }
4095     }
4096   }
4097 }
4098
4099 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4100 // This writes the registers not written by store_regs_bt
4101 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_dirty,int addr)
4102 {
4103   int hr;
4104   int t=(addr-start)>>2;
4105   for(hr=0;hr<HOST_REGS;hr++) {
4106     if(hr!=EXCLUDE_REG) {
4107       if(i_regmap[hr]>0) {
4108         if(i_regmap[hr]!=CCREG) {
4109           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1)) {
4110             if((i_dirty>>hr)&1) {
4111               assert(i_regmap[hr]<64);
4112               emit_storereg(i_regmap[hr],hr);
4113             }
4114           }
4115         }
4116       }
4117     }
4118   }
4119 }
4120
4121 // Load all registers (except cycle count)
4122 void load_all_regs(signed char i_regmap[])
4123 {
4124   int hr;
4125   for(hr=0;hr<HOST_REGS;hr++) {
4126     if(hr!=EXCLUDE_REG) {
4127       if(i_regmap[hr]==0) {
4128         emit_zeroreg(hr);
4129       }
4130       else
4131       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4132       {
4133         emit_loadreg(i_regmap[hr],hr);
4134       }
4135     }
4136   }
4137 }
4138
4139 // Load all current registers also needed by next instruction
4140 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4141 {
4142   int hr;
4143   for(hr=0;hr<HOST_REGS;hr++) {
4144     if(hr!=EXCLUDE_REG) {
4145       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4146         if(i_regmap[hr]==0) {
4147           emit_zeroreg(hr);
4148         }
4149         else
4150         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4151         {
4152           emit_loadreg(i_regmap[hr],hr);
4153         }
4154       }
4155     }
4156   }
4157 }
4158
4159 // Load all regs, storing cycle count if necessary
4160 void load_regs_entry(int t)
4161 {
4162   int hr;
4163   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
4164   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
4165   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4166     emit_storereg(CCREG,HOST_CCREG);
4167   }
4168   // Load 32-bit regs
4169   for(hr=0;hr<HOST_REGS;hr++) {
4170     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4171       if(regs[t].regmap_entry[hr]==0) {
4172         emit_zeroreg(hr);
4173       }
4174       else if(regs[t].regmap_entry[hr]!=CCREG)
4175       {
4176         emit_loadreg(regs[t].regmap_entry[hr],hr);
4177       }
4178     }
4179   }
4180 }
4181
4182 // Store dirty registers prior to branch
4183 void store_regs_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
4184 {
4185   if(internal_branch(addr))
4186   {
4187     int t=(addr-start)>>2;
4188     int hr;
4189     for(hr=0;hr<HOST_REGS;hr++) {
4190       if(hr!=EXCLUDE_REG) {
4191         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4192           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1)) {
4193             if((i_dirty>>hr)&1) {
4194               assert(i_regmap[hr]<64);
4195               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4196                 emit_storereg(i_regmap[hr],hr);
4197             }
4198           }
4199         }
4200       }
4201     }
4202   }
4203   else
4204   {
4205     // Branch out of this block, write out all dirty regs
4206     wb_dirtys(i_regmap,i_dirty);
4207   }
4208 }
4209
4210 // Load all needed registers for branch target
4211 static void load_regs_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
4212 {
4213   //if(addr>=start && addr<(start+slen*4))
4214   if(internal_branch(addr))
4215   {
4216     int t=(addr-start)>>2;
4217     int hr;
4218     // Store the cycle count before loading something else
4219     if(i_regmap[HOST_CCREG]!=CCREG) {
4220       assert(i_regmap[HOST_CCREG]==-1);
4221     }
4222     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4223       emit_storereg(CCREG,HOST_CCREG);
4224     }
4225     // Load 32-bit regs
4226     for(hr=0;hr<HOST_REGS;hr++) {
4227       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4228         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4229           if(regs[t].regmap_entry[hr]==0) {
4230             emit_zeroreg(hr);
4231           }
4232           else if(regs[t].regmap_entry[hr]!=CCREG)
4233           {
4234             emit_loadreg(regs[t].regmap_entry[hr],hr);
4235           }
4236         }
4237       }
4238     }
4239   }
4240 }
4241
4242 static int match_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
4243 {
4244   if(addr>=start && addr<start+slen*4-4)
4245   {
4246     int t=(addr-start)>>2;
4247     int hr;
4248     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4249     for(hr=0;hr<HOST_REGS;hr++)
4250     {
4251       if(hr!=EXCLUDE_REG)
4252       {
4253         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4254         {
4255           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4256           {
4257             return 0;
4258           }
4259           else
4260           if((i_dirty>>hr)&1)
4261           {
4262             if(i_regmap[hr]<TEMPREG)
4263             {
4264               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4265                 return 0;
4266             }
4267             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4268             {
4269               assert(0);
4270             }
4271           }
4272         }
4273         else // Same register but is it 32-bit or dirty?
4274         if(i_regmap[hr]>=0)
4275         {
4276           if(!((regs[t].dirty>>hr)&1))
4277           {
4278             if((i_dirty>>hr)&1)
4279             {
4280               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4281               {
4282                 //printf("%x: dirty no match\n",addr);
4283                 return 0;
4284               }
4285             }
4286           }
4287         }
4288       }
4289     }
4290     // Delay slots are not valid branch targets
4291     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP)) return 0;
4292     // Delay slots require additional processing, so do not match
4293     if(is_ds[t]) return 0;
4294   }
4295   else
4296   {
4297     int hr;
4298     for(hr=0;hr<HOST_REGS;hr++)
4299     {
4300       if(hr!=EXCLUDE_REG)
4301       {
4302         if(i_regmap[hr]>=0)
4303         {
4304           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4305           {
4306             if((i_dirty>>hr)&1)
4307             {
4308               return 0;
4309             }
4310           }
4311         }
4312       }
4313     }
4314   }
4315   return 1;
4316 }
4317
4318 #ifdef DRC_DBG
4319 static void drc_dbg_emit_do_cmp(int i)
4320 {
4321   extern void do_insn_cmp();
4322   //extern int cycle;
4323   u_int hr,reglist=0;
4324
4325   for(hr=0;hr<HOST_REGS;hr++)
4326     if(regs[i].regmap[hr]>=0) reglist|=1<<hr;
4327   save_regs(reglist);
4328   emit_movimm(start+i*4,0);
4329   emit_writeword(0,&pcaddr);
4330   emit_far_call(do_insn_cmp);
4331   //emit_readword(&cycle,0);
4332   //emit_addimm(0,2,0);
4333   //emit_writeword(0,&cycle);
4334   (void)get_reg2;
4335   restore_regs(reglist);
4336 }
4337 #else
4338 #define drc_dbg_emit_do_cmp(x)
4339 #endif
4340
4341 // Used when a branch jumps into the delay slot of another branch
4342 static void ds_assemble_entry(int i)
4343 {
4344   int t=(ba[i]-start)>>2;
4345   if (!instr_addr[t])
4346     instr_addr[t] = out;
4347   assem_debug("Assemble delay slot at %x\n",ba[i]);
4348   assem_debug("<->\n");
4349   drc_dbg_emit_do_cmp(t);
4350   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4351     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty);
4352   load_regs(regs[t].regmap_entry,regs[t].regmap,rs1[t],rs2[t]);
4353   address_generation(t,&regs[t],regs[t].regmap_entry);
4354   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4355     load_regs(regs[t].regmap_entry,regs[t].regmap,INVCP,INVCP);
4356   is_delayslot=0;
4357   switch(itype[t]) {
4358     case ALU:
4359       alu_assemble(t,&regs[t]);break;
4360     case IMM16:
4361       imm16_assemble(t,&regs[t]);break;
4362     case SHIFT:
4363       shift_assemble(t,&regs[t]);break;
4364     case SHIFTIMM:
4365       shiftimm_assemble(t,&regs[t]);break;
4366     case LOAD:
4367       load_assemble(t,&regs[t]);break;
4368     case LOADLR:
4369       loadlr_assemble(t,&regs[t]);break;
4370     case STORE:
4371       store_assemble(t,&regs[t]);break;
4372     case STORELR:
4373       storelr_assemble(t,&regs[t]);break;
4374     case COP0:
4375       cop0_assemble(t,&regs[t]);break;
4376     case COP1:
4377       cop1_assemble(t,&regs[t]);break;
4378     case C1LS:
4379       c1ls_assemble(t,&regs[t]);break;
4380     case COP2:
4381       cop2_assemble(t,&regs[t]);break;
4382     case C2LS:
4383       c2ls_assemble(t,&regs[t]);break;
4384     case C2OP:
4385       c2op_assemble(t,&regs[t]);break;
4386     case MULTDIV:
4387       multdiv_assemble(t,&regs[t]);break;
4388     case MOV:
4389       mov_assemble(t,&regs[t]);break;
4390     case SYSCALL:
4391     case HLECALL:
4392     case INTCALL:
4393     case SPAN:
4394     case UJUMP:
4395     case RJUMP:
4396     case CJUMP:
4397     case SJUMP:
4398       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4399   }
4400   store_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+4);
4401   load_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+4);
4402   if(internal_branch(ba[i]+4))
4403     assem_debug("branch: internal\n");
4404   else
4405     assem_debug("branch: external\n");
4406   assert(internal_branch(ba[i]+4));
4407   add_to_linker(out,ba[i]+4,internal_branch(ba[i]+4));
4408   emit_jmp(0);
4409 }
4410
4411 static void emit_extjump(void *addr, u_int target)
4412 {
4413   emit_extjump2(addr, target, dyna_linker);
4414 }
4415
4416 static void emit_extjump_ds(void *addr, u_int target)
4417 {
4418   emit_extjump2(addr, target, dyna_linker_ds);
4419 }
4420
4421 // Load 2 immediates optimizing for small code size
4422 static void emit_mov2imm_compact(int imm1,u_int rt1,int imm2,u_int rt2)
4423 {
4424   emit_movimm(imm1,rt1);
4425   emit_movimm_from(imm1,rt1,imm2,rt2);
4426 }
4427
4428 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4429 {
4430   int count;
4431   void *jaddr;
4432   void *idle=NULL;
4433   int t=0;
4434   if(itype[i]==RJUMP)
4435   {
4436     *adj=0;
4437   }
4438   //if(ba[i]>=start && ba[i]<(start+slen*4))
4439   if(internal_branch(ba[i]))
4440   {
4441     t=(ba[i]-start)>>2;
4442     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4443     else *adj=ccadj[t];
4444   }
4445   else
4446   {
4447     *adj=0;
4448   }
4449   count=ccadj[i];
4450   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4451     // Idle loop
4452     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4453     idle=out;
4454     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4455     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4456     jaddr=out;
4457     emit_jmp(0);
4458   }
4459   else if(*adj==0||invert) {
4460     int cycles=CLOCK_ADJUST(count+2);
4461     // faster loop HACK
4462     if (t&&*adj) {
4463       int rel=t-i;
4464       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4465         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4466     }
4467     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4468     jaddr=out;
4469     emit_jns(0);
4470   }
4471   else
4472   {
4473     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4474     jaddr=out;
4475     emit_jns(0);
4476   }
4477   add_stub(CC_STUB,jaddr,idle?idle:out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4478 }
4479
4480 static void do_ccstub(int n)
4481 {
4482   literal_pool(256);
4483   assem_debug("do_ccstub %x\n",start+(u_int)stubs[n].b*4);
4484   set_jump_target(stubs[n].addr, out);
4485   int i=stubs[n].b;
4486   if(stubs[n].d==NULLDS) {
4487     // Delay slot instruction is nullified ("likely" branch)
4488     wb_dirtys(regs[i].regmap,regs[i].dirty);
4489   }
4490   else if(stubs[n].d!=TAKEN) {
4491     wb_dirtys(branch_regs[i].regmap,branch_regs[i].dirty);
4492   }
4493   else {
4494     if(internal_branch(ba[i]))
4495       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4496   }
4497   if(stubs[n].c!=-1)
4498   {
4499     // Save PC as return address
4500     emit_movimm(stubs[n].c,EAX);
4501     emit_writeword(EAX,&pcaddr);
4502   }
4503   else
4504   {
4505     // Return address depends on which way the branch goes
4506     if(itype[i]==CJUMP||itype[i]==SJUMP)
4507     {
4508       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4509       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4510       if(rs1[i]==0)
4511       {
4512         s1l=s2l;
4513         s2l=-1;
4514       }
4515       else if(rs2[i]==0)
4516       {
4517         s2l=-1;
4518       }
4519       assert(s1l>=0);
4520       #ifdef DESTRUCTIVE_WRITEBACK
4521       if(rs1[i]) {
4522         if((branch_regs[i].dirty>>s1l)&&1)
4523           emit_loadreg(rs1[i],s1l);
4524       }
4525       else {
4526         if((branch_regs[i].dirty>>s1l)&1)
4527           emit_loadreg(rs2[i],s1l);
4528       }
4529       if(s2l>=0)
4530         if((branch_regs[i].dirty>>s2l)&1)
4531           emit_loadreg(rs2[i],s2l);
4532       #endif
4533       int hr=0;
4534       int addr=-1,alt=-1,ntaddr=-1;
4535       while(hr<HOST_REGS)
4536       {
4537         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4538            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4539            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4540         {
4541           addr=hr++;break;
4542         }
4543         hr++;
4544       }
4545       while(hr<HOST_REGS)
4546       {
4547         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4548            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4549            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4550         {
4551           alt=hr++;break;
4552         }
4553         hr++;
4554       }
4555       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4556       {
4557         while(hr<HOST_REGS)
4558         {
4559           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4560              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4561              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4562           {
4563             ntaddr=hr;break;
4564           }
4565           hr++;
4566         }
4567         assert(hr<HOST_REGS);
4568       }
4569       if((opcode[i]&0x2f)==4) // BEQ
4570       {
4571         #ifdef HAVE_CMOV_IMM
4572         if(s2l>=0) emit_cmp(s1l,s2l);
4573         else emit_test(s1l,s1l);
4574         emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4575         #else
4576         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4577         if(s2l>=0) emit_cmp(s1l,s2l);
4578         else emit_test(s1l,s1l);
4579         emit_cmovne_reg(alt,addr);
4580         #endif
4581       }
4582       if((opcode[i]&0x2f)==5) // BNE
4583       {
4584         #ifdef HAVE_CMOV_IMM
4585         if(s2l>=0) emit_cmp(s1l,s2l);
4586         else emit_test(s1l,s1l);
4587         emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4588         #else
4589         emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4590         if(s2l>=0) emit_cmp(s1l,s2l);
4591         else emit_test(s1l,s1l);
4592         emit_cmovne_reg(alt,addr);
4593         #endif
4594       }
4595       if((opcode[i]&0x2f)==6) // BLEZ
4596       {
4597         //emit_movimm(ba[i],alt);
4598         //emit_movimm(start+i*4+8,addr);
4599         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4600         emit_cmpimm(s1l,1);
4601         emit_cmovl_reg(alt,addr);
4602       }
4603       if((opcode[i]&0x2f)==7) // BGTZ
4604       {
4605         //emit_movimm(ba[i],addr);
4606         //emit_movimm(start+i*4+8,ntaddr);
4607         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4608         emit_cmpimm(s1l,1);
4609         emit_cmovl_reg(ntaddr,addr);
4610       }
4611       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4612       {
4613         //emit_movimm(ba[i],alt);
4614         //emit_movimm(start+i*4+8,addr);
4615         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4616         emit_test(s1l,s1l);
4617         emit_cmovs_reg(alt,addr);
4618       }
4619       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4620       {
4621         //emit_movimm(ba[i],addr);
4622         //emit_movimm(start+i*4+8,alt);
4623         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4624         emit_test(s1l,s1l);
4625         emit_cmovs_reg(alt,addr);
4626       }
4627       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4628         if(source[i]&0x10000) // BC1T
4629         {
4630           //emit_movimm(ba[i],alt);
4631           //emit_movimm(start+i*4+8,addr);
4632           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4633           emit_testimm(s1l,0x800000);
4634           emit_cmovne_reg(alt,addr);
4635         }
4636         else // BC1F
4637         {
4638           //emit_movimm(ba[i],addr);
4639           //emit_movimm(start+i*4+8,alt);
4640           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4641           emit_testimm(s1l,0x800000);
4642           emit_cmovne_reg(alt,addr);
4643         }
4644       }
4645       emit_writeword(addr,&pcaddr);
4646     }
4647     else
4648     if(itype[i]==RJUMP)
4649     {
4650       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4651       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4652         r=get_reg(branch_regs[i].regmap,RTEMP);
4653       }
4654       emit_writeword(r,&pcaddr);
4655     }
4656     else {SysPrintf("Unknown branch type in do_ccstub\n");abort();}
4657   }
4658   // Update cycle count
4659   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4660   if(stubs[n].a) emit_addimm(HOST_CCREG,CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
4661   emit_far_call(cc_interrupt);
4662   if(stubs[n].a) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
4663   if(stubs[n].d==TAKEN) {
4664     if(internal_branch(ba[i]))
4665       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4666     else if(itype[i]==RJUMP) {
4667       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4668         emit_readword(&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4669       else
4670         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4671     }
4672   }else if(stubs[n].d==NOTTAKEN) {
4673     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4674     else load_all_regs(branch_regs[i].regmap);
4675   }else if(stubs[n].d==NULLDS) {
4676     // Delay slot instruction is nullified ("likely" branch)
4677     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4678     else load_all_regs(regs[i].regmap);
4679   }else{
4680     load_all_regs(branch_regs[i].regmap);
4681   }
4682   if (stubs[n].retaddr)
4683     emit_jmp(stubs[n].retaddr);
4684   else
4685     do_jump_vaddr(stubs[n].e);
4686 }
4687
4688 static void add_to_linker(void *addr, u_int target, int ext)
4689 {
4690   assert(linkcount < ARRAY_SIZE(link_addr));
4691   link_addr[linkcount].addr = addr;
4692   link_addr[linkcount].target = target;
4693   link_addr[linkcount].ext = ext;
4694   linkcount++;
4695 }
4696
4697 static void ujump_assemble_write_ra(int i)
4698 {
4699   int rt;
4700   unsigned int return_address;
4701   rt=get_reg(branch_regs[i].regmap,31);
4702   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4703   //assert(rt>=0);
4704   return_address=start+i*4+8;
4705   if(rt>=0) {
4706     #ifdef USE_MINI_HT
4707     if(internal_branch(return_address)&&rt1[i+1]!=31) {
4708       int temp=-1; // note: must be ds-safe
4709       #ifdef HOST_TEMPREG
4710       temp=HOST_TEMPREG;
4711       #endif
4712       if(temp>=0) do_miniht_insert(return_address,rt,temp);
4713       else emit_movimm(return_address,rt);
4714     }
4715     else
4716     #endif
4717     {
4718       #ifdef REG_PREFETCH
4719       if(temp>=0)
4720       {
4721         if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4722       }
4723       #endif
4724       emit_movimm(return_address,rt); // PC into link register
4725       #ifdef IMM_PREFETCH
4726       emit_prefetch(hash_table_get(return_address));
4727       #endif
4728     }
4729   }
4730 }
4731
4732 static void ujump_assemble(int i,struct regstat *i_regs)
4733 {
4734   int ra_done=0;
4735   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4736   address_generation(i+1,i_regs,regs[i].regmap_entry);
4737   #ifdef REG_PREFETCH
4738   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4739   if(rt1[i]==31&&temp>=0)
4740   {
4741     signed char *i_regmap=i_regs->regmap;
4742     int return_address=start+i*4+8;
4743     if(get_reg(branch_regs[i].regmap,31)>0)
4744     if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4745   }
4746   #endif
4747   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4748     ujump_assemble_write_ra(i); // writeback ra for DS
4749     ra_done=1;
4750   }
4751   ds_assemble(i+1,i_regs);
4752   uint64_t bc_unneeded=branch_regs[i].u;
4753   bc_unneeded|=1|(1LL<<rt1[i]);
4754   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
4755   load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
4756   if(!ra_done&&rt1[i]==31)
4757     ujump_assemble_write_ra(i);
4758   int cc,adj;
4759   cc=get_reg(branch_regs[i].regmap,CCREG);
4760   assert(cc==HOST_CCREG);
4761   store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4762   #ifdef REG_PREFETCH
4763   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4764   #endif
4765   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4766   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4767   load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4768   if(internal_branch(ba[i]))
4769     assem_debug("branch: internal\n");
4770   else
4771     assem_debug("branch: external\n");
4772   if(internal_branch(ba[i])&&is_ds[(ba[i]-start)>>2]) {
4773     ds_assemble_entry(i);
4774   }
4775   else {
4776     add_to_linker(out,ba[i],internal_branch(ba[i]));
4777     emit_jmp(0);
4778   }
4779 }
4780
4781 static void rjump_assemble_write_ra(int i)
4782 {
4783   int rt,return_address;
4784   assert(rt1[i+1]!=rt1[i]);
4785   assert(rt2[i+1]!=rt1[i]);
4786   rt=get_reg(branch_regs[i].regmap,rt1[i]);
4787   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4788   assert(rt>=0);
4789   return_address=start+i*4+8;
4790   #ifdef REG_PREFETCH
4791   if(temp>=0)
4792   {
4793     if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4794   }
4795   #endif
4796   emit_movimm(return_address,rt); // PC into link register
4797   #ifdef IMM_PREFETCH
4798   emit_prefetch(hash_table_get(return_address));
4799   #endif
4800 }
4801
4802 static void rjump_assemble(int i,struct regstat *i_regs)
4803 {
4804   int temp;
4805   int rs,cc;
4806   int ra_done=0;
4807   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4808   assert(rs>=0);
4809   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4810     // Delay slot abuse, make a copy of the branch address register
4811     temp=get_reg(branch_regs[i].regmap,RTEMP);
4812     assert(temp>=0);
4813     assert(regs[i].regmap[temp]==RTEMP);
4814     emit_mov(rs,temp);
4815     rs=temp;
4816   }
4817   address_generation(i+1,i_regs,regs[i].regmap_entry);
4818   #ifdef REG_PREFETCH
4819   if(rt1[i]==31)
4820   {
4821     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4822       signed char *i_regmap=i_regs->regmap;
4823       int return_address=start+i*4+8;
4824       if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4825     }
4826   }
4827   #endif
4828   #ifdef USE_MINI_HT
4829   if(rs1[i]==31) {
4830     int rh=get_reg(regs[i].regmap,RHASH);
4831     if(rh>=0) do_preload_rhash(rh);
4832   }
4833   #endif
4834   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4835     rjump_assemble_write_ra(i);
4836     ra_done=1;
4837   }
4838   ds_assemble(i+1,i_regs);
4839   uint64_t bc_unneeded=branch_regs[i].u;
4840   bc_unneeded|=1|(1LL<<rt1[i]);
4841   bc_unneeded&=~(1LL<<rs1[i]);
4842   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
4843   load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],CCREG);
4844   if(!ra_done&&rt1[i]!=0)
4845     rjump_assemble_write_ra(i);
4846   cc=get_reg(branch_regs[i].regmap,CCREG);
4847   assert(cc==HOST_CCREG);
4848   (void)cc;
4849   #ifdef USE_MINI_HT
4850   int rh=get_reg(branch_regs[i].regmap,RHASH);
4851   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4852   if(rs1[i]==31) {
4853     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4854     do_preload_rhtbl(ht);
4855     do_rhash(rs,rh);
4856   }
4857   #endif
4858   store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1);
4859   #ifdef DESTRUCTIVE_WRITEBACK
4860   if((branch_regs[i].dirty>>rs)&1) {
4861     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4862       emit_loadreg(rs1[i],rs);
4863     }
4864   }
4865   #endif
4866   #ifdef REG_PREFETCH
4867   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4868   #endif
4869   #ifdef USE_MINI_HT
4870   if(rs1[i]==31) {
4871     do_miniht_load(ht,rh);
4872   }
4873   #endif
4874   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4875   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4876   //assert(adj==0);
4877   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4878   add_stub(CC_STUB,out,NULL,0,i,-1,TAKEN,rs);
4879   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
4880     // special case for RFE
4881     emit_jmp(0);
4882   else
4883     emit_jns(0);
4884   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1);
4885   #ifdef USE_MINI_HT
4886   if(rs1[i]==31) {
4887     do_miniht_jump(rs,rh,ht);
4888   }
4889   else
4890   #endif
4891   {
4892     do_jump_vaddr(rs);
4893   }
4894   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4895   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4896   #endif
4897 }
4898
4899 static void cjump_assemble(int i,struct regstat *i_regs)
4900 {
4901   signed char *i_regmap=i_regs->regmap;
4902   int cc;
4903   int match;
4904   match=match_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4905   assem_debug("match=%d\n",match);
4906   int s1l,s2l;
4907   int unconditional=0,nop=0;
4908   int invert=0;
4909   int internal=internal_branch(ba[i]);
4910   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4911   if(!match) invert=1;
4912   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4913   if(i>(ba[i]-start)>>2) invert=1;
4914   #endif
4915   #ifdef __aarch64__
4916   invert=1; // because of near cond. branches
4917   #endif
4918
4919   if(ooo[i]) {
4920     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4921     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4922   }
4923   else {
4924     s1l=get_reg(i_regmap,rs1[i]);
4925     s2l=get_reg(i_regmap,rs2[i]);
4926   }
4927   if(rs1[i]==0&&rs2[i]==0)
4928   {
4929     if(opcode[i]&1) nop=1;
4930     else unconditional=1;
4931     //assert(opcode[i]!=5);
4932     //assert(opcode[i]!=7);
4933     //assert(opcode[i]!=0x15);
4934     //assert(opcode[i]!=0x17);
4935   }
4936   else if(rs1[i]==0)
4937   {
4938     s1l=s2l;
4939     s2l=-1;
4940   }
4941   else if(rs2[i]==0)
4942   {
4943     s2l=-1;
4944   }
4945
4946   if(ooo[i]) {
4947     // Out of order execution (delay slot first)
4948     //printf("OOOE\n");
4949     address_generation(i+1,i_regs,regs[i].regmap_entry);
4950     ds_assemble(i+1,i_regs);
4951     int adj;
4952     uint64_t bc_unneeded=branch_regs[i].u;
4953     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
4954     bc_unneeded|=1;
4955     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
4956     load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],rs2[i]);
4957     load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
4958     cc=get_reg(branch_regs[i].regmap,CCREG);
4959     assert(cc==HOST_CCREG);
4960     if(unconditional)
4961       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4962     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
4963     //assem_debug("cycle count (adj)\n");
4964     if(unconditional) {
4965       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4966       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
4967         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4968         load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4969         if(internal)
4970           assem_debug("branch: internal\n");
4971         else
4972           assem_debug("branch: external\n");
4973         if(internal&&is_ds[(ba[i]-start)>>2]) {
4974           ds_assemble_entry(i);
4975         }
4976         else {
4977           add_to_linker(out,ba[i],internal);
4978           emit_jmp(0);
4979         }
4980         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4981         if(((u_int)out)&7) emit_addnop(0);
4982         #endif
4983       }
4984     }
4985     else if(nop) {
4986       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
4987       void *jaddr=out;
4988       emit_jns(0);
4989       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
4990     }
4991     else {
4992       void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
4993       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
4994       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4995
4996       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4997       assert(s1l>=0);
4998       if(opcode[i]==4) // BEQ
4999       {
5000         if(s2l>=0) emit_cmp(s1l,s2l);
5001         else emit_test(s1l,s1l);
5002         if(invert){
5003           nottaken=out;
5004           emit_jne(DJT_1);
5005         }else{
5006           add_to_linker(out,ba[i],internal);
5007           emit_jeq(0);
5008         }
5009       }
5010       if(opcode[i]==5) // BNE
5011       {
5012         if(s2l>=0) emit_cmp(s1l,s2l);
5013         else emit_test(s1l,s1l);
5014         if(invert){
5015           nottaken=out;
5016           emit_jeq(DJT_1);
5017         }else{
5018           add_to_linker(out,ba[i],internal);
5019           emit_jne(0);
5020         }
5021       }
5022       if(opcode[i]==6) // BLEZ
5023       {
5024         emit_cmpimm(s1l,1);
5025         if(invert){
5026           nottaken=out;
5027           emit_jge(DJT_1);
5028         }else{
5029           add_to_linker(out,ba[i],internal);
5030           emit_jl(0);
5031         }
5032       }
5033       if(opcode[i]==7) // BGTZ
5034       {
5035         emit_cmpimm(s1l,1);
5036         if(invert){
5037           nottaken=out;
5038           emit_jl(DJT_1);
5039         }else{
5040           add_to_linker(out,ba[i],internal);
5041           emit_jge(0);
5042         }
5043       }
5044       if(invert) {
5045         if(taken) set_jump_target(taken, out);
5046         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5047         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5048           if(adj) {
5049             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5050             add_to_linker(out,ba[i],internal);
5051           }else{
5052             emit_addnop(13);
5053             add_to_linker(out,ba[i],internal*2);
5054           }
5055           emit_jmp(0);
5056         }else
5057         #endif
5058         {
5059           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5060           store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5061           load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5062           if(internal)
5063             assem_debug("branch: internal\n");
5064           else
5065             assem_debug("branch: external\n");
5066           if(internal&&is_ds[(ba[i]-start)>>2]) {
5067             ds_assemble_entry(i);
5068           }
5069           else {
5070             add_to_linker(out,ba[i],internal);
5071             emit_jmp(0);
5072           }
5073         }
5074         set_jump_target(nottaken, out);
5075       }
5076
5077       if(nottaken1) set_jump_target(nottaken1, out);
5078       if(adj) {
5079         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5080       }
5081     } // (!unconditional)
5082   } // if(ooo)
5083   else
5084   {
5085     // In-order execution (branch first)
5086     //if(likely[i]) printf("IOL\n");
5087     //else
5088     //printf("IOE\n");
5089     void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
5090     if(!unconditional&&!nop) {
5091       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5092       assert(s1l>=0);
5093       if((opcode[i]&0x2f)==4) // BEQ
5094       {
5095         if(s2l>=0) emit_cmp(s1l,s2l);
5096         else emit_test(s1l,s1l);
5097         nottaken=out;
5098         emit_jne(DJT_2);
5099       }
5100       if((opcode[i]&0x2f)==5) // BNE
5101       {
5102         if(s2l>=0) emit_cmp(s1l,s2l);
5103         else emit_test(s1l,s1l);
5104         nottaken=out;
5105         emit_jeq(DJT_2);
5106       }
5107       if((opcode[i]&0x2f)==6) // BLEZ
5108       {
5109         emit_cmpimm(s1l,1);
5110         nottaken=out;
5111         emit_jge(DJT_2);
5112       }
5113       if((opcode[i]&0x2f)==7) // BGTZ
5114       {
5115         emit_cmpimm(s1l,1);
5116         nottaken=out;
5117         emit_jl(DJT_2);
5118       }
5119     } // if(!unconditional)
5120     int adj;
5121     uint64_t ds_unneeded=branch_regs[i].u;
5122     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5123     ds_unneeded|=1;
5124     // branch taken
5125     if(!nop) {
5126       if(taken) set_jump_target(taken, out);
5127       assem_debug("1:\n");
5128       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5129       // load regs
5130       load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5131       address_generation(i+1,&branch_regs[i],0);
5132       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
5133       ds_assemble(i+1,&branch_regs[i]);
5134       cc=get_reg(branch_regs[i].regmap,CCREG);
5135       if(cc==-1) {
5136         emit_loadreg(CCREG,cc=HOST_CCREG);
5137         // CHECK: Is the following instruction (fall thru) allocated ok?
5138       }
5139       assert(cc==HOST_CCREG);
5140       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5141       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5142       assem_debug("cycle count (adj)\n");
5143       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5144       load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5145       if(internal)
5146         assem_debug("branch: internal\n");
5147       else
5148         assem_debug("branch: external\n");
5149       if(internal&&is_ds[(ba[i]-start)>>2]) {
5150         ds_assemble_entry(i);
5151       }
5152       else {
5153         add_to_linker(out,ba[i],internal);
5154         emit_jmp(0);
5155       }
5156     }
5157     // branch not taken
5158     if(!unconditional) {
5159       if(nottaken1) set_jump_target(nottaken1, out);
5160       set_jump_target(nottaken, out);
5161       assem_debug("2:\n");
5162       if(!likely[i]) {
5163         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5164         load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5165         address_generation(i+1,&branch_regs[i],0);
5166         load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5167         ds_assemble(i+1,&branch_regs[i]);
5168       }
5169       cc=get_reg(branch_regs[i].regmap,CCREG);
5170       if(cc==-1&&!likely[i]) {
5171         // Cycle count isn't in a register, temporarily load it then write it out
5172         emit_loadreg(CCREG,HOST_CCREG);
5173         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5174         void *jaddr=out;
5175         emit_jns(0);
5176         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5177         emit_storereg(CCREG,HOST_CCREG);
5178       }
5179       else{
5180         cc=get_reg(i_regmap,CCREG);
5181         assert(cc==HOST_CCREG);
5182         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5183         void *jaddr=out;
5184         emit_jns(0);
5185         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5186       }
5187     }
5188   }
5189 }
5190
5191 static void sjump_assemble(int i,struct regstat *i_regs)
5192 {
5193   signed char *i_regmap=i_regs->regmap;
5194   int cc;
5195   int match;
5196   match=match_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5197   assem_debug("smatch=%d\n",match);
5198   int s1l;
5199   int unconditional=0,nevertaken=0;
5200   int invert=0;
5201   int internal=internal_branch(ba[i]);
5202   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5203   if(!match) invert=1;
5204   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5205   if(i>(ba[i]-start)>>2) invert=1;
5206   #endif
5207   #ifdef __aarch64__
5208   invert=1; // because of near cond. branches
5209   #endif
5210
5211   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5212   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5213
5214   if(ooo[i]) {
5215     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5216   }
5217   else {
5218     s1l=get_reg(i_regmap,rs1[i]);
5219   }
5220   if(rs1[i]==0)
5221   {
5222     if(opcode2[i]&1) unconditional=1;
5223     else nevertaken=1;
5224     // These are never taken (r0 is never less than zero)
5225     //assert(opcode2[i]!=0);
5226     //assert(opcode2[i]!=2);
5227     //assert(opcode2[i]!=0x10);
5228     //assert(opcode2[i]!=0x12);
5229   }
5230
5231   if(ooo[i]) {
5232     // Out of order execution (delay slot first)
5233     //printf("OOOE\n");
5234     address_generation(i+1,i_regs,regs[i].regmap_entry);
5235     ds_assemble(i+1,i_regs);
5236     int adj;
5237     uint64_t bc_unneeded=branch_regs[i].u;
5238     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5239     bc_unneeded|=1;
5240     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
5241     load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],rs1[i]);
5242     load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5243     if(rt1[i]==31) {
5244       int rt,return_address;
5245       rt=get_reg(branch_regs[i].regmap,31);
5246       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5247       if(rt>=0) {
5248         // Save the PC even if the branch is not taken
5249         return_address=start+i*4+8;
5250         emit_movimm(return_address,rt); // PC into link register
5251         #ifdef IMM_PREFETCH
5252         if(!nevertaken) emit_prefetch(hash_table_get(return_address));
5253         #endif
5254       }
5255     }
5256     cc=get_reg(branch_regs[i].regmap,CCREG);
5257     assert(cc==HOST_CCREG);
5258     if(unconditional)
5259       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5260     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5261     assem_debug("cycle count (adj)\n");
5262     if(unconditional) {
5263       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5264       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5265         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5266         load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5267         if(internal)
5268           assem_debug("branch: internal\n");
5269         else
5270           assem_debug("branch: external\n");
5271         if(internal&&is_ds[(ba[i]-start)>>2]) {
5272           ds_assemble_entry(i);
5273         }
5274         else {
5275           add_to_linker(out,ba[i],internal);
5276           emit_jmp(0);
5277         }
5278         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5279         if(((u_int)out)&7) emit_addnop(0);
5280         #endif
5281       }
5282     }
5283     else if(nevertaken) {
5284       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5285       void *jaddr=out;
5286       emit_jns(0);
5287       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5288     }
5289     else {
5290       void *nottaken = NULL;
5291       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5292       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5293       {
5294         assert(s1l>=0);
5295         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5296         {
5297           emit_test(s1l,s1l);
5298           if(invert){
5299             nottaken=out;
5300             emit_jns(DJT_1);
5301           }else{
5302             add_to_linker(out,ba[i],internal);
5303             emit_js(0);
5304           }
5305         }
5306         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5307         {
5308           emit_test(s1l,s1l);
5309           if(invert){
5310             nottaken=out;
5311             emit_js(DJT_1);
5312           }else{
5313             add_to_linker(out,ba[i],internal);
5314             emit_jns(0);
5315           }
5316         }
5317       }
5318
5319       if(invert) {
5320         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5321         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5322           if(adj) {
5323             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5324             add_to_linker(out,ba[i],internal);
5325           }else{
5326             emit_addnop(13);
5327             add_to_linker(out,ba[i],internal*2);
5328           }
5329           emit_jmp(0);
5330         }else
5331         #endif
5332         {
5333           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5334           store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5335           load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5336           if(internal)
5337             assem_debug("branch: internal\n");
5338           else
5339             assem_debug("branch: external\n");
5340           if(internal&&is_ds[(ba[i]-start)>>2]) {
5341             ds_assemble_entry(i);
5342           }
5343           else {
5344             add_to_linker(out,ba[i],internal);
5345             emit_jmp(0);
5346           }
5347         }
5348         set_jump_target(nottaken, out);
5349       }
5350
5351       if(adj) {
5352         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5353       }
5354     } // (!unconditional)
5355   } // if(ooo)
5356   else
5357   {
5358     // In-order execution (branch first)
5359     //printf("IOE\n");
5360     void *nottaken = NULL;
5361     if(rt1[i]==31) {
5362       int rt,return_address;
5363       rt=get_reg(branch_regs[i].regmap,31);
5364       if(rt>=0) {
5365         // Save the PC even if the branch is not taken
5366         return_address=start+i*4+8;
5367         emit_movimm(return_address,rt); // PC into link register
5368         #ifdef IMM_PREFETCH
5369         emit_prefetch(hash_table_get(return_address));
5370         #endif
5371       }
5372     }
5373     if(!unconditional) {
5374       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5375         assert(s1l>=0);
5376         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5377         {
5378           emit_test(s1l,s1l);
5379           nottaken=out;
5380           emit_jns(DJT_1);
5381         }
5382         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5383         {
5384           emit_test(s1l,s1l);
5385           nottaken=out;
5386           emit_js(DJT_1);
5387         }
5388     } // if(!unconditional)
5389     int adj;
5390     uint64_t ds_unneeded=branch_regs[i].u;
5391     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5392     ds_unneeded|=1;
5393     // branch taken
5394     if(!nevertaken) {
5395       //assem_debug("1:\n");
5396       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5397       // load regs
5398       load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5399       address_generation(i+1,&branch_regs[i],0);
5400       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
5401       ds_assemble(i+1,&branch_regs[i]);
5402       cc=get_reg(branch_regs[i].regmap,CCREG);
5403       if(cc==-1) {
5404         emit_loadreg(CCREG,cc=HOST_CCREG);
5405         // CHECK: Is the following instruction (fall thru) allocated ok?
5406       }
5407       assert(cc==HOST_CCREG);
5408       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5409       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5410       assem_debug("cycle count (adj)\n");
5411       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5412       load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5413       if(internal)
5414         assem_debug("branch: internal\n");
5415       else
5416         assem_debug("branch: external\n");
5417       if(internal&&is_ds[(ba[i]-start)>>2]) {
5418         ds_assemble_entry(i);
5419       }
5420       else {
5421         add_to_linker(out,ba[i],internal);
5422         emit_jmp(0);
5423       }
5424     }
5425     // branch not taken
5426     if(!unconditional) {
5427       set_jump_target(nottaken, out);
5428       assem_debug("1:\n");
5429       if(!likely[i]) {
5430         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5431         load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5432         address_generation(i+1,&branch_regs[i],0);
5433         load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5434         ds_assemble(i+1,&branch_regs[i]);
5435       }
5436       cc=get_reg(branch_regs[i].regmap,CCREG);
5437       if(cc==-1&&!likely[i]) {
5438         // Cycle count isn't in a register, temporarily load it then write it out
5439         emit_loadreg(CCREG,HOST_CCREG);
5440         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5441         void *jaddr=out;
5442         emit_jns(0);
5443         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5444         emit_storereg(CCREG,HOST_CCREG);
5445       }
5446       else{
5447         cc=get_reg(i_regmap,CCREG);
5448         assert(cc==HOST_CCREG);
5449         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5450         void *jaddr=out;
5451         emit_jns(0);
5452         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5453       }
5454     }
5455   }
5456 }
5457
5458 static void pagespan_assemble(int i,struct regstat *i_regs)
5459 {
5460   int s1l=get_reg(i_regs->regmap,rs1[i]);
5461   int s2l=get_reg(i_regs->regmap,rs2[i]);
5462   void *taken = NULL;
5463   void *nottaken = NULL;
5464   int unconditional=0;
5465   if(rs1[i]==0)
5466   {
5467     s1l=s2l;
5468     s2l=-1;
5469   }
5470   else if(rs2[i]==0)
5471   {
5472     s2l=-1;
5473   }
5474   int hr=0;
5475   int addr=-1,alt=-1,ntaddr=-1;
5476   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5477   else {
5478     while(hr<HOST_REGS)
5479     {
5480       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5481          (i_regs->regmap[hr]&63)!=rs1[i] &&
5482          (i_regs->regmap[hr]&63)!=rs2[i] )
5483       {
5484         addr=hr++;break;
5485       }
5486       hr++;
5487     }
5488   }
5489   while(hr<HOST_REGS)
5490   {
5491     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5492        (i_regs->regmap[hr]&63)!=rs1[i] &&
5493        (i_regs->regmap[hr]&63)!=rs2[i] )
5494     {
5495       alt=hr++;break;
5496     }
5497     hr++;
5498   }
5499   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5500   {
5501     while(hr<HOST_REGS)
5502     {
5503       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5504          (i_regs->regmap[hr]&63)!=rs1[i] &&
5505          (i_regs->regmap[hr]&63)!=rs2[i] )
5506       {
5507         ntaddr=hr;break;
5508       }
5509       hr++;
5510     }
5511   }
5512   assert(hr<HOST_REGS);
5513   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5514     load_regs(regs[i].regmap_entry,regs[i].regmap,CCREG,CCREG);
5515   }
5516   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5517   if(opcode[i]==2) // J
5518   {
5519     unconditional=1;
5520   }
5521   if(opcode[i]==3) // JAL
5522   {
5523     // TODO: mini_ht
5524     int rt=get_reg(i_regs->regmap,31);
5525     emit_movimm(start+i*4+8,rt);
5526     unconditional=1;
5527   }
5528   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
5529   {
5530     emit_mov(s1l,addr);
5531     if(opcode2[i]==9) // JALR
5532     {
5533       int rt=get_reg(i_regs->regmap,rt1[i]);
5534       emit_movimm(start+i*4+8,rt);
5535     }
5536   }
5537   if((opcode[i]&0x3f)==4) // BEQ
5538   {
5539     if(rs1[i]==rs2[i])
5540     {
5541       unconditional=1;
5542     }
5543     else
5544     #ifdef HAVE_CMOV_IMM
5545     if(1) {
5546       if(s2l>=0) emit_cmp(s1l,s2l);
5547       else emit_test(s1l,s1l);
5548       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5549     }
5550     else
5551     #endif
5552     {
5553       assert(s1l>=0);
5554       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5555       if(s2l>=0) emit_cmp(s1l,s2l);
5556       else emit_test(s1l,s1l);
5557       emit_cmovne_reg(alt,addr);
5558     }
5559   }
5560   if((opcode[i]&0x3f)==5) // BNE
5561   {
5562     #ifdef HAVE_CMOV_IMM
5563     if(s2l>=0) emit_cmp(s1l,s2l);
5564     else emit_test(s1l,s1l);
5565     emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5566     #else
5567     assert(s1l>=0);
5568     emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5569     if(s2l>=0) emit_cmp(s1l,s2l);
5570     else emit_test(s1l,s1l);
5571     emit_cmovne_reg(alt,addr);
5572     #endif
5573   }
5574   if((opcode[i]&0x3f)==0x14) // BEQL
5575   {
5576     if(s2l>=0) emit_cmp(s1l,s2l);
5577     else emit_test(s1l,s1l);
5578     if(nottaken) set_jump_target(nottaken, out);
5579     nottaken=out;
5580     emit_jne(0);
5581   }
5582   if((opcode[i]&0x3f)==0x15) // BNEL
5583   {
5584     if(s2l>=0) emit_cmp(s1l,s2l);
5585     else emit_test(s1l,s1l);
5586     nottaken=out;
5587     emit_jeq(0);
5588     if(taken) set_jump_target(taken, out);
5589   }
5590   if((opcode[i]&0x3f)==6) // BLEZ
5591   {
5592     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5593     emit_cmpimm(s1l,1);
5594     emit_cmovl_reg(alt,addr);
5595   }
5596   if((opcode[i]&0x3f)==7) // BGTZ
5597   {
5598     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5599     emit_cmpimm(s1l,1);
5600     emit_cmovl_reg(ntaddr,addr);
5601   }
5602   if((opcode[i]&0x3f)==0x16) // BLEZL
5603   {
5604     assert((opcode[i]&0x3f)!=0x16);
5605   }
5606   if((opcode[i]&0x3f)==0x17) // BGTZL
5607   {
5608     assert((opcode[i]&0x3f)!=0x17);
5609   }
5610   assert(opcode[i]!=1); // BLTZ/BGEZ
5611
5612   //FIXME: Check CSREG
5613   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5614     if((source[i]&0x30000)==0) // BC1F
5615     {
5616       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5617       emit_testimm(s1l,0x800000);
5618       emit_cmovne_reg(alt,addr);
5619     }
5620     if((source[i]&0x30000)==0x10000) // BC1T
5621     {
5622       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5623       emit_testimm(s1l,0x800000);
5624       emit_cmovne_reg(alt,addr);
5625     }
5626     if((source[i]&0x30000)==0x20000) // BC1FL
5627     {
5628       emit_testimm(s1l,0x800000);
5629       nottaken=out;
5630       emit_jne(0);
5631     }
5632     if((source[i]&0x30000)==0x30000) // BC1TL
5633     {
5634       emit_testimm(s1l,0x800000);
5635       nottaken=out;
5636       emit_jeq(0);
5637     }
5638   }
5639
5640   assert(i_regs->regmap[HOST_CCREG]==CCREG);
5641   wb_dirtys(regs[i].regmap,regs[i].dirty);
5642   if(likely[i]||unconditional)
5643   {
5644     emit_movimm(ba[i],HOST_BTREG);
5645   }
5646   else if(addr!=HOST_BTREG)
5647   {
5648     emit_mov(addr,HOST_BTREG);
5649   }
5650   void *branch_addr=out;
5651   emit_jmp(0);
5652   int target_addr=start+i*4+5;
5653   void *stub=out;
5654   void *compiled_target_addr=check_addr(target_addr);
5655   emit_extjump_ds(branch_addr, target_addr);
5656   if(compiled_target_addr) {
5657     set_jump_target(branch_addr, compiled_target_addr);
5658     add_link(target_addr,stub);
5659   }
5660   else set_jump_target(branch_addr, stub);
5661   if(likely[i]) {
5662     // Not-taken path
5663     set_jump_target(nottaken, out);
5664     wb_dirtys(regs[i].regmap,regs[i].dirty);
5665     void *branch_addr=out;
5666     emit_jmp(0);
5667     int target_addr=start+i*4+8;
5668     void *stub=out;
5669     void *compiled_target_addr=check_addr(target_addr);
5670     emit_extjump_ds(branch_addr, target_addr);
5671     if(compiled_target_addr) {
5672       set_jump_target(branch_addr, compiled_target_addr);
5673       add_link(target_addr,stub);
5674     }
5675     else set_jump_target(branch_addr, stub);
5676   }
5677 }
5678
5679 // Assemble the delay slot for the above
5680 static void pagespan_ds()
5681 {
5682   assem_debug("initial delay slot:\n");
5683   u_int vaddr=start+1;
5684   u_int page=get_page(vaddr);
5685   u_int vpage=get_vpage(vaddr);
5686   ll_add(jump_dirty+vpage,vaddr,(void *)out);
5687   do_dirty_stub_ds();
5688   ll_add(jump_in+page,vaddr,(void *)out);
5689   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
5690   if(regs[0].regmap[HOST_CCREG]!=CCREG)
5691     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty);
5692   if(regs[0].regmap[HOST_BTREG]!=BTREG)
5693     emit_writeword(HOST_BTREG,&branch_target);
5694   load_regs(regs[0].regmap_entry,regs[0].regmap,rs1[0],rs2[0]);
5695   address_generation(0,&regs[0],regs[0].regmap_entry);
5696   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
5697     load_regs(regs[0].regmap_entry,regs[0].regmap,INVCP,INVCP);
5698   is_delayslot=0;
5699   switch(itype[0]) {
5700     case ALU:
5701       alu_assemble(0,&regs[0]);break;
5702     case IMM16:
5703       imm16_assemble(0,&regs[0]);break;
5704     case SHIFT:
5705       shift_assemble(0,&regs[0]);break;
5706     case SHIFTIMM:
5707       shiftimm_assemble(0,&regs[0]);break;
5708     case LOAD:
5709       load_assemble(0,&regs[0]);break;
5710     case LOADLR:
5711       loadlr_assemble(0,&regs[0]);break;
5712     case STORE:
5713       store_assemble(0,&regs[0]);break;
5714     case STORELR:
5715       storelr_assemble(0,&regs[0]);break;
5716     case COP0:
5717       cop0_assemble(0,&regs[0]);break;
5718     case COP1:
5719       cop1_assemble(0,&regs[0]);break;
5720     case C1LS:
5721       c1ls_assemble(0,&regs[0]);break;
5722     case COP2:
5723       cop2_assemble(0,&regs[0]);break;
5724     case C2LS:
5725       c2ls_assemble(0,&regs[0]);break;
5726     case C2OP:
5727       c2op_assemble(0,&regs[0]);break;
5728     case MULTDIV:
5729       multdiv_assemble(0,&regs[0]);break;
5730     case MOV:
5731       mov_assemble(0,&regs[0]);break;
5732     case SYSCALL:
5733     case HLECALL:
5734     case INTCALL:
5735     case SPAN:
5736     case UJUMP:
5737     case RJUMP:
5738     case CJUMP:
5739     case SJUMP:
5740       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
5741   }
5742   int btaddr=get_reg(regs[0].regmap,BTREG);
5743   if(btaddr<0) {
5744     btaddr=get_reg(regs[0].regmap,-1);
5745     emit_readword(&branch_target,btaddr);
5746   }
5747   assert(btaddr!=HOST_CCREG);
5748   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
5749 #ifdef HOST_IMM8
5750   host_tempreg_acquire();
5751   emit_movimm(start+4,HOST_TEMPREG);
5752   emit_cmp(btaddr,HOST_TEMPREG);
5753   host_tempreg_release();
5754 #else
5755   emit_cmpimm(btaddr,start+4);
5756 #endif
5757   void *branch = out;
5758   emit_jeq(0);
5759   store_regs_bt(regs[0].regmap,regs[0].dirty,-1);
5760   do_jump_vaddr(btaddr);
5761   set_jump_target(branch, out);
5762   store_regs_bt(regs[0].regmap,regs[0].dirty,start+4);
5763   load_regs_bt(regs[0].regmap,regs[0].dirty,start+4);
5764 }
5765
5766 // Basic liveness analysis for MIPS registers
5767 void unneeded_registers(int istart,int iend,int r)
5768 {
5769   int i;
5770   uint64_t u,gte_u,b,gte_b;
5771   uint64_t temp_u,temp_gte_u=0;
5772   uint64_t gte_u_unknown=0;
5773   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
5774     gte_u_unknown=~0ll;
5775   if(iend==slen-1) {
5776     u=1;
5777     gte_u=gte_u_unknown;
5778   }else{
5779     //u=unneeded_reg[iend+1];
5780     u=1;
5781     gte_u=gte_unneeded[iend+1];
5782   }
5783
5784   for (i=iend;i>=istart;i--)
5785   {
5786     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
5787     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
5788     {
5789       // If subroutine call, flag return address as a possible branch target
5790       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
5791
5792       if(ba[i]<start || ba[i]>=(start+slen*4))
5793       {
5794         // Branch out of this block, flush all regs
5795         u=1;
5796         gte_u=gte_u_unknown;
5797         branch_unneeded_reg[i]=u;
5798         // Merge in delay slot
5799         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5800         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5801         u|=1;
5802         gte_u|=gte_rt[i+1];
5803         gte_u&=~gte_rs[i+1];
5804         // If branch is "likely" (and conditional)
5805         // then we skip the delay slot on the fall-thru path
5806         if(likely[i]) {
5807           if(i<slen-1) {
5808             u&=unneeded_reg[i+2];
5809             gte_u&=gte_unneeded[i+2];
5810           }
5811           else
5812           {
5813             u=1;
5814             gte_u=gte_u_unknown;
5815           }
5816         }
5817       }
5818       else
5819       {
5820         // Internal branch, flag target
5821         bt[(ba[i]-start)>>2]=1;
5822         if(ba[i]<=start+i*4) {
5823           // Backward branch
5824           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
5825           {
5826             // Unconditional branch
5827             temp_u=1;
5828             temp_gte_u=0;
5829           } else {
5830             // Conditional branch (not taken case)
5831             temp_u=unneeded_reg[i+2];
5832             temp_gte_u&=gte_unneeded[i+2];
5833           }
5834           // Merge in delay slot
5835           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5836           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5837           temp_u|=1;
5838           temp_gte_u|=gte_rt[i+1];
5839           temp_gte_u&=~gte_rs[i+1];
5840           // If branch is "likely" (and conditional)
5841           // then we skip the delay slot on the fall-thru path
5842           if(likely[i]) {
5843             if(i<slen-1) {
5844               temp_u&=unneeded_reg[i+2];
5845               temp_gte_u&=gte_unneeded[i+2];
5846             }
5847             else
5848             {
5849               temp_u=1;
5850               temp_gte_u=gte_u_unknown;
5851             }
5852           }
5853           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
5854           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5855           temp_u|=1;
5856           temp_gte_u|=gte_rt[i];
5857           temp_gte_u&=~gte_rs[i];
5858           unneeded_reg[i]=temp_u;
5859           gte_unneeded[i]=temp_gte_u;
5860           // Only go three levels deep.  This recursion can take an
5861           // excessive amount of time if there are a lot of nested loops.
5862           if(r<2) {
5863             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
5864           }else{
5865             unneeded_reg[(ba[i]-start)>>2]=1;
5866             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
5867           }
5868         } /*else*/ if(1) {
5869           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
5870           {
5871             // Unconditional branch
5872             u=unneeded_reg[(ba[i]-start)>>2];
5873             gte_u=gte_unneeded[(ba[i]-start)>>2];
5874             branch_unneeded_reg[i]=u;
5875             // Merge in delay slot
5876             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5877             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5878             u|=1;
5879             gte_u|=gte_rt[i+1];
5880             gte_u&=~gte_rs[i+1];
5881           } else {
5882             // Conditional branch
5883             b=unneeded_reg[(ba[i]-start)>>2];
5884             gte_b=gte_unneeded[(ba[i]-start)>>2];
5885             branch_unneeded_reg[i]=b;
5886             // Branch delay slot
5887             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5888             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5889             b|=1;
5890             gte_b|=gte_rt[i+1];
5891             gte_b&=~gte_rs[i+1];
5892             // If branch is "likely" then we skip the
5893             // delay slot on the fall-thru path
5894             if(likely[i]) {
5895               u=b;
5896               gte_u=gte_b;
5897               if(i<slen-1) {
5898                 u&=unneeded_reg[i+2];
5899                 gte_u&=gte_unneeded[i+2];
5900               }
5901             } else {
5902               u&=b;
5903               gte_u&=gte_b;
5904             }
5905             if(i<slen-1) {
5906               branch_unneeded_reg[i]&=unneeded_reg[i+2];
5907             } else {
5908               branch_unneeded_reg[i]=1;
5909             }
5910           }
5911         }
5912       }
5913     }
5914     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
5915     {
5916       // SYSCALL instruction (software interrupt)
5917       u=1;
5918     }
5919     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
5920     {
5921       // ERET instruction (return from interrupt)
5922       u=1;
5923     }
5924     //u=1; // DEBUG
5925     // Written registers are unneeded
5926     u|=1LL<<rt1[i];
5927     u|=1LL<<rt2[i];
5928     gte_u|=gte_rt[i];
5929     // Accessed registers are needed
5930     u&=~(1LL<<rs1[i]);
5931     u&=~(1LL<<rs2[i]);
5932     gte_u&=~gte_rs[i];
5933     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
5934       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
5935     // Source-target dependencies
5936     // R0 is always unneeded
5937     u|=1;
5938     // Save it
5939     unneeded_reg[i]=u;
5940     gte_unneeded[i]=gte_u;
5941     /*
5942     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
5943     printf("U:");
5944     int r;
5945     for(r=1;r<=CCREG;r++) {
5946       if((unneeded_reg[i]>>r)&1) {
5947         if(r==HIREG) printf(" HI");
5948         else if(r==LOREG) printf(" LO");
5949         else printf(" r%d",r);
5950       }
5951     }
5952     printf("\n");
5953     */
5954   }
5955 }
5956
5957 // Write back dirty registers as soon as we will no longer modify them,
5958 // so that we don't end up with lots of writes at the branches.
5959 void clean_registers(int istart,int iend,int wr)
5960 {
5961   int i;
5962   int r;
5963   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
5964   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
5965   if(iend==slen-1) {
5966     will_dirty_i=will_dirty_next=0;
5967     wont_dirty_i=wont_dirty_next=0;
5968   }else{
5969     will_dirty_i=will_dirty_next=will_dirty[iend+1];
5970     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
5971   }
5972   for (i=iend;i>=istart;i--)
5973   {
5974     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
5975     {
5976       if(ba[i]<start || ba[i]>=(start+slen*4))
5977       {
5978         // Branch out of this block, flush all regs
5979         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
5980         {
5981           // Unconditional branch
5982           will_dirty_i=0;
5983           wont_dirty_i=0;
5984           // Merge in delay slot (will dirty)
5985           for(r=0;r<HOST_REGS;r++) {
5986             if(r!=EXCLUDE_REG) {
5987               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
5988               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
5989               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
5990               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
5991               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
5992               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
5993               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
5994               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
5995               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
5996               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
5997               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
5998               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
5999               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6000               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6001             }
6002           }
6003         }
6004         else
6005         {
6006           // Conditional branch
6007           will_dirty_i=0;
6008           wont_dirty_i=wont_dirty_next;
6009           // Merge in delay slot (will dirty)
6010           for(r=0;r<HOST_REGS;r++) {
6011             if(r!=EXCLUDE_REG) {
6012               if(!likely[i]) {
6013                 // Might not dirty if likely branch is not taken
6014                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6015                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6016                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6017                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6018                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6019                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6020                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6021                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6022                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6023                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6024                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6025                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6026                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6027                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6028               }
6029             }
6030           }
6031         }
6032         // Merge in delay slot (wont dirty)
6033         for(r=0;r<HOST_REGS;r++) {
6034           if(r!=EXCLUDE_REG) {
6035             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6036             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6037             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6038             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6039             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6040             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6041             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6042             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6043             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6044             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6045           }
6046         }
6047         if(wr) {
6048           #ifndef DESTRUCTIVE_WRITEBACK
6049           branch_regs[i].dirty&=wont_dirty_i;
6050           #endif
6051           branch_regs[i].dirty|=will_dirty_i;
6052         }
6053       }
6054       else
6055       {
6056         // Internal branch
6057         if(ba[i]<=start+i*4) {
6058           // Backward branch
6059           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6060           {
6061             // Unconditional branch
6062             temp_will_dirty=0;
6063             temp_wont_dirty=0;
6064             // Merge in delay slot (will dirty)
6065             for(r=0;r<HOST_REGS;r++) {
6066               if(r!=EXCLUDE_REG) {
6067                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6068                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6069                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6070                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6071                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6072                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6073                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6074                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6075                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6076                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6077                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6078                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6079                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6080                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6081               }
6082             }
6083           } else {
6084             // Conditional branch (not taken case)
6085             temp_will_dirty=will_dirty_next;
6086             temp_wont_dirty=wont_dirty_next;
6087             // Merge in delay slot (will dirty)
6088             for(r=0;r<HOST_REGS;r++) {
6089               if(r!=EXCLUDE_REG) {
6090                 if(!likely[i]) {
6091                   // Will not dirty if likely branch is not taken
6092                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6093                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6094                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6095                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6096                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6097                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
6098                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6099                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6100                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6101                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6102                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6103                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6104                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6105                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6106                 }
6107               }
6108             }
6109           }
6110           // Merge in delay slot (wont dirty)
6111           for(r=0;r<HOST_REGS;r++) {
6112             if(r!=EXCLUDE_REG) {
6113               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6114               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6115               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6116               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6117               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6118               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6119               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6120               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6121               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6122               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6123             }
6124           }
6125           // Deal with changed mappings
6126           if(i<iend) {
6127             for(r=0;r<HOST_REGS;r++) {
6128               if(r!=EXCLUDE_REG) {
6129                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
6130                   temp_will_dirty&=~(1<<r);
6131                   temp_wont_dirty&=~(1<<r);
6132                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6133                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6134                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6135                   } else {
6136                     temp_will_dirty|=1<<r;
6137                     temp_wont_dirty|=1<<r;
6138                   }
6139                 }
6140               }
6141             }
6142           }
6143           if(wr) {
6144             will_dirty[i]=temp_will_dirty;
6145             wont_dirty[i]=temp_wont_dirty;
6146             clean_registers((ba[i]-start)>>2,i-1,0);
6147           }else{
6148             // Limit recursion.  It can take an excessive amount
6149             // of time if there are a lot of nested loops.
6150             will_dirty[(ba[i]-start)>>2]=0;
6151             wont_dirty[(ba[i]-start)>>2]=-1;
6152           }
6153         }
6154         /*else*/ if(1)
6155         {
6156           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6157           {
6158             // Unconditional branch
6159             will_dirty_i=0;
6160             wont_dirty_i=0;
6161           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6162             for(r=0;r<HOST_REGS;r++) {
6163               if(r!=EXCLUDE_REG) {
6164                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6165                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
6166                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6167                 }
6168                 if(branch_regs[i].regmap[r]>=0) {
6169                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6170                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6171                 }
6172               }
6173             }
6174           //}
6175             // Merge in delay slot
6176             for(r=0;r<HOST_REGS;r++) {
6177               if(r!=EXCLUDE_REG) {
6178                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6179                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6180                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6181                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6182                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6183                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6184                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6185                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6186                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6187                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6188                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6189                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6190                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6191                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6192               }
6193             }
6194           } else {
6195             // Conditional branch
6196             will_dirty_i=will_dirty_next;
6197             wont_dirty_i=wont_dirty_next;
6198           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6199             for(r=0;r<HOST_REGS;r++) {
6200               if(r!=EXCLUDE_REG) {
6201                 signed char target_reg=branch_regs[i].regmap[r];
6202                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6203                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6204                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6205                 }
6206                 else if(target_reg>=0) {
6207                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6208                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6209                 }
6210                 // Treat delay slot as part of branch too
6211                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6212                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6213                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6214                 }
6215                 else
6216                 {
6217                   will_dirty[i+1]&=~(1<<r);
6218                 }*/
6219               }
6220             }
6221           //}
6222             // Merge in delay slot
6223             for(r=0;r<HOST_REGS;r++) {
6224               if(r!=EXCLUDE_REG) {
6225                 if(!likely[i]) {
6226                   // Might not dirty if likely branch is not taken
6227                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6228                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6229                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6230                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6231                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6232                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6233                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6234                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6235                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6236                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6237                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6238                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6239                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6240                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6241                 }
6242               }
6243             }
6244           }
6245           // Merge in delay slot (won't dirty)
6246           for(r=0;r<HOST_REGS;r++) {
6247             if(r!=EXCLUDE_REG) {
6248               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6249               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6250               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6251               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6252               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6253               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6254               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6255               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6256               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6257               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6258             }
6259           }
6260           if(wr) {
6261             #ifndef DESTRUCTIVE_WRITEBACK
6262             branch_regs[i].dirty&=wont_dirty_i;
6263             #endif
6264             branch_regs[i].dirty|=will_dirty_i;
6265           }
6266         }
6267       }
6268     }
6269     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6270     {
6271       // SYSCALL instruction (software interrupt)
6272       will_dirty_i=0;
6273       wont_dirty_i=0;
6274     }
6275     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6276     {
6277       // ERET instruction (return from interrupt)
6278       will_dirty_i=0;
6279       wont_dirty_i=0;
6280     }
6281     will_dirty_next=will_dirty_i;
6282     wont_dirty_next=wont_dirty_i;
6283     for(r=0;r<HOST_REGS;r++) {
6284       if(r!=EXCLUDE_REG) {
6285         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6286         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6287         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6288         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6289         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6290         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6291         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6292         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6293         if(i>istart) {
6294           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP)
6295           {
6296             // Don't store a register immediately after writing it,
6297             // may prevent dual-issue.
6298             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
6299             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
6300           }
6301         }
6302       }
6303     }
6304     // Save it
6305     will_dirty[i]=will_dirty_i;
6306     wont_dirty[i]=wont_dirty_i;
6307     // Mark registers that won't be dirtied as not dirty
6308     if(wr) {
6309       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
6310       for(r=0;r<HOST_REGS;r++) {
6311         if((will_dirty_i>>r)&1) {
6312           printf(" r%d",r);
6313         }
6314       }
6315       printf("\n");*/
6316
6317       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP)) {
6318         regs[i].dirty|=will_dirty_i;
6319         #ifndef DESTRUCTIVE_WRITEBACK
6320         regs[i].dirty&=wont_dirty_i;
6321         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
6322         {
6323           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
6324             for(r=0;r<HOST_REGS;r++) {
6325               if(r!=EXCLUDE_REG) {
6326                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
6327                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
6328                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6329               }
6330             }
6331           }
6332         }
6333         else
6334         {
6335           if(i<iend) {
6336             for(r=0;r<HOST_REGS;r++) {
6337               if(r!=EXCLUDE_REG) {
6338                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
6339                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
6340                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6341               }
6342             }
6343           }
6344         }
6345         #endif
6346       //}
6347     }
6348     // Deal with changed mappings
6349     temp_will_dirty=will_dirty_i;
6350     temp_wont_dirty=wont_dirty_i;
6351     for(r=0;r<HOST_REGS;r++) {
6352       if(r!=EXCLUDE_REG) {
6353         int nr;
6354         if(regs[i].regmap[r]==regmap_pre[i][r]) {
6355           if(wr) {
6356             #ifndef DESTRUCTIVE_WRITEBACK
6357             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6358             #endif
6359             regs[i].wasdirty|=will_dirty_i&(1<<r);
6360           }
6361         }
6362         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
6363           // Register moved to a different register
6364           will_dirty_i&=~(1<<r);
6365           wont_dirty_i&=~(1<<r);
6366           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
6367           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
6368           if(wr) {
6369             #ifndef DESTRUCTIVE_WRITEBACK
6370             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6371             #endif
6372             regs[i].wasdirty|=will_dirty_i&(1<<r);
6373           }
6374         }
6375         else {
6376           will_dirty_i&=~(1<<r);
6377           wont_dirty_i&=~(1<<r);
6378           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6379             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6380             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6381           } else {
6382             wont_dirty_i|=1<<r;
6383             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
6384           }
6385         }
6386       }
6387     }
6388   }
6389 }
6390
6391 #ifdef DISASM
6392   /* disassembly */
6393 void disassemble_inst(int i)
6394 {
6395     if (bt[i]) printf("*"); else printf(" ");
6396     switch(itype[i]) {
6397       case UJUMP:
6398         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6399       case CJUMP:
6400         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
6401       case SJUMP:
6402         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
6403       case RJUMP:
6404         if (opcode[i]==0x9&&rt1[i]!=31)
6405           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
6406         else
6407           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6408         break;
6409       case SPAN:
6410         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
6411       case IMM16:
6412         if(opcode[i]==0xf) //LUI
6413           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
6414         else
6415           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6416         break;
6417       case LOAD:
6418       case LOADLR:
6419         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6420         break;
6421       case STORE:
6422       case STORELR:
6423         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
6424         break;
6425       case ALU:
6426       case SHIFT:
6427         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
6428         break;
6429       case MULTDIV:
6430         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
6431         break;
6432       case SHIFTIMM:
6433         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6434         break;
6435       case MOV:
6436         if((opcode2[i]&0x1d)==0x10)
6437           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
6438         else if((opcode2[i]&0x1d)==0x11)
6439           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6440         else
6441           printf (" %x: %s\n",start+i*4,insn[i]);
6442         break;
6443       case COP0:
6444         if(opcode2[i]==0)
6445           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
6446         else if(opcode2[i]==4)
6447           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
6448         else printf (" %x: %s\n",start+i*4,insn[i]);
6449         break;
6450       case COP1:
6451         if(opcode2[i]<3)
6452           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
6453         else if(opcode2[i]>3)
6454           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
6455         else printf (" %x: %s\n",start+i*4,insn[i]);
6456         break;
6457       case COP2:
6458         if(opcode2[i]<3)
6459           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
6460         else if(opcode2[i]>3)
6461           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
6462         else printf (" %x: %s\n",start+i*4,insn[i]);
6463         break;
6464       case C1LS:
6465         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6466         break;
6467       case C2LS:
6468         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6469         break;
6470       case INTCALL:
6471         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
6472         break;
6473       default:
6474         //printf (" %s %8x\n",insn[i],source[i]);
6475         printf (" %x: %s\n",start+i*4,insn[i]);
6476     }
6477 }
6478 #else
6479 static void disassemble_inst(int i) {}
6480 #endif // DISASM
6481
6482 #define DRC_TEST_VAL 0x74657374
6483
6484 static void new_dynarec_test(void)
6485 {
6486   int (*testfunc)(void);
6487   void *beginning;
6488   int ret[2];
6489   size_t i;
6490
6491   // check structure linkage
6492   if ((u_char *)rcnts - (u_char *)&psxRegs != sizeof(psxRegs))
6493   {
6494     SysPrintf("linkage_arm* miscompilation/breakage detected.\n");
6495   }
6496
6497   SysPrintf("testing if we can run recompiled code...\n");
6498   ((volatile u_int *)out)[0]++; // make cache dirty
6499
6500   for (i = 0; i < ARRAY_SIZE(ret); i++) {
6501     out = ndrc->translation_cache;
6502     beginning = start_block();
6503     emit_movimm(DRC_TEST_VAL + i, 0); // test
6504     emit_ret();
6505     literal_pool(0);
6506     end_block(beginning);
6507     testfunc = beginning;
6508     ret[i] = testfunc();
6509   }
6510
6511   if (ret[0] == DRC_TEST_VAL && ret[1] == DRC_TEST_VAL + 1)
6512     SysPrintf("test passed.\n");
6513   else
6514     SysPrintf("test failed, will likely crash soon (r=%08x %08x)\n", ret[0], ret[1]);
6515   out = ndrc->translation_cache;
6516 }
6517
6518 // clear the state completely, instead of just marking
6519 // things invalid like invalidate_all_pages() does
6520 void new_dynarec_clear_full(void)
6521 {
6522   int n;
6523   out = ndrc->translation_cache;
6524   memset(invalid_code,1,sizeof(invalid_code));
6525   memset(hash_table,0xff,sizeof(hash_table));
6526   memset(mini_ht,-1,sizeof(mini_ht));
6527   memset(restore_candidate,0,sizeof(restore_candidate));
6528   memset(shadow,0,sizeof(shadow));
6529   copy=shadow;
6530   expirep=16384; // Expiry pointer, +2 blocks
6531   pending_exception=0;
6532   literalcount=0;
6533   stop_after_jal=0;
6534   inv_code_start=inv_code_end=~0;
6535   // TLB
6536   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6537   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6538   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6539 }
6540
6541 void new_dynarec_init(void)
6542 {
6543   SysPrintf("Init new dynarec\n");
6544
6545 #ifdef BASE_ADDR_DYNAMIC
6546   #ifdef VITA
6547   sceBlock = sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
6548   if (sceBlock < 0)
6549     SysPrintf("sceKernelAllocMemBlockForVM failed\n");
6550   int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&ndrc);
6551   if (ret < 0)
6552     SysPrintf("sceKernelGetMemBlockBase failed\n");
6553   #else
6554   uintptr_t desired_addr = 0;
6555   #ifdef __ELF__
6556   extern char _end;
6557   desired_addr = ((uintptr_t)&_end + 0xffffff) & ~0xffffffl;
6558   #endif
6559   ndrc = mmap((void *)desired_addr, sizeof(*ndrc),
6560             PROT_READ | PROT_WRITE | PROT_EXEC,
6561             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
6562   if (ndrc == MAP_FAILED) {
6563     SysPrintf("mmap() failed: %s\n", strerror(errno));
6564     abort();
6565   }
6566   #endif
6567 #else
6568   #ifndef NO_WRITE_EXEC
6569   // not all systems allow execute in data segment by default
6570   if (mprotect(ndrc, sizeof(ndrc->translation_cache) + sizeof(ndrc->tramp.ops),
6571                PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
6572     SysPrintf("mprotect() failed: %s\n", strerror(errno));
6573   #endif
6574 #endif
6575   out = ndrc->translation_cache;
6576   cycle_multiplier=200;
6577   new_dynarec_clear_full();
6578 #ifdef HOST_IMM8
6579   // Copy this into local area so we don't have to put it in every literal pool
6580   invc_ptr=invalid_code;
6581 #endif
6582   arch_init();
6583   new_dynarec_test();
6584 #ifndef RAM_FIXED
6585   ram_offset=(uintptr_t)rdram-0x80000000;
6586 #endif
6587   if (ram_offset!=0)
6588     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
6589 }
6590
6591 void new_dynarec_cleanup(void)
6592 {
6593   int n;
6594 #ifdef BASE_ADDR_DYNAMIC
6595   #ifdef VITA
6596   sceKernelFreeMemBlock(sceBlock);
6597   sceBlock = -1;
6598   #else
6599   if (munmap(ndrc, sizeof(*ndrc)) < 0)
6600     SysPrintf("munmap() failed\n");
6601   #endif
6602 #endif
6603   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6604   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6605   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6606   #ifdef ROM_COPY
6607   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
6608   #endif
6609 }
6610
6611 static u_int *get_source_start(u_int addr, u_int *limit)
6612 {
6613   if (addr < 0x00200000 ||
6614     (0xa0000000 <= addr && addr < 0xa0200000)) {
6615     // used for BIOS calls mostly?
6616     *limit = (addr&0xa0000000)|0x00200000;
6617     return (u_int *)(rdram + (addr&0x1fffff));
6618   }
6619   else if (!Config.HLE && (
6620     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
6621     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
6622     // BIOS
6623     *limit = (addr & 0xfff00000) | 0x80000;
6624     return (u_int *)((u_char *)psxR + (addr&0x7ffff));
6625   }
6626   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
6627     *limit = (addr & 0x80600000) + 0x00200000;
6628     return (u_int *)(rdram + (addr&0x1fffff));
6629   }
6630   return NULL;
6631 }
6632
6633 static u_int scan_for_ret(u_int addr)
6634 {
6635   u_int limit = 0;
6636   u_int *mem;
6637
6638   mem = get_source_start(addr, &limit);
6639   if (mem == NULL)
6640     return addr;
6641
6642   if (limit > addr + 0x1000)
6643     limit = addr + 0x1000;
6644   for (; addr < limit; addr += 4, mem++) {
6645     if (*mem == 0x03e00008) // jr $ra
6646       return addr + 8;
6647   }
6648   return addr;
6649 }
6650
6651 struct savestate_block {
6652   uint32_t addr;
6653   uint32_t regflags;
6654 };
6655
6656 static int addr_cmp(const void *p1_, const void *p2_)
6657 {
6658   const struct savestate_block *p1 = p1_, *p2 = p2_;
6659   return p1->addr - p2->addr;
6660 }
6661
6662 int new_dynarec_save_blocks(void *save, int size)
6663 {
6664   struct savestate_block *blocks = save;
6665   int maxcount = size / sizeof(blocks[0]);
6666   struct savestate_block tmp_blocks[1024];
6667   struct ll_entry *head;
6668   int p, s, d, o, bcnt;
6669   u_int addr;
6670
6671   o = 0;
6672   for (p = 0; p < ARRAY_SIZE(jump_in); p++) {
6673     bcnt = 0;
6674     for (head = jump_in[p]; head != NULL; head = head->next) {
6675       tmp_blocks[bcnt].addr = head->vaddr;
6676       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
6677       bcnt++;
6678     }
6679     if (bcnt < 1)
6680       continue;
6681     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
6682
6683     addr = tmp_blocks[0].addr;
6684     for (s = d = 0; s < bcnt; s++) {
6685       if (tmp_blocks[s].addr < addr)
6686         continue;
6687       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
6688         tmp_blocks[d++] = tmp_blocks[s];
6689       addr = scan_for_ret(tmp_blocks[s].addr);
6690     }
6691
6692     if (o + d > maxcount)
6693       d = maxcount - o;
6694     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
6695     o += d;
6696   }
6697
6698   return o * sizeof(blocks[0]);
6699 }
6700
6701 void new_dynarec_load_blocks(const void *save, int size)
6702 {
6703   const struct savestate_block *blocks = save;
6704   int count = size / sizeof(blocks[0]);
6705   u_int regs_save[32];
6706   uint32_t f;
6707   int i, b;
6708
6709   get_addr(psxRegs.pc);
6710
6711   // change GPRs for speculation to at least partially work..
6712   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
6713   for (i = 1; i < 32; i++)
6714     psxRegs.GPR.r[i] = 0x80000000;
6715
6716   for (b = 0; b < count; b++) {
6717     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
6718       if (f & 1)
6719         psxRegs.GPR.r[i] = 0x1f800000;
6720     }
6721
6722     get_addr(blocks[b].addr);
6723
6724     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
6725       if (f & 1)
6726         psxRegs.GPR.r[i] = 0x80000000;
6727     }
6728   }
6729
6730   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
6731 }
6732
6733 int new_recompile_block(u_int addr)
6734 {
6735   u_int pagelimit = 0;
6736   u_int state_rflags = 0;
6737   int i;
6738
6739   assem_debug("NOTCOMPILED: addr = %x -> %p\n", addr, out);
6740   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
6741   //if(debug)
6742   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
6743
6744   // this is just for speculation
6745   for (i = 1; i < 32; i++) {
6746     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
6747       state_rflags |= 1 << i;
6748   }
6749
6750   start = (u_int)addr&~3;
6751   //assert(((u_int)addr&1)==0); // start-in-delay-slot flag
6752   new_dynarec_did_compile=1;
6753   if (Config.HLE && start == 0x80001000) // hlecall
6754   {
6755     // XXX: is this enough? Maybe check hleSoftCall?
6756     void *beginning=start_block();
6757     u_int page=get_page(start);
6758
6759     invalid_code[start>>12]=0;
6760     emit_movimm(start,0);
6761     emit_writeword(0,&pcaddr);
6762     emit_far_jump(new_dyna_leave);
6763     literal_pool(0);
6764     end_block(beginning);
6765     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
6766     return 0;
6767   }
6768
6769   source = get_source_start(start, &pagelimit);
6770   if (source == NULL) {
6771     SysPrintf("Compile at bogus memory address: %08x\n", addr);
6772     abort();
6773   }
6774
6775   /* Pass 1: disassemble */
6776   /* Pass 2: register dependencies, branch targets */
6777   /* Pass 3: register allocation */
6778   /* Pass 4: branch dependencies */
6779   /* Pass 5: pre-alloc */
6780   /* Pass 6: optimize clean/dirty state */
6781   /* Pass 7: flag 32-bit registers */
6782   /* Pass 8: assembly */
6783   /* Pass 9: linker */
6784   /* Pass 10: garbage collection / free memory */
6785
6786   int j;
6787   int done=0;
6788   unsigned int type,op,op2;
6789
6790   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
6791
6792   /* Pass 1 disassembly */
6793
6794   for(i=0;!done;i++) {
6795     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
6796     minimum_free_regs[i]=0;
6797     opcode[i]=op=source[i]>>26;
6798     switch(op)
6799     {
6800       case 0x00: strcpy(insn[i],"special"); type=NI;
6801         op2=source[i]&0x3f;
6802         switch(op2)
6803         {
6804           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
6805           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
6806           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
6807           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
6808           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
6809           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
6810           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
6811           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
6812           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
6813           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
6814           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
6815           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
6816           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
6817           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
6818           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
6819           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
6820           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
6821           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
6822           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
6823           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
6824           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
6825           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
6826           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
6827           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
6828           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
6829           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
6830           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
6831           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
6832           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
6833           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
6834           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
6835           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
6836           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
6837           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
6838           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
6839 #if 0
6840           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
6841           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
6842           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
6843           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
6844           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
6845           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
6846           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
6847           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
6848           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
6849           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
6850           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
6851           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
6852           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
6853           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
6854           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
6855           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
6856           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
6857 #endif
6858         }
6859         break;
6860       case 0x01: strcpy(insn[i],"regimm"); type=NI;
6861         op2=(source[i]>>16)&0x1f;
6862         switch(op2)
6863         {
6864           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
6865           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
6866           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
6867           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
6868           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
6869           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
6870           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
6871           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
6872           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
6873           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
6874           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
6875           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
6876           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
6877           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
6878         }
6879         break;
6880       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
6881       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
6882       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
6883       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
6884       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
6885       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
6886       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
6887       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
6888       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
6889       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
6890       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
6891       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
6892       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
6893       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
6894       case 0x10: strcpy(insn[i],"cop0"); type=NI;
6895         op2=(source[i]>>21)&0x1f;
6896         switch(op2)
6897         {
6898           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
6899           case 0x02: strcpy(insn[i],"CFC0"); type=COP0; break;
6900           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
6901           case 0x06: strcpy(insn[i],"CTC0"); type=COP0; break;
6902           case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
6903         }
6904         break;
6905       case 0x11: strcpy(insn[i],"cop1"); type=COP1;
6906         op2=(source[i]>>21)&0x1f;
6907         break;
6908 #if 0
6909       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
6910       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
6911       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
6912       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
6913       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
6914       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
6915       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
6916       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
6917 #endif
6918       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
6919       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
6920       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
6921       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
6922       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
6923       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
6924       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
6925 #if 0
6926       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
6927 #endif
6928       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
6929       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
6930       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
6931       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
6932 #if 0
6933       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
6934       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
6935 #endif
6936       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
6937       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
6938       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
6939       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
6940 #if 0
6941       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
6942       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
6943       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
6944 #endif
6945       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
6946       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
6947 #if 0
6948       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
6949       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
6950       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
6951 #endif
6952       case 0x12: strcpy(insn[i],"COP2"); type=NI;
6953         op2=(source[i]>>21)&0x1f;
6954         //if (op2 & 0x10)
6955         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
6956           if (gte_handlers[source[i]&0x3f]!=NULL) {
6957             if (gte_regnames[source[i]&0x3f]!=NULL)
6958               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
6959             else
6960               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
6961             type=C2OP;
6962           }
6963         }
6964         else switch(op2)
6965         {
6966           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
6967           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
6968           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
6969           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
6970         }
6971         break;
6972       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
6973       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
6974       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
6975       default: strcpy(insn[i],"???"); type=NI;
6976         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
6977         break;
6978     }
6979     itype[i]=type;
6980     opcode2[i]=op2;
6981     /* Get registers/immediates */
6982     lt1[i]=0;
6983     dep1[i]=0;
6984     dep2[i]=0;
6985     gte_rs[i]=gte_rt[i]=0;
6986     switch(type) {
6987       case LOAD:
6988         rs1[i]=(source[i]>>21)&0x1f;
6989         rs2[i]=0;
6990         rt1[i]=(source[i]>>16)&0x1f;
6991         rt2[i]=0;
6992         imm[i]=(short)source[i];
6993         break;
6994       case STORE:
6995       case STORELR:
6996         rs1[i]=(source[i]>>21)&0x1f;
6997         rs2[i]=(source[i]>>16)&0x1f;
6998         rt1[i]=0;
6999         rt2[i]=0;
7000         imm[i]=(short)source[i];
7001         break;
7002       case LOADLR:
7003         // LWL/LWR only load part of the register,
7004         // therefore the target register must be treated as a source too
7005         rs1[i]=(source[i]>>21)&0x1f;
7006         rs2[i]=(source[i]>>16)&0x1f;
7007         rt1[i]=(source[i]>>16)&0x1f;
7008         rt2[i]=0;
7009         imm[i]=(short)source[i];
7010         if(op==0x26) dep1[i]=rt1[i]; // LWR
7011         break;
7012       case IMM16:
7013         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7014         else rs1[i]=(source[i]>>21)&0x1f;
7015         rs2[i]=0;
7016         rt1[i]=(source[i]>>16)&0x1f;
7017         rt2[i]=0;
7018         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7019           imm[i]=(unsigned short)source[i];
7020         }else{
7021           imm[i]=(short)source[i];
7022         }
7023         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7024         break;
7025       case UJUMP:
7026         rs1[i]=0;
7027         rs2[i]=0;
7028         rt1[i]=0;
7029         rt2[i]=0;
7030         // The JAL instruction writes to r31.
7031         if (op&1) {
7032           rt1[i]=31;
7033         }
7034         rs2[i]=CCREG;
7035         break;
7036       case RJUMP:
7037         rs1[i]=(source[i]>>21)&0x1f;
7038         rs2[i]=0;
7039         rt1[i]=0;
7040         rt2[i]=0;
7041         // The JALR instruction writes to rd.
7042         if (op2&1) {
7043           rt1[i]=(source[i]>>11)&0x1f;
7044         }
7045         rs2[i]=CCREG;
7046         break;
7047       case CJUMP:
7048         rs1[i]=(source[i]>>21)&0x1f;
7049         rs2[i]=(source[i]>>16)&0x1f;
7050         rt1[i]=0;
7051         rt2[i]=0;
7052         if(op&2) { // BGTZ/BLEZ
7053           rs2[i]=0;
7054         }
7055         likely[i]=op>>4;
7056         break;
7057       case SJUMP:
7058         rs1[i]=(source[i]>>21)&0x1f;
7059         rs2[i]=CCREG;
7060         rt1[i]=0;
7061         rt2[i]=0;
7062         if(op2&0x10) { // BxxAL
7063           rt1[i]=31;
7064           // NOTE: If the branch is not taken, r31 is still overwritten
7065         }
7066         likely[i]=(op2&2)>>1;
7067         break;
7068       case ALU:
7069         rs1[i]=(source[i]>>21)&0x1f; // source
7070         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7071         rt1[i]=(source[i]>>11)&0x1f; // destination
7072         rt2[i]=0;
7073         if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7074           dep1[i]=rs1[i];dep2[i]=rs2[i];
7075         }
7076         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7077           dep1[i]=rs1[i];dep2[i]=rs2[i];
7078         }
7079         break;
7080       case MULTDIV:
7081         rs1[i]=(source[i]>>21)&0x1f; // source
7082         rs2[i]=(source[i]>>16)&0x1f; // divisor
7083         rt1[i]=HIREG;
7084         rt2[i]=LOREG;
7085         break;
7086       case MOV:
7087         rs1[i]=0;
7088         rs2[i]=0;
7089         rt1[i]=0;
7090         rt2[i]=0;
7091         if(op2==0x10) rs1[i]=HIREG; // MFHI
7092         if(op2==0x11) rt1[i]=HIREG; // MTHI
7093         if(op2==0x12) rs1[i]=LOREG; // MFLO
7094         if(op2==0x13) rt1[i]=LOREG; // MTLO
7095         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7096         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7097         dep1[i]=rs1[i];
7098         break;
7099       case SHIFT:
7100         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7101         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7102         rt1[i]=(source[i]>>11)&0x1f; // destination
7103         rt2[i]=0;
7104         break;
7105       case SHIFTIMM:
7106         rs1[i]=(source[i]>>16)&0x1f;
7107         rs2[i]=0;
7108         rt1[i]=(source[i]>>11)&0x1f;
7109         rt2[i]=0;
7110         imm[i]=(source[i]>>6)&0x1f;
7111         // DSxx32 instructions
7112         if(op2>=0x3c) imm[i]|=0x20;
7113         break;
7114       case COP0:
7115         rs1[i]=0;
7116         rs2[i]=0;
7117         rt1[i]=0;
7118         rt2[i]=0;
7119         if(op2==0||op2==2) rt1[i]=(source[i]>>16)&0x1F; // MFC0/CFC0
7120         if(op2==4||op2==6) rs1[i]=(source[i]>>16)&0x1F; // MTC0/CTC0
7121         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7122         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7123         break;
7124       case COP1:
7125         rs1[i]=0;
7126         rs2[i]=0;
7127         rt1[i]=0;
7128         rt2[i]=0;
7129         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7130         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7131         rs2[i]=CSREG;
7132         break;
7133       case COP2:
7134         rs1[i]=0;
7135         rs2[i]=0;
7136         rt1[i]=0;
7137         rt2[i]=0;
7138         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
7139         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
7140         rs2[i]=CSREG;
7141         int gr=(source[i]>>11)&0x1F;
7142         switch(op2)
7143         {
7144           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
7145           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
7146           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
7147           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
7148         }
7149         break;
7150       case C1LS:
7151         rs1[i]=(source[i]>>21)&0x1F;
7152         rs2[i]=CSREG;
7153         rt1[i]=0;
7154         rt2[i]=0;
7155         imm[i]=(short)source[i];
7156         break;
7157       case C2LS:
7158         rs1[i]=(source[i]>>21)&0x1F;
7159         rs2[i]=0;
7160         rt1[i]=0;
7161         rt2[i]=0;
7162         imm[i]=(short)source[i];
7163         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
7164         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
7165         break;
7166       case C2OP:
7167         rs1[i]=0;
7168         rs2[i]=0;
7169         rt1[i]=0;
7170         rt2[i]=0;
7171         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
7172         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
7173         gte_rt[i]|=1ll<<63; // every op changes flags
7174         if((source[i]&0x3f)==GTE_MVMVA) {
7175           int v = (source[i] >> 15) & 3;
7176           gte_rs[i]&=~0xe3fll;
7177           if(v==3) gte_rs[i]|=0xe00ll;
7178           else gte_rs[i]|=3ll<<(v*2);
7179         }
7180         break;
7181       case SYSCALL:
7182       case HLECALL:
7183       case INTCALL:
7184         rs1[i]=CCREG;
7185         rs2[i]=0;
7186         rt1[i]=0;
7187         rt2[i]=0;
7188         break;
7189       default:
7190         rs1[i]=0;
7191         rs2[i]=0;
7192         rt1[i]=0;
7193         rt2[i]=0;
7194     }
7195     /* Calculate branch target addresses */
7196     if(type==UJUMP)
7197       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
7198     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
7199       ba[i]=start+i*4+8; // Ignore never taken branch
7200     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
7201       ba[i]=start+i*4+8; // Ignore never taken branch
7202     else if(type==CJUMP||type==SJUMP)
7203       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
7204     else ba[i]=-1;
7205     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP)) {
7206       int do_in_intrp=0;
7207       // branch in delay slot?
7208       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP) {
7209         // don't handle first branch and call interpreter if it's hit
7210         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
7211         do_in_intrp=1;
7212       }
7213       // basic load delay detection
7214       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
7215         int t=(ba[i-1]-start)/4;
7216         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
7217           // jump target wants DS result - potential load delay effect
7218           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
7219           do_in_intrp=1;
7220           bt[t+1]=1; // expected return from interpreter
7221         }
7222         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
7223               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
7224           // v0 overwrite like this is a sign of trouble, bail out
7225           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
7226           do_in_intrp=1;
7227         }
7228       }
7229       if(do_in_intrp) {
7230         rs1[i-1]=CCREG;
7231         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
7232         ba[i-1]=-1;
7233         itype[i-1]=INTCALL;
7234         done=2;
7235         i--; // don't compile the DS
7236       }
7237     }
7238     /* Is this the end of the block? */
7239     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
7240       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
7241         done=2;
7242       }
7243       else {
7244         if(stop_after_jal) done=1;
7245         // Stop on BREAK
7246         if((source[i+1]&0xfc00003f)==0x0d) done=1;
7247       }
7248       // Don't recompile stuff that's already compiled
7249       if(check_addr(start+i*4+4)) done=1;
7250       // Don't get too close to the limit
7251       if(i>MAXBLOCK/2) done=1;
7252     }
7253     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
7254     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
7255     if(done==2) {
7256       // Does the block continue due to a branch?
7257       for(j=i-1;j>=0;j--)
7258       {
7259         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
7260         if(ba[j]==start+i*4+4) done=j=0;
7261         if(ba[j]==start+i*4+8) done=j=0;
7262       }
7263     }
7264     //assert(i<MAXBLOCK-1);
7265     if(start+i*4==pagelimit-4) done=1;
7266     assert(start+i*4<pagelimit);
7267     if (i==MAXBLOCK-1) done=1;
7268     // Stop if we're compiling junk
7269     if(itype[i]==NI&&opcode[i]==0x11) {
7270       done=stop_after_jal=1;
7271       SysPrintf("Disabled speculative precompilation\n");
7272     }
7273   }
7274   slen=i;
7275   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP) {
7276     if(start+i*4==pagelimit) {
7277       itype[i-1]=SPAN;
7278     }
7279   }
7280   assert(slen>0);
7281
7282   /* Pass 2 - Register dependencies and branch targets */
7283
7284   unneeded_registers(0,slen-1,0);
7285
7286   /* Pass 3 - Register allocation */
7287
7288   struct regstat current; // Current register allocations/status
7289   current.dirty=0;
7290   current.u=unneeded_reg[0];
7291   clear_all_regs(current.regmap);
7292   alloc_reg(&current,0,CCREG);
7293   dirty_reg(&current,CCREG);
7294   current.isconst=0;
7295   current.wasconst=0;
7296   current.waswritten=0;
7297   int ds=0;
7298   int cc=0;
7299   int hr=-1;
7300
7301   if((u_int)addr&1) {
7302     // First instruction is delay slot
7303     cc=-1;
7304     bt[1]=1;
7305     ds=1;
7306     unneeded_reg[0]=1;
7307     current.regmap[HOST_BTREG]=BTREG;
7308   }
7309
7310   for(i=0;i<slen;i++)
7311   {
7312     if(bt[i])
7313     {
7314       int hr;
7315       for(hr=0;hr<HOST_REGS;hr++)
7316       {
7317         // Is this really necessary?
7318         if(current.regmap[hr]==0) current.regmap[hr]=-1;
7319       }
7320       current.isconst=0;
7321       current.waswritten=0;
7322     }
7323
7324     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
7325     regs[i].wasconst=current.isconst;
7326     regs[i].wasdirty=current.dirty;
7327     regs[i].loadedconst=0;
7328     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP) {
7329       if(i+1<slen) {
7330         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7331         current.u|=1;
7332       } else {
7333         current.u=1;
7334       }
7335     } else {
7336       if(i+1<slen) {
7337         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
7338         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7339         current.u|=1;
7340       } else { SysPrintf("oops, branch at end of block with no delay slot\n");abort(); }
7341     }
7342     is_ds[i]=ds;
7343     if(ds) {
7344       ds=0; // Skip delay slot, already allocated as part of branch
7345       // ...but we need to alloc it in case something jumps here
7346       if(i+1<slen) {
7347         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
7348       }else{
7349         current.u=branch_unneeded_reg[i-1];
7350       }
7351       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7352       current.u|=1;
7353       struct regstat temp;
7354       memcpy(&temp,&current,sizeof(current));
7355       temp.wasdirty=temp.dirty;
7356       // TODO: Take into account unconditional branches, as below
7357       delayslot_alloc(&temp,i);
7358       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
7359       regs[i].wasdirty=temp.wasdirty;
7360       regs[i].dirty=temp.dirty;
7361       regs[i].isconst=0;
7362       regs[i].wasconst=0;
7363       current.isconst=0;
7364       // Create entry (branch target) regmap
7365       for(hr=0;hr<HOST_REGS;hr++)
7366       {
7367         int r=temp.regmap[hr];
7368         if(r>=0) {
7369           if(r!=regmap_pre[i][hr]) {
7370             regs[i].regmap_entry[hr]=-1;
7371           }
7372           else
7373           {
7374               assert(r < 64);
7375               if((current.u>>r)&1) {
7376                 regs[i].regmap_entry[hr]=-1;
7377                 regs[i].regmap[hr]=-1;
7378                 //Don't clear regs in the delay slot as the branch might need them
7379                 //current.regmap[hr]=-1;
7380               }else
7381                 regs[i].regmap_entry[hr]=r;
7382           }
7383         } else {
7384           // First instruction expects CCREG to be allocated
7385           if(i==0&&hr==HOST_CCREG)
7386             regs[i].regmap_entry[hr]=CCREG;
7387           else
7388             regs[i].regmap_entry[hr]=-1;
7389         }
7390       }
7391     }
7392     else { // Not delay slot
7393       switch(itype[i]) {
7394         case UJUMP:
7395           //current.isconst=0; // DEBUG
7396           //current.wasconst=0; // DEBUG
7397           //regs[i].wasconst=0; // DEBUG
7398           clear_const(&current,rt1[i]);
7399           alloc_cc(&current,i);
7400           dirty_reg(&current,CCREG);
7401           if (rt1[i]==31) {
7402             alloc_reg(&current,i,31);
7403             dirty_reg(&current,31);
7404             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
7405             //assert(rt1[i+1]!=rt1[i]);
7406             #ifdef REG_PREFETCH
7407             alloc_reg(&current,i,PTEMP);
7408             #endif
7409           }
7410           ooo[i]=1;
7411           delayslot_alloc(&current,i+1);
7412           //current.isconst=0; // DEBUG
7413           ds=1;
7414           //printf("i=%d, isconst=%x\n",i,current.isconst);
7415           break;
7416         case RJUMP:
7417           //current.isconst=0;
7418           //current.wasconst=0;
7419           //regs[i].wasconst=0;
7420           clear_const(&current,rs1[i]);
7421           clear_const(&current,rt1[i]);
7422           alloc_cc(&current,i);
7423           dirty_reg(&current,CCREG);
7424           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
7425             alloc_reg(&current,i,rs1[i]);
7426             if (rt1[i]!=0) {
7427               alloc_reg(&current,i,rt1[i]);
7428               dirty_reg(&current,rt1[i]);
7429               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
7430               assert(rt1[i+1]!=rt1[i]);
7431               #ifdef REG_PREFETCH
7432               alloc_reg(&current,i,PTEMP);
7433               #endif
7434             }
7435             #ifdef USE_MINI_HT
7436             if(rs1[i]==31) { // JALR
7437               alloc_reg(&current,i,RHASH);
7438               alloc_reg(&current,i,RHTBL);
7439             }
7440             #endif
7441             delayslot_alloc(&current,i+1);
7442           } else {
7443             // The delay slot overwrites our source register,
7444             // allocate a temporary register to hold the old value.
7445             current.isconst=0;
7446             current.wasconst=0;
7447             regs[i].wasconst=0;
7448             delayslot_alloc(&current,i+1);
7449             current.isconst=0;
7450             alloc_reg(&current,i,RTEMP);
7451           }
7452           //current.isconst=0; // DEBUG
7453           ooo[i]=1;
7454           ds=1;
7455           break;
7456         case CJUMP:
7457           //current.isconst=0;
7458           //current.wasconst=0;
7459           //regs[i].wasconst=0;
7460           clear_const(&current,rs1[i]);
7461           clear_const(&current,rs2[i]);
7462           if((opcode[i]&0x3E)==4) // BEQ/BNE
7463           {
7464             alloc_cc(&current,i);
7465             dirty_reg(&current,CCREG);
7466             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7467             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
7468             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
7469                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
7470               // The delay slot overwrites one of our conditions.
7471               // Allocate the branch condition registers instead.
7472               current.isconst=0;
7473               current.wasconst=0;
7474               regs[i].wasconst=0;
7475               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7476               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
7477             }
7478             else
7479             {
7480               ooo[i]=1;
7481               delayslot_alloc(&current,i+1);
7482             }
7483           }
7484           else
7485           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
7486           {
7487             alloc_cc(&current,i);
7488             dirty_reg(&current,CCREG);
7489             alloc_reg(&current,i,rs1[i]);
7490             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
7491               // The delay slot overwrites one of our conditions.
7492               // Allocate the branch condition registers instead.
7493               current.isconst=0;
7494               current.wasconst=0;
7495               regs[i].wasconst=0;
7496               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7497             }
7498             else
7499             {
7500               ooo[i]=1;
7501               delayslot_alloc(&current,i+1);
7502             }
7503           }
7504           else
7505           // Don't alloc the delay slot yet because we might not execute it
7506           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
7507           {
7508             current.isconst=0;
7509             current.wasconst=0;
7510             regs[i].wasconst=0;
7511             alloc_cc(&current,i);
7512             dirty_reg(&current,CCREG);
7513             alloc_reg(&current,i,rs1[i]);
7514             alloc_reg(&current,i,rs2[i]);
7515           }
7516           else
7517           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
7518           {
7519             current.isconst=0;
7520             current.wasconst=0;
7521             regs[i].wasconst=0;
7522             alloc_cc(&current,i);
7523             dirty_reg(&current,CCREG);
7524             alloc_reg(&current,i,rs1[i]);
7525           }
7526           ds=1;
7527           //current.isconst=0;
7528           break;
7529         case SJUMP:
7530           //current.isconst=0;
7531           //current.wasconst=0;
7532           //regs[i].wasconst=0;
7533           clear_const(&current,rs1[i]);
7534           clear_const(&current,rt1[i]);
7535           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
7536           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
7537           {
7538             alloc_cc(&current,i);
7539             dirty_reg(&current,CCREG);
7540             alloc_reg(&current,i,rs1[i]);
7541             if (rt1[i]==31) { // BLTZAL/BGEZAL
7542               alloc_reg(&current,i,31);
7543               dirty_reg(&current,31);
7544               //#ifdef REG_PREFETCH
7545               //alloc_reg(&current,i,PTEMP);
7546               //#endif
7547             }
7548             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
7549                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
7550               // Allocate the branch condition registers instead.
7551               current.isconst=0;
7552               current.wasconst=0;
7553               regs[i].wasconst=0;
7554               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7555             }
7556             else
7557             {
7558               ooo[i]=1;
7559               delayslot_alloc(&current,i+1);
7560             }
7561           }
7562           else
7563           // Don't alloc the delay slot yet because we might not execute it
7564           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
7565           {
7566             current.isconst=0;
7567             current.wasconst=0;
7568             regs[i].wasconst=0;
7569             alloc_cc(&current,i);
7570             dirty_reg(&current,CCREG);
7571             alloc_reg(&current,i,rs1[i]);
7572           }
7573           ds=1;
7574           //current.isconst=0;
7575           break;
7576         case IMM16:
7577           imm16_alloc(&current,i);
7578           break;
7579         case LOAD:
7580         case LOADLR:
7581           load_alloc(&current,i);
7582           break;
7583         case STORE:
7584         case STORELR:
7585           store_alloc(&current,i);
7586           break;
7587         case ALU:
7588           alu_alloc(&current,i);
7589           break;
7590         case SHIFT:
7591           shift_alloc(&current,i);
7592           break;
7593         case MULTDIV:
7594           multdiv_alloc(&current,i);
7595           break;
7596         case SHIFTIMM:
7597           shiftimm_alloc(&current,i);
7598           break;
7599         case MOV:
7600           mov_alloc(&current,i);
7601           break;
7602         case COP0:
7603           cop0_alloc(&current,i);
7604           break;
7605         case COP1:
7606         case COP2:
7607           cop12_alloc(&current,i);
7608           break;
7609         case C1LS:
7610           c1ls_alloc(&current,i);
7611           break;
7612         case C2LS:
7613           c2ls_alloc(&current,i);
7614           break;
7615         case C2OP:
7616           c2op_alloc(&current,i);
7617           break;
7618         case SYSCALL:
7619         case HLECALL:
7620         case INTCALL:
7621           syscall_alloc(&current,i);
7622           break;
7623         case SPAN:
7624           pagespan_alloc(&current,i);
7625           break;
7626       }
7627
7628       // Create entry (branch target) regmap
7629       for(hr=0;hr<HOST_REGS;hr++)
7630       {
7631         int r,or;
7632         r=current.regmap[hr];
7633         if(r>=0) {
7634           if(r!=regmap_pre[i][hr]) {
7635             // TODO: delay slot (?)
7636             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
7637             if(or<0||(r&63)>=TEMPREG){
7638               regs[i].regmap_entry[hr]=-1;
7639             }
7640             else
7641             {
7642               // Just move it to a different register
7643               regs[i].regmap_entry[hr]=r;
7644               // If it was dirty before, it's still dirty
7645               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
7646             }
7647           }
7648           else
7649           {
7650             // Unneeded
7651             if(r==0){
7652               regs[i].regmap_entry[hr]=0;
7653             }
7654             else
7655             {
7656               assert(r<64);
7657               if((current.u>>r)&1) {
7658                 regs[i].regmap_entry[hr]=-1;
7659                 //regs[i].regmap[hr]=-1;
7660                 current.regmap[hr]=-1;
7661               }else
7662                 regs[i].regmap_entry[hr]=r;
7663             }
7664           }
7665         } else {
7666           // Branches expect CCREG to be allocated at the target
7667           if(regmap_pre[i][hr]==CCREG)
7668             regs[i].regmap_entry[hr]=CCREG;
7669           else
7670             regs[i].regmap_entry[hr]=-1;
7671         }
7672       }
7673       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
7674     }
7675
7676     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
7677       current.waswritten|=1<<rs1[i-1];
7678     current.waswritten&=~(1<<rt1[i]);
7679     current.waswritten&=~(1<<rt2[i]);
7680     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
7681       current.waswritten&=~(1<<rs1[i]);
7682
7683     /* Branch post-alloc */
7684     if(i>0)
7685     {
7686       current.wasdirty=current.dirty;
7687       switch(itype[i-1]) {
7688         case UJUMP:
7689           memcpy(&branch_regs[i-1],&current,sizeof(current));
7690           branch_regs[i-1].isconst=0;
7691           branch_regs[i-1].wasconst=0;
7692           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
7693           alloc_cc(&branch_regs[i-1],i-1);
7694           dirty_reg(&branch_regs[i-1],CCREG);
7695           if(rt1[i-1]==31) { // JAL
7696             alloc_reg(&branch_regs[i-1],i-1,31);
7697             dirty_reg(&branch_regs[i-1],31);
7698           }
7699           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7700           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
7701           break;
7702         case RJUMP:
7703           memcpy(&branch_regs[i-1],&current,sizeof(current));
7704           branch_regs[i-1].isconst=0;
7705           branch_regs[i-1].wasconst=0;
7706           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
7707           alloc_cc(&branch_regs[i-1],i-1);
7708           dirty_reg(&branch_regs[i-1],CCREG);
7709           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
7710           if(rt1[i-1]!=0) { // JALR
7711             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
7712             dirty_reg(&branch_regs[i-1],rt1[i-1]);
7713           }
7714           #ifdef USE_MINI_HT
7715           if(rs1[i-1]==31) { // JALR
7716             alloc_reg(&branch_regs[i-1],i-1,RHASH);
7717             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
7718           }
7719           #endif
7720           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7721           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
7722           break;
7723         case CJUMP:
7724           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
7725           {
7726             alloc_cc(&current,i-1);
7727             dirty_reg(&current,CCREG);
7728             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
7729                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
7730               // The delay slot overwrote one of our conditions
7731               // Delay slot goes after the test (in order)
7732               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7733               current.u|=1;
7734               delayslot_alloc(&current,i);
7735               current.isconst=0;
7736             }
7737             else
7738             {
7739               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
7740               // Alloc the branch condition registers
7741               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
7742               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
7743             }
7744             memcpy(&branch_regs[i-1],&current,sizeof(current));
7745             branch_regs[i-1].isconst=0;
7746             branch_regs[i-1].wasconst=0;
7747             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
7748             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
7749           }
7750           else
7751           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
7752           {
7753             alloc_cc(&current,i-1);
7754             dirty_reg(&current,CCREG);
7755             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
7756               // The delay slot overwrote the branch condition
7757               // Delay slot goes after the test (in order)
7758               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7759               current.u|=1;
7760               delayslot_alloc(&current,i);
7761               current.isconst=0;
7762             }
7763             else
7764             {
7765               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
7766               // Alloc the branch condition register
7767               alloc_reg(&current,i-1,rs1[i-1]);
7768             }
7769             memcpy(&branch_regs[i-1],&current,sizeof(current));
7770             branch_regs[i-1].isconst=0;
7771             branch_regs[i-1].wasconst=0;
7772             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
7773             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
7774           }
7775           else
7776           // Alloc the delay slot in case the branch is taken
7777           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
7778           {
7779             memcpy(&branch_regs[i-1],&current,sizeof(current));
7780             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
7781             alloc_cc(&branch_regs[i-1],i);
7782             dirty_reg(&branch_regs[i-1],CCREG);
7783             delayslot_alloc(&branch_regs[i-1],i);
7784             branch_regs[i-1].isconst=0;
7785             alloc_reg(&current,i,CCREG); // Not taken path
7786             dirty_reg(&current,CCREG);
7787             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7788           }
7789           else
7790           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
7791           {
7792             memcpy(&branch_regs[i-1],&current,sizeof(current));
7793             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
7794             alloc_cc(&branch_regs[i-1],i);
7795             dirty_reg(&branch_regs[i-1],CCREG);
7796             delayslot_alloc(&branch_regs[i-1],i);
7797             branch_regs[i-1].isconst=0;
7798             alloc_reg(&current,i,CCREG); // Not taken path
7799             dirty_reg(&current,CCREG);
7800             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7801           }
7802           break;
7803         case SJUMP:
7804           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
7805           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
7806           {
7807             alloc_cc(&current,i-1);
7808             dirty_reg(&current,CCREG);
7809             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
7810               // The delay slot overwrote the branch condition
7811               // Delay slot goes after the test (in order)
7812               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7813               current.u|=1;
7814               delayslot_alloc(&current,i);
7815               current.isconst=0;
7816             }
7817             else
7818             {
7819               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
7820               // Alloc the branch condition register
7821               alloc_reg(&current,i-1,rs1[i-1]);
7822             }
7823             memcpy(&branch_regs[i-1],&current,sizeof(current));
7824             branch_regs[i-1].isconst=0;
7825             branch_regs[i-1].wasconst=0;
7826             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
7827             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
7828           }
7829           else
7830           // Alloc the delay slot in case the branch is taken
7831           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
7832           {
7833             memcpy(&branch_regs[i-1],&current,sizeof(current));
7834             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
7835             alloc_cc(&branch_regs[i-1],i);
7836             dirty_reg(&branch_regs[i-1],CCREG);
7837             delayslot_alloc(&branch_regs[i-1],i);
7838             branch_regs[i-1].isconst=0;
7839             alloc_reg(&current,i,CCREG); // Not taken path
7840             dirty_reg(&current,CCREG);
7841             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7842           }
7843           // FIXME: BLTZAL/BGEZAL
7844           if(opcode2[i-1]&0x10) { // BxxZAL
7845             alloc_reg(&branch_regs[i-1],i-1,31);
7846             dirty_reg(&branch_regs[i-1],31);
7847           }
7848           break;
7849       }
7850
7851       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
7852       {
7853         if(rt1[i-1]==31) // JAL/JALR
7854         {
7855           // Subroutine call will return here, don't alloc any registers
7856           current.dirty=0;
7857           clear_all_regs(current.regmap);
7858           alloc_reg(&current,i,CCREG);
7859           dirty_reg(&current,CCREG);
7860         }
7861         else if(i+1<slen)
7862         {
7863           // Internal branch will jump here, match registers to caller
7864           current.dirty=0;
7865           clear_all_regs(current.regmap);
7866           alloc_reg(&current,i,CCREG);
7867           dirty_reg(&current,CCREG);
7868           for(j=i-1;j>=0;j--)
7869           {
7870             if(ba[j]==start+i*4+4) {
7871               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
7872               current.dirty=branch_regs[j].dirty;
7873               break;
7874             }
7875           }
7876           while(j>=0) {
7877             if(ba[j]==start+i*4+4) {
7878               for(hr=0;hr<HOST_REGS;hr++) {
7879                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
7880                   current.regmap[hr]=-1;
7881                 }
7882                 current.dirty&=branch_regs[j].dirty;
7883               }
7884             }
7885             j--;
7886           }
7887         }
7888       }
7889     }
7890
7891     // Count cycles in between branches
7892     ccadj[i]=cc;
7893     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
7894     {
7895       cc=0;
7896     }
7897 #if !defined(DRC_DBG)
7898     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
7899     {
7900       // GTE runs in parallel until accessed, divide by 2 for a rough guess
7901       cc+=gte_cycletab[source[i]&0x3f]/2;
7902     }
7903     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
7904     {
7905       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
7906     }
7907     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
7908     {
7909       cc+=4;
7910     }
7911     else if(itype[i]==C2LS)
7912     {
7913       cc+=4;
7914     }
7915 #endif
7916     else
7917     {
7918       cc++;
7919     }
7920
7921     if(!is_ds[i]) {
7922       regs[i].dirty=current.dirty;
7923       regs[i].isconst=current.isconst;
7924       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
7925     }
7926     for(hr=0;hr<HOST_REGS;hr++) {
7927       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
7928         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
7929           regs[i].wasconst&=~(1<<hr);
7930         }
7931       }
7932     }
7933     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
7934     regs[i].waswritten=current.waswritten;
7935   }
7936
7937   /* Pass 4 - Cull unused host registers */
7938
7939   uint64_t nr=0;
7940
7941   for (i=slen-1;i>=0;i--)
7942   {
7943     int hr;
7944     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
7945     {
7946       if(ba[i]<start || ba[i]>=(start+slen*4))
7947       {
7948         // Branch out of this block, don't need anything
7949         nr=0;
7950       }
7951       else
7952       {
7953         // Internal branch
7954         // Need whatever matches the target
7955         nr=0;
7956         int t=(ba[i]-start)>>2;
7957         for(hr=0;hr<HOST_REGS;hr++)
7958         {
7959           if(regs[i].regmap_entry[hr]>=0) {
7960             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
7961           }
7962         }
7963       }
7964       // Conditional branch may need registers for following instructions
7965       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7966       {
7967         if(i<slen-2) {
7968           nr|=needed_reg[i+2];
7969           for(hr=0;hr<HOST_REGS;hr++)
7970           {
7971             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
7972             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
7973           }
7974         }
7975       }
7976       // Don't need stuff which is overwritten
7977       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
7978       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
7979       // Merge in delay slot
7980       for(hr=0;hr<HOST_REGS;hr++)
7981       {
7982         if(!likely[i]) {
7983           // These are overwritten unless the branch is "likely"
7984           // and the delay slot is nullified if not taken
7985           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
7986           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
7987         }
7988         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
7989         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
7990         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
7991         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
7992         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
7993           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
7994           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
7995         }
7996       }
7997     }
7998     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7999     {
8000       // SYSCALL instruction (software interrupt)
8001       nr=0;
8002     }
8003     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
8004     {
8005       // ERET instruction (return from interrupt)
8006       nr=0;
8007     }
8008     else // Non-branch
8009     {
8010       if(i<slen-1) {
8011         for(hr=0;hr<HOST_REGS;hr++) {
8012           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
8013           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
8014           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8015           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8016         }
8017       }
8018     }
8019     for(hr=0;hr<HOST_REGS;hr++)
8020     {
8021       // Overwritten registers are not needed
8022       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8023       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8024       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8025       // Source registers are needed
8026       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
8027       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
8028       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8029       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8030       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
8031         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8032         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8033       }
8034       // Don't store a register immediately after writing it,
8035       // may prevent dual-issue.
8036       // But do so if this is a branch target, otherwise we
8037       // might have to load the register before the branch.
8038       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
8039         if((regmap_pre[i][hr]>0&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1))) {
8040           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8041           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8042         }
8043         if((regs[i].regmap_entry[hr]>0&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1))) {
8044           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8045           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8046         }
8047       }
8048     }
8049     // Cycle count is needed at branches.  Assume it is needed at the target too.
8050     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==SPAN) {
8051       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8052       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8053     }
8054     // Save it
8055     needed_reg[i]=nr;
8056
8057     // Deallocate unneeded registers
8058     for(hr=0;hr<HOST_REGS;hr++)
8059     {
8060       if(!((nr>>hr)&1)) {
8061         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
8062         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8063            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8064            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
8065         {
8066           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8067           {
8068             if(likely[i]) {
8069               regs[i].regmap[hr]=-1;
8070               regs[i].isconst&=~(1<<hr);
8071               if(i<slen-2) {
8072                 regmap_pre[i+2][hr]=-1;
8073                 regs[i+2].wasconst&=~(1<<hr);
8074               }
8075             }
8076           }
8077         }
8078         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
8079         {
8080           int map=0,temp=0;
8081           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
8082              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8083             map=INVCP;
8084           }
8085           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
8086              itype[i+1]==C1LS || itype[i+1]==C2LS)
8087             temp=FTEMP;
8088           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8089              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8090              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
8091              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
8092              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
8093              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
8094              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
8095              regs[i].regmap[hr]!=map )
8096           {
8097             regs[i].regmap[hr]=-1;
8098             regs[i].isconst&=~(1<<hr);
8099             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
8100                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
8101                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
8102                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
8103                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
8104                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
8105                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
8106                branch_regs[i].regmap[hr]!=map)
8107             {
8108               branch_regs[i].regmap[hr]=-1;
8109               branch_regs[i].regmap_entry[hr]=-1;
8110               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8111               {
8112                 if(!likely[i]&&i<slen-2) {
8113                   regmap_pre[i+2][hr]=-1;
8114                   regs[i+2].wasconst&=~(1<<hr);
8115                 }
8116               }
8117             }
8118           }
8119         }
8120         else
8121         {
8122           // Non-branch
8123           if(i>0)
8124           {
8125             int map=-1,temp=-1;
8126             if(itype[i]==STORE || itype[i]==STORELR ||
8127                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8128               map=INVCP;
8129             }
8130             if(itype[i]==LOADLR || itype[i]==STORELR ||
8131                itype[i]==C1LS || itype[i]==C2LS)
8132               temp=FTEMP;
8133             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8134                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
8135                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
8136                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
8137             {
8138               if(i<slen-1&&!is_ds[i]) {
8139                 assert(regs[i].regmap[hr]<64);
8140                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]>0)
8141                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
8142                 {
8143                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
8144                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
8145                 }
8146                 regmap_pre[i+1][hr]=-1;
8147                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
8148                 regs[i+1].wasconst&=~(1<<hr);
8149               }
8150               regs[i].regmap[hr]=-1;
8151               regs[i].isconst&=~(1<<hr);
8152             }
8153           }
8154         }
8155       } // if needed
8156     } // for hr
8157   }
8158
8159   /* Pass 5 - Pre-allocate registers */
8160
8161   // If a register is allocated during a loop, try to allocate it for the
8162   // entire loop, if possible.  This avoids loading/storing registers
8163   // inside of the loop.
8164
8165   signed char f_regmap[HOST_REGS];
8166   clear_all_regs(f_regmap);
8167   for(i=0;i<slen-1;i++)
8168   {
8169     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
8170     {
8171       if(ba[i]>=start && ba[i]<(start+i*4))
8172       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
8173       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
8174       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
8175       ||itype[i+1]==SHIFT||itype[i+1]==COP1
8176       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
8177       {
8178         int t=(ba[i]-start)>>2;
8179         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP)) // loop_preload can't handle jumps into delay slots
8180         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
8181         for(hr=0;hr<HOST_REGS;hr++)
8182         {
8183           if(regs[i].regmap[hr]>=0) {
8184             if(f_regmap[hr]!=regs[i].regmap[hr]) {
8185               // dealloc old register
8186               int n;
8187               for(n=0;n<HOST_REGS;n++)
8188               {
8189                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
8190               }
8191               // and alloc new one
8192               f_regmap[hr]=regs[i].regmap[hr];
8193             }
8194           }
8195           if(branch_regs[i].regmap[hr]>=0) {
8196             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
8197               // dealloc old register
8198               int n;
8199               for(n=0;n<HOST_REGS;n++)
8200               {
8201                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
8202               }
8203               // and alloc new one
8204               f_regmap[hr]=branch_regs[i].regmap[hr];
8205             }
8206           }
8207           if(ooo[i]) {
8208             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
8209               f_regmap[hr]=branch_regs[i].regmap[hr];
8210           }else{
8211             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
8212               f_regmap[hr]=branch_regs[i].regmap[hr];
8213           }
8214           // Avoid dirty->clean transition
8215           #ifdef DESTRUCTIVE_WRITEBACK
8216           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
8217           #endif
8218           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
8219           // case above, however it's always a good idea.  We can't hoist the
8220           // load if the register was already allocated, so there's no point
8221           // wasting time analyzing most of these cases.  It only "succeeds"
8222           // when the mapping was different and the load can be replaced with
8223           // a mov, which is of negligible benefit.  So such cases are
8224           // skipped below.
8225           if(f_regmap[hr]>0) {
8226             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
8227               int r=f_regmap[hr];
8228               for(j=t;j<=i;j++)
8229               {
8230                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
8231                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
8232                 assert(r < 64);
8233                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
8234                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
8235                   int k;
8236                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
8237                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
8238                     if(r>63) {
8239                       if(get_reg(regs[i].regmap,r&63)<0) break;
8240                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
8241                     }
8242                     k=i;
8243                     while(k>1&&regs[k-1].regmap[hr]==-1) {
8244                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
8245                         //printf("no free regs for store %x\n",start+(k-1)*4);
8246                         break;
8247                       }
8248                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
8249                         //printf("no-match due to different register\n");
8250                         break;
8251                       }
8252                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP) {
8253                         //printf("no-match due to branch\n");
8254                         break;
8255                       }
8256                       // call/ret fast path assumes no registers allocated
8257                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
8258                         break;
8259                       }
8260                       assert(r < 64);
8261                       k--;
8262                     }
8263                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
8264                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
8265                       while(k<i) {
8266                         regs[k].regmap_entry[hr]=f_regmap[hr];
8267                         regs[k].regmap[hr]=f_regmap[hr];
8268                         regmap_pre[k+1][hr]=f_regmap[hr];
8269                         regs[k].wasdirty&=~(1<<hr);
8270                         regs[k].dirty&=~(1<<hr);
8271                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
8272                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
8273                         regs[k].wasconst&=~(1<<hr);
8274                         regs[k].isconst&=~(1<<hr);
8275                         k++;
8276                       }
8277                     }
8278                     else {
8279                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
8280                       break;
8281                     }
8282                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
8283                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
8284                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
8285                       regs[i].regmap_entry[hr]=f_regmap[hr];
8286                       regs[i].regmap[hr]=f_regmap[hr];
8287                       regs[i].wasdirty&=~(1<<hr);
8288                       regs[i].dirty&=~(1<<hr);
8289                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
8290                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
8291                       regs[i].wasconst&=~(1<<hr);
8292                       regs[i].isconst&=~(1<<hr);
8293                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
8294                       branch_regs[i].wasdirty&=~(1<<hr);
8295                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
8296                       branch_regs[i].regmap[hr]=f_regmap[hr];
8297                       branch_regs[i].dirty&=~(1<<hr);
8298                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
8299                       branch_regs[i].wasconst&=~(1<<hr);
8300                       branch_regs[i].isconst&=~(1<<hr);
8301                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
8302                         regmap_pre[i+2][hr]=f_regmap[hr];
8303                         regs[i+2].wasdirty&=~(1<<hr);
8304                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
8305                       }
8306                     }
8307                   }
8308                   for(k=t;k<j;k++) {
8309                     // Alloc register clean at beginning of loop,
8310                     // but may dirty it in pass 6
8311                     regs[k].regmap_entry[hr]=f_regmap[hr];
8312                     regs[k].regmap[hr]=f_regmap[hr];
8313                     regs[k].dirty&=~(1<<hr);
8314                     regs[k].wasconst&=~(1<<hr);
8315                     regs[k].isconst&=~(1<<hr);
8316                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP) {
8317                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
8318                       branch_regs[k].regmap[hr]=f_regmap[hr];
8319                       branch_regs[k].dirty&=~(1<<hr);
8320                       branch_regs[k].wasconst&=~(1<<hr);
8321                       branch_regs[k].isconst&=~(1<<hr);
8322                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
8323                         regmap_pre[k+2][hr]=f_regmap[hr];
8324                         regs[k+2].wasdirty&=~(1<<hr);
8325                       }
8326                     }
8327                     else
8328                     {
8329                       regmap_pre[k+1][hr]=f_regmap[hr];
8330                       regs[k+1].wasdirty&=~(1<<hr);
8331                     }
8332                   }
8333                   if(regs[j].regmap[hr]==f_regmap[hr])
8334                     regs[j].regmap_entry[hr]=f_regmap[hr];
8335                   break;
8336                 }
8337                 if(j==i) break;
8338                 if(regs[j].regmap[hr]>=0)
8339                   break;
8340                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
8341                   //printf("no-match due to different register\n");
8342                   break;
8343                 }
8344                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
8345                 {
8346                   // Stop on unconditional branch
8347                   break;
8348                 }
8349                 if(itype[j]==CJUMP||itype[j]==SJUMP)
8350                 {
8351                   if(ooo[j]) {
8352                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
8353                       break;
8354                   }else{
8355                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
8356                       break;
8357                   }
8358                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
8359                     //printf("no-match due to different register (branch)\n");
8360                     break;
8361                   }
8362                 }
8363                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
8364                   //printf("No free regs for store %x\n",start+j*4);
8365                   break;
8366                 }
8367                 assert(f_regmap[hr]<64);
8368               }
8369             }
8370           }
8371         }
8372       }
8373     }else{
8374       // Non branch or undetermined branch target
8375       for(hr=0;hr<HOST_REGS;hr++)
8376       {
8377         if(hr!=EXCLUDE_REG) {
8378           if(regs[i].regmap[hr]>=0) {
8379             if(f_regmap[hr]!=regs[i].regmap[hr]) {
8380               // dealloc old register
8381               int n;
8382               for(n=0;n<HOST_REGS;n++)
8383               {
8384                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
8385               }
8386               // and alloc new one
8387               f_regmap[hr]=regs[i].regmap[hr];
8388             }
8389           }
8390         }
8391       }
8392       // Try to restore cycle count at branch targets
8393       if(bt[i]) {
8394         for(j=i;j<slen-1;j++) {
8395           if(regs[j].regmap[HOST_CCREG]!=-1) break;
8396           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
8397             //printf("no free regs for store %x\n",start+j*4);
8398             break;
8399           }
8400         }
8401         if(regs[j].regmap[HOST_CCREG]==CCREG) {
8402           int k=i;
8403           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
8404           while(k<j) {
8405             regs[k].regmap_entry[HOST_CCREG]=CCREG;
8406             regs[k].regmap[HOST_CCREG]=CCREG;
8407             regmap_pre[k+1][HOST_CCREG]=CCREG;
8408             regs[k+1].wasdirty|=1<<HOST_CCREG;
8409             regs[k].dirty|=1<<HOST_CCREG;
8410             regs[k].wasconst&=~(1<<HOST_CCREG);
8411             regs[k].isconst&=~(1<<HOST_CCREG);
8412             k++;
8413           }
8414           regs[j].regmap_entry[HOST_CCREG]=CCREG;
8415         }
8416         // Work backwards from the branch target
8417         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
8418         {
8419           //printf("Extend backwards\n");
8420           int k;
8421           k=i;
8422           while(regs[k-1].regmap[HOST_CCREG]==-1) {
8423             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
8424               //printf("no free regs for store %x\n",start+(k-1)*4);
8425               break;
8426             }
8427             k--;
8428           }
8429           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
8430             //printf("Extend CC, %x ->\n",start+k*4);
8431             while(k<=i) {
8432               regs[k].regmap_entry[HOST_CCREG]=CCREG;
8433               regs[k].regmap[HOST_CCREG]=CCREG;
8434               regmap_pre[k+1][HOST_CCREG]=CCREG;
8435               regs[k+1].wasdirty|=1<<HOST_CCREG;
8436               regs[k].dirty|=1<<HOST_CCREG;
8437               regs[k].wasconst&=~(1<<HOST_CCREG);
8438               regs[k].isconst&=~(1<<HOST_CCREG);
8439               k++;
8440             }
8441           }
8442           else {
8443             //printf("Fail Extend CC, %x ->\n",start+k*4);
8444           }
8445         }
8446       }
8447       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
8448          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
8449          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1)
8450       {
8451         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
8452       }
8453     }
8454   }
8455
8456   // This allocates registers (if possible) one instruction prior
8457   // to use, which can avoid a load-use penalty on certain CPUs.
8458   for(i=0;i<slen-1;i++)
8459   {
8460     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP))
8461     {
8462       if(!bt[i+1])
8463       {
8464         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
8465            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
8466         {
8467           if(rs1[i+1]) {
8468             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
8469             {
8470               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8471               {
8472                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
8473                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
8474                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
8475                 regs[i].isconst&=~(1<<hr);
8476                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8477                 constmap[i][hr]=constmap[i+1][hr];
8478                 regs[i+1].wasdirty&=~(1<<hr);
8479                 regs[i].dirty&=~(1<<hr);
8480               }
8481             }
8482           }
8483           if(rs2[i+1]) {
8484             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
8485             {
8486               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8487               {
8488                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
8489                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
8490                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
8491                 regs[i].isconst&=~(1<<hr);
8492                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8493                 constmap[i][hr]=constmap[i+1][hr];
8494                 regs[i+1].wasdirty&=~(1<<hr);
8495                 regs[i].dirty&=~(1<<hr);
8496               }
8497             }
8498           }
8499           // Preload target address for load instruction (non-constant)
8500           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8501             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
8502             {
8503               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8504               {
8505                 regs[i].regmap[hr]=rs1[i+1];
8506                 regmap_pre[i+1][hr]=rs1[i+1];
8507                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8508                 regs[i].isconst&=~(1<<hr);
8509                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8510                 constmap[i][hr]=constmap[i+1][hr];
8511                 regs[i+1].wasdirty&=~(1<<hr);
8512                 regs[i].dirty&=~(1<<hr);
8513               }
8514             }
8515           }
8516           // Load source into target register
8517           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8518             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
8519             {
8520               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8521               {
8522                 regs[i].regmap[hr]=rs1[i+1];
8523                 regmap_pre[i+1][hr]=rs1[i+1];
8524                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8525                 regs[i].isconst&=~(1<<hr);
8526                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8527                 constmap[i][hr]=constmap[i+1][hr];
8528                 regs[i+1].wasdirty&=~(1<<hr);
8529                 regs[i].dirty&=~(1<<hr);
8530               }
8531             }
8532           }
8533           // Address for store instruction (non-constant)
8534           if(itype[i+1]==STORE||itype[i+1]==STORELR
8535              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
8536             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8537               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
8538               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
8539               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
8540               assert(hr>=0);
8541               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8542               {
8543                 regs[i].regmap[hr]=rs1[i+1];
8544                 regmap_pre[i+1][hr]=rs1[i+1];
8545                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8546                 regs[i].isconst&=~(1<<hr);
8547                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8548                 constmap[i][hr]=constmap[i+1][hr];
8549                 regs[i+1].wasdirty&=~(1<<hr);
8550                 regs[i].dirty&=~(1<<hr);
8551               }
8552             }
8553           }
8554           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
8555             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8556               int nr;
8557               hr=get_reg(regs[i+1].regmap,FTEMP);
8558               assert(hr>=0);
8559               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8560               {
8561                 regs[i].regmap[hr]=rs1[i+1];
8562                 regmap_pre[i+1][hr]=rs1[i+1];
8563                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8564                 regs[i].isconst&=~(1<<hr);
8565                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8566                 constmap[i][hr]=constmap[i+1][hr];
8567                 regs[i+1].wasdirty&=~(1<<hr);
8568                 regs[i].dirty&=~(1<<hr);
8569               }
8570               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
8571               {
8572                 // move it to another register
8573                 regs[i+1].regmap[hr]=-1;
8574                 regmap_pre[i+2][hr]=-1;
8575                 regs[i+1].regmap[nr]=FTEMP;
8576                 regmap_pre[i+2][nr]=FTEMP;
8577                 regs[i].regmap[nr]=rs1[i+1];
8578                 regmap_pre[i+1][nr]=rs1[i+1];
8579                 regs[i+1].regmap_entry[nr]=rs1[i+1];
8580                 regs[i].isconst&=~(1<<nr);
8581                 regs[i+1].isconst&=~(1<<nr);
8582                 regs[i].dirty&=~(1<<nr);
8583                 regs[i+1].wasdirty&=~(1<<nr);
8584                 regs[i+1].dirty&=~(1<<nr);
8585                 regs[i+2].wasdirty&=~(1<<nr);
8586               }
8587             }
8588           }
8589           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
8590             if(itype[i+1]==LOAD)
8591               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
8592             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
8593               hr=get_reg(regs[i+1].regmap,FTEMP);
8594             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
8595               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
8596               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
8597             }
8598             if(hr>=0&&regs[i].regmap[hr]<0) {
8599               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
8600               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
8601                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
8602                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
8603                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
8604                 regs[i].isconst&=~(1<<hr);
8605                 regs[i+1].wasdirty&=~(1<<hr);
8606                 regs[i].dirty&=~(1<<hr);
8607               }
8608             }
8609           }
8610         }
8611       }
8612     }
8613   }
8614
8615   /* Pass 6 - Optimize clean/dirty state */
8616   clean_registers(0,slen-1,1);
8617
8618   /* Pass 7 - Identify 32-bit registers */
8619   for (i=slen-1;i>=0;i--)
8620   {
8621     if(itype[i]==CJUMP||itype[i]==SJUMP)
8622     {
8623       // Conditional branch
8624       if((source[i]>>16)!=0x1000&&i<slen-2) {
8625         // Mark this address as a branch target since it may be called
8626         // upon return from interrupt
8627         bt[i+2]=1;
8628       }
8629     }
8630   }
8631
8632   if(itype[slen-1]==SPAN) {
8633     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
8634   }
8635
8636 #ifdef DISASM
8637   /* Debug/disassembly */
8638   for(i=0;i<slen;i++)
8639   {
8640     printf("U:");
8641     int r;
8642     for(r=1;r<=CCREG;r++) {
8643       if((unneeded_reg[i]>>r)&1) {
8644         if(r==HIREG) printf(" HI");
8645         else if(r==LOREG) printf(" LO");
8646         else printf(" r%d",r);
8647       }
8648     }
8649     printf("\n");
8650     #if defined(__i386__) || defined(__x86_64__)
8651     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
8652     #endif
8653     #ifdef __arm__
8654     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
8655     #endif
8656     #if defined(__i386__) || defined(__x86_64__)
8657     printf("needs: ");
8658     if(needed_reg[i]&1) printf("eax ");
8659     if((needed_reg[i]>>1)&1) printf("ecx ");
8660     if((needed_reg[i]>>2)&1) printf("edx ");
8661     if((needed_reg[i]>>3)&1) printf("ebx ");
8662     if((needed_reg[i]>>5)&1) printf("ebp ");
8663     if((needed_reg[i]>>6)&1) printf("esi ");
8664     if((needed_reg[i]>>7)&1) printf("edi ");
8665     printf("\n");
8666     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
8667     printf("dirty: ");
8668     if(regs[i].wasdirty&1) printf("eax ");
8669     if((regs[i].wasdirty>>1)&1) printf("ecx ");
8670     if((regs[i].wasdirty>>2)&1) printf("edx ");
8671     if((regs[i].wasdirty>>3)&1) printf("ebx ");
8672     if((regs[i].wasdirty>>5)&1) printf("ebp ");
8673     if((regs[i].wasdirty>>6)&1) printf("esi ");
8674     if((regs[i].wasdirty>>7)&1) printf("edi ");
8675     #endif
8676     #ifdef __arm__
8677     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
8678     printf("dirty: ");
8679     if(regs[i].wasdirty&1) printf("r0 ");
8680     if((regs[i].wasdirty>>1)&1) printf("r1 ");
8681     if((regs[i].wasdirty>>2)&1) printf("r2 ");
8682     if((regs[i].wasdirty>>3)&1) printf("r3 ");
8683     if((regs[i].wasdirty>>4)&1) printf("r4 ");
8684     if((regs[i].wasdirty>>5)&1) printf("r5 ");
8685     if((regs[i].wasdirty>>6)&1) printf("r6 ");
8686     if((regs[i].wasdirty>>7)&1) printf("r7 ");
8687     if((regs[i].wasdirty>>8)&1) printf("r8 ");
8688     if((regs[i].wasdirty>>9)&1) printf("r9 ");
8689     if((regs[i].wasdirty>>10)&1) printf("r10 ");
8690     if((regs[i].wasdirty>>12)&1) printf("r12 ");
8691     #endif
8692     printf("\n");
8693     disassemble_inst(i);
8694     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
8695     #if defined(__i386__) || defined(__x86_64__)
8696     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
8697     if(regs[i].dirty&1) printf("eax ");
8698     if((regs[i].dirty>>1)&1) printf("ecx ");
8699     if((regs[i].dirty>>2)&1) printf("edx ");
8700     if((regs[i].dirty>>3)&1) printf("ebx ");
8701     if((regs[i].dirty>>5)&1) printf("ebp ");
8702     if((regs[i].dirty>>6)&1) printf("esi ");
8703     if((regs[i].dirty>>7)&1) printf("edi ");
8704     #endif
8705     #ifdef __arm__
8706     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
8707     if(regs[i].dirty&1) printf("r0 ");
8708     if((regs[i].dirty>>1)&1) printf("r1 ");
8709     if((regs[i].dirty>>2)&1) printf("r2 ");
8710     if((regs[i].dirty>>3)&1) printf("r3 ");
8711     if((regs[i].dirty>>4)&1) printf("r4 ");
8712     if((regs[i].dirty>>5)&1) printf("r5 ");
8713     if((regs[i].dirty>>6)&1) printf("r6 ");
8714     if((regs[i].dirty>>7)&1) printf("r7 ");
8715     if((regs[i].dirty>>8)&1) printf("r8 ");
8716     if((regs[i].dirty>>9)&1) printf("r9 ");
8717     if((regs[i].dirty>>10)&1) printf("r10 ");
8718     if((regs[i].dirty>>12)&1) printf("r12 ");
8719     #endif
8720     printf("\n");
8721     if(regs[i].isconst) {
8722       printf("constants: ");
8723       #if defined(__i386__) || defined(__x86_64__)
8724       if(regs[i].isconst&1) printf("eax=%x ",(u_int)constmap[i][0]);
8725       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(u_int)constmap[i][1]);
8726       if((regs[i].isconst>>2)&1) printf("edx=%x ",(u_int)constmap[i][2]);
8727       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(u_int)constmap[i][3]);
8728       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(u_int)constmap[i][5]);
8729       if((regs[i].isconst>>6)&1) printf("esi=%x ",(u_int)constmap[i][6]);
8730       if((regs[i].isconst>>7)&1) printf("edi=%x ",(u_int)constmap[i][7]);
8731       #endif
8732       #if defined(__arm__) || defined(__aarch64__)
8733       int r;
8734       for (r = 0; r < ARRAY_SIZE(constmap[i]); r++)
8735         if ((regs[i].isconst >> r) & 1)
8736           printf(" r%d=%x", r, (u_int)constmap[i][r]);
8737       #endif
8738       printf("\n");
8739     }
8740     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
8741       #if defined(__i386__) || defined(__x86_64__)
8742       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
8743       if(branch_regs[i].dirty&1) printf("eax ");
8744       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
8745       if((branch_regs[i].dirty>>2)&1) printf("edx ");
8746       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
8747       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
8748       if((branch_regs[i].dirty>>6)&1) printf("esi ");
8749       if((branch_regs[i].dirty>>7)&1) printf("edi ");
8750       #endif
8751       #ifdef __arm__
8752       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
8753       if(branch_regs[i].dirty&1) printf("r0 ");
8754       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
8755       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
8756       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
8757       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
8758       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
8759       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
8760       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
8761       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
8762       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
8763       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
8764       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
8765       #endif
8766     }
8767   }
8768 #endif // DISASM
8769
8770   /* Pass 8 - Assembly */
8771   linkcount=0;stubcount=0;
8772   ds=0;is_delayslot=0;
8773   u_int dirty_pre=0;
8774   void *beginning=start_block();
8775   if((u_int)addr&1) {
8776     ds=1;
8777     pagespan_ds();
8778   }
8779   void *instr_addr0_override = NULL;
8780
8781   if (start == 0x80030000) {
8782     // nasty hack for the fastbios thing
8783     // override block entry to this code
8784     instr_addr0_override = out;
8785     emit_movimm(start,0);
8786     // abuse io address var as a flag that we
8787     // have already returned here once
8788     emit_readword(&address,1);
8789     emit_writeword(0,&pcaddr);
8790     emit_writeword(0,&address);
8791     emit_cmp(0,1);
8792     #ifdef __aarch64__
8793     emit_jeq(out + 4*2);
8794     emit_far_jump(new_dyna_leave);
8795     #else
8796     emit_jne(new_dyna_leave);
8797     #endif
8798   }
8799   for(i=0;i<slen;i++)
8800   {
8801     //if(ds) printf("ds: ");
8802     disassemble_inst(i);
8803     if(ds) {
8804       ds=0; // Skip delay slot
8805       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
8806       instr_addr[i] = NULL;
8807     } else {
8808       speculate_register_values(i);
8809       #ifndef DESTRUCTIVE_WRITEBACK
8810       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
8811       {
8812         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,unneeded_reg[i]);
8813       }
8814       if((itype[i]==CJUMP||itype[i]==SJUMP)&&!likely[i]) {
8815         dirty_pre=branch_regs[i].dirty;
8816       }else{
8817         dirty_pre=regs[i].dirty;
8818       }
8819       #endif
8820       // write back
8821       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
8822       {
8823         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,unneeded_reg[i]);
8824         loop_preload(regmap_pre[i],regs[i].regmap_entry);
8825       }
8826       // branch target entry point
8827       instr_addr[i] = out;
8828       assem_debug("<->\n");
8829       drc_dbg_emit_do_cmp(i);
8830
8831       // load regs
8832       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
8833         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty);
8834       load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i],rs2[i]);
8835       address_generation(i,&regs[i],regs[i].regmap_entry);
8836       load_consts(regmap_pre[i],regs[i].regmap,i);
8837       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
8838       {
8839         // Load the delay slot registers if necessary
8840         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
8841           load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i+1],rs1[i+1]);
8842         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
8843           load_regs(regs[i].regmap_entry,regs[i].regmap,rs2[i+1],rs2[i+1]);
8844         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
8845           load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP);
8846       }
8847       else if(i+1<slen)
8848       {
8849         // Preload registers for following instruction
8850         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
8851           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
8852             load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i+1],rs1[i+1]);
8853         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
8854           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
8855             load_regs(regs[i].regmap_entry,regs[i].regmap,rs2[i+1],rs2[i+1]);
8856       }
8857       // TODO: if(is_ooo(i)) address_generation(i+1);
8858       if(itype[i]==CJUMP)
8859         load_regs(regs[i].regmap_entry,regs[i].regmap,CCREG,CCREG);
8860       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
8861         load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP);
8862       // assemble
8863       switch(itype[i]) {
8864         case ALU:
8865           alu_assemble(i,&regs[i]);break;
8866         case IMM16:
8867           imm16_assemble(i,&regs[i]);break;
8868         case SHIFT:
8869           shift_assemble(i,&regs[i]);break;
8870         case SHIFTIMM:
8871           shiftimm_assemble(i,&regs[i]);break;
8872         case LOAD:
8873           load_assemble(i,&regs[i]);break;
8874         case LOADLR:
8875           loadlr_assemble(i,&regs[i]);break;
8876         case STORE:
8877           store_assemble(i,&regs[i]);break;
8878         case STORELR:
8879           storelr_assemble(i,&regs[i]);break;
8880         case COP0:
8881           cop0_assemble(i,&regs[i]);break;
8882         case COP1:
8883           cop1_assemble(i,&regs[i]);break;
8884         case C1LS:
8885           c1ls_assemble(i,&regs[i]);break;
8886         case COP2:
8887           cop2_assemble(i,&regs[i]);break;
8888         case C2LS:
8889           c2ls_assemble(i,&regs[i]);break;
8890         case C2OP:
8891           c2op_assemble(i,&regs[i]);break;
8892         case MULTDIV:
8893           multdiv_assemble(i,&regs[i]);break;
8894         case MOV:
8895           mov_assemble(i,&regs[i]);break;
8896         case SYSCALL:
8897           syscall_assemble(i,&regs[i]);break;
8898         case HLECALL:
8899           hlecall_assemble(i,&regs[i]);break;
8900         case INTCALL:
8901           intcall_assemble(i,&regs[i]);break;
8902         case UJUMP:
8903           ujump_assemble(i,&regs[i]);ds=1;break;
8904         case RJUMP:
8905           rjump_assemble(i,&regs[i]);ds=1;break;
8906         case CJUMP:
8907           cjump_assemble(i,&regs[i]);ds=1;break;
8908         case SJUMP:
8909           sjump_assemble(i,&regs[i]);ds=1;break;
8910         case SPAN:
8911           pagespan_assemble(i,&regs[i]);break;
8912       }
8913       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
8914         literal_pool(1024);
8915       else
8916         literal_pool_jumpover(256);
8917     }
8918   }
8919   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
8920   // If the block did not end with an unconditional branch,
8921   // add a jump to the next instruction.
8922   if(i>1) {
8923     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
8924       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP);
8925       assert(i==slen);
8926       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP) {
8927         store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4);
8928         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
8929           emit_loadreg(CCREG,HOST_CCREG);
8930         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
8931       }
8932       else if(!likely[i-2])
8933       {
8934         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].dirty,start+i*4);
8935         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
8936       }
8937       else
8938       {
8939         store_regs_bt(regs[i-2].regmap,regs[i-2].dirty,start+i*4);
8940         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
8941       }
8942       add_to_linker(out,start+i*4,0);
8943       emit_jmp(0);
8944     }
8945   }
8946   else
8947   {
8948     assert(i>0);
8949     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP);
8950     store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4);
8951     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
8952       emit_loadreg(CCREG,HOST_CCREG);
8953     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
8954     add_to_linker(out,start+i*4,0);
8955     emit_jmp(0);
8956   }
8957
8958   // TODO: delay slot stubs?
8959   // Stubs
8960   for(i=0;i<stubcount;i++)
8961   {
8962     switch(stubs[i].type)
8963     {
8964       case LOADB_STUB:
8965       case LOADH_STUB:
8966       case LOADW_STUB:
8967       case LOADD_STUB:
8968       case LOADBU_STUB:
8969       case LOADHU_STUB:
8970         do_readstub(i);break;
8971       case STOREB_STUB:
8972       case STOREH_STUB:
8973       case STOREW_STUB:
8974       case STORED_STUB:
8975         do_writestub(i);break;
8976       case CC_STUB:
8977         do_ccstub(i);break;
8978       case INVCODE_STUB:
8979         do_invstub(i);break;
8980       case FP_STUB:
8981         do_cop1stub(i);break;
8982       case STORELR_STUB:
8983         do_unalignedwritestub(i);break;
8984     }
8985   }
8986
8987   if (instr_addr0_override)
8988     instr_addr[0] = instr_addr0_override;
8989
8990   /* Pass 9 - Linker */
8991   for(i=0;i<linkcount;i++)
8992   {
8993     assem_debug("%p -> %8x\n",link_addr[i].addr,link_addr[i].target);
8994     literal_pool(64);
8995     if (!link_addr[i].ext)
8996     {
8997       void *stub = out;
8998       void *addr = check_addr(link_addr[i].target);
8999       emit_extjump(link_addr[i].addr, link_addr[i].target);
9000       if (addr) {
9001         set_jump_target(link_addr[i].addr, addr);
9002         add_link(link_addr[i].target,stub);
9003       }
9004       else
9005         set_jump_target(link_addr[i].addr, stub);
9006     }
9007     else
9008     {
9009       // Internal branch
9010       int target=(link_addr[i].target-start)>>2;
9011       assert(target>=0&&target<slen);
9012       assert(instr_addr[target]);
9013       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
9014       //set_jump_target_fillslot(link_addr[i].addr,instr_addr[target],link_addr[i].ext>>1);
9015       //#else
9016       set_jump_target(link_addr[i].addr, instr_addr[target]);
9017       //#endif
9018     }
9019   }
9020   // External Branch Targets (jump_in)
9021   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
9022   for(i=0;i<slen;i++)
9023   {
9024     if(bt[i]||i==0)
9025     {
9026       if(instr_addr[i]) // TODO - delay slots (=null)
9027       {
9028         u_int vaddr=start+i*4;
9029         u_int page=get_page(vaddr);
9030         u_int vpage=get_vpage(vaddr);
9031         literal_pool(256);
9032         {
9033           assem_debug("%p (%d) <- %8x\n",instr_addr[i],i,start+i*4);
9034           assem_debug("jump_in: %x\n",start+i*4);
9035           ll_add(jump_dirty+vpage,vaddr,out);
9036           void *entry_point = do_dirty_stub(i);
9037           ll_add_flags(jump_in+page,vaddr,state_rflags,entry_point);
9038           // If there was an existing entry in the hash table,
9039           // replace it with the new address.
9040           // Don't add new entries.  We'll insert the
9041           // ones that actually get used in check_addr().
9042           struct ht_entry *ht_bin = hash_table_get(vaddr);
9043           if (ht_bin->vaddr[0] == vaddr)
9044             ht_bin->tcaddr[0] = entry_point;
9045           if (ht_bin->vaddr[1] == vaddr)
9046             ht_bin->tcaddr[1] = entry_point;
9047         }
9048       }
9049     }
9050   }
9051   // Write out the literal pool if necessary
9052   literal_pool(0);
9053   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
9054   // Align code
9055   if(((u_int)out)&7) emit_addnop(13);
9056   #endif
9057   assert(out - (u_char *)beginning < MAX_OUTPUT_BLOCK_SIZE);
9058   //printf("shadow buffer: %p-%p\n",copy,(u_char *)copy+slen*4);
9059   memcpy(copy,source,slen*4);
9060   copy+=slen*4;
9061
9062   end_block(beginning);
9063
9064   // If we're within 256K of the end of the buffer,
9065   // start over from the beginning. (Is 256K enough?)
9066   if (out > ndrc->translation_cache + sizeof(ndrc->translation_cache) - MAX_OUTPUT_BLOCK_SIZE)
9067     out = ndrc->translation_cache;
9068
9069   // Trap writes to any of the pages we compiled
9070   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
9071     invalid_code[i]=0;
9072   }
9073   inv_code_start=inv_code_end=~0;
9074
9075   // for PCSX we need to mark all mirrors too
9076   if(get_page(start)<(RAM_SIZE>>12))
9077     for(i=start>>12;i<=(start+slen*4)>>12;i++)
9078       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
9079       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
9080       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
9081
9082   /* Pass 10 - Free memory by expiring oldest blocks */
9083
9084   int end=(((out-ndrc->translation_cache)>>(TARGET_SIZE_2-16))+16384)&65535;
9085   while(expirep!=end)
9086   {
9087     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
9088     uintptr_t base=(uintptr_t)ndrc->translation_cache+((expirep>>13)<<shift); // Base address of this block
9089     inv_debug("EXP: Phase %d\n",expirep);
9090     switch((expirep>>11)&3)
9091     {
9092       case 0:
9093         // Clear jump_in and jump_dirty
9094         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
9095         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
9096         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
9097         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
9098         break;
9099       case 1:
9100         // Clear pointers
9101         ll_kill_pointers(jump_out[expirep&2047],base,shift);
9102         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
9103         break;
9104       case 2:
9105         // Clear hash table
9106         for(i=0;i<32;i++) {
9107           struct ht_entry *ht_bin = &hash_table[((expirep&2047)<<5)+i];
9108           if (((uintptr_t)ht_bin->tcaddr[1]>>shift) == (base>>shift) ||
9109              (((uintptr_t)ht_bin->tcaddr[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
9110             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[1],ht_bin->tcaddr[1]);
9111             ht_bin->vaddr[1] = -1;
9112             ht_bin->tcaddr[1] = NULL;
9113           }
9114           if (((uintptr_t)ht_bin->tcaddr[0]>>shift) == (base>>shift) ||
9115              (((uintptr_t)ht_bin->tcaddr[0]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
9116             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[0],ht_bin->tcaddr[0]);
9117             ht_bin->vaddr[0] = ht_bin->vaddr[1];
9118             ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
9119             ht_bin->vaddr[1] = -1;
9120             ht_bin->tcaddr[1] = NULL;
9121           }
9122         }
9123         break;
9124       case 3:
9125         // Clear jump_out
9126         if((expirep&2047)==0)
9127           do_clear_cache();
9128         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
9129         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
9130         break;
9131     }
9132     expirep=(expirep+1)&65535;
9133   }
9134   return 0;
9135 }
9136
9137 // vim:shiftwidth=2:expandtab