drc: drop 'likely' branch support, simplify
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29 #ifdef _3DS
30 #include <3ds_utils.h>
31 #endif
32 #ifdef VITA
33 #include <psp2/kernel/sysmem.h>
34 static int sceBlock;
35 #endif
36
37 #include "new_dynarec_config.h"
38 #include "../psxhle.h"
39 #include "../psxinterpreter.h"
40 #include "../gte.h"
41 #include "emu_if.h" // emulator interface
42
43 #define noinline __attribute__((noinline,noclone))
44 #ifndef ARRAY_SIZE
45 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
46 #endif
47 #ifndef min
48 #define min(a, b) ((b) < (a) ? (b) : (a))
49 #endif
50 #ifndef max
51 #define max(a, b) ((b) > (a) ? (b) : (a))
52 #endif
53
54 //#define DISASM
55 //#define ASSEM_PRINT
56
57 #ifdef ASSEM_PRINT
58 #define assem_debug printf
59 #else
60 #define assem_debug(...)
61 #endif
62 //#define inv_debug printf
63 #define inv_debug(...)
64
65 #ifdef __i386__
66 #include "assem_x86.h"
67 #endif
68 #ifdef __x86_64__
69 #include "assem_x64.h"
70 #endif
71 #ifdef __arm__
72 #include "assem_arm.h"
73 #endif
74 #ifdef __aarch64__
75 #include "assem_arm64.h"
76 #endif
77
78 #define RAM_SIZE 0x200000
79 #define MAXBLOCK 4096
80 #define MAX_OUTPUT_BLOCK_SIZE 262144
81
82 struct ndrc_mem
83 {
84   u_char translation_cache[1 << TARGET_SIZE_2];
85   struct
86   {
87     struct tramp_insns ops[2048 / sizeof(struct tramp_insns)];
88     const void *f[2048 / sizeof(void *)];
89   } tramp;
90 };
91
92 #ifdef BASE_ADDR_DYNAMIC
93 static struct ndrc_mem *ndrc;
94 #else
95 static struct ndrc_mem ndrc_ __attribute__((aligned(4096)));
96 static struct ndrc_mem *ndrc = &ndrc_;
97 #endif
98
99 // stubs
100 enum stub_type {
101   CC_STUB = 1,
102   FP_STUB = 2,
103   LOADB_STUB = 3,
104   LOADH_STUB = 4,
105   LOADW_STUB = 5,
106   LOADD_STUB = 6,
107   LOADBU_STUB = 7,
108   LOADHU_STUB = 8,
109   STOREB_STUB = 9,
110   STOREH_STUB = 10,
111   STOREW_STUB = 11,
112   STORED_STUB = 12,
113   STORELR_STUB = 13,
114   INVCODE_STUB = 14,
115 };
116
117 struct regstat
118 {
119   signed char regmap_entry[HOST_REGS];
120   signed char regmap[HOST_REGS];
121   uint64_t wasdirty;
122   uint64_t dirty;
123   uint64_t u;
124   u_int wasconst;
125   u_int isconst;
126   u_int loadedconst;             // host regs that have constants loaded
127   u_int waswritten;              // MIPS regs that were used as store base before
128 };
129
130 // note: asm depends on this layout
131 struct ll_entry
132 {
133   u_int vaddr;
134   u_int reg_sv_flags;
135   void *addr;
136   struct ll_entry *next;
137 };
138
139 struct ht_entry
140 {
141   u_int vaddr[2];
142   void *tcaddr[2];
143 };
144
145 struct code_stub
146 {
147   enum stub_type type;
148   void *addr;
149   void *retaddr;
150   u_int a;
151   uintptr_t b;
152   uintptr_t c;
153   u_int d;
154   u_int e;
155 };
156
157 struct link_entry
158 {
159   void *addr;
160   u_int target;
161   u_int ext;
162 };
163
164 static struct decoded_insn
165 {
166   u_char itype;
167   u_char opcode;
168   u_char opcode2;
169   u_char rs1;
170   u_char rs2;
171   u_char rt1;
172   u_char rt2;
173   u_char lt1;
174   u_char bt:1;
175   u_char ooo:1;
176   u_char is_ds:1;
177   u_char is_jump:1;
178   u_char is_ujump:1;
179 } dops[MAXBLOCK];
180
181   // used by asm:
182   u_char *out;
183   struct ht_entry hash_table[65536]  __attribute__((aligned(16)));
184   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
185   struct ll_entry *jump_dirty[4096];
186
187   static struct ll_entry *jump_out[4096];
188   static u_int start;
189   static u_int *source;
190   static char insn[MAXBLOCK][10];
191   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
192   static uint64_t gte_rt[MAXBLOCK];
193   static uint64_t gte_unneeded[MAXBLOCK];
194   static u_int smrv[32]; // speculated MIPS register values
195   static u_int smrv_strong; // mask or regs that are likely to have correct values
196   static u_int smrv_weak; // same, but somewhat less likely
197   static u_int smrv_strong_next; // same, but after current insn executes
198   static u_int smrv_weak_next;
199   static int imm[MAXBLOCK];
200   static u_int ba[MAXBLOCK];
201   static uint64_t unneeded_reg[MAXBLOCK];
202   static uint64_t branch_unneeded_reg[MAXBLOCK];
203   static signed char regmap_pre[MAXBLOCK][HOST_REGS]; // pre-instruction i?
204   // contains 'real' consts at [i] insn, but may differ from what's actually
205   // loaded in host reg as 'final' value is always loaded, see get_final_value()
206   static uint32_t current_constmap[HOST_REGS];
207   static uint32_t constmap[MAXBLOCK][HOST_REGS];
208   static struct regstat regs[MAXBLOCK];
209   static struct regstat branch_regs[MAXBLOCK];
210   static signed char minimum_free_regs[MAXBLOCK];
211   static u_int needed_reg[MAXBLOCK];
212   static u_int wont_dirty[MAXBLOCK];
213   static u_int will_dirty[MAXBLOCK];
214   static int ccadj[MAXBLOCK];
215   static int slen;
216   static void *instr_addr[MAXBLOCK];
217   static struct link_entry link_addr[MAXBLOCK];
218   static int linkcount;
219   static struct code_stub stubs[MAXBLOCK*3];
220   static int stubcount;
221   static u_int literals[1024][2];
222   static int literalcount;
223   static int is_delayslot;
224   static char shadow[1048576]  __attribute__((aligned(16)));
225   static void *copy;
226   static int expirep;
227   static u_int stop_after_jal;
228 #ifndef RAM_FIXED
229   static uintptr_t ram_offset;
230 #else
231   static const uintptr_t ram_offset=0;
232 #endif
233
234   int new_dynarec_hacks;
235   int new_dynarec_hacks_pergame;
236   int new_dynarec_hacks_old;
237   int new_dynarec_did_compile;
238
239   #define HACK_ENABLED(x) ((new_dynarec_hacks | new_dynarec_hacks_pergame) & (x))
240
241   extern int cycle_count; // ... until end of the timeslice, counts -N -> 0
242   extern int last_count;  // last absolute target, often = next_interupt
243   extern int pcaddr;
244   extern int pending_exception;
245   extern int branch_target;
246   extern uintptr_t mini_ht[32][2];
247   extern u_char restore_candidate[512];
248
249   /* registers that may be allocated */
250   /* 1-31 gpr */
251 #define LOREG 32 // lo
252 #define HIREG 33 // hi
253 //#define FSREG 34 // FPU status (FCSR)
254 #define CSREG 35 // Coprocessor status
255 #define CCREG 36 // Cycle count
256 #define INVCP 37 // Pointer to invalid_code
257 //#define MMREG 38 // Pointer to memory_map
258 //#define ROREG 39 // ram offset (if rdram!=0x80000000)
259 #define TEMPREG 40
260 #define FTEMP 40 // FPU temporary register
261 #define PTEMP 41 // Prefetch temporary register
262 //#define TLREG 42 // TLB mapping offset
263 #define RHASH 43 // Return address hash
264 #define RHTBL 44 // Return address hash table address
265 #define RTEMP 45 // JR/JALR address register
266 #define MAXREG 45
267 #define AGEN1 46 // Address generation temporary register
268 //#define AGEN2 47 // Address generation temporary register
269 //#define MGEN1 48 // Maptable address generation temporary register
270 //#define MGEN2 49 // Maptable address generation temporary register
271 #define BTREG 50 // Branch target temporary register
272
273   /* instruction types */
274 #define NOP 0     // No operation
275 #define LOAD 1    // Load
276 #define STORE 2   // Store
277 #define LOADLR 3  // Unaligned load
278 #define STORELR 4 // Unaligned store
279 #define MOV 5     // Move
280 #define ALU 6     // Arithmetic/logic
281 #define MULTDIV 7 // Multiply/divide
282 #define SHIFT 8   // Shift by register
283 #define SHIFTIMM 9// Shift by immediate
284 #define IMM16 10  // 16-bit immediate
285 #define RJUMP 11  // Unconditional jump to register
286 #define UJUMP 12  // Unconditional jump
287 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
288 #define SJUMP 14  // Conditional branch (regimm format)
289 #define COP0 15   // Coprocessor 0
290 #define COP1 16   // Coprocessor 1
291 #define C1LS 17   // Coprocessor 1 load/store
292 //#define FJUMP 18  // Conditional branch (floating point)
293 //#define FLOAT 19  // Floating point unit
294 //#define FCONV 20  // Convert integer to float
295 //#define FCOMP 21  // Floating point compare (sets FSREG)
296 #define SYSCALL 22// SYSCALL
297 #define OTHER 23  // Other
298 #define SPAN 24   // Branch/delay slot spans 2 pages
299 #define NI 25     // Not implemented
300 #define HLECALL 26// PCSX fake opcodes for HLE
301 #define COP2 27   // Coprocessor 2 move
302 #define C2LS 28   // Coprocessor 2 load/store
303 #define C2OP 29   // Coprocessor 2 operation
304 #define INTCALL 30// Call interpreter to handle rare corner cases
305
306   /* branch codes */
307 #define TAKEN 1
308 #define NOTTAKEN 2
309 #define NULLDS 3
310
311 #define DJT_1 (void *)1l // no function, just a label in assem_debug log
312 #define DJT_2 (void *)2l
313
314 // asm linkage
315 int new_recompile_block(u_int addr);
316 void *get_addr_ht(u_int vaddr);
317 void invalidate_block(u_int block);
318 void invalidate_addr(u_int addr);
319 void remove_hash(int vaddr);
320 void dyna_linker();
321 void dyna_linker_ds();
322 void verify_code();
323 void verify_code_ds();
324 void cc_interrupt();
325 void fp_exception();
326 void fp_exception_ds();
327 void jump_to_new_pc();
328 void call_gteStall();
329 void new_dyna_leave();
330
331 // Needed by assembler
332 static void wb_register(signed char r,signed char regmap[],uint64_t dirty);
333 static void wb_dirtys(signed char i_regmap[],uint64_t i_dirty);
334 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_dirty,int addr);
335 static void load_all_regs(signed char i_regmap[]);
336 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
337 static void load_regs_entry(int t);
338 static void load_all_consts(signed char regmap[],u_int dirty,int i);
339 static u_int get_host_reglist(const signed char *regmap);
340
341 static int verify_dirty(const u_int *ptr);
342 static int get_final_value(int hr, int i, int *value);
343 static void add_stub(enum stub_type type, void *addr, void *retaddr,
344   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e);
345 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
346   int i, int addr_reg, const struct regstat *i_regs, int ccadj, u_int reglist);
347 static void add_to_linker(void *addr, u_int target, int ext);
348 static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override);
349 static void *get_direct_memhandler(void *table, u_int addr,
350   enum stub_type type, uintptr_t *addr_host);
351 static void cop2_do_stall_check(u_int op, int i, const struct regstat *i_regs, u_int reglist);
352 static void pass_args(int a0, int a1);
353 static void emit_far_jump(const void *f);
354 static void emit_far_call(const void *f);
355
356 static void mprotect_w_x(void *start, void *end, int is_x)
357 {
358 #ifdef NO_WRITE_EXEC
359   #if defined(VITA)
360   // *Open* enables write on all memory that was
361   // allocated by sceKernelAllocMemBlockForVM()?
362   if (is_x)
363     sceKernelCloseVMDomain();
364   else
365     sceKernelOpenVMDomain();
366   #else
367   u_long mstart = (u_long)start & ~4095ul;
368   u_long mend = (u_long)end;
369   if (mprotect((void *)mstart, mend - mstart,
370                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
371     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
372   #endif
373 #endif
374 }
375
376 static void start_tcache_write(void *start, void *end)
377 {
378   mprotect_w_x(start, end, 0);
379 }
380
381 static void end_tcache_write(void *start, void *end)
382 {
383 #if defined(__arm__) || defined(__aarch64__)
384   size_t len = (char *)end - (char *)start;
385   #if   defined(__BLACKBERRY_QNX__)
386   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
387   #elif defined(__MACH__)
388   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
389   #elif defined(VITA)
390   sceKernelSyncVMDomain(sceBlock, start, len);
391   #elif defined(_3DS)
392   ctr_flush_invalidate_cache();
393   #elif defined(__aarch64__)
394   // as of 2021, __clear_cache() is still broken on arm64
395   // so here is a custom one :(
396   clear_cache_arm64(start, end);
397   #else
398   __clear_cache(start, end);
399   #endif
400   (void)len;
401 #endif
402
403   mprotect_w_x(start, end, 1);
404 }
405
406 static void *start_block(void)
407 {
408   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
409   if (end > ndrc->translation_cache + sizeof(ndrc->translation_cache))
410     end = ndrc->translation_cache + sizeof(ndrc->translation_cache);
411   start_tcache_write(out, end);
412   return out;
413 }
414
415 static void end_block(void *start)
416 {
417   end_tcache_write(start, out);
418 }
419
420 // also takes care of w^x mappings when patching code
421 static u_int needs_clear_cache[1<<(TARGET_SIZE_2-17)];
422
423 static void mark_clear_cache(void *target)
424 {
425   uintptr_t offset = (u_char *)target - ndrc->translation_cache;
426   u_int mask = 1u << ((offset >> 12) & 31);
427   if (!(needs_clear_cache[offset >> 17] & mask)) {
428     char *start = (char *)((uintptr_t)target & ~4095l);
429     start_tcache_write(start, start + 4095);
430     needs_clear_cache[offset >> 17] |= mask;
431   }
432 }
433
434 // Clearing the cache is rather slow on ARM Linux, so mark the areas
435 // that need to be cleared, and then only clear these areas once.
436 static void do_clear_cache(void)
437 {
438   int i, j;
439   for (i = 0; i < (1<<(TARGET_SIZE_2-17)); i++)
440   {
441     u_int bitmap = needs_clear_cache[i];
442     if (!bitmap)
443       continue;
444     for (j = 0; j < 32; j++)
445     {
446       u_char *start, *end;
447       if (!(bitmap & (1<<j)))
448         continue;
449
450       start = ndrc->translation_cache + i*131072 + j*4096;
451       end = start + 4095;
452       for (j++; j < 32; j++) {
453         if (!(bitmap & (1<<j)))
454           break;
455         end += 4096;
456       }
457       end_tcache_write(start, end);
458     }
459     needs_clear_cache[i] = 0;
460   }
461 }
462
463 //#define DEBUG_CYCLE_COUNT 1
464
465 #define NO_CYCLE_PENALTY_THR 12
466
467 int cycle_multiplier; // 100 for 1.0
468 int cycle_multiplier_override;
469 int cycle_multiplier_old;
470
471 static int CLOCK_ADJUST(int x)
472 {
473   int m = cycle_multiplier_override
474         ? cycle_multiplier_override : cycle_multiplier;
475   int s=(x>>31)|1;
476   return (x * m + s * 50) / 100;
477 }
478
479 static int ds_writes_rjump_rs(int i)
480 {
481   return dops[i].rs1 != 0 && (dops[i].rs1 == dops[i+1].rt1 || dops[i].rs1 == dops[i+1].rt2);
482 }
483
484 static u_int get_page(u_int vaddr)
485 {
486   u_int page=vaddr&~0xe0000000;
487   if (page < 0x1000000)
488     page &= ~0x0e00000; // RAM mirrors
489   page>>=12;
490   if(page>2048) page=2048+(page&2047);
491   return page;
492 }
493
494 // no virtual mem in PCSX
495 static u_int get_vpage(u_int vaddr)
496 {
497   return get_page(vaddr);
498 }
499
500 static struct ht_entry *hash_table_get(u_int vaddr)
501 {
502   return &hash_table[((vaddr>>16)^vaddr)&0xFFFF];
503 }
504
505 static void hash_table_add(struct ht_entry *ht_bin, u_int vaddr, void *tcaddr)
506 {
507   ht_bin->vaddr[1] = ht_bin->vaddr[0];
508   ht_bin->tcaddr[1] = ht_bin->tcaddr[0];
509   ht_bin->vaddr[0] = vaddr;
510   ht_bin->tcaddr[0] = tcaddr;
511 }
512
513 // some messy ari64's code, seems to rely on unsigned 32bit overflow
514 static int doesnt_expire_soon(void *tcaddr)
515 {
516   u_int diff = (u_int)((u_char *)tcaddr - out) << (32-TARGET_SIZE_2);
517   return diff > (u_int)(0x60000000 + (MAX_OUTPUT_BLOCK_SIZE << (32-TARGET_SIZE_2)));
518 }
519
520 // Get address from virtual address
521 // This is called from the recompiled JR/JALR instructions
522 void noinline *get_addr(u_int vaddr)
523 {
524   u_int page=get_page(vaddr);
525   u_int vpage=get_vpage(vaddr);
526   struct ll_entry *head;
527   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
528   head=jump_in[page];
529   while(head!=NULL) {
530     if(head->vaddr==vaddr) {
531   //printf("TRACE: count=%d next=%d (get_addr match %x: %p)\n",Count,next_interupt,vaddr,head->addr);
532       hash_table_add(hash_table_get(vaddr), vaddr, head->addr);
533       return head->addr;
534     }
535     head=head->next;
536   }
537   head=jump_dirty[vpage];
538   while(head!=NULL) {
539     if(head->vaddr==vaddr) {
540       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %p)\n",Count,next_interupt,vaddr,head->addr);
541       // Don't restore blocks which are about to expire from the cache
542       if (doesnt_expire_soon(head->addr))
543       if (verify_dirty(head->addr)) {
544         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
545         invalid_code[vaddr>>12]=0;
546         inv_code_start=inv_code_end=~0;
547         if(vpage<2048) {
548           restore_candidate[vpage>>3]|=1<<(vpage&7);
549         }
550         else restore_candidate[page>>3]|=1<<(page&7);
551         struct ht_entry *ht_bin = hash_table_get(vaddr);
552         if (ht_bin->vaddr[0] == vaddr)
553           ht_bin->tcaddr[0] = head->addr; // Replace existing entry
554         else
555           hash_table_add(ht_bin, vaddr, head->addr);
556
557         return head->addr;
558       }
559     }
560     head=head->next;
561   }
562   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
563   int r=new_recompile_block(vaddr);
564   if(r==0) return get_addr(vaddr);
565   // Execute in unmapped page, generate pagefault execption
566   Status|=2;
567   Cause=(vaddr<<31)|0x8;
568   EPC=(vaddr&1)?vaddr-5:vaddr;
569   BadVAddr=(vaddr&~1);
570   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
571   EntryHi=BadVAddr&0xFFFFE000;
572   return get_addr_ht(0x80000000);
573 }
574 // Look up address in hash table first
575 void *get_addr_ht(u_int vaddr)
576 {
577   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
578   const struct ht_entry *ht_bin = hash_table_get(vaddr);
579   if (ht_bin->vaddr[0] == vaddr) return ht_bin->tcaddr[0];
580   if (ht_bin->vaddr[1] == vaddr) return ht_bin->tcaddr[1];
581   return get_addr(vaddr);
582 }
583
584 void clear_all_regs(signed char regmap[])
585 {
586   int hr;
587   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
588 }
589
590 static signed char get_reg(const signed char regmap[],int r)
591 {
592   int hr;
593   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
594   return -1;
595 }
596
597 // Find a register that is available for two consecutive cycles
598 static signed char get_reg2(signed char regmap1[], const signed char regmap2[], int r)
599 {
600   int hr;
601   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
602   return -1;
603 }
604
605 int count_free_regs(signed char regmap[])
606 {
607   int count=0;
608   int hr;
609   for(hr=0;hr<HOST_REGS;hr++)
610   {
611     if(hr!=EXCLUDE_REG) {
612       if(regmap[hr]<0) count++;
613     }
614   }
615   return count;
616 }
617
618 void dirty_reg(struct regstat *cur,signed char reg)
619 {
620   int hr;
621   if(!reg) return;
622   for (hr=0;hr<HOST_REGS;hr++) {
623     if((cur->regmap[hr]&63)==reg) {
624       cur->dirty|=1<<hr;
625     }
626   }
627 }
628
629 static void set_const(struct regstat *cur, signed char reg, uint32_t value)
630 {
631   int hr;
632   if(!reg) return;
633   for (hr=0;hr<HOST_REGS;hr++) {
634     if(cur->regmap[hr]==reg) {
635       cur->isconst|=1<<hr;
636       current_constmap[hr]=value;
637     }
638   }
639 }
640
641 static void clear_const(struct regstat *cur, signed char reg)
642 {
643   int hr;
644   if(!reg) return;
645   for (hr=0;hr<HOST_REGS;hr++) {
646     if((cur->regmap[hr]&63)==reg) {
647       cur->isconst&=~(1<<hr);
648     }
649   }
650 }
651
652 static int is_const(struct regstat *cur, signed char reg)
653 {
654   int hr;
655   if(reg<0) return 0;
656   if(!reg) return 1;
657   for (hr=0;hr<HOST_REGS;hr++) {
658     if((cur->regmap[hr]&63)==reg) {
659       return (cur->isconst>>hr)&1;
660     }
661   }
662   return 0;
663 }
664
665 static uint32_t get_const(struct regstat *cur, signed char reg)
666 {
667   int hr;
668   if(!reg) return 0;
669   for (hr=0;hr<HOST_REGS;hr++) {
670     if(cur->regmap[hr]==reg) {
671       return current_constmap[hr];
672     }
673   }
674   SysPrintf("Unknown constant in r%d\n",reg);
675   abort();
676 }
677
678 // Least soon needed registers
679 // Look at the next ten instructions and see which registers
680 // will be used.  Try not to reallocate these.
681 void lsn(u_char hsn[], int i, int *preferred_reg)
682 {
683   int j;
684   int b=-1;
685   for(j=0;j<9;j++)
686   {
687     if(i+j>=slen) {
688       j=slen-i-1;
689       break;
690     }
691     if (dops[i+j].is_ujump)
692     {
693       // Don't go past an unconditonal jump
694       j++;
695       break;
696     }
697   }
698   for(;j>=0;j--)
699   {
700     if(dops[i+j].rs1) hsn[dops[i+j].rs1]=j;
701     if(dops[i+j].rs2) hsn[dops[i+j].rs2]=j;
702     if(dops[i+j].rt1) hsn[dops[i+j].rt1]=j;
703     if(dops[i+j].rt2) hsn[dops[i+j].rt2]=j;
704     if(dops[i+j].itype==STORE || dops[i+j].itype==STORELR) {
705       // Stores can allocate zero
706       hsn[dops[i+j].rs1]=j;
707       hsn[dops[i+j].rs2]=j;
708     }
709     // On some architectures stores need invc_ptr
710     #if defined(HOST_IMM8)
711     if(dops[i+j].itype==STORE || dops[i+j].itype==STORELR || (dops[i+j].opcode&0x3b)==0x39 || (dops[i+j].opcode&0x3b)==0x3a) {
712       hsn[INVCP]=j;
713     }
714     #endif
715     if(i+j>=0&&(dops[i+j].itype==UJUMP||dops[i+j].itype==CJUMP||dops[i+j].itype==SJUMP))
716     {
717       hsn[CCREG]=j;
718       b=j;
719     }
720   }
721   if(b>=0)
722   {
723     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
724     {
725       // Follow first branch
726       int t=(ba[i+b]-start)>>2;
727       j=7-b;if(t+j>=slen) j=slen-t-1;
728       for(;j>=0;j--)
729       {
730         if(dops[t+j].rs1) if(hsn[dops[t+j].rs1]>j+b+2) hsn[dops[t+j].rs1]=j+b+2;
731         if(dops[t+j].rs2) if(hsn[dops[t+j].rs2]>j+b+2) hsn[dops[t+j].rs2]=j+b+2;
732         //if(dops[t+j].rt1) if(hsn[dops[t+j].rt1]>j+b+2) hsn[dops[t+j].rt1]=j+b+2;
733         //if(dops[t+j].rt2) if(hsn[dops[t+j].rt2]>j+b+2) hsn[dops[t+j].rt2]=j+b+2;
734       }
735     }
736     // TODO: preferred register based on backward branch
737   }
738   // Delay slot should preferably not overwrite branch conditions or cycle count
739   if (i > 0 && dops[i-1].is_jump) {
740     if(dops[i-1].rs1) if(hsn[dops[i-1].rs1]>1) hsn[dops[i-1].rs1]=1;
741     if(dops[i-1].rs2) if(hsn[dops[i-1].rs2]>1) hsn[dops[i-1].rs2]=1;
742     hsn[CCREG]=1;
743     // ...or hash tables
744     hsn[RHASH]=1;
745     hsn[RHTBL]=1;
746   }
747   // Coprocessor load/store needs FTEMP, even if not declared
748   if(dops[i].itype==C1LS||dops[i].itype==C2LS) {
749     hsn[FTEMP]=0;
750   }
751   // Load L/R also uses FTEMP as a temporary register
752   if(dops[i].itype==LOADLR) {
753     hsn[FTEMP]=0;
754   }
755   // Also SWL/SWR/SDL/SDR
756   if(dops[i].opcode==0x2a||dops[i].opcode==0x2e||dops[i].opcode==0x2c||dops[i].opcode==0x2d) {
757     hsn[FTEMP]=0;
758   }
759   // Don't remove the miniht registers
760   if(dops[i].itype==UJUMP||dops[i].itype==RJUMP)
761   {
762     hsn[RHASH]=0;
763     hsn[RHTBL]=0;
764   }
765 }
766
767 // We only want to allocate registers if we're going to use them again soon
768 int needed_again(int r, int i)
769 {
770   int j;
771   int b=-1;
772   int rn=10;
773
774   if (i > 0 && dops[i-1].is_ujump)
775   {
776     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
777       return 0; // Don't need any registers if exiting the block
778   }
779   for(j=0;j<9;j++)
780   {
781     if(i+j>=slen) {
782       j=slen-i-1;
783       break;
784     }
785     if (dops[i+j].is_ujump)
786     {
787       // Don't go past an unconditonal jump
788       j++;
789       break;
790     }
791     if(dops[i+j].itype==SYSCALL||dops[i+j].itype==HLECALL||dops[i+j].itype==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
792     {
793       break;
794     }
795   }
796   for(;j>=1;j--)
797   {
798     if(dops[i+j].rs1==r) rn=j;
799     if(dops[i+j].rs2==r) rn=j;
800     if((unneeded_reg[i+j]>>r)&1) rn=10;
801     if(i+j>=0&&(dops[i+j].itype==UJUMP||dops[i+j].itype==CJUMP||dops[i+j].itype==SJUMP))
802     {
803       b=j;
804     }
805   }
806   if(rn<10) return 1;
807   (void)b;
808   return 0;
809 }
810
811 // Try to match register allocations at the end of a loop with those
812 // at the beginning
813 int loop_reg(int i, int r, int hr)
814 {
815   int j,k;
816   for(j=0;j<9;j++)
817   {
818     if(i+j>=slen) {
819       j=slen-i-1;
820       break;
821     }
822     if (dops[i+j].is_ujump)
823     {
824       // Don't go past an unconditonal jump
825       j++;
826       break;
827     }
828   }
829   k=0;
830   if(i>0){
831     if(dops[i-1].itype==UJUMP||dops[i-1].itype==CJUMP||dops[i-1].itype==SJUMP)
832       k--;
833   }
834   for(;k<j;k++)
835   {
836     assert(r < 64);
837     if((unneeded_reg[i+k]>>r)&1) return hr;
838     if(i+k>=0&&(dops[i+k].itype==UJUMP||dops[i+k].itype==CJUMP||dops[i+k].itype==SJUMP))
839     {
840       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
841       {
842         int t=(ba[i+k]-start)>>2;
843         int reg=get_reg(regs[t].regmap_entry,r);
844         if(reg>=0) return reg;
845         //reg=get_reg(regs[t+1].regmap_entry,r);
846         //if(reg>=0) return reg;
847       }
848     }
849   }
850   return hr;
851 }
852
853
854 // Allocate every register, preserving source/target regs
855 void alloc_all(struct regstat *cur,int i)
856 {
857   int hr;
858
859   for(hr=0;hr<HOST_REGS;hr++) {
860     if(hr!=EXCLUDE_REG) {
861       if(((cur->regmap[hr]&63)!=dops[i].rs1)&&((cur->regmap[hr]&63)!=dops[i].rs2)&&
862          ((cur->regmap[hr]&63)!=dops[i].rt1)&&((cur->regmap[hr]&63)!=dops[i].rt2))
863       {
864         cur->regmap[hr]=-1;
865         cur->dirty&=~(1<<hr);
866       }
867       // Don't need zeros
868       if((cur->regmap[hr]&63)==0)
869       {
870         cur->regmap[hr]=-1;
871         cur->dirty&=~(1<<hr);
872       }
873     }
874   }
875 }
876
877 #ifndef NDEBUG
878 static int host_tempreg_in_use;
879
880 static void host_tempreg_acquire(void)
881 {
882   assert(!host_tempreg_in_use);
883   host_tempreg_in_use = 1;
884 }
885
886 static void host_tempreg_release(void)
887 {
888   host_tempreg_in_use = 0;
889 }
890 #else
891 static void host_tempreg_acquire(void) {}
892 static void host_tempreg_release(void) {}
893 #endif
894
895 #ifdef ASSEM_PRINT
896 extern void gen_interupt();
897 extern void do_insn_cmp();
898 #define FUNCNAME(f) { f, " " #f }
899 static const struct {
900   void *addr;
901   const char *name;
902 } function_names[] = {
903   FUNCNAME(cc_interrupt),
904   FUNCNAME(gen_interupt),
905   FUNCNAME(get_addr_ht),
906   FUNCNAME(get_addr),
907   FUNCNAME(jump_handler_read8),
908   FUNCNAME(jump_handler_read16),
909   FUNCNAME(jump_handler_read32),
910   FUNCNAME(jump_handler_write8),
911   FUNCNAME(jump_handler_write16),
912   FUNCNAME(jump_handler_write32),
913   FUNCNAME(invalidate_addr),
914   FUNCNAME(jump_to_new_pc),
915   FUNCNAME(call_gteStall),
916   FUNCNAME(new_dyna_leave),
917   FUNCNAME(pcsx_mtc0),
918   FUNCNAME(pcsx_mtc0_ds),
919 #ifdef DRC_DBG
920   FUNCNAME(do_insn_cmp),
921 #endif
922 #ifdef __arm__
923   FUNCNAME(verify_code),
924 #endif
925 };
926
927 static const char *func_name(const void *a)
928 {
929   int i;
930   for (i = 0; i < sizeof(function_names)/sizeof(function_names[0]); i++)
931     if (function_names[i].addr == a)
932       return function_names[i].name;
933   return "";
934 }
935 #else
936 #define func_name(x) ""
937 #endif
938
939 #ifdef __i386__
940 #include "assem_x86.c"
941 #endif
942 #ifdef __x86_64__
943 #include "assem_x64.c"
944 #endif
945 #ifdef __arm__
946 #include "assem_arm.c"
947 #endif
948 #ifdef __aarch64__
949 #include "assem_arm64.c"
950 #endif
951
952 static void *get_trampoline(const void *f)
953 {
954   size_t i;
955
956   for (i = 0; i < ARRAY_SIZE(ndrc->tramp.f); i++) {
957     if (ndrc->tramp.f[i] == f || ndrc->tramp.f[i] == NULL)
958       break;
959   }
960   if (i == ARRAY_SIZE(ndrc->tramp.f)) {
961     SysPrintf("trampoline table is full, last func %p\n", f);
962     abort();
963   }
964   if (ndrc->tramp.f[i] == NULL) {
965     start_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]);
966     ndrc->tramp.f[i] = f;
967     end_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]);
968   }
969   return &ndrc->tramp.ops[i];
970 }
971
972 static void emit_far_jump(const void *f)
973 {
974   if (can_jump_or_call(f)) {
975     emit_jmp(f);
976     return;
977   }
978
979   f = get_trampoline(f);
980   emit_jmp(f);
981 }
982
983 static void emit_far_call(const void *f)
984 {
985   if (can_jump_or_call(f)) {
986     emit_call(f);
987     return;
988   }
989
990   f = get_trampoline(f);
991   emit_call(f);
992 }
993
994 // Add virtual address mapping to linked list
995 void ll_add(struct ll_entry **head,int vaddr,void *addr)
996 {
997   struct ll_entry *new_entry;
998   new_entry=malloc(sizeof(struct ll_entry));
999   assert(new_entry!=NULL);
1000   new_entry->vaddr=vaddr;
1001   new_entry->reg_sv_flags=0;
1002   new_entry->addr=addr;
1003   new_entry->next=*head;
1004   *head=new_entry;
1005 }
1006
1007 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
1008 {
1009   ll_add(head,vaddr,addr);
1010   (*head)->reg_sv_flags=reg_sv_flags;
1011 }
1012
1013 // Check if an address is already compiled
1014 // but don't return addresses which are about to expire from the cache
1015 void *check_addr(u_int vaddr)
1016 {
1017   struct ht_entry *ht_bin = hash_table_get(vaddr);
1018   size_t i;
1019   for (i = 0; i < ARRAY_SIZE(ht_bin->vaddr); i++) {
1020     if (ht_bin->vaddr[i] == vaddr)
1021       if (doesnt_expire_soon((u_char *)ht_bin->tcaddr[i] - MAX_OUTPUT_BLOCK_SIZE))
1022         if (isclean(ht_bin->tcaddr[i]))
1023           return ht_bin->tcaddr[i];
1024   }
1025   u_int page=get_page(vaddr);
1026   struct ll_entry *head;
1027   head=jump_in[page];
1028   while (head != NULL) {
1029     if (head->vaddr == vaddr) {
1030       if (doesnt_expire_soon(head->addr)) {
1031         // Update existing entry with current address
1032         if (ht_bin->vaddr[0] == vaddr) {
1033           ht_bin->tcaddr[0] = head->addr;
1034           return head->addr;
1035         }
1036         if (ht_bin->vaddr[1] == vaddr) {
1037           ht_bin->tcaddr[1] = head->addr;
1038           return head->addr;
1039         }
1040         // Insert into hash table with low priority.
1041         // Don't evict existing entries, as they are probably
1042         // addresses that are being accessed frequently.
1043         if (ht_bin->vaddr[0] == -1) {
1044           ht_bin->vaddr[0] = vaddr;
1045           ht_bin->tcaddr[0] = head->addr;
1046         }
1047         else if (ht_bin->vaddr[1] == -1) {
1048           ht_bin->vaddr[1] = vaddr;
1049           ht_bin->tcaddr[1] = head->addr;
1050         }
1051         return head->addr;
1052       }
1053     }
1054     head=head->next;
1055   }
1056   return 0;
1057 }
1058
1059 void remove_hash(int vaddr)
1060 {
1061   //printf("remove hash: %x\n",vaddr);
1062   struct ht_entry *ht_bin = hash_table_get(vaddr);
1063   if (ht_bin->vaddr[1] == vaddr) {
1064     ht_bin->vaddr[1] = -1;
1065     ht_bin->tcaddr[1] = NULL;
1066   }
1067   if (ht_bin->vaddr[0] == vaddr) {
1068     ht_bin->vaddr[0] = ht_bin->vaddr[1];
1069     ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
1070     ht_bin->vaddr[1] = -1;
1071     ht_bin->tcaddr[1] = NULL;
1072   }
1073 }
1074
1075 static void ll_remove_matching_addrs(struct ll_entry **head,
1076   uintptr_t base_offs_s, int shift)
1077 {
1078   struct ll_entry *next;
1079   while(*head) {
1080     uintptr_t o1 = (u_char *)(*head)->addr - ndrc->translation_cache;
1081     uintptr_t o2 = o1 - MAX_OUTPUT_BLOCK_SIZE;
1082     if ((o1 >> shift) == base_offs_s || (o2 >> shift) == base_offs_s)
1083     {
1084       inv_debug("EXP: Remove pointer to %p (%x)\n",(*head)->addr,(*head)->vaddr);
1085       remove_hash((*head)->vaddr);
1086       next=(*head)->next;
1087       free(*head);
1088       *head=next;
1089     }
1090     else
1091     {
1092       head=&((*head)->next);
1093     }
1094   }
1095 }
1096
1097 // Remove all entries from linked list
1098 void ll_clear(struct ll_entry **head)
1099 {
1100   struct ll_entry *cur;
1101   struct ll_entry *next;
1102   if((cur=*head)) {
1103     *head=0;
1104     while(cur) {
1105       next=cur->next;
1106       free(cur);
1107       cur=next;
1108     }
1109   }
1110 }
1111
1112 // Dereference the pointers and remove if it matches
1113 static void ll_kill_pointers(struct ll_entry *head,
1114   uintptr_t base_offs_s, int shift)
1115 {
1116   while(head) {
1117     u_char *ptr = get_pointer(head->addr);
1118     uintptr_t o1 = ptr - ndrc->translation_cache;
1119     uintptr_t o2 = o1 - MAX_OUTPUT_BLOCK_SIZE;
1120     inv_debug("EXP: Lookup pointer to %p at %p (%x)\n",ptr,head->addr,head->vaddr);
1121     if ((o1 >> shift) == base_offs_s || (o2 >> shift) == base_offs_s)
1122     {
1123       inv_debug("EXP: Kill pointer at %p (%x)\n",head->addr,head->vaddr);
1124       void *host_addr=find_extjump_insn(head->addr);
1125       mark_clear_cache(host_addr);
1126       set_jump_target(host_addr, head->addr);
1127     }
1128     head=head->next;
1129   }
1130 }
1131
1132 // This is called when we write to a compiled block (see do_invstub)
1133 static void invalidate_page(u_int page)
1134 {
1135   struct ll_entry *head;
1136   struct ll_entry *next;
1137   head=jump_in[page];
1138   jump_in[page]=0;
1139   while(head!=NULL) {
1140     inv_debug("INVALIDATE: %x\n",head->vaddr);
1141     remove_hash(head->vaddr);
1142     next=head->next;
1143     free(head);
1144     head=next;
1145   }
1146   head=jump_out[page];
1147   jump_out[page]=0;
1148   while(head!=NULL) {
1149     inv_debug("INVALIDATE: kill pointer to %x (%p)\n",head->vaddr,head->addr);
1150     void *host_addr=find_extjump_insn(head->addr);
1151     mark_clear_cache(host_addr);
1152     set_jump_target(host_addr, head->addr); // point back to dyna_linker
1153     next=head->next;
1154     free(head);
1155     head=next;
1156   }
1157 }
1158
1159 static void invalidate_block_range(u_int block, u_int first, u_int last)
1160 {
1161   u_int page=get_page(block<<12);
1162   //printf("first=%d last=%d\n",first,last);
1163   invalidate_page(page);
1164   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1165   assert(last<page+5);
1166   // Invalidate the adjacent pages if a block crosses a 4K boundary
1167   while(first<page) {
1168     invalidate_page(first);
1169     first++;
1170   }
1171   for(first=page+1;first<last;first++) {
1172     invalidate_page(first);
1173   }
1174   do_clear_cache();
1175
1176   // Don't trap writes
1177   invalid_code[block]=1;
1178
1179   #ifdef USE_MINI_HT
1180   memset(mini_ht,-1,sizeof(mini_ht));
1181   #endif
1182 }
1183
1184 void invalidate_block(u_int block)
1185 {
1186   u_int page=get_page(block<<12);
1187   u_int vpage=get_vpage(block<<12);
1188   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1189   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1190   u_int first,last;
1191   first=last=page;
1192   struct ll_entry *head;
1193   head=jump_dirty[vpage];
1194   //printf("page=%d vpage=%d\n",page,vpage);
1195   while(head!=NULL) {
1196     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1197       u_char *start, *end;
1198       get_bounds(head->addr, &start, &end);
1199       //printf("start: %p end: %p\n", start, end);
1200       if (page < 2048 && start >= rdram && end < rdram+RAM_SIZE) {
1201         if (((start-rdram)>>12) <= page && ((end-1-rdram)>>12) >= page) {
1202           if ((((start-rdram)>>12)&2047) < first) first = ((start-rdram)>>12)&2047;
1203           if ((((end-1-rdram)>>12)&2047) > last)  last = ((end-1-rdram)>>12)&2047;
1204         }
1205       }
1206     }
1207     head=head->next;
1208   }
1209   invalidate_block_range(block,first,last);
1210 }
1211
1212 void invalidate_addr(u_int addr)
1213 {
1214   //static int rhits;
1215   // this check is done by the caller
1216   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1217   u_int page=get_vpage(addr);
1218   if(page<2048) { // RAM
1219     struct ll_entry *head;
1220     u_int addr_min=~0, addr_max=0;
1221     u_int mask=RAM_SIZE-1;
1222     u_int addr_main=0x80000000|(addr&mask);
1223     int pg1;
1224     inv_code_start=addr_main&~0xfff;
1225     inv_code_end=addr_main|0xfff;
1226     pg1=page;
1227     if (pg1>0) {
1228       // must check previous page too because of spans..
1229       pg1--;
1230       inv_code_start-=0x1000;
1231     }
1232     for(;pg1<=page;pg1++) {
1233       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1234         u_char *start_h, *end_h;
1235         u_int start, end;
1236         get_bounds(head->addr, &start_h, &end_h);
1237         start = (uintptr_t)start_h - ram_offset;
1238         end = (uintptr_t)end_h - ram_offset;
1239         if(start<=addr_main&&addr_main<end) {
1240           if(start<addr_min) addr_min=start;
1241           if(end>addr_max) addr_max=end;
1242         }
1243         else if(addr_main<start) {
1244           if(start<inv_code_end)
1245             inv_code_end=start-1;
1246         }
1247         else {
1248           if(end>inv_code_start)
1249             inv_code_start=end;
1250         }
1251       }
1252     }
1253     if (addr_min!=~0) {
1254       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1255       inv_code_start=inv_code_end=~0;
1256       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1257       return;
1258     }
1259     else {
1260       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1261       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1262       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1263       return;
1264     }
1265   }
1266   invalidate_block(addr>>12);
1267 }
1268
1269 // This is called when loading a save state.
1270 // Anything could have changed, so invalidate everything.
1271 void invalidate_all_pages(void)
1272 {
1273   u_int page;
1274   for(page=0;page<4096;page++)
1275     invalidate_page(page);
1276   for(page=0;page<1048576;page++)
1277     if(!invalid_code[page]) {
1278       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1279       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1280     }
1281   #ifdef USE_MINI_HT
1282   memset(mini_ht,-1,sizeof(mini_ht));
1283   #endif
1284   do_clear_cache();
1285 }
1286
1287 static void do_invstub(int n)
1288 {
1289   literal_pool(20);
1290   u_int reglist=stubs[n].a;
1291   set_jump_target(stubs[n].addr, out);
1292   save_regs(reglist);
1293   if(stubs[n].b!=0) emit_mov(stubs[n].b,0);
1294   emit_far_call(invalidate_addr);
1295   restore_regs(reglist);
1296   emit_jmp(stubs[n].retaddr); // return address
1297 }
1298
1299 // Add an entry to jump_out after making a link
1300 // src should point to code by emit_extjump2()
1301 void add_jump_out(u_int vaddr,void *src)
1302 {
1303   u_int page=get_page(vaddr);
1304   inv_debug("add_jump_out: %p -> %x (%d)\n",src,vaddr,page);
1305   check_extjump2(src);
1306   ll_add(jump_out+page,vaddr,src);
1307   //inv_debug("add_jump_out:  to %p\n",get_pointer(src));
1308 }
1309
1310 // If a code block was found to be unmodified (bit was set in
1311 // restore_candidate) and it remains unmodified (bit is clear
1312 // in invalid_code) then move the entries for that 4K page from
1313 // the dirty list to the clean list.
1314 void clean_blocks(u_int page)
1315 {
1316   struct ll_entry *head;
1317   inv_debug("INV: clean_blocks page=%d\n",page);
1318   head=jump_dirty[page];
1319   while(head!=NULL) {
1320     if(!invalid_code[head->vaddr>>12]) {
1321       // Don't restore blocks which are about to expire from the cache
1322       if (doesnt_expire_soon(head->addr)) {
1323         if(verify_dirty(head->addr)) {
1324           u_char *start, *end;
1325           //printf("Possibly Restore %x (%p)\n",head->vaddr, head->addr);
1326           u_int i;
1327           u_int inv=0;
1328           get_bounds(head->addr, &start, &end);
1329           if (start - rdram < RAM_SIZE) {
1330             for (i = (start-rdram+0x80000000)>>12; i <= (end-1-rdram+0x80000000)>>12; i++) {
1331               inv|=invalid_code[i];
1332             }
1333           }
1334           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1335             inv=1;
1336           }
1337           if(!inv) {
1338             void *clean_addr = get_clean_addr(head->addr);
1339             if (doesnt_expire_soon(clean_addr)) {
1340               u_int ppage=page;
1341               inv_debug("INV: Restored %x (%p/%p)\n",head->vaddr, head->addr, clean_addr);
1342               //printf("page=%x, addr=%x\n",page,head->vaddr);
1343               //assert(head->vaddr>>12==(page|0x80000));
1344               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1345               struct ht_entry *ht_bin = hash_table_get(head->vaddr);
1346               if (ht_bin->vaddr[0] == head->vaddr)
1347                 ht_bin->tcaddr[0] = clean_addr; // Replace existing entry
1348               if (ht_bin->vaddr[1] == head->vaddr)
1349                 ht_bin->tcaddr[1] = clean_addr; // Replace existing entry
1350             }
1351           }
1352         }
1353       }
1354     }
1355     head=head->next;
1356   }
1357 }
1358
1359 /* Register allocation */
1360
1361 // Note: registers are allocated clean (unmodified state)
1362 // if you intend to modify the register, you must call dirty_reg().
1363 static void alloc_reg(struct regstat *cur,int i,signed char reg)
1364 {
1365   int r,hr;
1366   int preferred_reg = (reg&7);
1367   if(reg==CCREG) preferred_reg=HOST_CCREG;
1368   if(reg==PTEMP||reg==FTEMP) preferred_reg=12;
1369
1370   // Don't allocate unused registers
1371   if((cur->u>>reg)&1) return;
1372
1373   // see if it's already allocated
1374   for(hr=0;hr<HOST_REGS;hr++)
1375   {
1376     if(cur->regmap[hr]==reg) return;
1377   }
1378
1379   // Keep the same mapping if the register was already allocated in a loop
1380   preferred_reg = loop_reg(i,reg,preferred_reg);
1381
1382   // Try to allocate the preferred register
1383   if(cur->regmap[preferred_reg]==-1) {
1384     cur->regmap[preferred_reg]=reg;
1385     cur->dirty&=~(1<<preferred_reg);
1386     cur->isconst&=~(1<<preferred_reg);
1387     return;
1388   }
1389   r=cur->regmap[preferred_reg];
1390   assert(r < 64);
1391   if((cur->u>>r)&1) {
1392     cur->regmap[preferred_reg]=reg;
1393     cur->dirty&=~(1<<preferred_reg);
1394     cur->isconst&=~(1<<preferred_reg);
1395     return;
1396   }
1397
1398   // Clear any unneeded registers
1399   // We try to keep the mapping consistent, if possible, because it
1400   // makes branches easier (especially loops).  So we try to allocate
1401   // first (see above) before removing old mappings.  If this is not
1402   // possible then go ahead and clear out the registers that are no
1403   // longer needed.
1404   for(hr=0;hr<HOST_REGS;hr++)
1405   {
1406     r=cur->regmap[hr];
1407     if(r>=0) {
1408       assert(r < 64);
1409       if((cur->u>>r)&1) {cur->regmap[hr]=-1;break;}
1410     }
1411   }
1412   // Try to allocate any available register, but prefer
1413   // registers that have not been used recently.
1414   if(i>0) {
1415     for(hr=0;hr<HOST_REGS;hr++) {
1416       if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
1417         if(regs[i-1].regmap[hr]!=dops[i-1].rs1&&regs[i-1].regmap[hr]!=dops[i-1].rs2&&regs[i-1].regmap[hr]!=dops[i-1].rt1&&regs[i-1].regmap[hr]!=dops[i-1].rt2) {
1418           cur->regmap[hr]=reg;
1419           cur->dirty&=~(1<<hr);
1420           cur->isconst&=~(1<<hr);
1421           return;
1422         }
1423       }
1424     }
1425   }
1426   // Try to allocate any available register
1427   for(hr=0;hr<HOST_REGS;hr++) {
1428     if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
1429       cur->regmap[hr]=reg;
1430       cur->dirty&=~(1<<hr);
1431       cur->isconst&=~(1<<hr);
1432       return;
1433     }
1434   }
1435
1436   // Ok, now we have to evict someone
1437   // Pick a register we hopefully won't need soon
1438   u_char hsn[MAXREG+1];
1439   memset(hsn,10,sizeof(hsn));
1440   int j;
1441   lsn(hsn,i,&preferred_reg);
1442   //printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",cur->regmap[0],cur->regmap[1],cur->regmap[2],cur->regmap[3],cur->regmap[5],cur->regmap[6],cur->regmap[7]);
1443   //printf("hsn(%x): %d %d %d %d %d %d %d\n",start+i*4,hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]);
1444   if(i>0) {
1445     // Don't evict the cycle count at entry points, otherwise the entry
1446     // stub will have to write it.
1447     if(dops[i].bt&&hsn[CCREG]>2) hsn[CCREG]=2;
1448     if (i>1 && hsn[CCREG] > 2 && dops[i-2].is_jump) hsn[CCREG]=2;
1449     for(j=10;j>=3;j--)
1450     {
1451       // Alloc preferred register if available
1452       if(hsn[r=cur->regmap[preferred_reg]&63]==j) {
1453         for(hr=0;hr<HOST_REGS;hr++) {
1454           // Evict both parts of a 64-bit register
1455           if((cur->regmap[hr]&63)==r) {
1456             cur->regmap[hr]=-1;
1457             cur->dirty&=~(1<<hr);
1458             cur->isconst&=~(1<<hr);
1459           }
1460         }
1461         cur->regmap[preferred_reg]=reg;
1462         return;
1463       }
1464       for(r=1;r<=MAXREG;r++)
1465       {
1466         if(hsn[r]==j&&r!=dops[i-1].rs1&&r!=dops[i-1].rs2&&r!=dops[i-1].rt1&&r!=dops[i-1].rt2) {
1467           for(hr=0;hr<HOST_REGS;hr++) {
1468             if(hr!=HOST_CCREG||j<hsn[CCREG]) {
1469               if(cur->regmap[hr]==r) {
1470                 cur->regmap[hr]=reg;
1471                 cur->dirty&=~(1<<hr);
1472                 cur->isconst&=~(1<<hr);
1473                 return;
1474               }
1475             }
1476           }
1477         }
1478       }
1479     }
1480   }
1481   for(j=10;j>=0;j--)
1482   {
1483     for(r=1;r<=MAXREG;r++)
1484     {
1485       if(hsn[r]==j) {
1486         for(hr=0;hr<HOST_REGS;hr++) {
1487           if(cur->regmap[hr]==r) {
1488             cur->regmap[hr]=reg;
1489             cur->dirty&=~(1<<hr);
1490             cur->isconst&=~(1<<hr);
1491             return;
1492           }
1493         }
1494       }
1495     }
1496   }
1497   SysPrintf("This shouldn't happen (alloc_reg)");abort();
1498 }
1499
1500 // Allocate a temporary register.  This is done without regard to
1501 // dirty status or whether the register we request is on the unneeded list
1502 // Note: This will only allocate one register, even if called multiple times
1503 static void alloc_reg_temp(struct regstat *cur,int i,signed char reg)
1504 {
1505   int r,hr;
1506   int preferred_reg = -1;
1507
1508   // see if it's already allocated
1509   for(hr=0;hr<HOST_REGS;hr++)
1510   {
1511     if(hr!=EXCLUDE_REG&&cur->regmap[hr]==reg) return;
1512   }
1513
1514   // Try to allocate any available register
1515   for(hr=HOST_REGS-1;hr>=0;hr--) {
1516     if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
1517       cur->regmap[hr]=reg;
1518       cur->dirty&=~(1<<hr);
1519       cur->isconst&=~(1<<hr);
1520       return;
1521     }
1522   }
1523
1524   // Find an unneeded register
1525   for(hr=HOST_REGS-1;hr>=0;hr--)
1526   {
1527     r=cur->regmap[hr];
1528     if(r>=0) {
1529       assert(r < 64);
1530       if((cur->u>>r)&1) {
1531         if(i==0||((unneeded_reg[i-1]>>r)&1)) {
1532           cur->regmap[hr]=reg;
1533           cur->dirty&=~(1<<hr);
1534           cur->isconst&=~(1<<hr);
1535           return;
1536         }
1537       }
1538     }
1539   }
1540
1541   // Ok, now we have to evict someone
1542   // Pick a register we hopefully won't need soon
1543   // TODO: we might want to follow unconditional jumps here
1544   // TODO: get rid of dupe code and make this into a function
1545   u_char hsn[MAXREG+1];
1546   memset(hsn,10,sizeof(hsn));
1547   int j;
1548   lsn(hsn,i,&preferred_reg);
1549   //printf("hsn: %d %d %d %d %d %d %d\n",hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]);
1550   if(i>0) {
1551     // Don't evict the cycle count at entry points, otherwise the entry
1552     // stub will have to write it.
1553     if(dops[i].bt&&hsn[CCREG]>2) hsn[CCREG]=2;
1554     if (i>1 && hsn[CCREG] > 2 && dops[i-2].is_jump) hsn[CCREG]=2;
1555     for(j=10;j>=3;j--)
1556     {
1557       for(r=1;r<=MAXREG;r++)
1558       {
1559         if(hsn[r]==j&&r!=dops[i-1].rs1&&r!=dops[i-1].rs2&&r!=dops[i-1].rt1&&r!=dops[i-1].rt2) {
1560           for(hr=0;hr<HOST_REGS;hr++) {
1561             if(hr!=HOST_CCREG||hsn[CCREG]>2) {
1562               if(cur->regmap[hr]==r) {
1563                 cur->regmap[hr]=reg;
1564                 cur->dirty&=~(1<<hr);
1565                 cur->isconst&=~(1<<hr);
1566                 return;
1567               }
1568             }
1569           }
1570         }
1571       }
1572     }
1573   }
1574   for(j=10;j>=0;j--)
1575   {
1576     for(r=1;r<=MAXREG;r++)
1577     {
1578       if(hsn[r]==j) {
1579         for(hr=0;hr<HOST_REGS;hr++) {
1580           if(cur->regmap[hr]==r) {
1581             cur->regmap[hr]=reg;
1582             cur->dirty&=~(1<<hr);
1583             cur->isconst&=~(1<<hr);
1584             return;
1585           }
1586         }
1587       }
1588     }
1589   }
1590   SysPrintf("This shouldn't happen");abort();
1591 }
1592
1593 static void mov_alloc(struct regstat *current,int i)
1594 {
1595   if (dops[i].rs1 == HIREG || dops[i].rs1 == LOREG) {
1596     // logically this is needed but just won't work, no idea why
1597     //alloc_cc(current,i); // for stalls
1598     //dirty_reg(current,CCREG);
1599   }
1600
1601   // Note: Don't need to actually alloc the source registers
1602   //alloc_reg(current,i,dops[i].rs1);
1603   alloc_reg(current,i,dops[i].rt1);
1604
1605   clear_const(current,dops[i].rs1);
1606   clear_const(current,dops[i].rt1);
1607   dirty_reg(current,dops[i].rt1);
1608 }
1609
1610 static void shiftimm_alloc(struct regstat *current,int i)
1611 {
1612   if(dops[i].opcode2<=0x3) // SLL/SRL/SRA
1613   {
1614     if(dops[i].rt1) {
1615       if(dops[i].rs1&&needed_again(dops[i].rs1,i)) alloc_reg(current,i,dops[i].rs1);
1616       else dops[i].lt1=dops[i].rs1;
1617       alloc_reg(current,i,dops[i].rt1);
1618       dirty_reg(current,dops[i].rt1);
1619       if(is_const(current,dops[i].rs1)) {
1620         int v=get_const(current,dops[i].rs1);
1621         if(dops[i].opcode2==0x00) set_const(current,dops[i].rt1,v<<imm[i]);
1622         if(dops[i].opcode2==0x02) set_const(current,dops[i].rt1,(u_int)v>>imm[i]);
1623         if(dops[i].opcode2==0x03) set_const(current,dops[i].rt1,v>>imm[i]);
1624       }
1625       else clear_const(current,dops[i].rt1);
1626     }
1627   }
1628   else
1629   {
1630     clear_const(current,dops[i].rs1);
1631     clear_const(current,dops[i].rt1);
1632   }
1633
1634   if(dops[i].opcode2>=0x38&&dops[i].opcode2<=0x3b) // DSLL/DSRL/DSRA
1635   {
1636     assert(0);
1637   }
1638   if(dops[i].opcode2==0x3c) // DSLL32
1639   {
1640     assert(0);
1641   }
1642   if(dops[i].opcode2==0x3e) // DSRL32
1643   {
1644     assert(0);
1645   }
1646   if(dops[i].opcode2==0x3f) // DSRA32
1647   {
1648     assert(0);
1649   }
1650 }
1651
1652 static void shift_alloc(struct regstat *current,int i)
1653 {
1654   if(dops[i].rt1) {
1655     if(dops[i].opcode2<=0x07) // SLLV/SRLV/SRAV
1656     {
1657       if(dops[i].rs1) alloc_reg(current,i,dops[i].rs1);
1658       if(dops[i].rs2) alloc_reg(current,i,dops[i].rs2);
1659       alloc_reg(current,i,dops[i].rt1);
1660       if(dops[i].rt1==dops[i].rs2) {
1661         alloc_reg_temp(current,i,-1);
1662         minimum_free_regs[i]=1;
1663       }
1664     } else { // DSLLV/DSRLV/DSRAV
1665       assert(0);
1666     }
1667     clear_const(current,dops[i].rs1);
1668     clear_const(current,dops[i].rs2);
1669     clear_const(current,dops[i].rt1);
1670     dirty_reg(current,dops[i].rt1);
1671   }
1672 }
1673
1674 static void alu_alloc(struct regstat *current,int i)
1675 {
1676   if(dops[i].opcode2>=0x20&&dops[i].opcode2<=0x23) { // ADD/ADDU/SUB/SUBU
1677     if(dops[i].rt1) {
1678       if(dops[i].rs1&&dops[i].rs2) {
1679         alloc_reg(current,i,dops[i].rs1);
1680         alloc_reg(current,i,dops[i].rs2);
1681       }
1682       else {
1683         if(dops[i].rs1&&needed_again(dops[i].rs1,i)) alloc_reg(current,i,dops[i].rs1);
1684         if(dops[i].rs2&&needed_again(dops[i].rs2,i)) alloc_reg(current,i,dops[i].rs2);
1685       }
1686       alloc_reg(current,i,dops[i].rt1);
1687     }
1688   }
1689   if(dops[i].opcode2==0x2a||dops[i].opcode2==0x2b) { // SLT/SLTU
1690     if(dops[i].rt1) {
1691       alloc_reg(current,i,dops[i].rs1);
1692       alloc_reg(current,i,dops[i].rs2);
1693       alloc_reg(current,i,dops[i].rt1);
1694     }
1695   }
1696   if(dops[i].opcode2>=0x24&&dops[i].opcode2<=0x27) { // AND/OR/XOR/NOR
1697     if(dops[i].rt1) {
1698       if(dops[i].rs1&&dops[i].rs2) {
1699         alloc_reg(current,i,dops[i].rs1);
1700         alloc_reg(current,i,dops[i].rs2);
1701       }
1702       else
1703       {
1704         if(dops[i].rs1&&needed_again(dops[i].rs1,i)) alloc_reg(current,i,dops[i].rs1);
1705         if(dops[i].rs2&&needed_again(dops[i].rs2,i)) alloc_reg(current,i,dops[i].rs2);
1706       }
1707       alloc_reg(current,i,dops[i].rt1);
1708     }
1709   }
1710   if(dops[i].opcode2>=0x2c&&dops[i].opcode2<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1711     assert(0);
1712   }
1713   clear_const(current,dops[i].rs1);
1714   clear_const(current,dops[i].rs2);
1715   clear_const(current,dops[i].rt1);
1716   dirty_reg(current,dops[i].rt1);
1717 }
1718
1719 static void imm16_alloc(struct regstat *current,int i)
1720 {
1721   if(dops[i].rs1&&needed_again(dops[i].rs1,i)) alloc_reg(current,i,dops[i].rs1);
1722   else dops[i].lt1=dops[i].rs1;
1723   if(dops[i].rt1) alloc_reg(current,i,dops[i].rt1);
1724   if(dops[i].opcode==0x18||dops[i].opcode==0x19) { // DADDI/DADDIU
1725     assert(0);
1726   }
1727   else if(dops[i].opcode==0x0a||dops[i].opcode==0x0b) { // SLTI/SLTIU
1728     clear_const(current,dops[i].rs1);
1729     clear_const(current,dops[i].rt1);
1730   }
1731   else if(dops[i].opcode>=0x0c&&dops[i].opcode<=0x0e) { // ANDI/ORI/XORI
1732     if(is_const(current,dops[i].rs1)) {
1733       int v=get_const(current,dops[i].rs1);
1734       if(dops[i].opcode==0x0c) set_const(current,dops[i].rt1,v&imm[i]);
1735       if(dops[i].opcode==0x0d) set_const(current,dops[i].rt1,v|imm[i]);
1736       if(dops[i].opcode==0x0e) set_const(current,dops[i].rt1,v^imm[i]);
1737     }
1738     else clear_const(current,dops[i].rt1);
1739   }
1740   else if(dops[i].opcode==0x08||dops[i].opcode==0x09) { // ADDI/ADDIU
1741     if(is_const(current,dops[i].rs1)) {
1742       int v=get_const(current,dops[i].rs1);
1743       set_const(current,dops[i].rt1,v+imm[i]);
1744     }
1745     else clear_const(current,dops[i].rt1);
1746   }
1747   else {
1748     set_const(current,dops[i].rt1,imm[i]<<16); // LUI
1749   }
1750   dirty_reg(current,dops[i].rt1);
1751 }
1752
1753 static void load_alloc(struct regstat *current,int i)
1754 {
1755   clear_const(current,dops[i].rt1);
1756   //if(dops[i].rs1!=dops[i].rt1&&needed_again(dops[i].rs1,i)) clear_const(current,dops[i].rs1); // Does this help or hurt?
1757   if(!dops[i].rs1) current->u&=~1LL; // Allow allocating r0 if it's the source register
1758   if(needed_again(dops[i].rs1,i)) alloc_reg(current,i,dops[i].rs1);
1759   if(dops[i].rt1&&!((current->u>>dops[i].rt1)&1)) {
1760     alloc_reg(current,i,dops[i].rt1);
1761     assert(get_reg(current->regmap,dops[i].rt1)>=0);
1762     if(dops[i].opcode==0x27||dops[i].opcode==0x37) // LWU/LD
1763     {
1764       assert(0);
1765     }
1766     else if(dops[i].opcode==0x1A||dops[i].opcode==0x1B) // LDL/LDR
1767     {
1768       assert(0);
1769     }
1770     dirty_reg(current,dops[i].rt1);
1771     // LWL/LWR need a temporary register for the old value
1772     if(dops[i].opcode==0x22||dops[i].opcode==0x26)
1773     {
1774       alloc_reg(current,i,FTEMP);
1775       alloc_reg_temp(current,i,-1);
1776       minimum_free_regs[i]=1;
1777     }
1778   }
1779   else
1780   {
1781     // Load to r0 or unneeded register (dummy load)
1782     // but we still need a register to calculate the address
1783     if(dops[i].opcode==0x22||dops[i].opcode==0x26)
1784     {
1785       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1786     }
1787     alloc_reg_temp(current,i,-1);
1788     minimum_free_regs[i]=1;
1789     if(dops[i].opcode==0x1A||dops[i].opcode==0x1B) // LDL/LDR
1790     {
1791       assert(0);
1792     }
1793   }
1794 }
1795
1796 void store_alloc(struct regstat *current,int i)
1797 {
1798   clear_const(current,dops[i].rs2);
1799   if(!(dops[i].rs2)) current->u&=~1LL; // Allow allocating r0 if necessary
1800   if(needed_again(dops[i].rs1,i)) alloc_reg(current,i,dops[i].rs1);
1801   alloc_reg(current,i,dops[i].rs2);
1802   if(dops[i].opcode==0x2c||dops[i].opcode==0x2d||dops[i].opcode==0x3f) { // 64-bit SDL/SDR/SD
1803     assert(0);
1804   }
1805   #if defined(HOST_IMM8)
1806   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1807   else alloc_reg(current,i,INVCP);
1808   #endif
1809   if(dops[i].opcode==0x2a||dops[i].opcode==0x2e||dops[i].opcode==0x2c||dops[i].opcode==0x2d) { // SWL/SWL/SDL/SDR
1810     alloc_reg(current,i,FTEMP);
1811   }
1812   // We need a temporary register for address generation
1813   alloc_reg_temp(current,i,-1);
1814   minimum_free_regs[i]=1;
1815 }
1816
1817 void c1ls_alloc(struct regstat *current,int i)
1818 {
1819   //clear_const(current,dops[i].rs1); // FIXME
1820   clear_const(current,dops[i].rt1);
1821   if(needed_again(dops[i].rs1,i)) alloc_reg(current,i,dops[i].rs1);
1822   alloc_reg(current,i,CSREG); // Status
1823   alloc_reg(current,i,FTEMP);
1824   if(dops[i].opcode==0x35||dops[i].opcode==0x3d) { // 64-bit LDC1/SDC1
1825     assert(0);
1826   }
1827   #if defined(HOST_IMM8)
1828   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1829   else if((dops[i].opcode&0x3b)==0x39) // SWC1/SDC1
1830     alloc_reg(current,i,INVCP);
1831   #endif
1832   // We need a temporary register for address generation
1833   alloc_reg_temp(current,i,-1);
1834 }
1835
1836 void c2ls_alloc(struct regstat *current,int i)
1837 {
1838   clear_const(current,dops[i].rt1);
1839   if(needed_again(dops[i].rs1,i)) alloc_reg(current,i,dops[i].rs1);
1840   alloc_reg(current,i,FTEMP);
1841   #if defined(HOST_IMM8)
1842   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1843   if((dops[i].opcode&0x3b)==0x3a) // SWC2/SDC2
1844     alloc_reg(current,i,INVCP);
1845   #endif
1846   // We need a temporary register for address generation
1847   alloc_reg_temp(current,i,-1);
1848   minimum_free_regs[i]=1;
1849 }
1850
1851 #ifndef multdiv_alloc
1852 void multdiv_alloc(struct regstat *current,int i)
1853 {
1854   //  case 0x18: MULT
1855   //  case 0x19: MULTU
1856   //  case 0x1A: DIV
1857   //  case 0x1B: DIVU
1858   //  case 0x1C: DMULT
1859   //  case 0x1D: DMULTU
1860   //  case 0x1E: DDIV
1861   //  case 0x1F: DDIVU
1862   clear_const(current,dops[i].rs1);
1863   clear_const(current,dops[i].rs2);
1864   alloc_cc(current,i); // for stalls
1865   if(dops[i].rs1&&dops[i].rs2)
1866   {
1867     if((dops[i].opcode2&4)==0) // 32-bit
1868     {
1869       current->u&=~(1LL<<HIREG);
1870       current->u&=~(1LL<<LOREG);
1871       alloc_reg(current,i,HIREG);
1872       alloc_reg(current,i,LOREG);
1873       alloc_reg(current,i,dops[i].rs1);
1874       alloc_reg(current,i,dops[i].rs2);
1875       dirty_reg(current,HIREG);
1876       dirty_reg(current,LOREG);
1877     }
1878     else // 64-bit
1879     {
1880       assert(0);
1881     }
1882   }
1883   else
1884   {
1885     // Multiply by zero is zero.
1886     // MIPS does not have a divide by zero exception.
1887     // The result is undefined, we return zero.
1888     alloc_reg(current,i,HIREG);
1889     alloc_reg(current,i,LOREG);
1890     dirty_reg(current,HIREG);
1891     dirty_reg(current,LOREG);
1892   }
1893 }
1894 #endif
1895
1896 void cop0_alloc(struct regstat *current,int i)
1897 {
1898   if(dops[i].opcode2==0) // MFC0
1899   {
1900     if(dops[i].rt1) {
1901       clear_const(current,dops[i].rt1);
1902       alloc_all(current,i);
1903       alloc_reg(current,i,dops[i].rt1);
1904       dirty_reg(current,dops[i].rt1);
1905     }
1906   }
1907   else if(dops[i].opcode2==4) // MTC0
1908   {
1909     if(dops[i].rs1){
1910       clear_const(current,dops[i].rs1);
1911       alloc_reg(current,i,dops[i].rs1);
1912       alloc_all(current,i);
1913     }
1914     else {
1915       alloc_all(current,i); // FIXME: Keep r0
1916       current->u&=~1LL;
1917       alloc_reg(current,i,0);
1918     }
1919   }
1920   else
1921   {
1922     // TLBR/TLBWI/TLBWR/TLBP/ERET
1923     assert(dops[i].opcode2==0x10);
1924     alloc_all(current,i);
1925   }
1926   minimum_free_regs[i]=HOST_REGS;
1927 }
1928
1929 static void cop2_alloc(struct regstat *current,int i)
1930 {
1931   if (dops[i].opcode2 < 3) // MFC2/CFC2
1932   {
1933     alloc_cc(current,i); // for stalls
1934     dirty_reg(current,CCREG);
1935     if(dops[i].rt1){
1936       clear_const(current,dops[i].rt1);
1937       alloc_reg(current,i,dops[i].rt1);
1938       dirty_reg(current,dops[i].rt1);
1939     }
1940   }
1941   else if (dops[i].opcode2 > 3) // MTC2/CTC2
1942   {
1943     if(dops[i].rs1){
1944       clear_const(current,dops[i].rs1);
1945       alloc_reg(current,i,dops[i].rs1);
1946     }
1947     else {
1948       current->u&=~1LL;
1949       alloc_reg(current,i,0);
1950     }
1951   }
1952   alloc_reg_temp(current,i,-1);
1953   minimum_free_regs[i]=1;
1954 }
1955
1956 void c2op_alloc(struct regstat *current,int i)
1957 {
1958   alloc_cc(current,i); // for stalls
1959   dirty_reg(current,CCREG);
1960   alloc_reg_temp(current,i,-1);
1961 }
1962
1963 void syscall_alloc(struct regstat *current,int i)
1964 {
1965   alloc_cc(current,i);
1966   dirty_reg(current,CCREG);
1967   alloc_all(current,i);
1968   minimum_free_regs[i]=HOST_REGS;
1969   current->isconst=0;
1970 }
1971
1972 void delayslot_alloc(struct regstat *current,int i)
1973 {
1974   switch(dops[i].itype) {
1975     case UJUMP:
1976     case CJUMP:
1977     case SJUMP:
1978     case RJUMP:
1979     case SYSCALL:
1980     case HLECALL:
1981     case SPAN:
1982       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//abort();
1983       SysPrintf("Disabled speculative precompilation\n");
1984       stop_after_jal=1;
1985       break;
1986     case IMM16:
1987       imm16_alloc(current,i);
1988       break;
1989     case LOAD:
1990     case LOADLR:
1991       load_alloc(current,i);
1992       break;
1993     case STORE:
1994     case STORELR:
1995       store_alloc(current,i);
1996       break;
1997     case ALU:
1998       alu_alloc(current,i);
1999       break;
2000     case SHIFT:
2001       shift_alloc(current,i);
2002       break;
2003     case MULTDIV:
2004       multdiv_alloc(current,i);
2005       break;
2006     case SHIFTIMM:
2007       shiftimm_alloc(current,i);
2008       break;
2009     case MOV:
2010       mov_alloc(current,i);
2011       break;
2012     case COP0:
2013       cop0_alloc(current,i);
2014       break;
2015     case COP1:
2016       break;
2017     case COP2:
2018       cop2_alloc(current,i);
2019       break;
2020     case C1LS:
2021       c1ls_alloc(current,i);
2022       break;
2023     case C2LS:
2024       c2ls_alloc(current,i);
2025       break;
2026     case C2OP:
2027       c2op_alloc(current,i);
2028       break;
2029   }
2030 }
2031
2032 // Special case where a branch and delay slot span two pages in virtual memory
2033 static void pagespan_alloc(struct regstat *current,int i)
2034 {
2035   current->isconst=0;
2036   current->wasconst=0;
2037   regs[i].wasconst=0;
2038   minimum_free_regs[i]=HOST_REGS;
2039   alloc_all(current,i);
2040   alloc_cc(current,i);
2041   dirty_reg(current,CCREG);
2042   if(dops[i].opcode==3) // JAL
2043   {
2044     alloc_reg(current,i,31);
2045     dirty_reg(current,31);
2046   }
2047   if(dops[i].opcode==0&&(dops[i].opcode2&0x3E)==8) // JR/JALR
2048   {
2049     alloc_reg(current,i,dops[i].rs1);
2050     if (dops[i].rt1!=0) {
2051       alloc_reg(current,i,dops[i].rt1);
2052       dirty_reg(current,dops[i].rt1);
2053     }
2054   }
2055   if((dops[i].opcode&0x2E)==4) // BEQ/BNE/BEQL/BNEL
2056   {
2057     if(dops[i].rs1) alloc_reg(current,i,dops[i].rs1);
2058     if(dops[i].rs2) alloc_reg(current,i,dops[i].rs2);
2059   }
2060   else
2061   if((dops[i].opcode&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
2062   {
2063     if(dops[i].rs1) alloc_reg(current,i,dops[i].rs1);
2064   }
2065   //else ...
2066 }
2067
2068 static void add_stub(enum stub_type type, void *addr, void *retaddr,
2069   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e)
2070 {
2071   assert(stubcount < ARRAY_SIZE(stubs));
2072   stubs[stubcount].type = type;
2073   stubs[stubcount].addr = addr;
2074   stubs[stubcount].retaddr = retaddr;
2075   stubs[stubcount].a = a;
2076   stubs[stubcount].b = b;
2077   stubs[stubcount].c = c;
2078   stubs[stubcount].d = d;
2079   stubs[stubcount].e = e;
2080   stubcount++;
2081 }
2082
2083 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
2084   int i, int addr_reg, const struct regstat *i_regs, int ccadj, u_int reglist)
2085 {
2086   add_stub(type, addr, retaddr, i, addr_reg, (uintptr_t)i_regs, ccadj, reglist);
2087 }
2088
2089 // Write out a single register
2090 static void wb_register(signed char r,signed char regmap[],uint64_t dirty)
2091 {
2092   int hr;
2093   for(hr=0;hr<HOST_REGS;hr++) {
2094     if(hr!=EXCLUDE_REG) {
2095       if((regmap[hr]&63)==r) {
2096         if((dirty>>hr)&1) {
2097           assert(regmap[hr]<64);
2098           emit_storereg(r,hr);
2099         }
2100       }
2101     }
2102   }
2103 }
2104
2105 static void wb_valid(signed char pre[],signed char entry[],u_int dirty_pre,u_int dirty,uint64_t u)
2106 {
2107   //if(dirty_pre==dirty) return;
2108   int hr,reg;
2109   for(hr=0;hr<HOST_REGS;hr++) {
2110     if(hr!=EXCLUDE_REG) {
2111       reg=pre[hr];
2112       if(((~u)>>(reg&63))&1) {
2113         if(reg>0) {
2114           if(((dirty_pre&~dirty)>>hr)&1) {
2115             if(reg>0&&reg<34) {
2116               emit_storereg(reg,hr);
2117             }
2118             else if(reg>=64) {
2119               assert(0);
2120             }
2121           }
2122         }
2123       }
2124     }
2125   }
2126 }
2127
2128 // trashes r2
2129 static void pass_args(int a0, int a1)
2130 {
2131   if(a0==1&&a1==0) {
2132     // must swap
2133     emit_mov(a0,2); emit_mov(a1,1); emit_mov(2,0);
2134   }
2135   else if(a0!=0&&a1==0) {
2136     emit_mov(a1,1);
2137     if (a0>=0) emit_mov(a0,0);
2138   }
2139   else {
2140     if(a0>=0&&a0!=0) emit_mov(a0,0);
2141     if(a1>=0&&a1!=1) emit_mov(a1,1);
2142   }
2143 }
2144
2145 static void alu_assemble(int i,struct regstat *i_regs)
2146 {
2147   if(dops[i].opcode2>=0x20&&dops[i].opcode2<=0x23) { // ADD/ADDU/SUB/SUBU
2148     if(dops[i].rt1) {
2149       signed char s1,s2,t;
2150       t=get_reg(i_regs->regmap,dops[i].rt1);
2151       if(t>=0) {
2152         s1=get_reg(i_regs->regmap,dops[i].rs1);
2153         s2=get_reg(i_regs->regmap,dops[i].rs2);
2154         if(dops[i].rs1&&dops[i].rs2) {
2155           assert(s1>=0);
2156           assert(s2>=0);
2157           if(dops[i].opcode2&2) emit_sub(s1,s2,t);
2158           else emit_add(s1,s2,t);
2159         }
2160         else if(dops[i].rs1) {
2161           if(s1>=0) emit_mov(s1,t);
2162           else emit_loadreg(dops[i].rs1,t);
2163         }
2164         else if(dops[i].rs2) {
2165           if(s2>=0) {
2166             if(dops[i].opcode2&2) emit_neg(s2,t);
2167             else emit_mov(s2,t);
2168           }
2169           else {
2170             emit_loadreg(dops[i].rs2,t);
2171             if(dops[i].opcode2&2) emit_neg(t,t);
2172           }
2173         }
2174         else emit_zeroreg(t);
2175       }
2176     }
2177   }
2178   if(dops[i].opcode2>=0x2c&&dops[i].opcode2<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2179     assert(0);
2180   }
2181   if(dops[i].opcode2==0x2a||dops[i].opcode2==0x2b) { // SLT/SLTU
2182     if(dops[i].rt1) {
2183       signed char s1l,s2l,t;
2184       {
2185         t=get_reg(i_regs->regmap,dops[i].rt1);
2186         //assert(t>=0);
2187         if(t>=0) {
2188           s1l=get_reg(i_regs->regmap,dops[i].rs1);
2189           s2l=get_reg(i_regs->regmap,dops[i].rs2);
2190           if(dops[i].rs2==0) // rx<r0
2191           {
2192             if(dops[i].opcode2==0x2a&&dops[i].rs1!=0) { // SLT
2193               assert(s1l>=0);
2194               emit_shrimm(s1l,31,t);
2195             }
2196             else // SLTU (unsigned can not be less than zero, 0<0)
2197               emit_zeroreg(t);
2198           }
2199           else if(dops[i].rs1==0) // r0<rx
2200           {
2201             assert(s2l>=0);
2202             if(dops[i].opcode2==0x2a) // SLT
2203               emit_set_gz32(s2l,t);
2204             else // SLTU (set if not zero)
2205               emit_set_nz32(s2l,t);
2206           }
2207           else{
2208             assert(s1l>=0);assert(s2l>=0);
2209             if(dops[i].opcode2==0x2a) // SLT
2210               emit_set_if_less32(s1l,s2l,t);
2211             else // SLTU
2212               emit_set_if_carry32(s1l,s2l,t);
2213           }
2214         }
2215       }
2216     }
2217   }
2218   if(dops[i].opcode2>=0x24&&dops[i].opcode2<=0x27) { // AND/OR/XOR/NOR
2219     if(dops[i].rt1) {
2220       signed char s1l,s2l,tl;
2221       tl=get_reg(i_regs->regmap,dops[i].rt1);
2222       {
2223         if(tl>=0) {
2224           s1l=get_reg(i_regs->regmap,dops[i].rs1);
2225           s2l=get_reg(i_regs->regmap,dops[i].rs2);
2226           if(dops[i].rs1&&dops[i].rs2) {
2227             assert(s1l>=0);
2228             assert(s2l>=0);
2229             if(dops[i].opcode2==0x24) { // AND
2230               emit_and(s1l,s2l,tl);
2231             } else
2232             if(dops[i].opcode2==0x25) { // OR
2233               emit_or(s1l,s2l,tl);
2234             } else
2235             if(dops[i].opcode2==0x26) { // XOR
2236               emit_xor(s1l,s2l,tl);
2237             } else
2238             if(dops[i].opcode2==0x27) { // NOR
2239               emit_or(s1l,s2l,tl);
2240               emit_not(tl,tl);
2241             }
2242           }
2243           else
2244           {
2245             if(dops[i].opcode2==0x24) { // AND
2246               emit_zeroreg(tl);
2247             } else
2248             if(dops[i].opcode2==0x25||dops[i].opcode2==0x26) { // OR/XOR
2249               if(dops[i].rs1){
2250                 if(s1l>=0) emit_mov(s1l,tl);
2251                 else emit_loadreg(dops[i].rs1,tl); // CHECK: regmap_entry?
2252               }
2253               else
2254               if(dops[i].rs2){
2255                 if(s2l>=0) emit_mov(s2l,tl);
2256                 else emit_loadreg(dops[i].rs2,tl); // CHECK: regmap_entry?
2257               }
2258               else emit_zeroreg(tl);
2259             } else
2260             if(dops[i].opcode2==0x27) { // NOR
2261               if(dops[i].rs1){
2262                 if(s1l>=0) emit_not(s1l,tl);
2263                 else {
2264                   emit_loadreg(dops[i].rs1,tl);
2265                   emit_not(tl,tl);
2266                 }
2267               }
2268               else
2269               if(dops[i].rs2){
2270                 if(s2l>=0) emit_not(s2l,tl);
2271                 else {
2272                   emit_loadreg(dops[i].rs2,tl);
2273                   emit_not(tl,tl);
2274                 }
2275               }
2276               else emit_movimm(-1,tl);
2277             }
2278           }
2279         }
2280       }
2281     }
2282   }
2283 }
2284
2285 void imm16_assemble(int i,struct regstat *i_regs)
2286 {
2287   if (dops[i].opcode==0x0f) { // LUI
2288     if(dops[i].rt1) {
2289       signed char t;
2290       t=get_reg(i_regs->regmap,dops[i].rt1);
2291       //assert(t>=0);
2292       if(t>=0) {
2293         if(!((i_regs->isconst>>t)&1))
2294           emit_movimm(imm[i]<<16,t);
2295       }
2296     }
2297   }
2298   if(dops[i].opcode==0x08||dops[i].opcode==0x09) { // ADDI/ADDIU
2299     if(dops[i].rt1) {
2300       signed char s,t;
2301       t=get_reg(i_regs->regmap,dops[i].rt1);
2302       s=get_reg(i_regs->regmap,dops[i].rs1);
2303       if(dops[i].rs1) {
2304         //assert(t>=0);
2305         //assert(s>=0);
2306         if(t>=0) {
2307           if(!((i_regs->isconst>>t)&1)) {
2308             if(s<0) {
2309               if(i_regs->regmap_entry[t]!=dops[i].rs1) emit_loadreg(dops[i].rs1,t);
2310               emit_addimm(t,imm[i],t);
2311             }else{
2312               if(!((i_regs->wasconst>>s)&1))
2313                 emit_addimm(s,imm[i],t);
2314               else
2315                 emit_movimm(constmap[i][s]+imm[i],t);
2316             }
2317           }
2318         }
2319       } else {
2320         if(t>=0) {
2321           if(!((i_regs->isconst>>t)&1))
2322             emit_movimm(imm[i],t);
2323         }
2324       }
2325     }
2326   }
2327   if(dops[i].opcode==0x18||dops[i].opcode==0x19) { // DADDI/DADDIU
2328     if(dops[i].rt1) {
2329       signed char sl,tl;
2330       tl=get_reg(i_regs->regmap,dops[i].rt1);
2331       sl=get_reg(i_regs->regmap,dops[i].rs1);
2332       if(tl>=0) {
2333         if(dops[i].rs1) {
2334           assert(sl>=0);
2335           emit_addimm(sl,imm[i],tl);
2336         } else {
2337           emit_movimm(imm[i],tl);
2338         }
2339       }
2340     }
2341   }
2342   else if(dops[i].opcode==0x0a||dops[i].opcode==0x0b) { // SLTI/SLTIU
2343     if(dops[i].rt1) {
2344       //assert(dops[i].rs1!=0); // r0 might be valid, but it's probably a bug
2345       signed char sl,t;
2346       t=get_reg(i_regs->regmap,dops[i].rt1);
2347       sl=get_reg(i_regs->regmap,dops[i].rs1);
2348       //assert(t>=0);
2349       if(t>=0) {
2350         if(dops[i].rs1>0) {
2351             if(dops[i].opcode==0x0a) { // SLTI
2352               if(sl<0) {
2353                 if(i_regs->regmap_entry[t]!=dops[i].rs1) emit_loadreg(dops[i].rs1,t);
2354                 emit_slti32(t,imm[i],t);
2355               }else{
2356                 emit_slti32(sl,imm[i],t);
2357               }
2358             }
2359             else { // SLTIU
2360               if(sl<0) {
2361                 if(i_regs->regmap_entry[t]!=dops[i].rs1) emit_loadreg(dops[i].rs1,t);
2362                 emit_sltiu32(t,imm[i],t);
2363               }else{
2364                 emit_sltiu32(sl,imm[i],t);
2365               }
2366             }
2367         }else{
2368           // SLTI(U) with r0 is just stupid,
2369           // nonetheless examples can be found
2370           if(dops[i].opcode==0x0a) // SLTI
2371             if(0<imm[i]) emit_movimm(1,t);
2372             else emit_zeroreg(t);
2373           else // SLTIU
2374           {
2375             if(imm[i]) emit_movimm(1,t);
2376             else emit_zeroreg(t);
2377           }
2378         }
2379       }
2380     }
2381   }
2382   else if(dops[i].opcode>=0x0c&&dops[i].opcode<=0x0e) { // ANDI/ORI/XORI
2383     if(dops[i].rt1) {
2384       signed char sl,tl;
2385       tl=get_reg(i_regs->regmap,dops[i].rt1);
2386       sl=get_reg(i_regs->regmap,dops[i].rs1);
2387       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2388         if(dops[i].opcode==0x0c) //ANDI
2389         {
2390           if(dops[i].rs1) {
2391             if(sl<0) {
2392               if(i_regs->regmap_entry[tl]!=dops[i].rs1) emit_loadreg(dops[i].rs1,tl);
2393               emit_andimm(tl,imm[i],tl);
2394             }else{
2395               if(!((i_regs->wasconst>>sl)&1))
2396                 emit_andimm(sl,imm[i],tl);
2397               else
2398                 emit_movimm(constmap[i][sl]&imm[i],tl);
2399             }
2400           }
2401           else
2402             emit_zeroreg(tl);
2403         }
2404         else
2405         {
2406           if(dops[i].rs1) {
2407             if(sl<0) {
2408               if(i_regs->regmap_entry[tl]!=dops[i].rs1) emit_loadreg(dops[i].rs1,tl);
2409             }
2410             if(dops[i].opcode==0x0d) { // ORI
2411               if(sl<0) {
2412                 emit_orimm(tl,imm[i],tl);
2413               }else{
2414                 if(!((i_regs->wasconst>>sl)&1))
2415                   emit_orimm(sl,imm[i],tl);
2416                 else
2417                   emit_movimm(constmap[i][sl]|imm[i],tl);
2418               }
2419             }
2420             if(dops[i].opcode==0x0e) { // XORI
2421               if(sl<0) {
2422                 emit_xorimm(tl,imm[i],tl);
2423               }else{
2424                 if(!((i_regs->wasconst>>sl)&1))
2425                   emit_xorimm(sl,imm[i],tl);
2426                 else
2427                   emit_movimm(constmap[i][sl]^imm[i],tl);
2428               }
2429             }
2430           }
2431           else {
2432             emit_movimm(imm[i],tl);
2433           }
2434         }
2435       }
2436     }
2437   }
2438 }
2439
2440 void shiftimm_assemble(int i,struct regstat *i_regs)
2441 {
2442   if(dops[i].opcode2<=0x3) // SLL/SRL/SRA
2443   {
2444     if(dops[i].rt1) {
2445       signed char s,t;
2446       t=get_reg(i_regs->regmap,dops[i].rt1);
2447       s=get_reg(i_regs->regmap,dops[i].rs1);
2448       //assert(t>=0);
2449       if(t>=0&&!((i_regs->isconst>>t)&1)){
2450         if(dops[i].rs1==0)
2451         {
2452           emit_zeroreg(t);
2453         }
2454         else
2455         {
2456           if(s<0&&i_regs->regmap_entry[t]!=dops[i].rs1) emit_loadreg(dops[i].rs1,t);
2457           if(imm[i]) {
2458             if(dops[i].opcode2==0) // SLL
2459             {
2460               emit_shlimm(s<0?t:s,imm[i],t);
2461             }
2462             if(dops[i].opcode2==2) // SRL
2463             {
2464               emit_shrimm(s<0?t:s,imm[i],t);
2465             }
2466             if(dops[i].opcode2==3) // SRA
2467             {
2468               emit_sarimm(s<0?t:s,imm[i],t);
2469             }
2470           }else{
2471             // Shift by zero
2472             if(s>=0 && s!=t) emit_mov(s,t);
2473           }
2474         }
2475       }
2476       //emit_storereg(dops[i].rt1,t); //DEBUG
2477     }
2478   }
2479   if(dops[i].opcode2>=0x38&&dops[i].opcode2<=0x3b) // DSLL/DSRL/DSRA
2480   {
2481     assert(0);
2482   }
2483   if(dops[i].opcode2==0x3c) // DSLL32
2484   {
2485     assert(0);
2486   }
2487   if(dops[i].opcode2==0x3e) // DSRL32
2488   {
2489     assert(0);
2490   }
2491   if(dops[i].opcode2==0x3f) // DSRA32
2492   {
2493     assert(0);
2494   }
2495 }
2496
2497 #ifndef shift_assemble
2498 static void shift_assemble(int i,struct regstat *i_regs)
2499 {
2500   signed char s,t,shift;
2501   if (dops[i].rt1 == 0)
2502     return;
2503   assert(dops[i].opcode2<=0x07); // SLLV/SRLV/SRAV
2504   t = get_reg(i_regs->regmap, dops[i].rt1);
2505   s = get_reg(i_regs->regmap, dops[i].rs1);
2506   shift = get_reg(i_regs->regmap, dops[i].rs2);
2507   if (t < 0)
2508     return;
2509
2510   if(dops[i].rs1==0)
2511     emit_zeroreg(t);
2512   else if(dops[i].rs2==0) {
2513     assert(s>=0);
2514     if(s!=t) emit_mov(s,t);
2515   }
2516   else {
2517     host_tempreg_acquire();
2518     emit_andimm(shift,31,HOST_TEMPREG);
2519     switch(dops[i].opcode2) {
2520     case 4: // SLLV
2521       emit_shl(s,HOST_TEMPREG,t);
2522       break;
2523     case 6: // SRLV
2524       emit_shr(s,HOST_TEMPREG,t);
2525       break;
2526     case 7: // SRAV
2527       emit_sar(s,HOST_TEMPREG,t);
2528       break;
2529     default:
2530       assert(0);
2531     }
2532     host_tempreg_release();
2533   }
2534 }
2535
2536 #endif
2537
2538 enum {
2539   MTYPE_8000 = 0,
2540   MTYPE_8020,
2541   MTYPE_0000,
2542   MTYPE_A000,
2543   MTYPE_1F80,
2544 };
2545
2546 static int get_ptr_mem_type(u_int a)
2547 {
2548   if(a < 0x00200000) {
2549     if(a<0x1000&&((start>>20)==0xbfc||(start>>24)==0xa0))
2550       // return wrong, must use memhandler for BIOS self-test to pass
2551       // 007 does similar stuff from a00 mirror, weird stuff
2552       return MTYPE_8000;
2553     return MTYPE_0000;
2554   }
2555   if(0x1f800000 <= a && a < 0x1f801000)
2556     return MTYPE_1F80;
2557   if(0x80200000 <= a && a < 0x80800000)
2558     return MTYPE_8020;
2559   if(0xa0000000 <= a && a < 0xa0200000)
2560     return MTYPE_A000;
2561   return MTYPE_8000;
2562 }
2563
2564 static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override)
2565 {
2566   void *jaddr = NULL;
2567   int type=0;
2568   int mr=dops[i].rs1;
2569   if(((smrv_strong|smrv_weak)>>mr)&1) {
2570     type=get_ptr_mem_type(smrv[mr]);
2571     //printf("set %08x @%08x r%d %d\n", smrv[mr], start+i*4, mr, type);
2572   }
2573   else {
2574     // use the mirror we are running on
2575     type=get_ptr_mem_type(start);
2576     //printf("set nospec   @%08x r%d %d\n", start+i*4, mr, type);
2577   }
2578
2579   if(type==MTYPE_8020) { // RAM 80200000+ mirror
2580     host_tempreg_acquire();
2581     emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
2582     addr=*addr_reg_override=HOST_TEMPREG;
2583     type=0;
2584   }
2585   else if(type==MTYPE_0000) { // RAM 0 mirror
2586     host_tempreg_acquire();
2587     emit_orimm(addr,0x80000000,HOST_TEMPREG);
2588     addr=*addr_reg_override=HOST_TEMPREG;
2589     type=0;
2590   }
2591   else if(type==MTYPE_A000) { // RAM A mirror
2592     host_tempreg_acquire();
2593     emit_andimm(addr,~0x20000000,HOST_TEMPREG);
2594     addr=*addr_reg_override=HOST_TEMPREG;
2595     type=0;
2596   }
2597   else if(type==MTYPE_1F80) { // scratchpad
2598     if (psxH == (void *)0x1f800000) {
2599       host_tempreg_acquire();
2600       emit_xorimm(addr,0x1f800000,HOST_TEMPREG);
2601       emit_cmpimm(HOST_TEMPREG,0x1000);
2602       host_tempreg_release();
2603       jaddr=out;
2604       emit_jc(0);
2605     }
2606     else {
2607       // do the usual RAM check, jump will go to the right handler
2608       type=0;
2609     }
2610   }
2611
2612   if(type==0)
2613   {
2614     emit_cmpimm(addr,RAM_SIZE);
2615     jaddr=out;
2616     #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2617     // Hint to branch predictor that the branch is unlikely to be taken
2618     if(dops[i].rs1>=28)
2619       emit_jno_unlikely(0);
2620     else
2621     #endif
2622       emit_jno(0);
2623     if(ram_offset!=0) {
2624       host_tempreg_acquire();
2625       emit_addimm(addr,ram_offset,HOST_TEMPREG);
2626       addr=*addr_reg_override=HOST_TEMPREG;
2627     }
2628   }
2629
2630   return jaddr;
2631 }
2632
2633 // return memhandler, or get directly accessable address and return 0
2634 static void *get_direct_memhandler(void *table, u_int addr,
2635   enum stub_type type, uintptr_t *addr_host)
2636 {
2637   uintptr_t l1, l2 = 0;
2638   l1 = ((uintptr_t *)table)[addr>>12];
2639   if ((l1 & (1ul << (sizeof(l1)*8-1))) == 0) {
2640     uintptr_t v = l1 << 1;
2641     *addr_host = v + addr;
2642     return NULL;
2643   }
2644   else {
2645     l1 <<= 1;
2646     if (type == LOADB_STUB || type == LOADBU_STUB || type == STOREB_STUB)
2647       l2 = ((uintptr_t *)l1)[0x1000/4 + 0x1000/2 + (addr&0xfff)];
2648     else if (type == LOADH_STUB || type == LOADHU_STUB || type == STOREH_STUB)
2649       l2=((uintptr_t *)l1)[0x1000/4 + (addr&0xfff)/2];
2650     else
2651       l2=((uintptr_t *)l1)[(addr&0xfff)/4];
2652     if ((l2 & (1<<31)) == 0) {
2653       uintptr_t v = l2 << 1;
2654       *addr_host = v + (addr&0xfff);
2655       return NULL;
2656     }
2657     return (void *)(l2 << 1);
2658   }
2659 }
2660
2661 static u_int get_host_reglist(const signed char *regmap)
2662 {
2663   u_int reglist = 0, hr;
2664   for (hr = 0; hr < HOST_REGS; hr++) {
2665     if (hr != EXCLUDE_REG && regmap[hr] >= 0)
2666       reglist |= 1 << hr;
2667   }
2668   return reglist;
2669 }
2670
2671 static u_int reglist_exclude(u_int reglist, int r1, int r2)
2672 {
2673   if (r1 >= 0)
2674     reglist &= ~(1u << r1);
2675   if (r2 >= 0)
2676     reglist &= ~(1u << r2);
2677   return reglist;
2678 }
2679
2680 // find a temp caller-saved register not in reglist (so assumed to be free)
2681 static int reglist_find_free(u_int reglist)
2682 {
2683   u_int free_regs = ~reglist & CALLER_SAVE_REGS;
2684   if (free_regs == 0)
2685     return -1;
2686   return __builtin_ctz(free_regs);
2687 }
2688
2689 static void load_assemble(int i, const struct regstat *i_regs)
2690 {
2691   int s,tl,addr;
2692   int offset;
2693   void *jaddr=0;
2694   int memtarget=0,c=0;
2695   int fastio_reg_override=-1;
2696   u_int reglist=get_host_reglist(i_regs->regmap);
2697   tl=get_reg(i_regs->regmap,dops[i].rt1);
2698   s=get_reg(i_regs->regmap,dops[i].rs1);
2699   offset=imm[i];
2700   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2701   if(s>=0) {
2702     c=(i_regs->wasconst>>s)&1;
2703     if (c) {
2704       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2705     }
2706   }
2707   //printf("load_assemble: c=%d\n",c);
2708   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2709   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2710   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2711     ||dops[i].rt1==0) {
2712       // could be FIFO, must perform the read
2713       // ||dummy read
2714       assem_debug("(forced read)\n");
2715       tl=get_reg(i_regs->regmap,-1);
2716       assert(tl>=0);
2717   }
2718   if(offset||s<0||c) addr=tl;
2719   else addr=s;
2720   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2721  if(tl>=0) {
2722   //printf("load_assemble: c=%d\n",c);
2723   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2724   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2725   reglist&=~(1<<tl);
2726   if(!c) {
2727     #ifdef R29_HACK
2728     // Strmnnrmn's speed hack
2729     if(dops[i].rs1!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2730     #endif
2731     {
2732       jaddr=emit_fastpath_cmp_jump(i,addr,&fastio_reg_override);
2733     }
2734   }
2735   else if(ram_offset&&memtarget) {
2736     host_tempreg_acquire();
2737     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2738     fastio_reg_override=HOST_TEMPREG;
2739   }
2740   int dummy=(dops[i].rt1==0)||(tl!=get_reg(i_regs->regmap,dops[i].rt1)); // ignore loads to r0 and unneeded reg
2741   if (dops[i].opcode==0x20) { // LB
2742     if(!c||memtarget) {
2743       if(!dummy) {
2744         {
2745           int x=0,a=tl;
2746           if(!c) a=addr;
2747           if(fastio_reg_override>=0) a=fastio_reg_override;
2748
2749           emit_movsbl_indexed(x,a,tl);
2750         }
2751       }
2752       if(jaddr)
2753         add_stub_r(LOADB_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2754     }
2755     else
2756       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,dops[i].rt1,ccadj[i],reglist);
2757   }
2758   if (dops[i].opcode==0x21) { // LH
2759     if(!c||memtarget) {
2760       if(!dummy) {
2761         int x=0,a=tl;
2762         if(!c) a=addr;
2763         if(fastio_reg_override>=0) a=fastio_reg_override;
2764         emit_movswl_indexed(x,a,tl);
2765       }
2766       if(jaddr)
2767         add_stub_r(LOADH_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2768     }
2769     else
2770       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,dops[i].rt1,ccadj[i],reglist);
2771   }
2772   if (dops[i].opcode==0x23) { // LW
2773     if(!c||memtarget) {
2774       if(!dummy) {
2775         int a=addr;
2776         if(fastio_reg_override>=0) a=fastio_reg_override;
2777         emit_readword_indexed(0,a,tl);
2778       }
2779       if(jaddr)
2780         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2781     }
2782     else
2783       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,dops[i].rt1,ccadj[i],reglist);
2784   }
2785   if (dops[i].opcode==0x24) { // LBU
2786     if(!c||memtarget) {
2787       if(!dummy) {
2788         int x=0,a=tl;
2789         if(!c) a=addr;
2790         if(fastio_reg_override>=0) a=fastio_reg_override;
2791
2792         emit_movzbl_indexed(x,a,tl);
2793       }
2794       if(jaddr)
2795         add_stub_r(LOADBU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2796     }
2797     else
2798       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,dops[i].rt1,ccadj[i],reglist);
2799   }
2800   if (dops[i].opcode==0x25) { // LHU
2801     if(!c||memtarget) {
2802       if(!dummy) {
2803         int x=0,a=tl;
2804         if(!c) a=addr;
2805         if(fastio_reg_override>=0) a=fastio_reg_override;
2806         emit_movzwl_indexed(x,a,tl);
2807       }
2808       if(jaddr)
2809         add_stub_r(LOADHU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2810     }
2811     else
2812       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,dops[i].rt1,ccadj[i],reglist);
2813   }
2814   if (dops[i].opcode==0x27) { // LWU
2815     assert(0);
2816   }
2817   if (dops[i].opcode==0x37) { // LD
2818     assert(0);
2819   }
2820  }
2821  if (fastio_reg_override == HOST_TEMPREG)
2822    host_tempreg_release();
2823 }
2824
2825 #ifndef loadlr_assemble
2826 static void loadlr_assemble(int i, const struct regstat *i_regs)
2827 {
2828   int s,tl,temp,temp2,addr;
2829   int offset;
2830   void *jaddr=0;
2831   int memtarget=0,c=0;
2832   int fastio_reg_override=-1;
2833   u_int reglist=get_host_reglist(i_regs->regmap);
2834   tl=get_reg(i_regs->regmap,dops[i].rt1);
2835   s=get_reg(i_regs->regmap,dops[i].rs1);
2836   temp=get_reg(i_regs->regmap,-1);
2837   temp2=get_reg(i_regs->regmap,FTEMP);
2838   addr=get_reg(i_regs->regmap,AGEN1+(i&1));
2839   assert(addr<0);
2840   offset=imm[i];
2841   reglist|=1<<temp;
2842   if(offset||s<0||c) addr=temp2;
2843   else addr=s;
2844   if(s>=0) {
2845     c=(i_regs->wasconst>>s)&1;
2846     if(c) {
2847       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2848     }
2849   }
2850   if(!c) {
2851     emit_shlimm(addr,3,temp);
2852     if (dops[i].opcode==0x22||dops[i].opcode==0x26) {
2853       emit_andimm(addr,0xFFFFFFFC,temp2); // LWL/LWR
2854     }else{
2855       emit_andimm(addr,0xFFFFFFF8,temp2); // LDL/LDR
2856     }
2857     jaddr=emit_fastpath_cmp_jump(i,temp2,&fastio_reg_override);
2858   }
2859   else {
2860     if(ram_offset&&memtarget) {
2861       host_tempreg_acquire();
2862       emit_addimm(temp2,ram_offset,HOST_TEMPREG);
2863       fastio_reg_override=HOST_TEMPREG;
2864     }
2865     if (dops[i].opcode==0x22||dops[i].opcode==0x26) {
2866       emit_movimm(((constmap[i][s]+offset)<<3)&24,temp); // LWL/LWR
2867     }else{
2868       emit_movimm(((constmap[i][s]+offset)<<3)&56,temp); // LDL/LDR
2869     }
2870   }
2871   if (dops[i].opcode==0x22||dops[i].opcode==0x26) { // LWL/LWR
2872     if(!c||memtarget) {
2873       int a=temp2;
2874       if(fastio_reg_override>=0) a=fastio_reg_override;
2875       emit_readword_indexed(0,a,temp2);
2876       if(fastio_reg_override==HOST_TEMPREG) host_tempreg_release();
2877       if(jaddr) add_stub_r(LOADW_STUB,jaddr,out,i,temp2,i_regs,ccadj[i],reglist);
2878     }
2879     else
2880       inline_readstub(LOADW_STUB,i,(constmap[i][s]+offset)&0xFFFFFFFC,i_regs->regmap,FTEMP,ccadj[i],reglist);
2881     if(dops[i].rt1) {
2882       assert(tl>=0);
2883       emit_andimm(temp,24,temp);
2884       if (dops[i].opcode==0x22) // LWL
2885         emit_xorimm(temp,24,temp);
2886       host_tempreg_acquire();
2887       emit_movimm(-1,HOST_TEMPREG);
2888       if (dops[i].opcode==0x26) {
2889         emit_shr(temp2,temp,temp2);
2890         emit_bic_lsr(tl,HOST_TEMPREG,temp,tl);
2891       }else{
2892         emit_shl(temp2,temp,temp2);
2893         emit_bic_lsl(tl,HOST_TEMPREG,temp,tl);
2894       }
2895       host_tempreg_release();
2896       emit_or(temp2,tl,tl);
2897     }
2898     //emit_storereg(dops[i].rt1,tl); // DEBUG
2899   }
2900   if (dops[i].opcode==0x1A||dops[i].opcode==0x1B) { // LDL/LDR
2901     assert(0);
2902   }
2903 }
2904 #endif
2905
2906 void store_assemble(int i, const struct regstat *i_regs)
2907 {
2908   int s,tl;
2909   int addr,temp;
2910   int offset;
2911   void *jaddr=0;
2912   enum stub_type type;
2913   int memtarget=0,c=0;
2914   int agr=AGEN1+(i&1);
2915   int fastio_reg_override=-1;
2916   u_int reglist=get_host_reglist(i_regs->regmap);
2917   tl=get_reg(i_regs->regmap,dops[i].rs2);
2918   s=get_reg(i_regs->regmap,dops[i].rs1);
2919   temp=get_reg(i_regs->regmap,agr);
2920   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2921   offset=imm[i];
2922   if(s>=0) {
2923     c=(i_regs->wasconst>>s)&1;
2924     if(c) {
2925       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2926     }
2927   }
2928   assert(tl>=0);
2929   assert(temp>=0);
2930   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2931   if(offset||s<0||c) addr=temp;
2932   else addr=s;
2933   if(!c) {
2934     jaddr=emit_fastpath_cmp_jump(i,addr,&fastio_reg_override);
2935   }
2936   else if(ram_offset&&memtarget) {
2937     host_tempreg_acquire();
2938     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2939     fastio_reg_override=HOST_TEMPREG;
2940   }
2941
2942   if (dops[i].opcode==0x28) { // SB
2943     if(!c||memtarget) {
2944       int x=0,a=temp;
2945       if(!c) a=addr;
2946       if(fastio_reg_override>=0) a=fastio_reg_override;
2947       emit_writebyte_indexed(tl,x,a);
2948     }
2949     type=STOREB_STUB;
2950   }
2951   if (dops[i].opcode==0x29) { // SH
2952     if(!c||memtarget) {
2953       int x=0,a=temp;
2954       if(!c) a=addr;
2955       if(fastio_reg_override>=0) a=fastio_reg_override;
2956       emit_writehword_indexed(tl,x,a);
2957     }
2958     type=STOREH_STUB;
2959   }
2960   if (dops[i].opcode==0x2B) { // SW
2961     if(!c||memtarget) {
2962       int a=addr;
2963       if(fastio_reg_override>=0) a=fastio_reg_override;
2964       emit_writeword_indexed(tl,0,a);
2965     }
2966     type=STOREW_STUB;
2967   }
2968   if (dops[i].opcode==0x3F) { // SD
2969     assert(0);
2970     type=STORED_STUB;
2971   }
2972   if(fastio_reg_override==HOST_TEMPREG)
2973     host_tempreg_release();
2974   if(jaddr) {
2975     // PCSX store handlers don't check invcode again
2976     reglist|=1<<addr;
2977     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2978     jaddr=0;
2979   }
2980   if(!(i_regs->waswritten&(1<<dops[i].rs1)) && !HACK_ENABLED(NDHACK_NO_SMC_CHECK)) {
2981     if(!c||memtarget) {
2982       #ifdef DESTRUCTIVE_SHIFT
2983       // The x86 shift operation is 'destructive'; it overwrites the
2984       // source register, so we need to make a copy first and use that.
2985       addr=temp;
2986       #endif
2987       #if defined(HOST_IMM8)
2988       int ir=get_reg(i_regs->regmap,INVCP);
2989       assert(ir>=0);
2990       emit_cmpmem_indexedsr12_reg(ir,addr,1);
2991       #else
2992       emit_cmpmem_indexedsr12_imm(invalid_code,addr,1);
2993       #endif
2994       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2995       emit_callne(invalidate_addr_reg[addr]);
2996       #else
2997       void *jaddr2 = out;
2998       emit_jne(0);
2999       add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3000       #endif
3001     }
3002   }
3003   u_int addr_val=constmap[i][s]+offset;
3004   if(jaddr) {
3005     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
3006   } else if(c&&!memtarget) {
3007     inline_writestub(type,i,addr_val,i_regs->regmap,dops[i].rs2,ccadj[i],reglist);
3008   }
3009   // basic current block modification detection..
3010   // not looking back as that should be in mips cache already
3011   // (see Spyro2 title->attract mode)
3012   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
3013     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
3014     assert(i_regs->regmap==regs[i].regmap); // not delay slot
3015     if(i_regs->regmap==regs[i].regmap) {
3016       load_all_consts(regs[i].regmap_entry,regs[i].wasdirty,i);
3017       wb_dirtys(regs[i].regmap_entry,regs[i].wasdirty);
3018       emit_movimm(start+i*4+4,0);
3019       emit_writeword(0,&pcaddr);
3020       emit_addimm(HOST_CCREG,2,HOST_CCREG);
3021       emit_far_call(get_addr_ht);
3022       emit_jmpreg(0);
3023     }
3024   }
3025 }
3026
3027 static void storelr_assemble(int i, const struct regstat *i_regs)
3028 {
3029   int s,tl;
3030   int temp;
3031   int offset;
3032   void *jaddr=0;
3033   void *case1, *case2, *case3;
3034   void *done0, *done1, *done2;
3035   int memtarget=0,c=0;
3036   int agr=AGEN1+(i&1);
3037   u_int reglist=get_host_reglist(i_regs->regmap);
3038   tl=get_reg(i_regs->regmap,dops[i].rs2);
3039   s=get_reg(i_regs->regmap,dops[i].rs1);
3040   temp=get_reg(i_regs->regmap,agr);
3041   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3042   offset=imm[i];
3043   if(s>=0) {
3044     c=(i_regs->isconst>>s)&1;
3045     if(c) {
3046       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3047     }
3048   }
3049   assert(tl>=0);
3050   assert(temp>=0);
3051   if(!c) {
3052     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3053     if(!offset&&s!=temp) emit_mov(s,temp);
3054     jaddr=out;
3055     emit_jno(0);
3056   }
3057   else
3058   {
3059     if(!memtarget||!dops[i].rs1) {
3060       jaddr=out;
3061       emit_jmp(0);
3062     }
3063   }
3064   if(ram_offset)
3065     emit_addimm_no_flags(ram_offset,temp);
3066
3067   if (dops[i].opcode==0x2C||dops[i].opcode==0x2D) { // SDL/SDR
3068     assert(0);
3069   }
3070
3071   emit_xorimm(temp,3,temp);
3072   emit_testimm(temp,2);
3073   case2=out;
3074   emit_jne(0);
3075   emit_testimm(temp,1);
3076   case1=out;
3077   emit_jne(0);
3078   // 0
3079   if (dops[i].opcode==0x2A) { // SWL
3080     emit_writeword_indexed(tl,0,temp);
3081   }
3082   else if (dops[i].opcode==0x2E) { // SWR
3083     emit_writebyte_indexed(tl,3,temp);
3084   }
3085   else
3086     assert(0);
3087   done0=out;
3088   emit_jmp(0);
3089   // 1
3090   set_jump_target(case1, out);
3091   if (dops[i].opcode==0x2A) { // SWL
3092     // Write 3 msb into three least significant bytes
3093     if(dops[i].rs2) emit_rorimm(tl,8,tl);
3094     emit_writehword_indexed(tl,-1,temp);
3095     if(dops[i].rs2) emit_rorimm(tl,16,tl);
3096     emit_writebyte_indexed(tl,1,temp);
3097     if(dops[i].rs2) emit_rorimm(tl,8,tl);
3098   }
3099   else if (dops[i].opcode==0x2E) { // SWR
3100     // Write two lsb into two most significant bytes
3101     emit_writehword_indexed(tl,1,temp);
3102   }
3103   done1=out;
3104   emit_jmp(0);
3105   // 2
3106   set_jump_target(case2, out);
3107   emit_testimm(temp,1);
3108   case3=out;
3109   emit_jne(0);
3110   if (dops[i].opcode==0x2A) { // SWL
3111     // Write two msb into two least significant bytes
3112     if(dops[i].rs2) emit_rorimm(tl,16,tl);
3113     emit_writehword_indexed(tl,-2,temp);
3114     if(dops[i].rs2) emit_rorimm(tl,16,tl);
3115   }
3116   else if (dops[i].opcode==0x2E) { // SWR
3117     // Write 3 lsb into three most significant bytes
3118     emit_writebyte_indexed(tl,-1,temp);
3119     if(dops[i].rs2) emit_rorimm(tl,8,tl);
3120     emit_writehword_indexed(tl,0,temp);
3121     if(dops[i].rs2) emit_rorimm(tl,24,tl);
3122   }
3123   done2=out;
3124   emit_jmp(0);
3125   // 3
3126   set_jump_target(case3, out);
3127   if (dops[i].opcode==0x2A) { // SWL
3128     // Write msb into least significant byte
3129     if(dops[i].rs2) emit_rorimm(tl,24,tl);
3130     emit_writebyte_indexed(tl,-3,temp);
3131     if(dops[i].rs2) emit_rorimm(tl,8,tl);
3132   }
3133   else if (dops[i].opcode==0x2E) { // SWR
3134     // Write entire word
3135     emit_writeword_indexed(tl,-3,temp);
3136   }
3137   set_jump_target(done0, out);
3138   set_jump_target(done1, out);
3139   set_jump_target(done2, out);
3140   if(!c||!memtarget)
3141     add_stub_r(STORELR_STUB,jaddr,out,i,temp,i_regs,ccadj[i],reglist);
3142   if(!(i_regs->waswritten&(1<<dops[i].rs1)) && !HACK_ENABLED(NDHACK_NO_SMC_CHECK)) {
3143     emit_addimm_no_flags(-ram_offset,temp);
3144     #if defined(HOST_IMM8)
3145     int ir=get_reg(i_regs->regmap,INVCP);
3146     assert(ir>=0);
3147     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3148     #else
3149     emit_cmpmem_indexedsr12_imm(invalid_code,temp,1);
3150     #endif
3151     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3152     emit_callne(invalidate_addr_reg[temp]);
3153     #else
3154     void *jaddr2 = out;
3155     emit_jne(0);
3156     add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3157     #endif
3158   }
3159 }
3160
3161 static void cop0_assemble(int i,struct regstat *i_regs)
3162 {
3163   if(dops[i].opcode2==0) // MFC0
3164   {
3165     signed char t=get_reg(i_regs->regmap,dops[i].rt1);
3166     u_int copr=(source[i]>>11)&0x1f;
3167     //assert(t>=0); // Why does this happen?  OOT is weird
3168     if(t>=0&&dops[i].rt1!=0) {
3169       emit_readword(&reg_cop0[copr],t);
3170     }
3171   }
3172   else if(dops[i].opcode2==4) // MTC0
3173   {
3174     signed char s=get_reg(i_regs->regmap,dops[i].rs1);
3175     char copr=(source[i]>>11)&0x1f;
3176     assert(s>=0);
3177     wb_register(dops[i].rs1,i_regs->regmap,i_regs->dirty);
3178     if(copr==9||copr==11||copr==12||copr==13) {
3179       emit_readword(&last_count,HOST_TEMPREG);
3180       emit_loadreg(CCREG,HOST_CCREG); // TODO: do proper reg alloc
3181       emit_add(HOST_CCREG,HOST_TEMPREG,HOST_CCREG);
3182       emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3183       emit_writeword(HOST_CCREG,&Count);
3184     }
3185     // What a mess.  The status register (12) can enable interrupts,
3186     // so needs a special case to handle a pending interrupt.
3187     // The interrupt must be taken immediately, because a subsequent
3188     // instruction might disable interrupts again.
3189     if(copr==12||copr==13) {
3190       if (is_delayslot) {
3191         // burn cycles to cause cc_interrupt, which will
3192         // reschedule next_interupt. Relies on CCREG from above.
3193         assem_debug("MTC0 DS %d\n", copr);
3194         emit_writeword(HOST_CCREG,&last_count);
3195         emit_movimm(0,HOST_CCREG);
3196         emit_storereg(CCREG,HOST_CCREG);
3197         emit_loadreg(dops[i].rs1,1);
3198         emit_movimm(copr,0);
3199         emit_far_call(pcsx_mtc0_ds);
3200         emit_loadreg(dops[i].rs1,s);
3201         return;
3202       }
3203       emit_movimm(start+i*4+4,HOST_TEMPREG);
3204       emit_writeword(HOST_TEMPREG,&pcaddr);
3205       emit_movimm(0,HOST_TEMPREG);
3206       emit_writeword(HOST_TEMPREG,&pending_exception);
3207     }
3208     if(s==HOST_CCREG)
3209       emit_loadreg(dops[i].rs1,1);
3210     else if(s!=1)
3211       emit_mov(s,1);
3212     emit_movimm(copr,0);
3213     emit_far_call(pcsx_mtc0);
3214     if(copr==9||copr==11||copr==12||copr==13) {
3215       emit_readword(&Count,HOST_CCREG);
3216       emit_readword(&next_interupt,HOST_TEMPREG);
3217       emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3218       emit_sub(HOST_CCREG,HOST_TEMPREG,HOST_CCREG);
3219       emit_writeword(HOST_TEMPREG,&last_count);
3220       emit_storereg(CCREG,HOST_CCREG);
3221     }
3222     if(copr==12||copr==13) {
3223       assert(!is_delayslot);
3224       emit_readword(&pending_exception,14);
3225       emit_test(14,14);
3226       void *jaddr = out;
3227       emit_jeq(0);
3228       emit_readword(&pcaddr, 0);
3229       emit_addimm(HOST_CCREG,2,HOST_CCREG);
3230       emit_far_call(get_addr_ht);
3231       emit_jmpreg(0);
3232       set_jump_target(jaddr, out);
3233     }
3234     emit_loadreg(dops[i].rs1,s);
3235   }
3236   else
3237   {
3238     assert(dops[i].opcode2==0x10);
3239     //if((source[i]&0x3f)==0x10) // RFE
3240     {
3241       emit_readword(&Status,0);
3242       emit_andimm(0,0x3c,1);
3243       emit_andimm(0,~0xf,0);
3244       emit_orrshr_imm(1,2,0);
3245       emit_writeword(0,&Status);
3246     }
3247   }
3248 }
3249
3250 static void cop1_unusable(int i,struct regstat *i_regs)
3251 {
3252   // XXX: should just just do the exception instead
3253   //if(!cop1_usable)
3254   {
3255     void *jaddr=out;
3256     emit_jmp(0);
3257     add_stub_r(FP_STUB,jaddr,out,i,0,i_regs,is_delayslot,0);
3258   }
3259 }
3260
3261 static void cop1_assemble(int i,struct regstat *i_regs)
3262 {
3263   cop1_unusable(i, i_regs);
3264 }
3265
3266 static void c1ls_assemble(int i,struct regstat *i_regs)
3267 {
3268   cop1_unusable(i, i_regs);
3269 }
3270
3271 // FP_STUB
3272 static void do_cop1stub(int n)
3273 {
3274   literal_pool(256);
3275   assem_debug("do_cop1stub %x\n",start+stubs[n].a*4);
3276   set_jump_target(stubs[n].addr, out);
3277   int i=stubs[n].a;
3278 //  int rs=stubs[n].b;
3279   struct regstat *i_regs=(struct regstat *)stubs[n].c;
3280   int ds=stubs[n].d;
3281   if(!ds) {
3282     load_all_consts(regs[i].regmap_entry,regs[i].wasdirty,i);
3283     //if(i_regs!=&regs[i]) printf("oops: regs[i]=%x i_regs=%x",(int)&regs[i],(int)i_regs);
3284   }
3285   //else {printf("fp exception in delay slot\n");}
3286   wb_dirtys(i_regs->regmap_entry,i_regs->wasdirty);
3287   if(regs[i].regmap_entry[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
3288   emit_movimm(start+(i-ds)*4,EAX); // Get PC
3289   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3290   emit_far_jump(ds?fp_exception_ds:fp_exception);
3291 }
3292
3293 static int cop2_is_stalling_op(int i, int *cycles)
3294 {
3295   if (dops[i].opcode == 0x3a) { // SWC2
3296     *cycles = 0;
3297     return 1;
3298   }
3299   if (dops[i].itype == COP2 && (dops[i].opcode2 == 0 || dops[i].opcode2 == 2)) { // MFC2/CFC2
3300     *cycles = 0;
3301     return 1;
3302   }
3303   if (dops[i].itype == C2OP) {
3304     *cycles = gte_cycletab[source[i] & 0x3f];
3305     return 1;
3306   }
3307   // ... what about MTC2/CTC2/LWC2?
3308   return 0;
3309 }
3310
3311 #if 0
3312 static void log_gte_stall(int stall, u_int cycle)
3313 {
3314   if ((u_int)stall <= 44)
3315     printf("x    stall %2d %u\n", stall, cycle + last_count);
3316 }
3317
3318 static void emit_log_gte_stall(int i, int stall, u_int reglist)
3319 {
3320   save_regs(reglist);
3321   if (stall > 0)
3322     emit_movimm(stall, 0);
3323   else
3324     emit_mov(HOST_TEMPREG, 0);
3325   emit_addimm(HOST_CCREG, CLOCK_ADJUST(ccadj[i]), 1);
3326   emit_far_call(log_gte_stall);
3327   restore_regs(reglist);
3328 }
3329 #endif
3330
3331 static void cop2_do_stall_check(u_int op, int i, const struct regstat *i_regs, u_int reglist)
3332 {
3333   int j = i, other_gte_op_cycles = -1, stall = -MAXBLOCK, cycles_passed;
3334   int rtmp = reglist_find_free(reglist);
3335
3336   if (HACK_ENABLED(NDHACK_NO_STALLS))
3337     return;
3338   if (get_reg(i_regs->regmap, CCREG) != HOST_CCREG) {
3339     // happens occasionally... cc evicted? Don't bother then
3340     //printf("no cc %08x\n", start + i*4);
3341     return;
3342   }
3343   if (!dops[i].bt) {
3344     for (j = i - 1; j >= 0; j--) {
3345       //if (dops[j].is_ds) break;
3346       if (cop2_is_stalling_op(j, &other_gte_op_cycles) || dops[j].bt)
3347         break;
3348     }
3349     j = max(j, 0);
3350   }
3351   cycles_passed = CLOCK_ADJUST(ccadj[i] - ccadj[j]);
3352   if (other_gte_op_cycles >= 0)
3353     stall = other_gte_op_cycles - cycles_passed;
3354   else if (cycles_passed >= 44)
3355     stall = 0; // can't stall
3356   if (stall == -MAXBLOCK && rtmp >= 0) {
3357     // unknown stall, do the expensive runtime check
3358     assem_debug("; cop2_do_stall_check\n");
3359 #if 0 // too slow
3360     save_regs(reglist);
3361     emit_movimm(gte_cycletab[op], 0);
3362     emit_addimm(HOST_CCREG, CLOCK_ADJUST(ccadj[i]), 1);
3363     emit_far_call(call_gteStall);
3364     restore_regs(reglist);
3365 #else
3366     host_tempreg_acquire();
3367     emit_readword(&psxRegs.gteBusyCycle, rtmp);
3368     emit_addimm(rtmp, -CLOCK_ADJUST(ccadj[i]), rtmp);
3369     emit_sub(rtmp, HOST_CCREG, HOST_TEMPREG);
3370     emit_cmpimm(HOST_TEMPREG, 44);
3371     emit_cmovb_reg(rtmp, HOST_CCREG);
3372     //emit_log_gte_stall(i, 0, reglist);
3373     host_tempreg_release();
3374 #endif
3375   }
3376   else if (stall > 0) {
3377     //emit_log_gte_stall(i, stall, reglist);
3378     emit_addimm(HOST_CCREG, stall, HOST_CCREG);
3379   }
3380
3381   // save gteBusyCycle, if needed
3382   if (gte_cycletab[op] == 0)
3383     return;
3384   other_gte_op_cycles = -1;
3385   for (j = i + 1; j < slen; j++) {
3386     if (cop2_is_stalling_op(j, &other_gte_op_cycles))
3387       break;
3388     if (dops[j].is_jump) {
3389       // check ds
3390       if (j + 1 < slen && cop2_is_stalling_op(j + 1, &other_gte_op_cycles))
3391         j++;
3392       break;
3393     }
3394   }
3395   if (other_gte_op_cycles >= 0)
3396     // will handle stall when assembling that op
3397     return;
3398   cycles_passed = CLOCK_ADJUST(ccadj[min(j, slen -1)] - ccadj[i]);
3399   if (cycles_passed >= 44)
3400     return;
3401   assem_debug("; save gteBusyCycle\n");
3402   host_tempreg_acquire();
3403 #if 0
3404   emit_readword(&last_count, HOST_TEMPREG);
3405   emit_add(HOST_TEMPREG, HOST_CCREG, HOST_TEMPREG);
3406   emit_addimm(HOST_TEMPREG, CLOCK_ADJUST(ccadj[i]), HOST_TEMPREG);
3407   emit_addimm(HOST_TEMPREG, gte_cycletab[op]), HOST_TEMPREG);
3408   emit_writeword(HOST_TEMPREG, &psxRegs.gteBusyCycle);
3409 #else
3410   emit_addimm(HOST_CCREG, CLOCK_ADJUST(ccadj[i]) + gte_cycletab[op], HOST_TEMPREG);
3411   emit_writeword(HOST_TEMPREG, &psxRegs.gteBusyCycle);
3412 #endif
3413   host_tempreg_release();
3414 }
3415
3416 static int is_mflohi(int i)
3417 {
3418   return (dops[i].itype == MOV && (dops[i].rs1 == HIREG || dops[i].rs1 == LOREG));
3419 }
3420
3421 static int check_multdiv(int i, int *cycles)
3422 {
3423   if (dops[i].itype != MULTDIV)
3424     return 0;
3425   if (dops[i].opcode2 == 0x18 || dops[i].opcode2 == 0x19) // MULT(U)
3426     *cycles = 11; // approx from 7 11 14
3427   else
3428     *cycles = 37;
3429   return 1;
3430 }
3431
3432 static void multdiv_prepare_stall(int i, const struct regstat *i_regs)
3433 {
3434   int j, found = 0, c = 0;
3435   if (HACK_ENABLED(NDHACK_NO_STALLS))
3436     return;
3437   if (get_reg(i_regs->regmap, CCREG) != HOST_CCREG) {
3438     // happens occasionally... cc evicted? Don't bother then
3439     return;
3440   }
3441   for (j = i + 1; j < slen; j++) {
3442     if (dops[j].bt)
3443       break;
3444     if ((found = is_mflohi(j)))
3445       break;
3446     if (dops[j].is_jump) {
3447       // check ds
3448       if (j + 1 < slen && (found = is_mflohi(j + 1)))
3449         j++;
3450       break;
3451     }
3452   }
3453   if (found)
3454     // handle all in multdiv_do_stall()
3455     return;
3456   check_multdiv(i, &c);
3457   assert(c > 0);
3458   assem_debug("; muldiv prepare stall %d\n", c);
3459   host_tempreg_acquire();
3460   emit_addimm(HOST_CCREG, CLOCK_ADJUST(ccadj[i]) + c, HOST_TEMPREG);
3461   emit_writeword(HOST_TEMPREG, &psxRegs.muldivBusyCycle);
3462   host_tempreg_release();
3463 }
3464
3465 static void multdiv_do_stall(int i, const struct regstat *i_regs)
3466 {
3467   int j, known_cycles = 0;
3468   u_int reglist = get_host_reglist(i_regs->regmap);
3469   int rtmp = get_reg(i_regs->regmap, -1);
3470   if (rtmp < 0)
3471     rtmp = reglist_find_free(reglist);
3472   if (HACK_ENABLED(NDHACK_NO_STALLS))
3473     return;
3474   if (get_reg(i_regs->regmap, CCREG) != HOST_CCREG || rtmp < 0) {
3475     // happens occasionally... cc evicted? Don't bother then
3476     //printf("no cc/rtmp %08x\n", start + i*4);
3477     return;
3478   }
3479   if (!dops[i].bt) {
3480     for (j = i - 1; j >= 0; j--) {
3481       if (dops[j].is_ds) break;
3482       if (check_multdiv(j, &known_cycles) || dops[j].bt)
3483         break;
3484       if (is_mflohi(j))
3485         // already handled by this op
3486         return;
3487     }
3488     j = max(j, 0);
3489   }
3490   if (known_cycles > 0) {
3491     known_cycles -= CLOCK_ADJUST(ccadj[i] - ccadj[j]);
3492     assem_debug("; muldiv stall resolved %d\n", known_cycles);
3493     if (known_cycles > 0)
3494       emit_addimm(HOST_CCREG, known_cycles, HOST_CCREG);
3495     return;
3496   }
3497   assem_debug("; muldiv stall unresolved\n");
3498   host_tempreg_acquire();
3499   emit_readword(&psxRegs.muldivBusyCycle, rtmp);
3500   emit_addimm(rtmp, -CLOCK_ADJUST(ccadj[i]), rtmp);
3501   emit_sub(rtmp, HOST_CCREG, HOST_TEMPREG);
3502   emit_cmpimm(HOST_TEMPREG, 37);
3503   emit_cmovb_reg(rtmp, HOST_CCREG);
3504   //emit_log_gte_stall(i, 0, reglist);
3505   host_tempreg_release();
3506 }
3507
3508 static void cop2_get_dreg(u_int copr,signed char tl,signed char temp)
3509 {
3510   switch (copr) {
3511     case 1:
3512     case 3:
3513     case 5:
3514     case 8:
3515     case 9:
3516     case 10:
3517     case 11:
3518       emit_readword(&reg_cop2d[copr],tl);
3519       emit_signextend16(tl,tl);
3520       emit_writeword(tl,&reg_cop2d[copr]); // hmh
3521       break;
3522     case 7:
3523     case 16:
3524     case 17:
3525     case 18:
3526     case 19:
3527       emit_readword(&reg_cop2d[copr],tl);
3528       emit_andimm(tl,0xffff,tl);
3529       emit_writeword(tl,&reg_cop2d[copr]);
3530       break;
3531     case 15:
3532       emit_readword(&reg_cop2d[14],tl); // SXY2
3533       emit_writeword(tl,&reg_cop2d[copr]);
3534       break;
3535     case 28:
3536     case 29:
3537       c2op_mfc2_29_assemble(tl,temp);
3538       break;
3539     default:
3540       emit_readword(&reg_cop2d[copr],tl);
3541       break;
3542   }
3543 }
3544
3545 static void cop2_put_dreg(u_int copr,signed char sl,signed char temp)
3546 {
3547   switch (copr) {
3548     case 15:
3549       emit_readword(&reg_cop2d[13],temp);  // SXY1
3550       emit_writeword(sl,&reg_cop2d[copr]);
3551       emit_writeword(temp,&reg_cop2d[12]); // SXY0
3552       emit_readword(&reg_cop2d[14],temp);  // SXY2
3553       emit_writeword(sl,&reg_cop2d[14]);
3554       emit_writeword(temp,&reg_cop2d[13]); // SXY1
3555       break;
3556     case 28:
3557       emit_andimm(sl,0x001f,temp);
3558       emit_shlimm(temp,7,temp);
3559       emit_writeword(temp,&reg_cop2d[9]);
3560       emit_andimm(sl,0x03e0,temp);
3561       emit_shlimm(temp,2,temp);
3562       emit_writeword(temp,&reg_cop2d[10]);
3563       emit_andimm(sl,0x7c00,temp);
3564       emit_shrimm(temp,3,temp);
3565       emit_writeword(temp,&reg_cop2d[11]);
3566       emit_writeword(sl,&reg_cop2d[28]);
3567       break;
3568     case 30:
3569       emit_xorsar_imm(sl,sl,31,temp);
3570 #if defined(HAVE_ARMV5) || defined(__aarch64__)
3571       emit_clz(temp,temp);
3572 #else
3573       emit_movs(temp,HOST_TEMPREG);
3574       emit_movimm(0,temp);
3575       emit_jeq((int)out+4*4);
3576       emit_addpl_imm(temp,1,temp);
3577       emit_lslpls_imm(HOST_TEMPREG,1,HOST_TEMPREG);
3578       emit_jns((int)out-2*4);
3579 #endif
3580       emit_writeword(sl,&reg_cop2d[30]);
3581       emit_writeword(temp,&reg_cop2d[31]);
3582       break;
3583     case 31:
3584       break;
3585     default:
3586       emit_writeword(sl,&reg_cop2d[copr]);
3587       break;
3588   }
3589 }
3590
3591 static void c2ls_assemble(int i, const struct regstat *i_regs)
3592 {
3593   int s,tl;
3594   int ar;
3595   int offset;
3596   int memtarget=0,c=0;
3597   void *jaddr2=NULL;
3598   enum stub_type type;
3599   int agr=AGEN1+(i&1);
3600   int fastio_reg_override=-1;
3601   u_int reglist=get_host_reglist(i_regs->regmap);
3602   u_int copr=(source[i]>>16)&0x1f;
3603   s=get_reg(i_regs->regmap,dops[i].rs1);
3604   tl=get_reg(i_regs->regmap,FTEMP);
3605   offset=imm[i];
3606   assert(dops[i].rs1>0);
3607   assert(tl>=0);
3608
3609   if(i_regs->regmap[HOST_CCREG]==CCREG)
3610     reglist&=~(1<<HOST_CCREG);
3611
3612   // get the address
3613   if (dops[i].opcode==0x3a) { // SWC2
3614     ar=get_reg(i_regs->regmap,agr);
3615     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3616     reglist|=1<<ar;
3617   } else { // LWC2
3618     ar=tl;
3619   }
3620   if(s>=0) c=(i_regs->wasconst>>s)&1;
3621   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3622   if (!offset&&!c&&s>=0) ar=s;
3623   assert(ar>=0);
3624
3625   cop2_do_stall_check(0, i, i_regs, reglist);
3626
3627   if (dops[i].opcode==0x3a) { // SWC2
3628     cop2_get_dreg(copr,tl,-1);
3629     type=STOREW_STUB;
3630   }
3631   else
3632     type=LOADW_STUB;
3633
3634   if(c&&!memtarget) {
3635     jaddr2=out;
3636     emit_jmp(0); // inline_readstub/inline_writestub?
3637   }
3638   else {
3639     if(!c) {
3640       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3641     }
3642     else if(ram_offset&&memtarget) {
3643       host_tempreg_acquire();
3644       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3645       fastio_reg_override=HOST_TEMPREG;
3646     }
3647     if (dops[i].opcode==0x32) { // LWC2
3648       int a=ar;
3649       if(fastio_reg_override>=0) a=fastio_reg_override;
3650       emit_readword_indexed(0,a,tl);
3651     }
3652     if (dops[i].opcode==0x3a) { // SWC2
3653       #ifdef DESTRUCTIVE_SHIFT
3654       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3655       #endif
3656       int a=ar;
3657       if(fastio_reg_override>=0) a=fastio_reg_override;
3658       emit_writeword_indexed(tl,0,a);
3659     }
3660   }
3661   if(fastio_reg_override==HOST_TEMPREG)
3662     host_tempreg_release();
3663   if(jaddr2)
3664     add_stub_r(type,jaddr2,out,i,ar,i_regs,ccadj[i],reglist);
3665   if(dops[i].opcode==0x3a) // SWC2
3666   if(!(i_regs->waswritten&(1<<dops[i].rs1)) && !HACK_ENABLED(NDHACK_NO_SMC_CHECK)) {
3667 #if defined(HOST_IMM8)
3668     int ir=get_reg(i_regs->regmap,INVCP);
3669     assert(ir>=0);
3670     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3671 #else
3672     emit_cmpmem_indexedsr12_imm(invalid_code,ar,1);
3673 #endif
3674     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3675     emit_callne(invalidate_addr_reg[ar]);
3676     #else
3677     void *jaddr3 = out;
3678     emit_jne(0);
3679     add_stub(INVCODE_STUB,jaddr3,out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3680     #endif
3681   }
3682   if (dops[i].opcode==0x32) { // LWC2
3683     host_tempreg_acquire();
3684     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3685     host_tempreg_release();
3686   }
3687 }
3688
3689 static void cop2_assemble(int i, const struct regstat *i_regs)
3690 {
3691   u_int copr = (source[i]>>11) & 0x1f;
3692   signed char temp = get_reg(i_regs->regmap, -1);
3693
3694   if (!HACK_ENABLED(NDHACK_NO_STALLS)) {
3695     u_int reglist = reglist_exclude(get_host_reglist(i_regs->regmap), temp, -1);
3696     if (dops[i].opcode2 == 0 || dops[i].opcode2 == 2) { // MFC2/CFC2
3697       signed char tl = get_reg(i_regs->regmap, dops[i].rt1);
3698       reglist = reglist_exclude(reglist, tl, -1);
3699     }
3700     cop2_do_stall_check(0, i, i_regs, reglist);
3701   }
3702   if (dops[i].opcode2==0) { // MFC2
3703     signed char tl=get_reg(i_regs->regmap,dops[i].rt1);
3704     if(tl>=0&&dops[i].rt1!=0)
3705       cop2_get_dreg(copr,tl,temp);
3706   }
3707   else if (dops[i].opcode2==4) { // MTC2
3708     signed char sl=get_reg(i_regs->regmap,dops[i].rs1);
3709     cop2_put_dreg(copr,sl,temp);
3710   }
3711   else if (dops[i].opcode2==2) // CFC2
3712   {
3713     signed char tl=get_reg(i_regs->regmap,dops[i].rt1);
3714     if(tl>=0&&dops[i].rt1!=0)
3715       emit_readword(&reg_cop2c[copr],tl);
3716   }
3717   else if (dops[i].opcode2==6) // CTC2
3718   {
3719     signed char sl=get_reg(i_regs->regmap,dops[i].rs1);
3720     switch(copr) {
3721       case 4:
3722       case 12:
3723       case 20:
3724       case 26:
3725       case 27:
3726       case 29:
3727       case 30:
3728         emit_signextend16(sl,temp);
3729         break;
3730       case 31:
3731         c2op_ctc2_31_assemble(sl,temp);
3732         break;
3733       default:
3734         temp=sl;
3735         break;
3736     }
3737     emit_writeword(temp,&reg_cop2c[copr]);
3738     assert(sl>=0);
3739   }
3740 }
3741
3742 static void do_unalignedwritestub(int n)
3743 {
3744   assem_debug("do_unalignedwritestub %x\n",start+stubs[n].a*4);
3745   literal_pool(256);
3746   set_jump_target(stubs[n].addr, out);
3747
3748   int i=stubs[n].a;
3749   struct regstat *i_regs=(struct regstat *)stubs[n].c;
3750   int addr=stubs[n].b;
3751   u_int reglist=stubs[n].e;
3752   signed char *i_regmap=i_regs->regmap;
3753   int temp2=get_reg(i_regmap,FTEMP);
3754   int rt;
3755   rt=get_reg(i_regmap,dops[i].rs2);
3756   assert(rt>=0);
3757   assert(addr>=0);
3758   assert(dops[i].opcode==0x2a||dops[i].opcode==0x2e); // SWL/SWR only implemented
3759   reglist|=(1<<addr);
3760   reglist&=~(1<<temp2);
3761
3762   // don't bother with it and call write handler
3763   save_regs(reglist);
3764   pass_args(addr,rt);
3765   int cc=get_reg(i_regmap,CCREG);
3766   if(cc<0)
3767     emit_loadreg(CCREG,2);
3768   emit_addimm(cc<0?2:cc,CLOCK_ADJUST((int)stubs[n].d+1),2);
3769   emit_far_call((dops[i].opcode==0x2a?jump_handle_swl:jump_handle_swr));
3770   emit_addimm(0,-CLOCK_ADJUST((int)stubs[n].d+1),cc<0?2:cc);
3771   if(cc<0)
3772     emit_storereg(CCREG,2);
3773   restore_regs(reglist);
3774   emit_jmp(stubs[n].retaddr); // return address
3775 }
3776
3777 #ifndef multdiv_assemble
3778 void multdiv_assemble(int i,struct regstat *i_regs)
3779 {
3780   printf("Need multdiv_assemble for this architecture.\n");
3781   abort();
3782 }
3783 #endif
3784
3785 static void mov_assemble(int i,struct regstat *i_regs)
3786 {
3787   //if(dops[i].opcode2==0x10||dops[i].opcode2==0x12) { // MFHI/MFLO
3788   //if(dops[i].opcode2==0x11||dops[i].opcode2==0x13) { // MTHI/MTLO
3789   if(dops[i].rt1) {
3790     signed char sl,tl;
3791     tl=get_reg(i_regs->regmap,dops[i].rt1);
3792     //assert(tl>=0);
3793     if(tl>=0) {
3794       sl=get_reg(i_regs->regmap,dops[i].rs1);
3795       if(sl>=0) emit_mov(sl,tl);
3796       else emit_loadreg(dops[i].rs1,tl);
3797     }
3798   }
3799   if (dops[i].rs1 == HIREG || dops[i].rs1 == LOREG) // MFHI/MFLO
3800     multdiv_do_stall(i, i_regs);
3801 }
3802
3803 // call interpreter, exception handler, things that change pc/regs/cycles ...
3804 static void call_c_cpu_handler(int i, const struct regstat *i_regs, u_int pc, void *func)
3805 {
3806   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3807   assert(ccreg==HOST_CCREG);
3808   assert(!is_delayslot);
3809   (void)ccreg;
3810
3811   emit_movimm(pc,3); // Get PC
3812   emit_readword(&last_count,2);
3813   emit_writeword(3,&psxRegs.pc);
3814   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3815   emit_add(2,HOST_CCREG,2);
3816   emit_writeword(2,&psxRegs.cycle);
3817   emit_far_call(func);
3818   emit_far_jump(jump_to_new_pc);
3819 }
3820
3821 static void syscall_assemble(int i,struct regstat *i_regs)
3822 {
3823   emit_movimm(0x20,0); // cause code
3824   emit_movimm(0,1);    // not in delay slot
3825   call_c_cpu_handler(i,i_regs,start+i*4,psxException);
3826 }
3827
3828 static void hlecall_assemble(int i,struct regstat *i_regs)
3829 {
3830   void *hlefunc = psxNULL;
3831   uint32_t hleCode = source[i] & 0x03ffffff;
3832   if (hleCode < ARRAY_SIZE(psxHLEt))
3833     hlefunc = psxHLEt[hleCode];
3834
3835   call_c_cpu_handler(i,i_regs,start+i*4+4,hlefunc);
3836 }
3837
3838 static void intcall_assemble(int i,struct regstat *i_regs)
3839 {
3840   call_c_cpu_handler(i,i_regs,start+i*4,execI);
3841 }
3842
3843 static void speculate_mov(int rs,int rt)
3844 {
3845   if(rt!=0) {
3846     smrv_strong_next|=1<<rt;
3847     smrv[rt]=smrv[rs];
3848   }
3849 }
3850
3851 static void speculate_mov_weak(int rs,int rt)
3852 {
3853   if(rt!=0) {
3854     smrv_weak_next|=1<<rt;
3855     smrv[rt]=smrv[rs];
3856   }
3857 }
3858
3859 static void speculate_register_values(int i)
3860 {
3861   if(i==0) {
3862     memcpy(smrv,psxRegs.GPR.r,sizeof(smrv));
3863     // gp,sp are likely to stay the same throughout the block
3864     smrv_strong_next=(1<<28)|(1<<29)|(1<<30);
3865     smrv_weak_next=~smrv_strong_next;
3866     //printf(" llr %08x\n", smrv[4]);
3867   }
3868   smrv_strong=smrv_strong_next;
3869   smrv_weak=smrv_weak_next;
3870   switch(dops[i].itype) {
3871     case ALU:
3872       if     ((smrv_strong>>dops[i].rs1)&1) speculate_mov(dops[i].rs1,dops[i].rt1);
3873       else if((smrv_strong>>dops[i].rs2)&1) speculate_mov(dops[i].rs2,dops[i].rt1);
3874       else if((smrv_weak>>dops[i].rs1)&1) speculate_mov_weak(dops[i].rs1,dops[i].rt1);
3875       else if((smrv_weak>>dops[i].rs2)&1) speculate_mov_weak(dops[i].rs2,dops[i].rt1);
3876       else {
3877         smrv_strong_next&=~(1<<dops[i].rt1);
3878         smrv_weak_next&=~(1<<dops[i].rt1);
3879       }
3880       break;
3881     case SHIFTIMM:
3882       smrv_strong_next&=~(1<<dops[i].rt1);
3883       smrv_weak_next&=~(1<<dops[i].rt1);
3884       // fallthrough
3885     case IMM16:
3886       if(dops[i].rt1&&is_const(&regs[i],dops[i].rt1)) {
3887         int value,hr=get_reg(regs[i].regmap,dops[i].rt1);
3888         if(hr>=0) {
3889           if(get_final_value(hr,i,&value))
3890                smrv[dops[i].rt1]=value;
3891           else smrv[dops[i].rt1]=constmap[i][hr];
3892           smrv_strong_next|=1<<dops[i].rt1;
3893         }
3894       }
3895       else {
3896         if     ((smrv_strong>>dops[i].rs1)&1) speculate_mov(dops[i].rs1,dops[i].rt1);
3897         else if((smrv_weak>>dops[i].rs1)&1) speculate_mov_weak(dops[i].rs1,dops[i].rt1);
3898       }
3899       break;
3900     case LOAD:
3901       if(start<0x2000&&(dops[i].rt1==26||(smrv[dops[i].rt1]>>24)==0xa0)) {
3902         // special case for BIOS
3903         smrv[dops[i].rt1]=0xa0000000;
3904         smrv_strong_next|=1<<dops[i].rt1;
3905         break;
3906       }
3907       // fallthrough
3908     case SHIFT:
3909     case LOADLR:
3910     case MOV:
3911       smrv_strong_next&=~(1<<dops[i].rt1);
3912       smrv_weak_next&=~(1<<dops[i].rt1);
3913       break;
3914     case COP0:
3915     case COP2:
3916       if(dops[i].opcode2==0||dops[i].opcode2==2) { // MFC/CFC
3917         smrv_strong_next&=~(1<<dops[i].rt1);
3918         smrv_weak_next&=~(1<<dops[i].rt1);
3919       }
3920       break;
3921     case C2LS:
3922       if (dops[i].opcode==0x32) { // LWC2
3923         smrv_strong_next&=~(1<<dops[i].rt1);
3924         smrv_weak_next&=~(1<<dops[i].rt1);
3925       }
3926       break;
3927   }
3928 #if 0
3929   int r=4;
3930   printf("x %08x %08x %d %d c %08x %08x\n",smrv[r],start+i*4,
3931     ((smrv_strong>>r)&1),(smrv_weak>>r)&1,regs[i].isconst,regs[i].wasconst);
3932 #endif
3933 }
3934
3935 static void ds_assemble(int i,struct regstat *i_regs)
3936 {
3937   speculate_register_values(i);
3938   is_delayslot=1;
3939   switch(dops[i].itype) {
3940     case ALU:
3941       alu_assemble(i,i_regs);break;
3942     case IMM16:
3943       imm16_assemble(i,i_regs);break;
3944     case SHIFT:
3945       shift_assemble(i,i_regs);break;
3946     case SHIFTIMM:
3947       shiftimm_assemble(i,i_regs);break;
3948     case LOAD:
3949       load_assemble(i,i_regs);break;
3950     case LOADLR:
3951       loadlr_assemble(i,i_regs);break;
3952     case STORE:
3953       store_assemble(i,i_regs);break;
3954     case STORELR:
3955       storelr_assemble(i,i_regs);break;
3956     case COP0:
3957       cop0_assemble(i,i_regs);break;
3958     case COP1:
3959       cop1_assemble(i,i_regs);break;
3960     case C1LS:
3961       c1ls_assemble(i,i_regs);break;
3962     case COP2:
3963       cop2_assemble(i,i_regs);break;
3964     case C2LS:
3965       c2ls_assemble(i,i_regs);break;
3966     case C2OP:
3967       c2op_assemble(i,i_regs);break;
3968     case MULTDIV:
3969       multdiv_assemble(i,i_regs);
3970       multdiv_prepare_stall(i,i_regs);
3971       break;
3972     case MOV:
3973       mov_assemble(i,i_regs);break;
3974     case SYSCALL:
3975     case HLECALL:
3976     case INTCALL:
3977     case SPAN:
3978     case UJUMP:
3979     case RJUMP:
3980     case CJUMP:
3981     case SJUMP:
3982       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3983   }
3984   is_delayslot=0;
3985 }
3986
3987 // Is the branch target a valid internal jump?
3988 static int internal_branch(int addr)
3989 {
3990   if(addr&1) return 0; // Indirect (register) jump
3991   if(addr>=start && addr<start+slen*4-4)
3992   {
3993     return 1;
3994   }
3995   return 0;
3996 }
3997
3998 static void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t u)
3999 {
4000   int hr;
4001   for(hr=0;hr<HOST_REGS;hr++) {
4002     if(hr!=EXCLUDE_REG) {
4003       if(pre[hr]!=entry[hr]) {
4004         if(pre[hr]>=0) {
4005           if((dirty>>hr)&1) {
4006             if(get_reg(entry,pre[hr])<0) {
4007               assert(pre[hr]<64);
4008               if(!((u>>pre[hr])&1))
4009                 emit_storereg(pre[hr],hr);
4010             }
4011           }
4012         }
4013       }
4014     }
4015   }
4016   // Move from one register to another (no writeback)
4017   for(hr=0;hr<HOST_REGS;hr++) {
4018     if(hr!=EXCLUDE_REG) {
4019       if(pre[hr]!=entry[hr]) {
4020         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
4021           int nr;
4022           if((nr=get_reg(entry,pre[hr]))>=0) {
4023             emit_mov(hr,nr);
4024           }
4025         }
4026       }
4027     }
4028   }
4029 }
4030
4031 // Load the specified registers
4032 // This only loads the registers given as arguments because
4033 // we don't want to load things that will be overwritten
4034 static void load_regs(signed char entry[],signed char regmap[],int rs1,int rs2)
4035 {
4036   int hr;
4037   // Load 32-bit regs
4038   for(hr=0;hr<HOST_REGS;hr++) {
4039     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4040       if(entry[hr]!=regmap[hr]) {
4041         if(regmap[hr]==rs1||regmap[hr]==rs2)
4042         {
4043           if(regmap[hr]==0) {
4044             emit_zeroreg(hr);
4045           }
4046           else
4047           {
4048             emit_loadreg(regmap[hr],hr);
4049           }
4050         }
4051       }
4052     }
4053   }
4054 }
4055
4056 // Load registers prior to the start of a loop
4057 // so that they are not loaded within the loop
4058 static void loop_preload(signed char pre[],signed char entry[])
4059 {
4060   int hr;
4061   for(hr=0;hr<HOST_REGS;hr++) {
4062     if(hr!=EXCLUDE_REG) {
4063       if(pre[hr]!=entry[hr]) {
4064         if(entry[hr]>=0) {
4065           if(get_reg(pre,entry[hr])<0) {
4066             assem_debug("loop preload:\n");
4067             //printf("loop preload: %d\n",hr);
4068             if(entry[hr]==0) {
4069               emit_zeroreg(hr);
4070             }
4071             else if(entry[hr]<TEMPREG)
4072             {
4073               emit_loadreg(entry[hr],hr);
4074             }
4075             else if(entry[hr]-64<TEMPREG)
4076             {
4077               emit_loadreg(entry[hr],hr);
4078             }
4079           }
4080         }
4081       }
4082     }
4083   }
4084 }
4085
4086 // Generate address for load/store instruction
4087 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
4088 void address_generation(int i,struct regstat *i_regs,signed char entry[])
4089 {
4090   if(dops[i].itype==LOAD||dops[i].itype==LOADLR||dops[i].itype==STORE||dops[i].itype==STORELR||dops[i].itype==C1LS||dops[i].itype==C2LS) {
4091     int ra=-1;
4092     int agr=AGEN1+(i&1);
4093     if(dops[i].itype==LOAD) {
4094       ra=get_reg(i_regs->regmap,dops[i].rt1);
4095       if(ra<0) ra=get_reg(i_regs->regmap,-1);
4096       assert(ra>=0);
4097     }
4098     if(dops[i].itype==LOADLR) {
4099       ra=get_reg(i_regs->regmap,FTEMP);
4100     }
4101     if(dops[i].itype==STORE||dops[i].itype==STORELR) {
4102       ra=get_reg(i_regs->regmap,agr);
4103       if(ra<0) ra=get_reg(i_regs->regmap,-1);
4104     }
4105     if(dops[i].itype==C1LS||dops[i].itype==C2LS) {
4106       if ((dops[i].opcode&0x3b)==0x31||(dops[i].opcode&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
4107         ra=get_reg(i_regs->regmap,FTEMP);
4108       else { // SWC1/SDC1/SWC2/SDC2
4109         ra=get_reg(i_regs->regmap,agr);
4110         if(ra<0) ra=get_reg(i_regs->regmap,-1);
4111       }
4112     }
4113     int rs=get_reg(i_regs->regmap,dops[i].rs1);
4114     if(ra>=0) {
4115       int offset=imm[i];
4116       int c=(i_regs->wasconst>>rs)&1;
4117       if(dops[i].rs1==0) {
4118         // Using r0 as a base address
4119         if(!entry||entry[ra]!=agr) {
4120           if (dops[i].opcode==0x22||dops[i].opcode==0x26) {
4121             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4122           }else if (dops[i].opcode==0x1a||dops[i].opcode==0x1b) {
4123             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4124           }else{
4125             emit_movimm(offset,ra);
4126           }
4127         } // else did it in the previous cycle
4128       }
4129       else if(rs<0) {
4130         if(!entry||entry[ra]!=dops[i].rs1)
4131           emit_loadreg(dops[i].rs1,ra);
4132         //if(!entry||entry[ra]!=dops[i].rs1)
4133         //  printf("poor load scheduling!\n");
4134       }
4135       else if(c) {
4136         if(dops[i].rs1!=dops[i].rt1||dops[i].itype!=LOAD) {
4137           if(!entry||entry[ra]!=agr) {
4138             if (dops[i].opcode==0x22||dops[i].opcode==0x26) {
4139               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4140             }else if (dops[i].opcode==0x1a||dops[i].opcode==0x1b) {
4141               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4142             }else{
4143               emit_movimm(constmap[i][rs]+offset,ra);
4144               regs[i].loadedconst|=1<<ra;
4145             }
4146           } // else did it in the previous cycle
4147         } // else load_consts already did it
4148       }
4149       if(offset&&!c&&dops[i].rs1) {
4150         if(rs>=0) {
4151           emit_addimm(rs,offset,ra);
4152         }else{
4153           emit_addimm(ra,offset,ra);
4154         }
4155       }
4156     }
4157   }
4158   // Preload constants for next instruction
4159   if(dops[i+1].itype==LOAD||dops[i+1].itype==LOADLR||dops[i+1].itype==STORE||dops[i+1].itype==STORELR||dops[i+1].itype==C1LS||dops[i+1].itype==C2LS) {
4160     int agr,ra;
4161     // Actual address
4162     agr=AGEN1+((i+1)&1);
4163     ra=get_reg(i_regs->regmap,agr);
4164     if(ra>=0) {
4165       int rs=get_reg(regs[i+1].regmap,dops[i+1].rs1);
4166       int offset=imm[i+1];
4167       int c=(regs[i+1].wasconst>>rs)&1;
4168       if(c&&(dops[i+1].rs1!=dops[i+1].rt1||dops[i+1].itype!=LOAD)) {
4169         if (dops[i+1].opcode==0x22||dops[i+1].opcode==0x26) {
4170           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4171         }else if (dops[i+1].opcode==0x1a||dops[i+1].opcode==0x1b) {
4172           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4173         }else{
4174           emit_movimm(constmap[i+1][rs]+offset,ra);
4175           regs[i+1].loadedconst|=1<<ra;
4176         }
4177       }
4178       else if(dops[i+1].rs1==0) {
4179         // Using r0 as a base address
4180         if (dops[i+1].opcode==0x22||dops[i+1].opcode==0x26) {
4181           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4182         }else if (dops[i+1].opcode==0x1a||dops[i+1].opcode==0x1b) {
4183           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4184         }else{
4185           emit_movimm(offset,ra);
4186         }
4187       }
4188     }
4189   }
4190 }
4191
4192 static int get_final_value(int hr, int i, int *value)
4193 {
4194   int reg=regs[i].regmap[hr];
4195   while(i<slen-1) {
4196     if(regs[i+1].regmap[hr]!=reg) break;
4197     if(!((regs[i+1].isconst>>hr)&1)) break;
4198     if(dops[i+1].bt) break;
4199     i++;
4200   }
4201   if(i<slen-1) {
4202     if (dops[i].is_jump) {
4203       *value=constmap[i][hr];
4204       return 1;
4205     }
4206     if(!dops[i+1].bt) {
4207       if (dops[i+1].is_jump) {
4208         // Load in delay slot, out-of-order execution
4209         if(dops[i+2].itype==LOAD&&dops[i+2].rs1==reg&&dops[i+2].rt1==reg&&((regs[i+1].wasconst>>hr)&1))
4210         {
4211           // Precompute load address
4212           *value=constmap[i][hr]+imm[i+2];
4213           return 1;
4214         }
4215       }
4216       if(dops[i+1].itype==LOAD&&dops[i+1].rs1==reg&&dops[i+1].rt1==reg)
4217       {
4218         // Precompute load address
4219         *value=constmap[i][hr]+imm[i+1];
4220         //printf("c=%x imm=%lx\n",(long)constmap[i][hr],imm[i+1]);
4221         return 1;
4222       }
4223     }
4224   }
4225   *value=constmap[i][hr];
4226   //printf("c=%lx\n",(long)constmap[i][hr]);
4227   if(i==slen-1) return 1;
4228   assert(reg < 64);
4229   return !((unneeded_reg[i+1]>>reg)&1);
4230 }
4231
4232 // Load registers with known constants
4233 static void load_consts(signed char pre[],signed char regmap[],int i)
4234 {
4235   int hr,hr2;
4236   // propagate loaded constant flags
4237   if(i==0||dops[i].bt)
4238     regs[i].loadedconst=0;
4239   else {
4240     for(hr=0;hr<HOST_REGS;hr++) {
4241       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
4242          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
4243       {
4244         regs[i].loadedconst|=1<<hr;
4245       }
4246     }
4247   }
4248   // Load 32-bit regs
4249   for(hr=0;hr<HOST_REGS;hr++) {
4250     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4251       //if(entry[hr]!=regmap[hr]) {
4252       if(!((regs[i].loadedconst>>hr)&1)) {
4253         assert(regmap[hr]<64);
4254         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>0) {
4255           int value,similar=0;
4256           if(get_final_value(hr,i,&value)) {
4257             // see if some other register has similar value
4258             for(hr2=0;hr2<HOST_REGS;hr2++) {
4259               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
4260                 if(is_similar_value(value,constmap[i][hr2])) {
4261                   similar=1;
4262                   break;
4263                 }
4264               }
4265             }
4266             if(similar) {
4267               int value2;
4268               if(get_final_value(hr2,i,&value2)) // is this needed?
4269                 emit_movimm_from(value2,hr2,value,hr);
4270               else
4271                 emit_movimm(value,hr);
4272             }
4273             else if(value==0) {
4274               emit_zeroreg(hr);
4275             }
4276             else {
4277               emit_movimm(value,hr);
4278             }
4279           }
4280           regs[i].loadedconst|=1<<hr;
4281         }
4282       }
4283     }
4284   }
4285 }
4286
4287 void load_all_consts(signed char regmap[], u_int dirty, int i)
4288 {
4289   int hr;
4290   // Load 32-bit regs
4291   for(hr=0;hr<HOST_REGS;hr++) {
4292     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4293       assert(regmap[hr] < 64);
4294       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>0) {
4295         int value=constmap[i][hr];
4296         if(value==0) {
4297           emit_zeroreg(hr);
4298         }
4299         else {
4300           emit_movimm(value,hr);
4301         }
4302       }
4303     }
4304   }
4305 }
4306
4307 // Write out all dirty registers (except cycle count)
4308 static void wb_dirtys(signed char i_regmap[],uint64_t i_dirty)
4309 {
4310   int hr;
4311   for(hr=0;hr<HOST_REGS;hr++) {
4312     if(hr!=EXCLUDE_REG) {
4313       if(i_regmap[hr]>0) {
4314         if(i_regmap[hr]!=CCREG) {
4315           if((i_dirty>>hr)&1) {
4316             assert(i_regmap[hr]<64);
4317             emit_storereg(i_regmap[hr],hr);
4318           }
4319         }
4320       }
4321     }
4322   }
4323 }
4324
4325 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4326 // This writes the registers not written by store_regs_bt
4327 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_dirty,int addr)
4328 {
4329   int hr;
4330   int t=(addr-start)>>2;
4331   for(hr=0;hr<HOST_REGS;hr++) {
4332     if(hr!=EXCLUDE_REG) {
4333       if(i_regmap[hr]>0) {
4334         if(i_regmap[hr]!=CCREG) {
4335           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1)) {
4336             if((i_dirty>>hr)&1) {
4337               assert(i_regmap[hr]<64);
4338               emit_storereg(i_regmap[hr],hr);
4339             }
4340           }
4341         }
4342       }
4343     }
4344   }
4345 }
4346
4347 // Load all registers (except cycle count)
4348 void load_all_regs(signed char i_regmap[])
4349 {
4350   int hr;
4351   for(hr=0;hr<HOST_REGS;hr++) {
4352     if(hr!=EXCLUDE_REG) {
4353       if(i_regmap[hr]==0) {
4354         emit_zeroreg(hr);
4355       }
4356       else
4357       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4358       {
4359         emit_loadreg(i_regmap[hr],hr);
4360       }
4361     }
4362   }
4363 }
4364
4365 // Load all current registers also needed by next instruction
4366 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4367 {
4368   int hr;
4369   for(hr=0;hr<HOST_REGS;hr++) {
4370     if(hr!=EXCLUDE_REG) {
4371       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4372         if(i_regmap[hr]==0) {
4373           emit_zeroreg(hr);
4374         }
4375         else
4376         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4377         {
4378           emit_loadreg(i_regmap[hr],hr);
4379         }
4380       }
4381     }
4382   }
4383 }
4384
4385 // Load all regs, storing cycle count if necessary
4386 void load_regs_entry(int t)
4387 {
4388   int hr;
4389   if(dops[t].is_ds) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
4390   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
4391   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4392     emit_storereg(CCREG,HOST_CCREG);
4393   }
4394   // Load 32-bit regs
4395   for(hr=0;hr<HOST_REGS;hr++) {
4396     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4397       if(regs[t].regmap_entry[hr]==0) {
4398         emit_zeroreg(hr);
4399       }
4400       else if(regs[t].regmap_entry[hr]!=CCREG)
4401       {
4402         emit_loadreg(regs[t].regmap_entry[hr],hr);
4403       }
4404     }
4405   }
4406 }
4407
4408 // Store dirty registers prior to branch
4409 void store_regs_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
4410 {
4411   if(internal_branch(addr))
4412   {
4413     int t=(addr-start)>>2;
4414     int hr;
4415     for(hr=0;hr<HOST_REGS;hr++) {
4416       if(hr!=EXCLUDE_REG) {
4417         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4418           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1)) {
4419             if((i_dirty>>hr)&1) {
4420               assert(i_regmap[hr]<64);
4421               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4422                 emit_storereg(i_regmap[hr],hr);
4423             }
4424           }
4425         }
4426       }
4427     }
4428   }
4429   else
4430   {
4431     // Branch out of this block, write out all dirty regs
4432     wb_dirtys(i_regmap,i_dirty);
4433   }
4434 }
4435
4436 // Load all needed registers for branch target
4437 static void load_regs_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
4438 {
4439   //if(addr>=start && addr<(start+slen*4))
4440   if(internal_branch(addr))
4441   {
4442     int t=(addr-start)>>2;
4443     int hr;
4444     // Store the cycle count before loading something else
4445     if(i_regmap[HOST_CCREG]!=CCREG) {
4446       assert(i_regmap[HOST_CCREG]==-1);
4447     }
4448     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4449       emit_storereg(CCREG,HOST_CCREG);
4450     }
4451     // Load 32-bit regs
4452     for(hr=0;hr<HOST_REGS;hr++) {
4453       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4454         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4455           if(regs[t].regmap_entry[hr]==0) {
4456             emit_zeroreg(hr);
4457           }
4458           else if(regs[t].regmap_entry[hr]!=CCREG)
4459           {
4460             emit_loadreg(regs[t].regmap_entry[hr],hr);
4461           }
4462         }
4463       }
4464     }
4465   }
4466 }
4467
4468 static int match_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
4469 {
4470   if(addr>=start && addr<start+slen*4-4)
4471   {
4472     int t=(addr-start)>>2;
4473     int hr;
4474     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4475     for(hr=0;hr<HOST_REGS;hr++)
4476     {
4477       if(hr!=EXCLUDE_REG)
4478       {
4479         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4480         {
4481           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4482           {
4483             return 0;
4484           }
4485           else
4486           if((i_dirty>>hr)&1)
4487           {
4488             if(i_regmap[hr]<TEMPREG)
4489             {
4490               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4491                 return 0;
4492             }
4493             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4494             {
4495               assert(0);
4496             }
4497           }
4498         }
4499         else // Same register but is it 32-bit or dirty?
4500         if(i_regmap[hr]>=0)
4501         {
4502           if(!((regs[t].dirty>>hr)&1))
4503           {
4504             if((i_dirty>>hr)&1)
4505             {
4506               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4507               {
4508                 //printf("%x: dirty no match\n",addr);
4509                 return 0;
4510               }
4511             }
4512           }
4513         }
4514       }
4515     }
4516     // Delay slots are not valid branch targets
4517     //if(t>0&&(dops[t-1].is_jump) return 0;
4518     // Delay slots require additional processing, so do not match
4519     if(dops[t].is_ds) return 0;
4520   }
4521   else
4522   {
4523     int hr;
4524     for(hr=0;hr<HOST_REGS;hr++)
4525     {
4526       if(hr!=EXCLUDE_REG)
4527       {
4528         if(i_regmap[hr]>=0)
4529         {
4530           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4531           {
4532             if((i_dirty>>hr)&1)
4533             {
4534               return 0;
4535             }
4536           }
4537         }
4538       }
4539     }
4540   }
4541   return 1;
4542 }
4543
4544 #ifdef DRC_DBG
4545 static void drc_dbg_emit_do_cmp(int i)
4546 {
4547   extern void do_insn_cmp();
4548   //extern int cycle;
4549   u_int hr, reglist = get_host_reglist(regs[i].regmap);
4550
4551   assem_debug("//do_insn_cmp %08x\n", start+i*4);
4552   save_regs(reglist);
4553   // write out changed consts to match the interpreter
4554   if (i > 0 && !dops[i].bt) {
4555     for (hr = 0; hr < HOST_REGS; hr++) {
4556       int reg = regs[i-1].regmap[hr];
4557       if (hr == EXCLUDE_REG || reg < 0)
4558         continue;
4559       if (!((regs[i-1].isconst >> hr) & 1))
4560         continue;
4561       if (i > 1 && reg == regs[i-2].regmap[hr] && constmap[i-1][hr] == constmap[i-2][hr])
4562         continue;
4563       emit_movimm(constmap[i-1][hr],0);
4564       emit_storereg(reg, 0);
4565     }
4566   }
4567   emit_movimm(start+i*4,0);
4568   emit_writeword(0,&pcaddr);
4569   emit_far_call(do_insn_cmp);
4570   //emit_readword(&cycle,0);
4571   //emit_addimm(0,2,0);
4572   //emit_writeword(0,&cycle);
4573   (void)get_reg2;
4574   restore_regs(reglist);
4575   assem_debug("\\\\do_insn_cmp\n");
4576 }
4577 #else
4578 #define drc_dbg_emit_do_cmp(x)
4579 #endif
4580
4581 // Used when a branch jumps into the delay slot of another branch
4582 static void ds_assemble_entry(int i)
4583 {
4584   int t=(ba[i]-start)>>2;
4585   if (!instr_addr[t])
4586     instr_addr[t] = out;
4587   assem_debug("Assemble delay slot at %x\n",ba[i]);
4588   assem_debug("<->\n");
4589   drc_dbg_emit_do_cmp(t);
4590   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4591     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty);
4592   load_regs(regs[t].regmap_entry,regs[t].regmap,dops[t].rs1,dops[t].rs2);
4593   address_generation(t,&regs[t],regs[t].regmap_entry);
4594   if(dops[t].itype==STORE||dops[t].itype==STORELR||(dops[t].opcode&0x3b)==0x39||(dops[t].opcode&0x3b)==0x3a)
4595     load_regs(regs[t].regmap_entry,regs[t].regmap,INVCP,INVCP);
4596   is_delayslot=0;
4597   switch(dops[t].itype) {
4598     case ALU:
4599       alu_assemble(t,&regs[t]);break;
4600     case IMM16:
4601       imm16_assemble(t,&regs[t]);break;
4602     case SHIFT:
4603       shift_assemble(t,&regs[t]);break;
4604     case SHIFTIMM:
4605       shiftimm_assemble(t,&regs[t]);break;
4606     case LOAD:
4607       load_assemble(t,&regs[t]);break;
4608     case LOADLR:
4609       loadlr_assemble(t,&regs[t]);break;
4610     case STORE:
4611       store_assemble(t,&regs[t]);break;
4612     case STORELR:
4613       storelr_assemble(t,&regs[t]);break;
4614     case COP0:
4615       cop0_assemble(t,&regs[t]);break;
4616     case COP1:
4617       cop1_assemble(t,&regs[t]);break;
4618     case C1LS:
4619       c1ls_assemble(t,&regs[t]);break;
4620     case COP2:
4621       cop2_assemble(t,&regs[t]);break;
4622     case C2LS:
4623       c2ls_assemble(t,&regs[t]);break;
4624     case C2OP:
4625       c2op_assemble(t,&regs[t]);break;
4626     case MULTDIV:
4627       multdiv_assemble(t,&regs[t]);
4628       multdiv_prepare_stall(i,&regs[t]);
4629       break;
4630     case MOV:
4631       mov_assemble(t,&regs[t]);break;
4632     case SYSCALL:
4633     case HLECALL:
4634     case INTCALL:
4635     case SPAN:
4636     case UJUMP:
4637     case RJUMP:
4638     case CJUMP:
4639     case SJUMP:
4640       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4641   }
4642   store_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+4);
4643   load_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+4);
4644   if(internal_branch(ba[i]+4))
4645     assem_debug("branch: internal\n");
4646   else
4647     assem_debug("branch: external\n");
4648   assert(internal_branch(ba[i]+4));
4649   add_to_linker(out,ba[i]+4,internal_branch(ba[i]+4));
4650   emit_jmp(0);
4651 }
4652
4653 static void emit_extjump(void *addr, u_int target)
4654 {
4655   emit_extjump2(addr, target, dyna_linker);
4656 }
4657
4658 static void emit_extjump_ds(void *addr, u_int target)
4659 {
4660   emit_extjump2(addr, target, dyna_linker_ds);
4661 }
4662
4663 // Load 2 immediates optimizing for small code size
4664 static void emit_mov2imm_compact(int imm1,u_int rt1,int imm2,u_int rt2)
4665 {
4666   emit_movimm(imm1,rt1);
4667   emit_movimm_from(imm1,rt1,imm2,rt2);
4668 }
4669
4670 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4671 {
4672   int count;
4673   void *jaddr;
4674   void *idle=NULL;
4675   int t=0;
4676   if(dops[i].itype==RJUMP)
4677   {
4678     *adj=0;
4679   }
4680   //if(ba[i]>=start && ba[i]<(start+slen*4))
4681   if(internal_branch(ba[i]))
4682   {
4683     t=(ba[i]-start)>>2;
4684     if(dops[t].is_ds) *adj=-1; // Branch into delay slot adds an extra cycle
4685     else *adj=ccadj[t];
4686   }
4687   else
4688   {
4689     *adj=0;
4690   }
4691   count=ccadj[i];
4692   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4693     // Idle loop
4694     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4695     idle=out;
4696     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4697     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4698     jaddr=out;
4699     emit_jmp(0);
4700   }
4701   else if(*adj==0||invert) {
4702     int cycles=CLOCK_ADJUST(count+2);
4703     // faster loop HACK
4704 #if 0
4705     if (t&&*adj) {
4706       int rel=t-i;
4707       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4708         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4709     }
4710 #endif
4711     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4712     jaddr=out;
4713     emit_jns(0);
4714   }
4715   else
4716   {
4717     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4718     jaddr=out;
4719     emit_jns(0);
4720   }
4721   add_stub(CC_STUB,jaddr,idle?idle:out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4722 }
4723
4724 static void do_ccstub(int n)
4725 {
4726   literal_pool(256);
4727   assem_debug("do_ccstub %x\n",start+(u_int)stubs[n].b*4);
4728   set_jump_target(stubs[n].addr, out);
4729   int i=stubs[n].b;
4730   if(stubs[n].d==NULLDS) {
4731     // Delay slot instruction is nullified ("likely" branch)
4732     wb_dirtys(regs[i].regmap,regs[i].dirty);
4733   }
4734   else if(stubs[n].d!=TAKEN) {
4735     wb_dirtys(branch_regs[i].regmap,branch_regs[i].dirty);
4736   }
4737   else {
4738     if(internal_branch(ba[i]))
4739       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4740   }
4741   if(stubs[n].c!=-1)
4742   {
4743     // Save PC as return address
4744     emit_movimm(stubs[n].c,EAX);
4745     emit_writeword(EAX,&pcaddr);
4746   }
4747   else
4748   {
4749     // Return address depends on which way the branch goes
4750     if(dops[i].itype==CJUMP||dops[i].itype==SJUMP)
4751     {
4752       int s1l=get_reg(branch_regs[i].regmap,dops[i].rs1);
4753       int s2l=get_reg(branch_regs[i].regmap,dops[i].rs2);
4754       if(dops[i].rs1==0)
4755       {
4756         s1l=s2l;
4757         s2l=-1;
4758       }
4759       else if(dops[i].rs2==0)
4760       {
4761         s2l=-1;
4762       }
4763       assert(s1l>=0);
4764       #ifdef DESTRUCTIVE_WRITEBACK
4765       if(dops[i].rs1) {
4766         if((branch_regs[i].dirty>>s1l)&&1)
4767           emit_loadreg(dops[i].rs1,s1l);
4768       }
4769       else {
4770         if((branch_regs[i].dirty>>s1l)&1)
4771           emit_loadreg(dops[i].rs2,s1l);
4772       }
4773       if(s2l>=0)
4774         if((branch_regs[i].dirty>>s2l)&1)
4775           emit_loadreg(dops[i].rs2,s2l);
4776       #endif
4777       int hr=0;
4778       int addr=-1,alt=-1,ntaddr=-1;
4779       while(hr<HOST_REGS)
4780       {
4781         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4782            (branch_regs[i].regmap[hr]&63)!=dops[i].rs1 &&
4783            (branch_regs[i].regmap[hr]&63)!=dops[i].rs2 )
4784         {
4785           addr=hr++;break;
4786         }
4787         hr++;
4788       }
4789       while(hr<HOST_REGS)
4790       {
4791         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4792            (branch_regs[i].regmap[hr]&63)!=dops[i].rs1 &&
4793            (branch_regs[i].regmap[hr]&63)!=dops[i].rs2 )
4794         {
4795           alt=hr++;break;
4796         }
4797         hr++;
4798       }
4799       if((dops[i].opcode&0x2E)==6) // BLEZ/BGTZ needs another register
4800       {
4801         while(hr<HOST_REGS)
4802         {
4803           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4804              (branch_regs[i].regmap[hr]&63)!=dops[i].rs1 &&
4805              (branch_regs[i].regmap[hr]&63)!=dops[i].rs2 )
4806           {
4807             ntaddr=hr;break;
4808           }
4809           hr++;
4810         }
4811         assert(hr<HOST_REGS);
4812       }
4813       if((dops[i].opcode&0x2f)==4) // BEQ
4814       {
4815         #ifdef HAVE_CMOV_IMM
4816         if(s2l>=0) emit_cmp(s1l,s2l);
4817         else emit_test(s1l,s1l);
4818         emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4819         #else
4820         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4821         if(s2l>=0) emit_cmp(s1l,s2l);
4822         else emit_test(s1l,s1l);
4823         emit_cmovne_reg(alt,addr);
4824         #endif
4825       }
4826       if((dops[i].opcode&0x2f)==5) // BNE
4827       {
4828         #ifdef HAVE_CMOV_IMM
4829         if(s2l>=0) emit_cmp(s1l,s2l);
4830         else emit_test(s1l,s1l);
4831         emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4832         #else
4833         emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4834         if(s2l>=0) emit_cmp(s1l,s2l);
4835         else emit_test(s1l,s1l);
4836         emit_cmovne_reg(alt,addr);
4837         #endif
4838       }
4839       if((dops[i].opcode&0x2f)==6) // BLEZ
4840       {
4841         //emit_movimm(ba[i],alt);
4842         //emit_movimm(start+i*4+8,addr);
4843         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4844         emit_cmpimm(s1l,1);
4845         emit_cmovl_reg(alt,addr);
4846       }
4847       if((dops[i].opcode&0x2f)==7) // BGTZ
4848       {
4849         //emit_movimm(ba[i],addr);
4850         //emit_movimm(start+i*4+8,ntaddr);
4851         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4852         emit_cmpimm(s1l,1);
4853         emit_cmovl_reg(ntaddr,addr);
4854       }
4855       if((dops[i].opcode==1)&&(dops[i].opcode2&0x2D)==0) // BLTZ
4856       {
4857         //emit_movimm(ba[i],alt);
4858         //emit_movimm(start+i*4+8,addr);
4859         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4860         emit_test(s1l,s1l);
4861         emit_cmovs_reg(alt,addr);
4862       }
4863       if((dops[i].opcode==1)&&(dops[i].opcode2&0x2D)==1) // BGEZ
4864       {
4865         //emit_movimm(ba[i],addr);
4866         //emit_movimm(start+i*4+8,alt);
4867         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4868         emit_test(s1l,s1l);
4869         emit_cmovs_reg(alt,addr);
4870       }
4871       if(dops[i].opcode==0x11 && dops[i].opcode2==0x08 ) {
4872         if(source[i]&0x10000) // BC1T
4873         {
4874           //emit_movimm(ba[i],alt);
4875           //emit_movimm(start+i*4+8,addr);
4876           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4877           emit_testimm(s1l,0x800000);
4878           emit_cmovne_reg(alt,addr);
4879         }
4880         else // BC1F
4881         {
4882           //emit_movimm(ba[i],addr);
4883           //emit_movimm(start+i*4+8,alt);
4884           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4885           emit_testimm(s1l,0x800000);
4886           emit_cmovne_reg(alt,addr);
4887         }
4888       }
4889       emit_writeword(addr,&pcaddr);
4890     }
4891     else
4892     if(dops[i].itype==RJUMP)
4893     {
4894       int r=get_reg(branch_regs[i].regmap,dops[i].rs1);
4895       if (ds_writes_rjump_rs(i)) {
4896         r=get_reg(branch_regs[i].regmap,RTEMP);
4897       }
4898       emit_writeword(r,&pcaddr);
4899     }
4900     else {SysPrintf("Unknown branch type in do_ccstub\n");abort();}
4901   }
4902   // Update cycle count
4903   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4904   if(stubs[n].a) emit_addimm(HOST_CCREG,CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
4905   emit_far_call(cc_interrupt);
4906   if(stubs[n].a) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
4907   if(stubs[n].d==TAKEN) {
4908     if(internal_branch(ba[i]))
4909       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4910     else if(dops[i].itype==RJUMP) {
4911       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4912         emit_readword(&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4913       else
4914         emit_loadreg(dops[i].rs1,get_reg(branch_regs[i].regmap,dops[i].rs1));
4915     }
4916   }else if(stubs[n].d==NOTTAKEN) {
4917     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4918     else load_all_regs(branch_regs[i].regmap);
4919   }else if(stubs[n].d==NULLDS) {
4920     // Delay slot instruction is nullified ("likely" branch)
4921     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4922     else load_all_regs(regs[i].regmap);
4923   }else{
4924     load_all_regs(branch_regs[i].regmap);
4925   }
4926   if (stubs[n].retaddr)
4927     emit_jmp(stubs[n].retaddr);
4928   else
4929     do_jump_vaddr(stubs[n].e);
4930 }
4931
4932 static void add_to_linker(void *addr, u_int target, int ext)
4933 {
4934   assert(linkcount < ARRAY_SIZE(link_addr));
4935   link_addr[linkcount].addr = addr;
4936   link_addr[linkcount].target = target;
4937   link_addr[linkcount].ext = ext;
4938   linkcount++;
4939 }
4940
4941 static void ujump_assemble_write_ra(int i)
4942 {
4943   int rt;
4944   unsigned int return_address;
4945   rt=get_reg(branch_regs[i].regmap,31);
4946   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4947   //assert(rt>=0);
4948   return_address=start+i*4+8;
4949   if(rt>=0) {
4950     #ifdef USE_MINI_HT
4951     if(internal_branch(return_address)&&dops[i+1].rt1!=31) {
4952       int temp=-1; // note: must be ds-safe
4953       #ifdef HOST_TEMPREG
4954       temp=HOST_TEMPREG;
4955       #endif
4956       if(temp>=0) do_miniht_insert(return_address,rt,temp);
4957       else emit_movimm(return_address,rt);
4958     }
4959     else
4960     #endif
4961     {
4962       #ifdef REG_PREFETCH
4963       if(temp>=0)
4964       {
4965         if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4966       }
4967       #endif
4968       emit_movimm(return_address,rt); // PC into link register
4969       #ifdef IMM_PREFETCH
4970       emit_prefetch(hash_table_get(return_address));
4971       #endif
4972     }
4973   }
4974 }
4975
4976 static void ujump_assemble(int i,struct regstat *i_regs)
4977 {
4978   int ra_done=0;
4979   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4980   address_generation(i+1,i_regs,regs[i].regmap_entry);
4981   #ifdef REG_PREFETCH
4982   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4983   if(dops[i].rt1==31&&temp>=0)
4984   {
4985     signed char *i_regmap=i_regs->regmap;
4986     int return_address=start+i*4+8;
4987     if(get_reg(branch_regs[i].regmap,31)>0)
4988     if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4989   }
4990   #endif
4991   if(dops[i].rt1==31&&(dops[i].rt1==dops[i+1].rs1||dops[i].rt1==dops[i+1].rs2)) {
4992     ujump_assemble_write_ra(i); // writeback ra for DS
4993     ra_done=1;
4994   }
4995   ds_assemble(i+1,i_regs);
4996   uint64_t bc_unneeded=branch_regs[i].u;
4997   bc_unneeded|=1|(1LL<<dops[i].rt1);
4998   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
4999   load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5000   if(!ra_done&&dops[i].rt1==31)
5001     ujump_assemble_write_ra(i);
5002   int cc,adj;
5003   cc=get_reg(branch_regs[i].regmap,CCREG);
5004   assert(cc==HOST_CCREG);
5005   store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5006   #ifdef REG_PREFETCH
5007   if(dops[i].rt1==31&&temp>=0) emit_prefetchreg(temp);
5008   #endif
5009   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5010   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5011   load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5012   if(internal_branch(ba[i]))
5013     assem_debug("branch: internal\n");
5014   else
5015     assem_debug("branch: external\n");
5016   if (internal_branch(ba[i]) && dops[(ba[i]-start)>>2].is_ds) {
5017     ds_assemble_entry(i);
5018   }
5019   else {
5020     add_to_linker(out,ba[i],internal_branch(ba[i]));
5021     emit_jmp(0);
5022   }
5023 }
5024
5025 static void rjump_assemble_write_ra(int i)
5026 {
5027   int rt,return_address;
5028   assert(dops[i+1].rt1!=dops[i].rt1);
5029   assert(dops[i+1].rt2!=dops[i].rt1);
5030   rt=get_reg(branch_regs[i].regmap,dops[i].rt1);
5031   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5032   assert(rt>=0);
5033   return_address=start+i*4+8;
5034   #ifdef REG_PREFETCH
5035   if(temp>=0)
5036   {
5037     if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
5038   }
5039   #endif
5040   emit_movimm(return_address,rt); // PC into link register
5041   #ifdef IMM_PREFETCH
5042   emit_prefetch(hash_table_get(return_address));
5043   #endif
5044 }
5045
5046 static void rjump_assemble(int i,struct regstat *i_regs)
5047 {
5048   int temp;
5049   int rs,cc;
5050   int ra_done=0;
5051   rs=get_reg(branch_regs[i].regmap,dops[i].rs1);
5052   assert(rs>=0);
5053   if (ds_writes_rjump_rs(i)) {
5054     // Delay slot abuse, make a copy of the branch address register
5055     temp=get_reg(branch_regs[i].regmap,RTEMP);
5056     assert(temp>=0);
5057     assert(regs[i].regmap[temp]==RTEMP);
5058     emit_mov(rs,temp);
5059     rs=temp;
5060   }
5061   address_generation(i+1,i_regs,regs[i].regmap_entry);
5062   #ifdef REG_PREFETCH
5063   if(dops[i].rt1==31)
5064   {
5065     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
5066       signed char *i_regmap=i_regs->regmap;
5067       int return_address=start+i*4+8;
5068       if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
5069     }
5070   }
5071   #endif
5072   #ifdef USE_MINI_HT
5073   if(dops[i].rs1==31) {
5074     int rh=get_reg(regs[i].regmap,RHASH);
5075     if(rh>=0) do_preload_rhash(rh);
5076   }
5077   #endif
5078   if(dops[i].rt1!=0&&(dops[i].rt1==dops[i+1].rs1||dops[i].rt1==dops[i+1].rs2)) {
5079     rjump_assemble_write_ra(i);
5080     ra_done=1;
5081   }
5082   ds_assemble(i+1,i_regs);
5083   uint64_t bc_unneeded=branch_regs[i].u;
5084   bc_unneeded|=1|(1LL<<dops[i].rt1);
5085   bc_unneeded&=~(1LL<<dops[i].rs1);
5086   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
5087   load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i].rs1,CCREG);
5088   if(!ra_done&&dops[i].rt1!=0)
5089     rjump_assemble_write_ra(i);
5090   cc=get_reg(branch_regs[i].regmap,CCREG);
5091   assert(cc==HOST_CCREG);
5092   (void)cc;
5093   #ifdef USE_MINI_HT
5094   int rh=get_reg(branch_regs[i].regmap,RHASH);
5095   int ht=get_reg(branch_regs[i].regmap,RHTBL);
5096   if(dops[i].rs1==31) {
5097     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
5098     do_preload_rhtbl(ht);
5099     do_rhash(rs,rh);
5100   }
5101   #endif
5102   store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1);
5103   #ifdef DESTRUCTIVE_WRITEBACK
5104   if((branch_regs[i].dirty>>rs)&1) {
5105     if(dops[i].rs1!=dops[i+1].rt1&&dops[i].rs1!=dops[i+1].rt2) {
5106       emit_loadreg(dops[i].rs1,rs);
5107     }
5108   }
5109   #endif
5110   #ifdef REG_PREFETCH
5111   if(dops[i].rt1==31&&temp>=0) emit_prefetchreg(temp);
5112   #endif
5113   #ifdef USE_MINI_HT
5114   if(dops[i].rs1==31) {
5115     do_miniht_load(ht,rh);
5116   }
5117   #endif
5118   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
5119   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
5120   //assert(adj==0);
5121   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5122   add_stub(CC_STUB,out,NULL,0,i,-1,TAKEN,rs);
5123   if(dops[i+1].itype==COP0&&(source[i+1]&0x3f)==0x10)
5124     // special case for RFE
5125     emit_jmp(0);
5126   else
5127     emit_jns(0);
5128   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1);
5129   #ifdef USE_MINI_HT
5130   if(dops[i].rs1==31) {
5131     do_miniht_jump(rs,rh,ht);
5132   }
5133   else
5134   #endif
5135   {
5136     do_jump_vaddr(rs);
5137   }
5138   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5139   if(dops[i].rt1!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
5140   #endif
5141 }
5142
5143 static void cjump_assemble(int i,struct regstat *i_regs)
5144 {
5145   signed char *i_regmap=i_regs->regmap;
5146   int cc;
5147   int match;
5148   match=match_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5149   assem_debug("match=%d\n",match);
5150   int s1l,s2l;
5151   int unconditional=0,nop=0;
5152   int invert=0;
5153   int internal=internal_branch(ba[i]);
5154   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5155   if(!match) invert=1;
5156   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5157   if(i>(ba[i]-start)>>2) invert=1;
5158   #endif
5159   #ifdef __aarch64__
5160   invert=1; // because of near cond. branches
5161   #endif
5162
5163   if(dops[i].ooo) {
5164     s1l=get_reg(branch_regs[i].regmap,dops[i].rs1);
5165     s2l=get_reg(branch_regs[i].regmap,dops[i].rs2);
5166   }
5167   else {
5168     s1l=get_reg(i_regmap,dops[i].rs1);
5169     s2l=get_reg(i_regmap,dops[i].rs2);
5170   }
5171   if(dops[i].rs1==0&&dops[i].rs2==0)
5172   {
5173     if(dops[i].opcode&1) nop=1;
5174     else unconditional=1;
5175     //assert(dops[i].opcode!=5);
5176     //assert(dops[i].opcode!=7);
5177     //assert(dops[i].opcode!=0x15);
5178     //assert(dops[i].opcode!=0x17);
5179   }
5180   else if(dops[i].rs1==0)
5181   {
5182     s1l=s2l;
5183     s2l=-1;
5184   }
5185   else if(dops[i].rs2==0)
5186   {
5187     s2l=-1;
5188   }
5189
5190   if(dops[i].ooo) {
5191     // Out of order execution (delay slot first)
5192     //printf("OOOE\n");
5193     address_generation(i+1,i_regs,regs[i].regmap_entry);
5194     ds_assemble(i+1,i_regs);
5195     int adj;
5196     uint64_t bc_unneeded=branch_regs[i].u;
5197     bc_unneeded&=~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
5198     bc_unneeded|=1;
5199     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
5200     load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i].rs1,dops[i].rs2);
5201     load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5202     cc=get_reg(branch_regs[i].regmap,CCREG);
5203     assert(cc==HOST_CCREG);
5204     if(unconditional)
5205       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5206     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5207     //assem_debug("cycle count (adj)\n");
5208     if(unconditional) {
5209       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5210       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5211         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5212         load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5213         if(internal)
5214           assem_debug("branch: internal\n");
5215         else
5216           assem_debug("branch: external\n");
5217         if (internal && dops[(ba[i]-start)>>2].is_ds) {
5218           ds_assemble_entry(i);
5219         }
5220         else {
5221           add_to_linker(out,ba[i],internal);
5222           emit_jmp(0);
5223         }
5224         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5225         if(((u_int)out)&7) emit_addnop(0);
5226         #endif
5227       }
5228     }
5229     else if(nop) {
5230       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5231       void *jaddr=out;
5232       emit_jns(0);
5233       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5234     }
5235     else {
5236       void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
5237       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5238       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5239
5240       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5241       assert(s1l>=0);
5242       if(dops[i].opcode==4) // BEQ
5243       {
5244         if(s2l>=0) emit_cmp(s1l,s2l);
5245         else emit_test(s1l,s1l);
5246         if(invert){
5247           nottaken=out;
5248           emit_jne(DJT_1);
5249         }else{
5250           add_to_linker(out,ba[i],internal);
5251           emit_jeq(0);
5252         }
5253       }
5254       if(dops[i].opcode==5) // BNE
5255       {
5256         if(s2l>=0) emit_cmp(s1l,s2l);
5257         else emit_test(s1l,s1l);
5258         if(invert){
5259           nottaken=out;
5260           emit_jeq(DJT_1);
5261         }else{
5262           add_to_linker(out,ba[i],internal);
5263           emit_jne(0);
5264         }
5265       }
5266       if(dops[i].opcode==6) // BLEZ
5267       {
5268         emit_cmpimm(s1l,1);
5269         if(invert){
5270           nottaken=out;
5271           emit_jge(DJT_1);
5272         }else{
5273           add_to_linker(out,ba[i],internal);
5274           emit_jl(0);
5275         }
5276       }
5277       if(dops[i].opcode==7) // BGTZ
5278       {
5279         emit_cmpimm(s1l,1);
5280         if(invert){
5281           nottaken=out;
5282           emit_jl(DJT_1);
5283         }else{
5284           add_to_linker(out,ba[i],internal);
5285           emit_jge(0);
5286         }
5287       }
5288       if(invert) {
5289         if(taken) set_jump_target(taken, out);
5290         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5291         if (match && (!internal || !dops[(ba[i]-start)>>2].is_ds)) {
5292           if(adj) {
5293             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5294             add_to_linker(out,ba[i],internal);
5295           }else{
5296             emit_addnop(13);
5297             add_to_linker(out,ba[i],internal*2);
5298           }
5299           emit_jmp(0);
5300         }else
5301         #endif
5302         {
5303           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5304           store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5305           load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5306           if(internal)
5307             assem_debug("branch: internal\n");
5308           else
5309             assem_debug("branch: external\n");
5310           if (internal && dops[(ba[i] - start) >> 2].is_ds) {
5311             ds_assemble_entry(i);
5312           }
5313           else {
5314             add_to_linker(out,ba[i],internal);
5315             emit_jmp(0);
5316           }
5317         }
5318         set_jump_target(nottaken, out);
5319       }
5320
5321       if(nottaken1) set_jump_target(nottaken1, out);
5322       if(adj) {
5323         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5324       }
5325     } // (!unconditional)
5326   } // if(ooo)
5327   else
5328   {
5329     // In-order execution (branch first)
5330     void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
5331     if(!unconditional&&!nop) {
5332       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5333       assert(s1l>=0);
5334       if((dops[i].opcode&0x2f)==4) // BEQ
5335       {
5336         if(s2l>=0) emit_cmp(s1l,s2l);
5337         else emit_test(s1l,s1l);
5338         nottaken=out;
5339         emit_jne(DJT_2);
5340       }
5341       if((dops[i].opcode&0x2f)==5) // BNE
5342       {
5343         if(s2l>=0) emit_cmp(s1l,s2l);
5344         else emit_test(s1l,s1l);
5345         nottaken=out;
5346         emit_jeq(DJT_2);
5347       }
5348       if((dops[i].opcode&0x2f)==6) // BLEZ
5349       {
5350         emit_cmpimm(s1l,1);
5351         nottaken=out;
5352         emit_jge(DJT_2);
5353       }
5354       if((dops[i].opcode&0x2f)==7) // BGTZ
5355       {
5356         emit_cmpimm(s1l,1);
5357         nottaken=out;
5358         emit_jl(DJT_2);
5359       }
5360     } // if(!unconditional)
5361     int adj;
5362     uint64_t ds_unneeded=branch_regs[i].u;
5363     ds_unneeded&=~((1LL<<dops[i+1].rs1)|(1LL<<dops[i+1].rs2));
5364     ds_unneeded|=1;
5365     // branch taken
5366     if(!nop) {
5367       if(taken) set_jump_target(taken, out);
5368       assem_debug("1:\n");
5369       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5370       // load regs
5371       load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i+1].rs1,dops[i+1].rs2);
5372       address_generation(i+1,&branch_regs[i],0);
5373       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
5374       ds_assemble(i+1,&branch_regs[i]);
5375       cc=get_reg(branch_regs[i].regmap,CCREG);
5376       if(cc==-1) {
5377         emit_loadreg(CCREG,cc=HOST_CCREG);
5378         // CHECK: Is the following instruction (fall thru) allocated ok?
5379       }
5380       assert(cc==HOST_CCREG);
5381       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5382       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5383       assem_debug("cycle count (adj)\n");
5384       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5385       load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5386       if(internal)
5387         assem_debug("branch: internal\n");
5388       else
5389         assem_debug("branch: external\n");
5390       if (internal && dops[(ba[i] - start) >> 2].is_ds) {
5391         ds_assemble_entry(i);
5392       }
5393       else {
5394         add_to_linker(out,ba[i],internal);
5395         emit_jmp(0);
5396       }
5397     }
5398     // branch not taken
5399     if(!unconditional) {
5400       if(nottaken1) set_jump_target(nottaken1, out);
5401       set_jump_target(nottaken, out);
5402       assem_debug("2:\n");
5403       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5404       load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i+1].rs1,dops[i+1].rs2);
5405       address_generation(i+1,&branch_regs[i],0);
5406       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5407       ds_assemble(i+1,&branch_regs[i]);
5408       cc=get_reg(branch_regs[i].regmap,CCREG);
5409       if (cc == -1) {
5410         // Cycle count isn't in a register, temporarily load it then write it out
5411         emit_loadreg(CCREG,HOST_CCREG);
5412         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5413         void *jaddr=out;
5414         emit_jns(0);
5415         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5416         emit_storereg(CCREG,HOST_CCREG);
5417       }
5418       else{
5419         cc=get_reg(i_regmap,CCREG);
5420         assert(cc==HOST_CCREG);
5421         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5422         void *jaddr=out;
5423         emit_jns(0);
5424         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5425       }
5426     }
5427   }
5428 }
5429
5430 static void sjump_assemble(int i,struct regstat *i_regs)
5431 {
5432   signed char *i_regmap=i_regs->regmap;
5433   int cc;
5434   int match;
5435   match=match_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5436   assem_debug("smatch=%d\n",match);
5437   int s1l;
5438   int unconditional=0,nevertaken=0;
5439   int invert=0;
5440   int internal=internal_branch(ba[i]);
5441   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5442   if(!match) invert=1;
5443   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5444   if(i>(ba[i]-start)>>2) invert=1;
5445   #endif
5446   #ifdef __aarch64__
5447   invert=1; // because of near cond. branches
5448   #endif
5449
5450   //if(dops[i].opcode2>=0x10) return; // FIXME (BxxZAL)
5451   //assert(dops[i].opcode2<0x10||dops[i].rs1==0); // FIXME (BxxZAL)
5452
5453   if(dops[i].ooo) {
5454     s1l=get_reg(branch_regs[i].regmap,dops[i].rs1);
5455   }
5456   else {
5457     s1l=get_reg(i_regmap,dops[i].rs1);
5458   }
5459   if(dops[i].rs1==0)
5460   {
5461     if(dops[i].opcode2&1) unconditional=1;
5462     else nevertaken=1;
5463     // These are never taken (r0 is never less than zero)
5464     //assert(dops[i].opcode2!=0);
5465     //assert(dops[i].opcode2!=2);
5466     //assert(dops[i].opcode2!=0x10);
5467     //assert(dops[i].opcode2!=0x12);
5468   }
5469
5470   if(dops[i].ooo) {
5471     // Out of order execution (delay slot first)
5472     //printf("OOOE\n");
5473     address_generation(i+1,i_regs,regs[i].regmap_entry);
5474     ds_assemble(i+1,i_regs);
5475     int adj;
5476     uint64_t bc_unneeded=branch_regs[i].u;
5477     bc_unneeded&=~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
5478     bc_unneeded|=1;
5479     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
5480     load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i].rs1,dops[i].rs1);
5481     load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5482     if(dops[i].rt1==31) {
5483       int rt,return_address;
5484       rt=get_reg(branch_regs[i].regmap,31);
5485       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5486       if(rt>=0) {
5487         // Save the PC even if the branch is not taken
5488         return_address=start+i*4+8;
5489         emit_movimm(return_address,rt); // PC into link register
5490         #ifdef IMM_PREFETCH
5491         if(!nevertaken) emit_prefetch(hash_table_get(return_address));
5492         #endif
5493       }
5494     }
5495     cc=get_reg(branch_regs[i].regmap,CCREG);
5496     assert(cc==HOST_CCREG);
5497     if(unconditional)
5498       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5499     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5500     assem_debug("cycle count (adj)\n");
5501     if(unconditional) {
5502       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5503       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5504         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5505         load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5506         if(internal)
5507           assem_debug("branch: internal\n");
5508         else
5509           assem_debug("branch: external\n");
5510         if (internal && dops[(ba[i] - start) >> 2].is_ds) {
5511           ds_assemble_entry(i);
5512         }
5513         else {
5514           add_to_linker(out,ba[i],internal);
5515           emit_jmp(0);
5516         }
5517         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5518         if(((u_int)out)&7) emit_addnop(0);
5519         #endif
5520       }
5521     }
5522     else if(nevertaken) {
5523       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5524       void *jaddr=out;
5525       emit_jns(0);
5526       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5527     }
5528     else {
5529       void *nottaken = NULL;
5530       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5531       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5532       {
5533         assert(s1l>=0);
5534         if((dops[i].opcode2&0xf)==0) // BLTZ/BLTZAL
5535         {
5536           emit_test(s1l,s1l);
5537           if(invert){
5538             nottaken=out;
5539             emit_jns(DJT_1);
5540           }else{
5541             add_to_linker(out,ba[i],internal);
5542             emit_js(0);
5543           }
5544         }
5545         if((dops[i].opcode2&0xf)==1) // BGEZ/BLTZAL
5546         {
5547           emit_test(s1l,s1l);
5548           if(invert){
5549             nottaken=out;
5550             emit_js(DJT_1);
5551           }else{
5552             add_to_linker(out,ba[i],internal);
5553             emit_jns(0);
5554           }
5555         }
5556       }
5557
5558       if(invert) {
5559         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5560         if (match && (!internal || !dops[(ba[i] - start) >> 2].is_ds)) {
5561           if(adj) {
5562             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5563             add_to_linker(out,ba[i],internal);
5564           }else{
5565             emit_addnop(13);
5566             add_to_linker(out,ba[i],internal*2);
5567           }
5568           emit_jmp(0);
5569         }else
5570         #endif
5571         {
5572           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5573           store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5574           load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5575           if(internal)
5576             assem_debug("branch: internal\n");
5577           else
5578             assem_debug("branch: external\n");
5579           if (internal && dops[(ba[i] - start) >> 2].is_ds) {
5580             ds_assemble_entry(i);
5581           }
5582           else {
5583             add_to_linker(out,ba[i],internal);
5584             emit_jmp(0);
5585           }
5586         }
5587         set_jump_target(nottaken, out);
5588       }
5589
5590       if(adj) {
5591         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5592       }
5593     } // (!unconditional)
5594   } // if(ooo)
5595   else
5596   {
5597     // In-order execution (branch first)
5598     //printf("IOE\n");
5599     void *nottaken = NULL;
5600     if(dops[i].rt1==31) {
5601       int rt,return_address;
5602       rt=get_reg(branch_regs[i].regmap,31);
5603       if(rt>=0) {
5604         // Save the PC even if the branch is not taken
5605         return_address=start+i*4+8;
5606         emit_movimm(return_address,rt); // PC into link register
5607         #ifdef IMM_PREFETCH
5608         emit_prefetch(hash_table_get(return_address));
5609         #endif
5610       }
5611     }
5612     if(!unconditional) {
5613       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5614         assert(s1l>=0);
5615         if((dops[i].opcode2&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5616         {
5617           emit_test(s1l,s1l);
5618           nottaken=out;
5619           emit_jns(DJT_1);
5620         }
5621         if((dops[i].opcode2&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5622         {
5623           emit_test(s1l,s1l);
5624           nottaken=out;
5625           emit_js(DJT_1);
5626         }
5627     } // if(!unconditional)
5628     int adj;
5629     uint64_t ds_unneeded=branch_regs[i].u;
5630     ds_unneeded&=~((1LL<<dops[i+1].rs1)|(1LL<<dops[i+1].rs2));
5631     ds_unneeded|=1;
5632     // branch taken
5633     if(!nevertaken) {
5634       //assem_debug("1:\n");
5635       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5636       // load regs
5637       load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i+1].rs1,dops[i+1].rs2);
5638       address_generation(i+1,&branch_regs[i],0);
5639       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
5640       ds_assemble(i+1,&branch_regs[i]);
5641       cc=get_reg(branch_regs[i].regmap,CCREG);
5642       if(cc==-1) {
5643         emit_loadreg(CCREG,cc=HOST_CCREG);
5644         // CHECK: Is the following instruction (fall thru) allocated ok?
5645       }
5646       assert(cc==HOST_CCREG);
5647       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5648       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5649       assem_debug("cycle count (adj)\n");
5650       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5651       load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5652       if(internal)
5653         assem_debug("branch: internal\n");
5654       else
5655         assem_debug("branch: external\n");
5656       if (internal && dops[(ba[i] - start) >> 2].is_ds) {
5657         ds_assemble_entry(i);
5658       }
5659       else {
5660         add_to_linker(out,ba[i],internal);
5661         emit_jmp(0);
5662       }
5663     }
5664     // branch not taken
5665     if(!unconditional) {
5666       set_jump_target(nottaken, out);
5667       assem_debug("1:\n");
5668       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5669       load_regs(regs[i].regmap,branch_regs[i].regmap,dops[i+1].rs1,dops[i+1].rs2);
5670       address_generation(i+1,&branch_regs[i],0);
5671       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5672       ds_assemble(i+1,&branch_regs[i]);
5673       cc=get_reg(branch_regs[i].regmap,CCREG);
5674       if (cc == -1) {
5675         // Cycle count isn't in a register, temporarily load it then write it out
5676         emit_loadreg(CCREG,HOST_CCREG);
5677         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5678         void *jaddr=out;
5679         emit_jns(0);
5680         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5681         emit_storereg(CCREG,HOST_CCREG);
5682       }
5683       else{
5684         cc=get_reg(i_regmap,CCREG);
5685         assert(cc==HOST_CCREG);
5686         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5687         void *jaddr=out;
5688         emit_jns(0);
5689         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5690       }
5691     }
5692   }
5693 }
5694
5695 static void pagespan_assemble(int i,struct regstat *i_regs)
5696 {
5697   int s1l=get_reg(i_regs->regmap,dops[i].rs1);
5698   int s2l=get_reg(i_regs->regmap,dops[i].rs2);
5699   void *taken = NULL;
5700   void *nottaken = NULL;
5701   int unconditional=0;
5702   if(dops[i].rs1==0)
5703   {
5704     s1l=s2l;
5705     s2l=-1;
5706   }
5707   else if(dops[i].rs2==0)
5708   {
5709     s2l=-1;
5710   }
5711   int hr=0;
5712   int addr=-1,alt=-1,ntaddr=-1;
5713   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5714   else {
5715     while(hr<HOST_REGS)
5716     {
5717       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5718          (i_regs->regmap[hr]&63)!=dops[i].rs1 &&
5719          (i_regs->regmap[hr]&63)!=dops[i].rs2 )
5720       {
5721         addr=hr++;break;
5722       }
5723       hr++;
5724     }
5725   }
5726   while(hr<HOST_REGS)
5727   {
5728     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5729        (i_regs->regmap[hr]&63)!=dops[i].rs1 &&
5730        (i_regs->regmap[hr]&63)!=dops[i].rs2 )
5731     {
5732       alt=hr++;break;
5733     }
5734     hr++;
5735   }
5736   if((dops[i].opcode&0x2E)==6) // BLEZ/BGTZ needs another register
5737   {
5738     while(hr<HOST_REGS)
5739     {
5740       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5741          (i_regs->regmap[hr]&63)!=dops[i].rs1 &&
5742          (i_regs->regmap[hr]&63)!=dops[i].rs2 )
5743       {
5744         ntaddr=hr;break;
5745       }
5746       hr++;
5747     }
5748   }
5749   assert(hr<HOST_REGS);
5750   if((dops[i].opcode&0x2e)==4||dops[i].opcode==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5751     load_regs(regs[i].regmap_entry,regs[i].regmap,CCREG,CCREG);
5752   }
5753   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5754   if(dops[i].opcode==2) // J
5755   {
5756     unconditional=1;
5757   }
5758   if(dops[i].opcode==3) // JAL
5759   {
5760     // TODO: mini_ht
5761     int rt=get_reg(i_regs->regmap,31);
5762     emit_movimm(start+i*4+8,rt);
5763     unconditional=1;
5764   }
5765   if(dops[i].opcode==0&&(dops[i].opcode2&0x3E)==8) // JR/JALR
5766   {
5767     emit_mov(s1l,addr);
5768     if(dops[i].opcode2==9) // JALR
5769     {
5770       int rt=get_reg(i_regs->regmap,dops[i].rt1);
5771       emit_movimm(start+i*4+8,rt);
5772     }
5773   }
5774   if((dops[i].opcode&0x3f)==4) // BEQ
5775   {
5776     if(dops[i].rs1==dops[i].rs2)
5777     {
5778       unconditional=1;
5779     }
5780     else
5781     #ifdef HAVE_CMOV_IMM
5782     if(1) {
5783       if(s2l>=0) emit_cmp(s1l,s2l);
5784       else emit_test(s1l,s1l);
5785       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5786     }
5787     else
5788     #endif
5789     {
5790       assert(s1l>=0);
5791       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5792       if(s2l>=0) emit_cmp(s1l,s2l);
5793       else emit_test(s1l,s1l);
5794       emit_cmovne_reg(alt,addr);
5795     }
5796   }
5797   if((dops[i].opcode&0x3f)==5) // BNE
5798   {
5799     #ifdef HAVE_CMOV_IMM
5800     if(s2l>=0) emit_cmp(s1l,s2l);
5801     else emit_test(s1l,s1l);
5802     emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5803     #else
5804     assert(s1l>=0);
5805     emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5806     if(s2l>=0) emit_cmp(s1l,s2l);
5807     else emit_test(s1l,s1l);
5808     emit_cmovne_reg(alt,addr);
5809     #endif
5810   }
5811   if((dops[i].opcode&0x3f)==0x14) // BEQL
5812   {
5813     if(s2l>=0) emit_cmp(s1l,s2l);
5814     else emit_test(s1l,s1l);
5815     if(nottaken) set_jump_target(nottaken, out);
5816     nottaken=out;
5817     emit_jne(0);
5818   }
5819   if((dops[i].opcode&0x3f)==0x15) // BNEL
5820   {
5821     if(s2l>=0) emit_cmp(s1l,s2l);
5822     else emit_test(s1l,s1l);
5823     nottaken=out;
5824     emit_jeq(0);
5825     if(taken) set_jump_target(taken, out);
5826   }
5827   if((dops[i].opcode&0x3f)==6) // BLEZ
5828   {
5829     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5830     emit_cmpimm(s1l,1);
5831     emit_cmovl_reg(alt,addr);
5832   }
5833   if((dops[i].opcode&0x3f)==7) // BGTZ
5834   {
5835     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5836     emit_cmpimm(s1l,1);
5837     emit_cmovl_reg(ntaddr,addr);
5838   }
5839   if((dops[i].opcode&0x3f)==0x16) // BLEZL
5840   {
5841     assert((dops[i].opcode&0x3f)!=0x16);
5842   }
5843   if((dops[i].opcode&0x3f)==0x17) // BGTZL
5844   {
5845     assert((dops[i].opcode&0x3f)!=0x17);
5846   }
5847   assert(dops[i].opcode!=1); // BLTZ/BGEZ
5848
5849   //FIXME: Check CSREG
5850   if(dops[i].opcode==0x11 && dops[i].opcode2==0x08 ) {
5851     if((source[i]&0x30000)==0) // BC1F
5852     {
5853       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5854       emit_testimm(s1l,0x800000);
5855       emit_cmovne_reg(alt,addr);
5856     }
5857     if((source[i]&0x30000)==0x10000) // BC1T
5858     {
5859       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5860       emit_testimm(s1l,0x800000);
5861       emit_cmovne_reg(alt,addr);
5862     }
5863     if((source[i]&0x30000)==0x20000) // BC1FL
5864     {
5865       emit_testimm(s1l,0x800000);
5866       nottaken=out;
5867       emit_jne(0);
5868     }
5869     if((source[i]&0x30000)==0x30000) // BC1TL
5870     {
5871       emit_testimm(s1l,0x800000);
5872       nottaken=out;
5873       emit_jeq(0);
5874     }
5875   }
5876
5877   assert(i_regs->regmap[HOST_CCREG]==CCREG);
5878   wb_dirtys(regs[i].regmap,regs[i].dirty);
5879   if(unconditional)
5880   {
5881     emit_movimm(ba[i],HOST_BTREG);
5882   }
5883   else if(addr!=HOST_BTREG)
5884   {
5885     emit_mov(addr,HOST_BTREG);
5886   }
5887   void *branch_addr=out;
5888   emit_jmp(0);
5889   int target_addr=start+i*4+5;
5890   void *stub=out;
5891   void *compiled_target_addr=check_addr(target_addr);
5892   emit_extjump_ds(branch_addr, target_addr);
5893   if(compiled_target_addr) {
5894     set_jump_target(branch_addr, compiled_target_addr);
5895     add_jump_out(target_addr,stub);
5896   }
5897   else set_jump_target(branch_addr, stub);
5898 }
5899
5900 // Assemble the delay slot for the above
5901 static void pagespan_ds()
5902 {
5903   assem_debug("initial delay slot:\n");
5904   u_int vaddr=start+1;
5905   u_int page=get_page(vaddr);
5906   u_int vpage=get_vpage(vaddr);
5907   ll_add(jump_dirty+vpage,vaddr,(void *)out);
5908   do_dirty_stub_ds(slen*4);
5909   ll_add(jump_in+page,vaddr,(void *)out);
5910   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
5911   if(regs[0].regmap[HOST_CCREG]!=CCREG)
5912     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty);
5913   if(regs[0].regmap[HOST_BTREG]!=BTREG)
5914     emit_writeword(HOST_BTREG,&branch_target);
5915   load_regs(regs[0].regmap_entry,regs[0].regmap,dops[0].rs1,dops[0].rs2);
5916   address_generation(0,&regs[0],regs[0].regmap_entry);
5917   if(dops[0].itype==STORE||dops[0].itype==STORELR||(dops[0].opcode&0x3b)==0x39||(dops[0].opcode&0x3b)==0x3a)
5918     load_regs(regs[0].regmap_entry,regs[0].regmap,INVCP,INVCP);
5919   is_delayslot=0;
5920   switch(dops[0].itype) {
5921     case ALU:
5922       alu_assemble(0,&regs[0]);break;
5923     case IMM16:
5924       imm16_assemble(0,&regs[0]);break;
5925     case SHIFT:
5926       shift_assemble(0,&regs[0]);break;
5927     case SHIFTIMM:
5928       shiftimm_assemble(0,&regs[0]);break;
5929     case LOAD:
5930       load_assemble(0,&regs[0]);break;
5931     case LOADLR:
5932       loadlr_assemble(0,&regs[0]);break;
5933     case STORE:
5934       store_assemble(0,&regs[0]);break;
5935     case STORELR:
5936       storelr_assemble(0,&regs[0]);break;
5937     case COP0:
5938       cop0_assemble(0,&regs[0]);break;
5939     case COP1:
5940       cop1_assemble(0,&regs[0]);break;
5941     case C1LS:
5942       c1ls_assemble(0,&regs[0]);break;
5943     case COP2:
5944       cop2_assemble(0,&regs[0]);break;
5945     case C2LS:
5946       c2ls_assemble(0,&regs[0]);break;
5947     case C2OP:
5948       c2op_assemble(0,&regs[0]);break;
5949     case MULTDIV:
5950       multdiv_assemble(0,&regs[0]);
5951       multdiv_prepare_stall(0,&regs[0]);
5952       break;
5953     case MOV:
5954       mov_assemble(0,&regs[0]);break;
5955     case SYSCALL:
5956     case HLECALL:
5957     case INTCALL:
5958     case SPAN:
5959     case UJUMP:
5960     case RJUMP:
5961     case CJUMP:
5962     case SJUMP:
5963       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
5964   }
5965   int btaddr=get_reg(regs[0].regmap,BTREG);
5966   if(btaddr<0) {
5967     btaddr=get_reg(regs[0].regmap,-1);
5968     emit_readword(&branch_target,btaddr);
5969   }
5970   assert(btaddr!=HOST_CCREG);
5971   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
5972 #ifdef HOST_IMM8
5973   host_tempreg_acquire();
5974   emit_movimm(start+4,HOST_TEMPREG);
5975   emit_cmp(btaddr,HOST_TEMPREG);
5976   host_tempreg_release();
5977 #else
5978   emit_cmpimm(btaddr,start+4);
5979 #endif
5980   void *branch = out;
5981   emit_jeq(0);
5982   store_regs_bt(regs[0].regmap,regs[0].dirty,-1);
5983   do_jump_vaddr(btaddr);
5984   set_jump_target(branch, out);
5985   store_regs_bt(regs[0].regmap,regs[0].dirty,start+4);
5986   load_regs_bt(regs[0].regmap,regs[0].dirty,start+4);
5987 }
5988
5989 // Basic liveness analysis for MIPS registers
5990 void unneeded_registers(int istart,int iend,int r)
5991 {
5992   int i;
5993   uint64_t u,gte_u,b,gte_b;
5994   uint64_t temp_u,temp_gte_u=0;
5995   uint64_t gte_u_unknown=0;
5996   if (HACK_ENABLED(NDHACK_GTE_UNNEEDED))
5997     gte_u_unknown=~0ll;
5998   if(iend==slen-1) {
5999     u=1;
6000     gte_u=gte_u_unknown;
6001   }else{
6002     //u=unneeded_reg[iend+1];
6003     u=1;
6004     gte_u=gte_unneeded[iend+1];
6005   }
6006
6007   for (i=iend;i>=istart;i--)
6008   {
6009     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6010     if(dops[i].is_jump)
6011     {
6012       // If subroutine call, flag return address as a possible branch target
6013       if(dops[i].rt1==31 && i<slen-2) dops[i+2].bt=1;
6014
6015       if(ba[i]<start || ba[i]>=(start+slen*4))
6016       {
6017         // Branch out of this block, flush all regs
6018         u=1;
6019         gte_u=gte_u_unknown;
6020         branch_unneeded_reg[i]=u;
6021         // Merge in delay slot
6022         u|=(1LL<<dops[i+1].rt1)|(1LL<<dops[i+1].rt2);
6023         u&=~((1LL<<dops[i+1].rs1)|(1LL<<dops[i+1].rs2));
6024         u|=1;
6025         gte_u|=gte_rt[i+1];
6026         gte_u&=~gte_rs[i+1];
6027       }
6028       else
6029       {
6030         // Internal branch, flag target
6031         dops[(ba[i]-start)>>2].bt=1;
6032         if(ba[i]<=start+i*4) {
6033           // Backward branch
6034           if(dops[i].is_ujump)
6035           {
6036             // Unconditional branch
6037             temp_u=1;
6038             temp_gte_u=0;
6039           } else {
6040             // Conditional branch (not taken case)
6041             temp_u=unneeded_reg[i+2];
6042             temp_gte_u&=gte_unneeded[i+2];
6043           }
6044           // Merge in delay slot
6045           temp_u|=(1LL<<dops[i+1].rt1)|(1LL<<dops[i+1].rt2);
6046           temp_u&=~((1LL<<dops[i+1].rs1)|(1LL<<dops[i+1].rs2));
6047           temp_u|=1;
6048           temp_gte_u|=gte_rt[i+1];
6049           temp_gte_u&=~gte_rs[i+1];
6050           temp_u|=(1LL<<dops[i].rt1)|(1LL<<dops[i].rt2);
6051           temp_u&=~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
6052           temp_u|=1;
6053           temp_gte_u|=gte_rt[i];
6054           temp_gte_u&=~gte_rs[i];
6055           unneeded_reg[i]=temp_u;
6056           gte_unneeded[i]=temp_gte_u;
6057           // Only go three levels deep.  This recursion can take an
6058           // excessive amount of time if there are a lot of nested loops.
6059           if(r<2) {
6060             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6061           }else{
6062             unneeded_reg[(ba[i]-start)>>2]=1;
6063             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
6064           }
6065         } /*else*/ if(1) {
6066           if (dops[i].is_ujump)
6067           {
6068             // Unconditional branch
6069             u=unneeded_reg[(ba[i]-start)>>2];
6070             gte_u=gte_unneeded[(ba[i]-start)>>2];
6071             branch_unneeded_reg[i]=u;
6072             // Merge in delay slot
6073             u|=(1LL<<dops[i+1].rt1)|(1LL<<dops[i+1].rt2);
6074             u&=~((1LL<<dops[i+1].rs1)|(1LL<<dops[i+1].rs2));
6075             u|=1;
6076             gte_u|=gte_rt[i+1];
6077             gte_u&=~gte_rs[i+1];
6078           } else {
6079             // Conditional branch
6080             b=unneeded_reg[(ba[i]-start)>>2];
6081             gte_b=gte_unneeded[(ba[i]-start)>>2];
6082             branch_unneeded_reg[i]=b;
6083             // Branch delay slot
6084             b|=(1LL<<dops[i+1].rt1)|(1LL<<dops[i+1].rt2);
6085             b&=~((1LL<<dops[i+1].rs1)|(1LL<<dops[i+1].rs2));
6086             b|=1;
6087             gte_b|=gte_rt[i+1];
6088             gte_b&=~gte_rs[i+1];
6089             u&=b;
6090             gte_u&=gte_b;
6091             if(i<slen-1) {
6092               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6093             } else {
6094               branch_unneeded_reg[i]=1;
6095             }
6096           }
6097         }
6098       }
6099     }
6100     else if(dops[i].itype==SYSCALL||dops[i].itype==HLECALL||dops[i].itype==INTCALL)
6101     {
6102       // SYSCALL instruction (software interrupt)
6103       u=1;
6104     }
6105     else if(dops[i].itype==COP0 && (source[i]&0x3f)==0x18)
6106     {
6107       // ERET instruction (return from interrupt)
6108       u=1;
6109     }
6110     //u=1; // DEBUG
6111     // Written registers are unneeded
6112     u|=1LL<<dops[i].rt1;
6113     u|=1LL<<dops[i].rt2;
6114     gte_u|=gte_rt[i];
6115     // Accessed registers are needed
6116     u&=~(1LL<<dops[i].rs1);
6117     u&=~(1LL<<dops[i].rs2);
6118     gte_u&=~gte_rs[i];
6119     if(gte_rs[i]&&dops[i].rt1&&(unneeded_reg[i+1]&(1ll<<dops[i].rt1)))
6120       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
6121     // Source-target dependencies
6122     // R0 is always unneeded
6123     u|=1;
6124     // Save it
6125     unneeded_reg[i]=u;
6126     gte_unneeded[i]=gte_u;
6127     /*
6128     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6129     printf("U:");
6130     int r;
6131     for(r=1;r<=CCREG;r++) {
6132       if((unneeded_reg[i]>>r)&1) {
6133         if(r==HIREG) printf(" HI");
6134         else if(r==LOREG) printf(" LO");
6135         else printf(" r%d",r);
6136       }
6137     }
6138     printf("\n");
6139     */
6140   }
6141 }
6142
6143 // Write back dirty registers as soon as we will no longer modify them,
6144 // so that we don't end up with lots of writes at the branches.
6145 void clean_registers(int istart,int iend,int wr)
6146 {
6147   int i;
6148   int r;
6149   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6150   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6151   if(iend==slen-1) {
6152     will_dirty_i=will_dirty_next=0;
6153     wont_dirty_i=wont_dirty_next=0;
6154   }else{
6155     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6156     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6157   }
6158   for (i=iend;i>=istart;i--)
6159   {
6160     if(dops[i].is_jump)
6161     {
6162       if(ba[i]<start || ba[i]>=(start+slen*4))
6163       {
6164         // Branch out of this block, flush all regs
6165         if (dops[i].is_ujump)
6166         {
6167           // Unconditional branch
6168           will_dirty_i=0;
6169           wont_dirty_i=0;
6170           // Merge in delay slot (will dirty)
6171           for(r=0;r<HOST_REGS;r++) {
6172             if(r!=EXCLUDE_REG) {
6173               if((branch_regs[i].regmap[r]&63)==dops[i].rt1) will_dirty_i|=1<<r;
6174               if((branch_regs[i].regmap[r]&63)==dops[i].rt2) will_dirty_i|=1<<r;
6175               if((branch_regs[i].regmap[r]&63)==dops[i+1].rt1) will_dirty_i|=1<<r;
6176               if((branch_regs[i].regmap[r]&63)==dops[i+1].rt2) will_dirty_i|=1<<r;
6177               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6178               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6179               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6180               if((regs[i].regmap[r]&63)==dops[i].rt1) will_dirty_i|=1<<r;
6181               if((regs[i].regmap[r]&63)==dops[i].rt2) will_dirty_i|=1<<r;
6182               if((regs[i].regmap[r]&63)==dops[i+1].rt1) will_dirty_i|=1<<r;
6183               if((regs[i].regmap[r]&63)==dops[i+1].rt2) will_dirty_i|=1<<r;
6184               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6185               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6186               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6187             }
6188           }
6189         }
6190         else
6191         {
6192           // Conditional branch
6193           will_dirty_i=0;
6194           wont_dirty_i=wont_dirty_next;
6195           // Merge in delay slot (will dirty)
6196           for(r=0;r<HOST_REGS;r++) {
6197             if(r!=EXCLUDE_REG) {
6198               if (1) { // !dops[i].likely) {
6199                 // Might not dirty if likely branch is not taken
6200                 if((branch_regs[i].regmap[r]&63)==dops[i].rt1) will_dirty_i|=1<<r;
6201                 if((branch_regs[i].regmap[r]&63)==dops[i].rt2) will_dirty_i|=1<<r;
6202                 if((branch_regs[i].regmap[r]&63)==dops[i+1].rt1) will_dirty_i|=1<<r;
6203                 if((branch_regs[i].regmap[r]&63)==dops[i+1].rt2) will_dirty_i|=1<<r;
6204                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6205                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6206                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6207                 //if((regs[i].regmap[r]&63)==dops[i].rt1) will_dirty_i|=1<<r;
6208                 //if((regs[i].regmap[r]&63)==dops[i].rt2) will_dirty_i|=1<<r;
6209                 if((regs[i].regmap[r]&63)==dops[i+1].rt1) will_dirty_i|=1<<r;
6210                 if((regs[i].regmap[r]&63)==dops[i+1].rt2) will_dirty_i|=1<<r;
6211                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6212                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6213                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6214               }
6215             }
6216           }
6217         }
6218         // Merge in delay slot (wont dirty)
6219         for(r=0;r<HOST_REGS;r++) {
6220           if(r!=EXCLUDE_REG) {
6221             if((regs[i].regmap[r]&63)==dops[i].rt1) wont_dirty_i|=1<<r;
6222             if((regs[i].regmap[r]&63)==dops[i].rt2) wont_dirty_i|=1<<r;
6223             if((regs[i].regmap[r]&63)==dops[i+1].rt1) wont_dirty_i|=1<<r;
6224             if((regs[i].regmap[r]&63)==dops[i+1].rt2) wont_dirty_i|=1<<r;
6225             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6226             if((branch_regs[i].regmap[r]&63)==dops[i].rt1) wont_dirty_i|=1<<r;
6227             if((branch_regs[i].regmap[r]&63)==dops[i].rt2) wont_dirty_i|=1<<r;
6228             if((branch_regs[i].regmap[r]&63)==dops[i+1].rt1) wont_dirty_i|=1<<r;
6229             if((branch_regs[i].regmap[r]&63)==dops[i+1].rt2) wont_dirty_i|=1<<r;
6230             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6231           }
6232         }
6233         if(wr) {
6234           #ifndef DESTRUCTIVE_WRITEBACK
6235           branch_regs[i].dirty&=wont_dirty_i;
6236           #endif
6237           branch_regs[i].dirty|=will_dirty_i;
6238         }
6239       }
6240       else
6241       {
6242         // Internal branch
6243         if(ba[i]<=start+i*4) {
6244           // Backward branch
6245           if (dops[i].is_ujump)
6246           {
6247             // Unconditional branch
6248             temp_will_dirty=0;
6249             temp_wont_dirty=0;
6250             // Merge in delay slot (will dirty)
6251             for(r=0;r<HOST_REGS;r++) {
6252               if(r!=EXCLUDE_REG) {
6253                 if((branch_regs[i].regmap[r]&63)==dops[i].rt1) temp_will_dirty|=1<<r;
6254                 if((branch_regs[i].regmap[r]&63)==dops[i].rt2) temp_will_dirty|=1<<r;
6255                 if((branch_regs[i].regmap[r]&63)==dops[i+1].rt1) temp_will_dirty|=1<<r;
6256                 if((branch_regs[i].regmap[r]&63)==dops[i+1].rt2) temp_will_dirty|=1<<r;
6257                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6258                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6259                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6260                 if((regs[i].regmap[r]&63)==dops[i].rt1) temp_will_dirty|=1<<r;
6261                 if((regs[i].regmap[r]&63)==dops[i].rt2) temp_will_dirty|=1<<r;
6262                 if((regs[i].regmap[r]&63)==dops[i+1].rt1) temp_will_dirty|=1<<r;
6263                 if((regs[i].regmap[r]&63)==dops[i+1].rt2) temp_will_dirty|=1<<r;
6264                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6265                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6266                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6267               }
6268             }
6269           } else {
6270             // Conditional branch (not taken case)
6271             temp_will_dirty=will_dirty_next;
6272             temp_wont_dirty=wont_dirty_next;
6273             // Merge in delay slot (will dirty)
6274             for(r=0;r<HOST_REGS;r++) {
6275               if(r!=EXCLUDE_REG) {
6276                 if (1) { // !dops[i].likely) {
6277                   // Will not dirty if likely branch is not taken
6278                   if((branch_regs[i].regmap[r]&63)==dops[i].rt1) temp_will_dirty|=1<<r;
6279                   if((branch_regs[i].regmap[r]&63)==dops[i].rt2) temp_will_dirty|=1<<r;
6280                   if((branch_regs[i].regmap[r]&63)==dops[i+1].rt1) temp_will_dirty|=1<<r;
6281                   if((branch_regs[i].regmap[r]&63)==dops[i+1].rt2) temp_will_dirty|=1<<r;
6282                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6283                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
6284                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6285                   //if((regs[i].regmap[r]&63)==dops[i].rt1) temp_will_dirty|=1<<r;
6286                   //if((regs[i].regmap[r]&63)==dops[i].rt2) temp_will_dirty|=1<<r;
6287                   if((regs[i].regmap[r]&63)==dops[i+1].rt1) temp_will_dirty|=1<<r;
6288                   if((regs[i].regmap[r]&63)==dops[i+1].rt2) temp_will_dirty|=1<<r;
6289                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6290                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6291                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6292                 }
6293               }
6294             }
6295           }
6296           // Merge in delay slot (wont dirty)
6297           for(r=0;r<HOST_REGS;r++) {
6298             if(r!=EXCLUDE_REG) {
6299               if((regs[i].regmap[r]&63)==dops[i].rt1) temp_wont_dirty|=1<<r;
6300               if((regs[i].regmap[r]&63)==dops[i].rt2) temp_wont_dirty|=1<<r;
6301               if((regs[i].regmap[r]&63)==dops[i+1].rt1) temp_wont_dirty|=1<<r;
6302               if((regs[i].regmap[r]&63)==dops[i+1].rt2) temp_wont_dirty|=1<<r;
6303               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6304               if((branch_regs[i].regmap[r]&63)==dops[i].rt1) temp_wont_dirty|=1<<r;
6305               if((branch_regs[i].regmap[r]&63)==dops[i].rt2) temp_wont_dirty|=1<<r;
6306               if((branch_regs[i].regmap[r]&63)==dops[i+1].rt1) temp_wont_dirty|=1<<r;
6307               if((branch_regs[i].regmap[r]&63)==dops[i+1].rt2) temp_wont_dirty|=1<<r;
6308               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6309             }
6310           }
6311           // Deal with changed mappings
6312           if(i<iend) {
6313             for(r=0;r<HOST_REGS;r++) {
6314               if(r!=EXCLUDE_REG) {
6315                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
6316                   temp_will_dirty&=~(1<<r);
6317                   temp_wont_dirty&=~(1<<r);
6318                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6319                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6320                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6321                   } else {
6322                     temp_will_dirty|=1<<r;
6323                     temp_wont_dirty|=1<<r;
6324                   }
6325                 }
6326               }
6327             }
6328           }
6329           if(wr) {
6330             will_dirty[i]=temp_will_dirty;
6331             wont_dirty[i]=temp_wont_dirty;
6332             clean_registers((ba[i]-start)>>2,i-1,0);
6333           }else{
6334             // Limit recursion.  It can take an excessive amount
6335             // of time if there are a lot of nested loops.
6336             will_dirty[(ba[i]-start)>>2]=0;
6337             wont_dirty[(ba[i]-start)>>2]=-1;
6338           }
6339         }
6340         /*else*/ if(1)
6341         {
6342           if (dops[i].is_ujump)
6343           {
6344             // Unconditional branch
6345             will_dirty_i=0;
6346             wont_dirty_i=0;
6347           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6348             for(r=0;r<HOST_REGS;r++) {
6349               if(r!=EXCLUDE_REG) {
6350                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6351                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
6352                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6353                 }
6354                 if(branch_regs[i].regmap[r]>=0) {
6355                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6356                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6357                 }
6358               }
6359             }
6360           //}
6361             // Merge in delay slot
6362             for(r=0;r<HOST_REGS;r++) {
6363               if(r!=EXCLUDE_REG) {
6364                 if((branch_regs[i].regmap[r]&63)==dops[i].rt1) will_dirty_i|=1<<r;
6365                 if((branch_regs[i].regmap[r]&63)==dops[i].rt2) will_dirty_i|=1<<r;
6366                 if((branch_regs[i].regmap[r]&63)==dops[i+1].rt1) will_dirty_i|=1<<r;
6367                 if((branch_regs[i].regmap[r]&63)==dops[i+1].rt2) will_dirty_i|=1<<r;
6368                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6369                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6370                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6371                 if((regs[i].regmap[r]&63)==dops[i].rt1) will_dirty_i|=1<<r;
6372                 if((regs[i].regmap[r]&63)==dops[i].rt2) will_dirty_i|=1<<r;
6373                 if((regs[i].regmap[r]&63)==dops[i+1].rt1) will_dirty_i|=1<<r;
6374                 if((regs[i].regmap[r]&63)==dops[i+1].rt2) will_dirty_i|=1<<r;
6375                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6376                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6377                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6378               }
6379             }
6380           } else {
6381             // Conditional branch
6382             will_dirty_i=will_dirty_next;
6383             wont_dirty_i=wont_dirty_next;
6384           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6385             for(r=0;r<HOST_REGS;r++) {
6386               if(r!=EXCLUDE_REG) {
6387                 signed char target_reg=branch_regs[i].regmap[r];
6388                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6389                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6390                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6391                 }
6392                 else if(target_reg>=0) {
6393                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6394                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6395                 }
6396               }
6397             }
6398           //}
6399             // Merge in delay slot
6400             for(r=0;r<HOST_REGS;r++) {
6401               if(r!=EXCLUDE_REG) {
6402                 if (1) { // !dops[i].likely) {
6403                   // Might not dirty if likely branch is not taken
6404                   if((branch_regs[i].regmap[r]&63)==dops[i].rt1) will_dirty_i|=1<<r;
6405                   if((branch_regs[i].regmap[r]&63)==dops[i].rt2) will_dirty_i|=1<<r;
6406                   if((branch_regs[i].regmap[r]&63)==dops[i+1].rt1) will_dirty_i|=1<<r;
6407                   if((branch_regs[i].regmap[r]&63)==dops[i+1].rt2) will_dirty_i|=1<<r;
6408                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6409                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6410                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6411                   //if((regs[i].regmap[r]&63)==dops[i].rt1) will_dirty_i|=1<<r;
6412                   //if((regs[i].regmap[r]&63)==dops[i].rt2) will_dirty_i|=1<<r;
6413                   if((regs[i].regmap[r]&63)==dops[i+1].rt1) will_dirty_i|=1<<r;
6414                   if((regs[i].regmap[r]&63)==dops[i+1].rt2) will_dirty_i|=1<<r;
6415                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6416                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6417                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6418                 }
6419               }
6420             }
6421           }
6422           // Merge in delay slot (won't dirty)
6423           for(r=0;r<HOST_REGS;r++) {
6424             if(r!=EXCLUDE_REG) {
6425               if((regs[i].regmap[r]&63)==dops[i].rt1) wont_dirty_i|=1<<r;
6426               if((regs[i].regmap[r]&63)==dops[i].rt2) wont_dirty_i|=1<<r;
6427               if((regs[i].regmap[r]&63)==dops[i+1].rt1) wont_dirty_i|=1<<r;
6428               if((regs[i].regmap[r]&63)==dops[i+1].rt2) wont_dirty_i|=1<<r;
6429               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6430               if((branch_regs[i].regmap[r]&63)==dops[i].rt1) wont_dirty_i|=1<<r;
6431               if((branch_regs[i].regmap[r]&63)==dops[i].rt2) wont_dirty_i|=1<<r;
6432               if((branch_regs[i].regmap[r]&63)==dops[i+1].rt1) wont_dirty_i|=1<<r;
6433               if((branch_regs[i].regmap[r]&63)==dops[i+1].rt2) wont_dirty_i|=1<<r;
6434               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6435             }
6436           }
6437           if(wr) {
6438             #ifndef DESTRUCTIVE_WRITEBACK
6439             branch_regs[i].dirty&=wont_dirty_i;
6440             #endif
6441             branch_regs[i].dirty|=will_dirty_i;
6442           }
6443         }
6444       }
6445     }
6446     else if(dops[i].itype==SYSCALL||dops[i].itype==HLECALL||dops[i].itype==INTCALL)
6447     {
6448       // SYSCALL instruction (software interrupt)
6449       will_dirty_i=0;
6450       wont_dirty_i=0;
6451     }
6452     else if(dops[i].itype==COP0 && (source[i]&0x3f)==0x18)
6453     {
6454       // ERET instruction (return from interrupt)
6455       will_dirty_i=0;
6456       wont_dirty_i=0;
6457     }
6458     will_dirty_next=will_dirty_i;
6459     wont_dirty_next=wont_dirty_i;
6460     for(r=0;r<HOST_REGS;r++) {
6461       if(r!=EXCLUDE_REG) {
6462         if((regs[i].regmap[r]&63)==dops[i].rt1) will_dirty_i|=1<<r;
6463         if((regs[i].regmap[r]&63)==dops[i].rt2) will_dirty_i|=1<<r;
6464         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6465         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6466         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6467         if((regs[i].regmap[r]&63)==dops[i].rt1) wont_dirty_i|=1<<r;
6468         if((regs[i].regmap[r]&63)==dops[i].rt2) wont_dirty_i|=1<<r;
6469         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6470         if(i>istart) {
6471           if (!dops[i].is_jump)
6472           {
6473             // Don't store a register immediately after writing it,
6474             // may prevent dual-issue.
6475             if((regs[i].regmap[r]&63)==dops[i-1].rt1) wont_dirty_i|=1<<r;
6476             if((regs[i].regmap[r]&63)==dops[i-1].rt2) wont_dirty_i|=1<<r;
6477           }
6478         }
6479       }
6480     }
6481     // Save it
6482     will_dirty[i]=will_dirty_i;
6483     wont_dirty[i]=wont_dirty_i;
6484     // Mark registers that won't be dirtied as not dirty
6485     if(wr) {
6486         regs[i].dirty|=will_dirty_i;
6487         #ifndef DESTRUCTIVE_WRITEBACK
6488         regs[i].dirty&=wont_dirty_i;
6489         if(dops[i].is_jump)
6490         {
6491           if (i < iend-1 && !dops[i].is_ujump) {
6492             for(r=0;r<HOST_REGS;r++) {
6493               if(r!=EXCLUDE_REG) {
6494                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
6495                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
6496                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6497               }
6498             }
6499           }
6500         }
6501         else
6502         {
6503           if(i<iend) {
6504             for(r=0;r<HOST_REGS;r++) {
6505               if(r!=EXCLUDE_REG) {
6506                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
6507                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
6508                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6509               }
6510             }
6511           }
6512         }
6513         #endif
6514       //}
6515     }
6516     // Deal with changed mappings
6517     temp_will_dirty=will_dirty_i;
6518     temp_wont_dirty=wont_dirty_i;
6519     for(r=0;r<HOST_REGS;r++) {
6520       if(r!=EXCLUDE_REG) {
6521         int nr;
6522         if(regs[i].regmap[r]==regmap_pre[i][r]) {
6523           if(wr) {
6524             #ifndef DESTRUCTIVE_WRITEBACK
6525             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6526             #endif
6527             regs[i].wasdirty|=will_dirty_i&(1<<r);
6528           }
6529         }
6530         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
6531           // Register moved to a different register
6532           will_dirty_i&=~(1<<r);
6533           wont_dirty_i&=~(1<<r);
6534           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
6535           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
6536           if(wr) {
6537             #ifndef DESTRUCTIVE_WRITEBACK
6538             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6539             #endif
6540             regs[i].wasdirty|=will_dirty_i&(1<<r);
6541           }
6542         }
6543         else {
6544           will_dirty_i&=~(1<<r);
6545           wont_dirty_i&=~(1<<r);
6546           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6547             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6548             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6549           } else {
6550             wont_dirty_i|=1<<r;
6551             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
6552           }
6553         }
6554       }
6555     }
6556   }
6557 }
6558
6559 #ifdef DISASM
6560   /* disassembly */
6561 void disassemble_inst(int i)
6562 {
6563     if (dops[i].bt) printf("*"); else printf(" ");
6564     switch(dops[i].itype) {
6565       case UJUMP:
6566         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6567       case CJUMP:
6568         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],dops[i].rs1,dops[i].rs2,i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
6569       case SJUMP:
6570         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],dops[i].rs1,start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
6571       case RJUMP:
6572         if (dops[i].opcode==0x9&&dops[i].rt1!=31)
6573           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],dops[i].rt1,dops[i].rs1);
6574         else
6575           printf (" %x: %s r%d\n",start+i*4,insn[i],dops[i].rs1);
6576         break;
6577       case SPAN:
6578         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],dops[i].rs1,dops[i].rs2,ba[i]);break;
6579       case IMM16:
6580         if(dops[i].opcode==0xf) //LUI
6581           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],dops[i].rt1,imm[i]&0xffff);
6582         else
6583           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],dops[i].rt1,dops[i].rs1,imm[i]);
6584         break;
6585       case LOAD:
6586       case LOADLR:
6587         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],dops[i].rt1,dops[i].rs1,imm[i]);
6588         break;
6589       case STORE:
6590       case STORELR:
6591         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],dops[i].rs2,dops[i].rs1,imm[i]);
6592         break;
6593       case ALU:
6594       case SHIFT:
6595         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],dops[i].rt1,dops[i].rs1,dops[i].rs2);
6596         break;
6597       case MULTDIV:
6598         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],dops[i].rs1,dops[i].rs2);
6599         break;
6600       case SHIFTIMM:
6601         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],dops[i].rt1,dops[i].rs1,imm[i]);
6602         break;
6603       case MOV:
6604         if((dops[i].opcode2&0x1d)==0x10)
6605           printf (" %x: %s r%d\n",start+i*4,insn[i],dops[i].rt1);
6606         else if((dops[i].opcode2&0x1d)==0x11)
6607           printf (" %x: %s r%d\n",start+i*4,insn[i],dops[i].rs1);
6608         else
6609           printf (" %x: %s\n",start+i*4,insn[i]);
6610         break;
6611       case COP0:
6612         if(dops[i].opcode2==0)
6613           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],dops[i].rt1,(source[i]>>11)&0x1f); // MFC0
6614         else if(dops[i].opcode2==4)
6615           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],dops[i].rs1,(source[i]>>11)&0x1f); // MTC0
6616         else printf (" %x: %s\n",start+i*4,insn[i]);
6617         break;
6618       case COP1:
6619         if(dops[i].opcode2<3)
6620           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],dops[i].rt1,(source[i]>>11)&0x1f); // MFC1
6621         else if(dops[i].opcode2>3)
6622           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],dops[i].rs1,(source[i]>>11)&0x1f); // MTC1
6623         else printf (" %x: %s\n",start+i*4,insn[i]);
6624         break;
6625       case COP2:
6626         if(dops[i].opcode2<3)
6627           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],dops[i].rt1,(source[i]>>11)&0x1f); // MFC2
6628         else if(dops[i].opcode2>3)
6629           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],dops[i].rs1,(source[i]>>11)&0x1f); // MTC2
6630         else printf (" %x: %s\n",start+i*4,insn[i]);
6631         break;
6632       case C1LS:
6633         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,dops[i].rs1,imm[i]);
6634         break;
6635       case C2LS:
6636         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,dops[i].rs1,imm[i]);
6637         break;
6638       case INTCALL:
6639         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
6640         break;
6641       default:
6642         //printf (" %s %8x\n",insn[i],source[i]);
6643         printf (" %x: %s\n",start+i*4,insn[i]);
6644     }
6645 }
6646 #else
6647 static void disassemble_inst(int i) {}
6648 #endif // DISASM
6649
6650 #define DRC_TEST_VAL 0x74657374
6651
6652 static void new_dynarec_test(void)
6653 {
6654   int (*testfunc)(void);
6655   void *beginning;
6656   int ret[2];
6657   size_t i;
6658
6659   // check structure linkage
6660   if ((u_char *)rcnts - (u_char *)&psxRegs != sizeof(psxRegs))
6661   {
6662     SysPrintf("linkage_arm* miscompilation/breakage detected.\n");
6663   }
6664
6665   SysPrintf("testing if we can run recompiled code...\n");
6666   ((volatile u_int *)out)[0]++; // make cache dirty
6667
6668   for (i = 0; i < ARRAY_SIZE(ret); i++) {
6669     out = ndrc->translation_cache;
6670     beginning = start_block();
6671     emit_movimm(DRC_TEST_VAL + i, 0); // test
6672     emit_ret();
6673     literal_pool(0);
6674     end_block(beginning);
6675     testfunc = beginning;
6676     ret[i] = testfunc();
6677   }
6678
6679   if (ret[0] == DRC_TEST_VAL && ret[1] == DRC_TEST_VAL + 1)
6680     SysPrintf("test passed.\n");
6681   else
6682     SysPrintf("test failed, will likely crash soon (r=%08x %08x)\n", ret[0], ret[1]);
6683   out = ndrc->translation_cache;
6684 }
6685
6686 // clear the state completely, instead of just marking
6687 // things invalid like invalidate_all_pages() does
6688 void new_dynarec_clear_full(void)
6689 {
6690   int n;
6691   out = ndrc->translation_cache;
6692   memset(invalid_code,1,sizeof(invalid_code));
6693   memset(hash_table,0xff,sizeof(hash_table));
6694   memset(mini_ht,-1,sizeof(mini_ht));
6695   memset(restore_candidate,0,sizeof(restore_candidate));
6696   memset(shadow,0,sizeof(shadow));
6697   copy=shadow;
6698   expirep=16384; // Expiry pointer, +2 blocks
6699   pending_exception=0;
6700   literalcount=0;
6701   stop_after_jal=0;
6702   inv_code_start=inv_code_end=~0;
6703   // TLB
6704   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6705   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6706   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6707
6708   cycle_multiplier_old = cycle_multiplier;
6709   new_dynarec_hacks_old = new_dynarec_hacks;
6710 }
6711
6712 void new_dynarec_init(void)
6713 {
6714   SysPrintf("Init new dynarec\n");
6715
6716 #ifdef BASE_ADDR_DYNAMIC
6717   #ifdef VITA
6718   sceBlock = sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
6719   if (sceBlock < 0)
6720     SysPrintf("sceKernelAllocMemBlockForVM failed\n");
6721   int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&ndrc);
6722   if (ret < 0)
6723     SysPrintf("sceKernelGetMemBlockBase failed\n");
6724   #else
6725   uintptr_t desired_addr = 0;
6726   #ifdef __ELF__
6727   extern char _end;
6728   desired_addr = ((uintptr_t)&_end + 0xffffff) & ~0xffffffl;
6729   #endif
6730   ndrc = mmap((void *)desired_addr, sizeof(*ndrc),
6731             PROT_READ | PROT_WRITE | PROT_EXEC,
6732             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
6733   if (ndrc == MAP_FAILED) {
6734     SysPrintf("mmap() failed: %s\n", strerror(errno));
6735     abort();
6736   }
6737   #endif
6738 #else
6739   #ifndef NO_WRITE_EXEC
6740   // not all systems allow execute in data segment by default
6741   if (mprotect(ndrc, sizeof(ndrc->translation_cache) + sizeof(ndrc->tramp.ops),
6742                PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
6743     SysPrintf("mprotect() failed: %s\n", strerror(errno));
6744   #endif
6745 #endif
6746   out = ndrc->translation_cache;
6747   cycle_multiplier=200;
6748   new_dynarec_clear_full();
6749 #ifdef HOST_IMM8
6750   // Copy this into local area so we don't have to put it in every literal pool
6751   invc_ptr=invalid_code;
6752 #endif
6753   arch_init();
6754   new_dynarec_test();
6755 #ifndef RAM_FIXED
6756   ram_offset=(uintptr_t)rdram-0x80000000;
6757 #endif
6758   if (ram_offset!=0)
6759     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
6760 }
6761
6762 void new_dynarec_cleanup(void)
6763 {
6764   int n;
6765 #ifdef BASE_ADDR_DYNAMIC
6766   #ifdef VITA
6767   sceKernelFreeMemBlock(sceBlock);
6768   sceBlock = -1;
6769   #else
6770   if (munmap(ndrc, sizeof(*ndrc)) < 0)
6771     SysPrintf("munmap() failed\n");
6772   #endif
6773 #endif
6774   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6775   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6776   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6777   #ifdef ROM_COPY
6778   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
6779   #endif
6780 }
6781
6782 static u_int *get_source_start(u_int addr, u_int *limit)
6783 {
6784   if (!HACK_ENABLED(NDHACK_OVERRIDE_CYCLE_M))
6785     cycle_multiplier_override = 0;
6786
6787   if (addr < 0x00200000 ||
6788     (0xa0000000 <= addr && addr < 0xa0200000))
6789   {
6790     // used for BIOS calls mostly?
6791     *limit = (addr&0xa0000000)|0x00200000;
6792     return (u_int *)(rdram + (addr&0x1fffff));
6793   }
6794   else if (!Config.HLE && (
6795     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
6796     (0xbfc00000 <= addr && addr < 0xbfc80000)))
6797   {
6798     // BIOS. The multiplier should be much higher as it's uncached 8bit mem,
6799     // but timings in PCSX are too tied to the interpreter's BIAS
6800     if (!HACK_ENABLED(NDHACK_OVERRIDE_CYCLE_M))
6801       cycle_multiplier_override = 200;
6802
6803     *limit = (addr & 0xfff00000) | 0x80000;
6804     return (u_int *)((u_char *)psxR + (addr&0x7ffff));
6805   }
6806   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
6807     *limit = (addr & 0x80600000) + 0x00200000;
6808     return (u_int *)(rdram + (addr&0x1fffff));
6809   }
6810   return NULL;
6811 }
6812
6813 static u_int scan_for_ret(u_int addr)
6814 {
6815   u_int limit = 0;
6816   u_int *mem;
6817
6818   mem = get_source_start(addr, &limit);
6819   if (mem == NULL)
6820     return addr;
6821
6822   if (limit > addr + 0x1000)
6823     limit = addr + 0x1000;
6824   for (; addr < limit; addr += 4, mem++) {
6825     if (*mem == 0x03e00008) // jr $ra
6826       return addr + 8;
6827   }
6828   return addr;
6829 }
6830
6831 struct savestate_block {
6832   uint32_t addr;
6833   uint32_t regflags;
6834 };
6835
6836 static int addr_cmp(const void *p1_, const void *p2_)
6837 {
6838   const struct savestate_block *p1 = p1_, *p2 = p2_;
6839   return p1->addr - p2->addr;
6840 }
6841
6842 int new_dynarec_save_blocks(void *save, int size)
6843 {
6844   struct savestate_block *blocks = save;
6845   int maxcount = size / sizeof(blocks[0]);
6846   struct savestate_block tmp_blocks[1024];
6847   struct ll_entry *head;
6848   int p, s, d, o, bcnt;
6849   u_int addr;
6850
6851   o = 0;
6852   for (p = 0; p < ARRAY_SIZE(jump_in); p++) {
6853     bcnt = 0;
6854     for (head = jump_in[p]; head != NULL; head = head->next) {
6855       tmp_blocks[bcnt].addr = head->vaddr;
6856       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
6857       bcnt++;
6858     }
6859     if (bcnt < 1)
6860       continue;
6861     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
6862
6863     addr = tmp_blocks[0].addr;
6864     for (s = d = 0; s < bcnt; s++) {
6865       if (tmp_blocks[s].addr < addr)
6866         continue;
6867       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
6868         tmp_blocks[d++] = tmp_blocks[s];
6869       addr = scan_for_ret(tmp_blocks[s].addr);
6870     }
6871
6872     if (o + d > maxcount)
6873       d = maxcount - o;
6874     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
6875     o += d;
6876   }
6877
6878   return o * sizeof(blocks[0]);
6879 }
6880
6881 void new_dynarec_load_blocks(const void *save, int size)
6882 {
6883   const struct savestate_block *blocks = save;
6884   int count = size / sizeof(blocks[0]);
6885   u_int regs_save[32];
6886   uint32_t f;
6887   int i, b;
6888
6889   get_addr(psxRegs.pc);
6890
6891   // change GPRs for speculation to at least partially work..
6892   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
6893   for (i = 1; i < 32; i++)
6894     psxRegs.GPR.r[i] = 0x80000000;
6895
6896   for (b = 0; b < count; b++) {
6897     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
6898       if (f & 1)
6899         psxRegs.GPR.r[i] = 0x1f800000;
6900     }
6901
6902     get_addr(blocks[b].addr);
6903
6904     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
6905       if (f & 1)
6906         psxRegs.GPR.r[i] = 0x80000000;
6907     }
6908   }
6909
6910   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
6911 }
6912
6913 int new_recompile_block(u_int addr)
6914 {
6915   u_int pagelimit = 0;
6916   u_int state_rflags = 0;
6917   int i;
6918
6919   assem_debug("NOTCOMPILED: addr = %x -> %p\n", addr, out);
6920   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
6921   //if(debug)
6922   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
6923
6924   // this is just for speculation
6925   for (i = 1; i < 32; i++) {
6926     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
6927       state_rflags |= 1 << i;
6928   }
6929
6930   start = (u_int)addr&~3;
6931   //assert(((u_int)addr&1)==0); // start-in-delay-slot flag
6932   new_dynarec_did_compile=1;
6933   if (Config.HLE && start == 0x80001000) // hlecall
6934   {
6935     // XXX: is this enough? Maybe check hleSoftCall?
6936     void *beginning=start_block();
6937     u_int page=get_page(start);
6938
6939     invalid_code[start>>12]=0;
6940     emit_movimm(start,0);
6941     emit_writeword(0,&pcaddr);
6942     emit_far_jump(new_dyna_leave);
6943     literal_pool(0);
6944     end_block(beginning);
6945     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
6946     return 0;
6947   }
6948
6949   source = get_source_start(start, &pagelimit);
6950   if (source == NULL) {
6951     SysPrintf("Compile at bogus memory address: %08x\n", addr);
6952     abort();
6953   }
6954
6955   /* Pass 1: disassemble */
6956   /* Pass 2: register dependencies, branch targets */
6957   /* Pass 3: register allocation */
6958   /* Pass 4: branch dependencies */
6959   /* Pass 5: pre-alloc */
6960   /* Pass 6: optimize clean/dirty state */
6961   /* Pass 7: flag 32-bit registers */
6962   /* Pass 8: assembly */
6963   /* Pass 9: linker */
6964   /* Pass 10: garbage collection / free memory */
6965
6966   int j;
6967   int done=0;
6968   unsigned int type,op,op2;
6969
6970   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
6971
6972   /* Pass 1 disassembly */
6973
6974   for(i=0;!done;i++) {
6975     dops[i].bt=0;
6976     dops[i].ooo=0;
6977     op2=0;
6978     minimum_free_regs[i]=0;
6979     dops[i].opcode=op=source[i]>>26;
6980     switch(op)
6981     {
6982       case 0x00: strcpy(insn[i],"special"); type=NI;
6983         op2=source[i]&0x3f;
6984         switch(op2)
6985         {
6986           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
6987           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
6988           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
6989           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
6990           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
6991           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
6992           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
6993           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
6994           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
6995           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
6996           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
6997           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
6998           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
6999           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7000           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7001           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7002           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7003           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7004           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7005           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7006           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7007           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7008           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7009           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7010           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7011           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7012           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7013           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7014           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7015           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7016           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7017           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7018           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7019           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7020           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7021 #if 0
7022           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7023           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7024           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7025           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7026           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7027           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7028           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7029           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7030           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7031           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7032           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7033           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7034           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7035           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7036           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7037           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7038           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7039 #endif
7040         }
7041         break;
7042       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7043         op2=(source[i]>>16)&0x1f;
7044         switch(op2)
7045         {
7046           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7047           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7048           //case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7049           //case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7050           //case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7051           //case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7052           //case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7053           //case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7054           //case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7055           //case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7056           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7057           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7058           //case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7059           //case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7060         }
7061         break;
7062       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7063       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7064       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7065       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7066       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7067       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7068       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7069       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7070       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7071       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7072       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7073       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7074       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7075       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7076       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7077         op2=(source[i]>>21)&0x1f;
7078         switch(op2)
7079         {
7080           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7081           case 0x02: strcpy(insn[i],"CFC0"); type=COP0; break;
7082           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7083           case 0x06: strcpy(insn[i],"CTC0"); type=COP0; break;
7084           case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
7085         }
7086         break;
7087       case 0x11: strcpy(insn[i],"cop1"); type=COP1;
7088         op2=(source[i]>>21)&0x1f;
7089         break;
7090 #if 0
7091       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7092       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7093       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7094       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7095       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7096       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7097       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7098       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7099 #endif
7100       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7101       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7102       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7103       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7104       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7105       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7106       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7107 #if 0
7108       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7109 #endif
7110       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7111       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7112       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7113       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7114 #if 0
7115       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7116       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7117 #endif
7118       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7119       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7120       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7121       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7122 #if 0
7123       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7124       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7125       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7126 #endif
7127       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7128       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7129 #if 0
7130       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7131       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7132       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7133 #endif
7134       case 0x12: strcpy(insn[i],"COP2"); type=NI;
7135         op2=(source[i]>>21)&0x1f;
7136         //if (op2 & 0x10)
7137         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
7138           if (gte_handlers[source[i]&0x3f]!=NULL) {
7139             if (gte_regnames[source[i]&0x3f]!=NULL)
7140               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
7141             else
7142               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
7143             type=C2OP;
7144           }
7145         }
7146         else switch(op2)
7147         {
7148           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
7149           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
7150           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
7151           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
7152         }
7153         break;
7154       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
7155       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
7156       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
7157       default: strcpy(insn[i],"???"); type=NI;
7158         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
7159         break;
7160     }
7161     dops[i].itype=type;
7162     dops[i].opcode2=op2;
7163     /* Get registers/immediates */
7164     dops[i].lt1=0;
7165     gte_rs[i]=gte_rt[i]=0;
7166     switch(type) {
7167       case LOAD:
7168         dops[i].rs1=(source[i]>>21)&0x1f;
7169         dops[i].rs2=0;
7170         dops[i].rt1=(source[i]>>16)&0x1f;
7171         dops[i].rt2=0;
7172         imm[i]=(short)source[i];
7173         break;
7174       case STORE:
7175       case STORELR:
7176         dops[i].rs1=(source[i]>>21)&0x1f;
7177         dops[i].rs2=(source[i]>>16)&0x1f;
7178         dops[i].rt1=0;
7179         dops[i].rt2=0;
7180         imm[i]=(short)source[i];
7181         break;
7182       case LOADLR:
7183         // LWL/LWR only load part of the register,
7184         // therefore the target register must be treated as a source too
7185         dops[i].rs1=(source[i]>>21)&0x1f;
7186         dops[i].rs2=(source[i]>>16)&0x1f;
7187         dops[i].rt1=(source[i]>>16)&0x1f;
7188         dops[i].rt2=0;
7189         imm[i]=(short)source[i];
7190         break;
7191       case IMM16:
7192         if (op==0x0f) dops[i].rs1=0; // LUI instruction has no source register
7193         else dops[i].rs1=(source[i]>>21)&0x1f;
7194         dops[i].rs2=0;
7195         dops[i].rt1=(source[i]>>16)&0x1f;
7196         dops[i].rt2=0;
7197         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7198           imm[i]=(unsigned short)source[i];
7199         }else{
7200           imm[i]=(short)source[i];
7201         }
7202         break;
7203       case UJUMP:
7204         dops[i].rs1=0;
7205         dops[i].rs2=0;
7206         dops[i].rt1=0;
7207         dops[i].rt2=0;
7208         // The JAL instruction writes to r31.
7209         if (op&1) {
7210           dops[i].rt1=31;
7211         }
7212         dops[i].rs2=CCREG;
7213         break;
7214       case RJUMP:
7215         dops[i].rs1=(source[i]>>21)&0x1f;
7216         dops[i].rs2=0;
7217         dops[i].rt1=0;
7218         dops[i].rt2=0;
7219         // The JALR instruction writes to rd.
7220         if (op2&1) {
7221           dops[i].rt1=(source[i]>>11)&0x1f;
7222         }
7223         dops[i].rs2=CCREG;
7224         break;
7225       case CJUMP:
7226         dops[i].rs1=(source[i]>>21)&0x1f;
7227         dops[i].rs2=(source[i]>>16)&0x1f;
7228         dops[i].rt1=0;
7229         dops[i].rt2=0;
7230         if(op&2) { // BGTZ/BLEZ
7231           dops[i].rs2=0;
7232         }
7233         break;
7234       case SJUMP:
7235         dops[i].rs1=(source[i]>>21)&0x1f;
7236         dops[i].rs2=CCREG;
7237         dops[i].rt1=0;
7238         dops[i].rt2=0;
7239         if(op2&0x10) { // BxxAL
7240           dops[i].rt1=31;
7241           // NOTE: If the branch is not taken, r31 is still overwritten
7242         }
7243         break;
7244       case ALU:
7245         dops[i].rs1=(source[i]>>21)&0x1f; // source
7246         dops[i].rs2=(source[i]>>16)&0x1f; // subtract amount
7247         dops[i].rt1=(source[i]>>11)&0x1f; // destination
7248         dops[i].rt2=0;
7249         break;
7250       case MULTDIV:
7251         dops[i].rs1=(source[i]>>21)&0x1f; // source
7252         dops[i].rs2=(source[i]>>16)&0x1f; // divisor
7253         dops[i].rt1=HIREG;
7254         dops[i].rt2=LOREG;
7255         break;
7256       case MOV:
7257         dops[i].rs1=0;
7258         dops[i].rs2=0;
7259         dops[i].rt1=0;
7260         dops[i].rt2=0;
7261         if(op2==0x10) dops[i].rs1=HIREG; // MFHI
7262         if(op2==0x11) dops[i].rt1=HIREG; // MTHI
7263         if(op2==0x12) dops[i].rs1=LOREG; // MFLO
7264         if(op2==0x13) dops[i].rt1=LOREG; // MTLO
7265         if((op2&0x1d)==0x10) dops[i].rt1=(source[i]>>11)&0x1f; // MFxx
7266         if((op2&0x1d)==0x11) dops[i].rs1=(source[i]>>21)&0x1f; // MTxx
7267         break;
7268       case SHIFT:
7269         dops[i].rs1=(source[i]>>16)&0x1f; // target of shift
7270         dops[i].rs2=(source[i]>>21)&0x1f; // shift amount
7271         dops[i].rt1=(source[i]>>11)&0x1f; // destination
7272         dops[i].rt2=0;
7273         break;
7274       case SHIFTIMM:
7275         dops[i].rs1=(source[i]>>16)&0x1f;
7276         dops[i].rs2=0;
7277         dops[i].rt1=(source[i]>>11)&0x1f;
7278         dops[i].rt2=0;
7279         imm[i]=(source[i]>>6)&0x1f;
7280         // DSxx32 instructions
7281         if(op2>=0x3c) imm[i]|=0x20;
7282         break;
7283       case COP0:
7284         dops[i].rs1=0;
7285         dops[i].rs2=0;
7286         dops[i].rt1=0;
7287         dops[i].rt2=0;
7288         if(op2==0||op2==2) dops[i].rt1=(source[i]>>16)&0x1F; // MFC0/CFC0
7289         if(op2==4||op2==6) dops[i].rs1=(source[i]>>16)&0x1F; // MTC0/CTC0
7290         if(op2==4&&((source[i]>>11)&0x1f)==12) dops[i].rt2=CSREG; // Status
7291         if(op2==16) if((source[i]&0x3f)==0x18) dops[i].rs2=CCREG; // ERET
7292         break;
7293       case COP1:
7294         dops[i].rs1=0;
7295         dops[i].rs2=0;
7296         dops[i].rt1=0;
7297         dops[i].rt2=0;
7298         if(op2<3) dops[i].rt1=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7299         if(op2>3) dops[i].rs1=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7300         dops[i].rs2=CSREG;
7301         break;
7302       case COP2:
7303         dops[i].rs1=0;
7304         dops[i].rs2=0;
7305         dops[i].rt1=0;
7306         dops[i].rt2=0;
7307         if(op2<3) dops[i].rt1=(source[i]>>16)&0x1F; // MFC2/CFC2
7308         if(op2>3) dops[i].rs1=(source[i]>>16)&0x1F; // MTC2/CTC2
7309         dops[i].rs2=CSREG;
7310         int gr=(source[i]>>11)&0x1F;
7311         switch(op2)
7312         {
7313           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
7314           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
7315           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
7316           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
7317         }
7318         break;
7319       case C1LS:
7320         dops[i].rs1=(source[i]>>21)&0x1F;
7321         dops[i].rs2=CSREG;
7322         dops[i].rt1=0;
7323         dops[i].rt2=0;
7324         imm[i]=(short)source[i];
7325         break;
7326       case C2LS:
7327         dops[i].rs1=(source[i]>>21)&0x1F;
7328         dops[i].rs2=0;
7329         dops[i].rt1=0;
7330         dops[i].rt2=0;
7331         imm[i]=(short)source[i];
7332         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
7333         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
7334         break;
7335       case C2OP:
7336         dops[i].rs1=0;
7337         dops[i].rs2=0;
7338         dops[i].rt1=0;
7339         dops[i].rt2=0;
7340         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
7341         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
7342         gte_rt[i]|=1ll<<63; // every op changes flags
7343         if((source[i]&0x3f)==GTE_MVMVA) {
7344           int v = (source[i] >> 15) & 3;
7345           gte_rs[i]&=~0xe3fll;
7346           if(v==3) gte_rs[i]|=0xe00ll;
7347           else gte_rs[i]|=3ll<<(v*2);
7348         }
7349         break;
7350       case SYSCALL:
7351       case HLECALL:
7352       case INTCALL:
7353         dops[i].rs1=CCREG;
7354         dops[i].rs2=0;
7355         dops[i].rt1=0;
7356         dops[i].rt2=0;
7357         break;
7358       default:
7359         dops[i].rs1=0;
7360         dops[i].rs2=0;
7361         dops[i].rt1=0;
7362         dops[i].rt2=0;
7363     }
7364     /* Calculate branch target addresses */
7365     if(type==UJUMP)
7366       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
7367     else if(type==CJUMP&&dops[i].rs1==dops[i].rs2&&(op&1))
7368       ba[i]=start+i*4+8; // Ignore never taken branch
7369     else if(type==SJUMP&&dops[i].rs1==0&&!(op2&1))
7370       ba[i]=start+i*4+8; // Ignore never taken branch
7371     else if(type==CJUMP||type==SJUMP)
7372       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
7373     else ba[i]=-1;
7374
7375     /* simplify always (not)taken branches */
7376     if (type == CJUMP && dops[i].rs1 == dops[i].rs2) {
7377       dops[i].rs1 = dops[i].rs2 = 0;
7378       if (!(op & 1)) {
7379         dops[i].itype = type = UJUMP;
7380         dops[i].rs2 = CCREG;
7381       }
7382     }
7383     else if (type == SJUMP && dops[i].rs1 == 0 && (op2 & 1))
7384       dops[i].itype = type = UJUMP;
7385
7386     dops[i].is_jump = (dops[i].itype == RJUMP || dops[i].itype == UJUMP || dops[i].itype == CJUMP || dops[i].itype == SJUMP);
7387     dops[i].is_ujump = (dops[i].itype == RJUMP || dops[i].itype == UJUMP); // || (source[i] >> 16) == 0x1000 // beq r0,r0
7388
7389     /* messy cases to just pass over to the interpreter */
7390     if (i > 0 && dops[i-1].is_jump) {
7391       int do_in_intrp=0;
7392       // branch in delay slot?
7393       if (dops[i].is_jump) {
7394         // don't handle first branch and call interpreter if it's hit
7395         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
7396         do_in_intrp=1;
7397       }
7398       // basic load delay detection
7399       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&dops[i].rt1!=0) {
7400         int t=(ba[i-1]-start)/4;
7401         if(0 <= t && t < i &&(dops[i].rt1==dops[t].rs1||dops[i].rt1==dops[t].rs2)&&dops[t].itype!=CJUMP&&dops[t].itype!=SJUMP) {
7402           // jump target wants DS result - potential load delay effect
7403           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
7404           do_in_intrp=1;
7405           dops[t+1].bt=1; // expected return from interpreter
7406         }
7407         else if(i>=2&&dops[i-2].rt1==2&&dops[i].rt1==2&&dops[i].rs1!=2&&dops[i].rs2!=2&&dops[i-1].rs1!=2&&dops[i-1].rs2!=2&&
7408               !(i>=3&&dops[i-3].is_jump)) {
7409           // v0 overwrite like this is a sign of trouble, bail out
7410           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
7411           do_in_intrp=1;
7412         }
7413       }
7414       if(do_in_intrp) {
7415         dops[i-1].rs1=CCREG;
7416         dops[i-1].rs2=dops[i-1].rt1=dops[i-1].rt2=0;
7417         ba[i-1]=-1;
7418         dops[i-1].itype=INTCALL;
7419         done=2;
7420         i--; // don't compile the DS
7421       }
7422     }
7423
7424     /* Is this the end of the block? */
7425     if (i > 0 && dops[i-1].is_ujump) {
7426       if(dops[i-1].rt1==0) { // Continue past subroutine call (JAL)
7427         done=2;
7428       }
7429       else {
7430         if(stop_after_jal) done=1;
7431         // Stop on BREAK
7432         if((source[i+1]&0xfc00003f)==0x0d) done=1;
7433       }
7434       // Don't recompile stuff that's already compiled
7435       if(check_addr(start+i*4+4)) done=1;
7436       // Don't get too close to the limit
7437       if(i>MAXBLOCK/2) done=1;
7438     }
7439     if(dops[i].itype==SYSCALL&&stop_after_jal) done=1;
7440     if(dops[i].itype==HLECALL||dops[i].itype==INTCALL) done=2;
7441     if(done==2) {
7442       // Does the block continue due to a branch?
7443       for(j=i-1;j>=0;j--)
7444       {
7445         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
7446         if(ba[j]==start+i*4+4) done=j=0;
7447         if(ba[j]==start+i*4+8) done=j=0;
7448       }
7449     }
7450     //assert(i<MAXBLOCK-1);
7451     if(start+i*4==pagelimit-4) done=1;
7452     assert(start+i*4<pagelimit);
7453     if (i==MAXBLOCK-1) done=1;
7454     // Stop if we're compiling junk
7455     if(dops[i].itype==NI&&dops[i].opcode==0x11) {
7456       done=stop_after_jal=1;
7457       SysPrintf("Disabled speculative precompilation\n");
7458     }
7459   }
7460   slen=i;
7461   if (dops[i-1].is_jump) {
7462     if(start+i*4==pagelimit) {
7463       dops[i-1].itype=SPAN;
7464     }
7465   }
7466   assert(slen>0);
7467
7468   /* Pass 2 - Register dependencies and branch targets */
7469
7470   unneeded_registers(0,slen-1,0);
7471
7472   /* Pass 3 - Register allocation */
7473
7474   struct regstat current; // Current register allocations/status
7475   current.dirty=0;
7476   current.u=unneeded_reg[0];
7477   clear_all_regs(current.regmap);
7478   alloc_reg(&current,0,CCREG);
7479   dirty_reg(&current,CCREG);
7480   current.isconst=0;
7481   current.wasconst=0;
7482   current.waswritten=0;
7483   int ds=0;
7484   int cc=0;
7485   int hr=-1;
7486
7487   if((u_int)addr&1) {
7488     // First instruction is delay slot
7489     cc=-1;
7490     dops[1].bt=1;
7491     ds=1;
7492     unneeded_reg[0]=1;
7493     current.regmap[HOST_BTREG]=BTREG;
7494   }
7495
7496   for(i=0;i<slen;i++)
7497   {
7498     if(dops[i].bt)
7499     {
7500       int hr;
7501       for(hr=0;hr<HOST_REGS;hr++)
7502       {
7503         // Is this really necessary?
7504         if(current.regmap[hr]==0) current.regmap[hr]=-1;
7505       }
7506       current.isconst=0;
7507       current.waswritten=0;
7508     }
7509
7510     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
7511     regs[i].wasconst=current.isconst;
7512     regs[i].wasdirty=current.dirty;
7513     regs[i].loadedconst=0;
7514     if (!dops[i].is_jump) {
7515       if(i+1<slen) {
7516         current.u=unneeded_reg[i+1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
7517         current.u|=1;
7518       } else {
7519         current.u=1;
7520       }
7521     } else {
7522       if(i+1<slen) {
7523         current.u=branch_unneeded_reg[i]&~((1LL<<dops[i+1].rs1)|(1LL<<dops[i+1].rs2));
7524         current.u&=~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
7525         current.u|=1;
7526       } else { SysPrintf("oops, branch at end of block with no delay slot\n");abort(); }
7527     }
7528     dops[i].is_ds=ds;
7529     if(ds) {
7530       ds=0; // Skip delay slot, already allocated as part of branch
7531       // ...but we need to alloc it in case something jumps here
7532       if(i+1<slen) {
7533         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
7534       }else{
7535         current.u=branch_unneeded_reg[i-1];
7536       }
7537       current.u&=~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
7538       current.u|=1;
7539       struct regstat temp;
7540       memcpy(&temp,&current,sizeof(current));
7541       temp.wasdirty=temp.dirty;
7542       // TODO: Take into account unconditional branches, as below
7543       delayslot_alloc(&temp,i);
7544       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
7545       regs[i].wasdirty=temp.wasdirty;
7546       regs[i].dirty=temp.dirty;
7547       regs[i].isconst=0;
7548       regs[i].wasconst=0;
7549       current.isconst=0;
7550       // Create entry (branch target) regmap
7551       for(hr=0;hr<HOST_REGS;hr++)
7552       {
7553         int r=temp.regmap[hr];
7554         if(r>=0) {
7555           if(r!=regmap_pre[i][hr]) {
7556             regs[i].regmap_entry[hr]=-1;
7557           }
7558           else
7559           {
7560               assert(r < 64);
7561               if((current.u>>r)&1) {
7562                 regs[i].regmap_entry[hr]=-1;
7563                 regs[i].regmap[hr]=-1;
7564                 //Don't clear regs in the delay slot as the branch might need them
7565                 //current.regmap[hr]=-1;
7566               }else
7567                 regs[i].regmap_entry[hr]=r;
7568           }
7569         } else {
7570           // First instruction expects CCREG to be allocated
7571           if(i==0&&hr==HOST_CCREG)
7572             regs[i].regmap_entry[hr]=CCREG;
7573           else
7574             regs[i].regmap_entry[hr]=-1;
7575         }
7576       }
7577     }
7578     else { // Not delay slot
7579       switch(dops[i].itype) {
7580         case UJUMP:
7581           //current.isconst=0; // DEBUG
7582           //current.wasconst=0; // DEBUG
7583           //regs[i].wasconst=0; // DEBUG
7584           clear_const(&current,dops[i].rt1);
7585           alloc_cc(&current,i);
7586           dirty_reg(&current,CCREG);
7587           if (dops[i].rt1==31) {
7588             alloc_reg(&current,i,31);
7589             dirty_reg(&current,31);
7590             //assert(dops[i+1].rs1!=31&&dops[i+1].rs2!=31);
7591             //assert(dops[i+1].rt1!=dops[i].rt1);
7592             #ifdef REG_PREFETCH
7593             alloc_reg(&current,i,PTEMP);
7594             #endif
7595           }
7596           dops[i].ooo=1;
7597           delayslot_alloc(&current,i+1);
7598           //current.isconst=0; // DEBUG
7599           ds=1;
7600           //printf("i=%d, isconst=%x\n",i,current.isconst);
7601           break;
7602         case RJUMP:
7603           //current.isconst=0;
7604           //current.wasconst=0;
7605           //regs[i].wasconst=0;
7606           clear_const(&current,dops[i].rs1);
7607           clear_const(&current,dops[i].rt1);
7608           alloc_cc(&current,i);
7609           dirty_reg(&current,CCREG);
7610           if (!ds_writes_rjump_rs(i)) {
7611             alloc_reg(&current,i,dops[i].rs1);
7612             if (dops[i].rt1!=0) {
7613               alloc_reg(&current,i,dops[i].rt1);
7614               dirty_reg(&current,dops[i].rt1);
7615               assert(dops[i+1].rs1!=dops[i].rt1&&dops[i+1].rs2!=dops[i].rt1);
7616               assert(dops[i+1].rt1!=dops[i].rt1);
7617               #ifdef REG_PREFETCH
7618               alloc_reg(&current,i,PTEMP);
7619               #endif
7620             }
7621             #ifdef USE_MINI_HT
7622             if(dops[i].rs1==31) { // JALR
7623               alloc_reg(&current,i,RHASH);
7624               alloc_reg(&current,i,RHTBL);
7625             }
7626             #endif
7627             delayslot_alloc(&current,i+1);
7628           } else {
7629             // The delay slot overwrites our source register,
7630             // allocate a temporary register to hold the old value.
7631             current.isconst=0;
7632             current.wasconst=0;
7633             regs[i].wasconst=0;
7634             delayslot_alloc(&current,i+1);
7635             current.isconst=0;
7636             alloc_reg(&current,i,RTEMP);
7637           }
7638           //current.isconst=0; // DEBUG
7639           dops[i].ooo=1;
7640           ds=1;
7641           break;
7642         case CJUMP:
7643           //current.isconst=0;
7644           //current.wasconst=0;
7645           //regs[i].wasconst=0;
7646           clear_const(&current,dops[i].rs1);
7647           clear_const(&current,dops[i].rs2);
7648           if((dops[i].opcode&0x3E)==4) // BEQ/BNE
7649           {
7650             alloc_cc(&current,i);
7651             dirty_reg(&current,CCREG);
7652             if(dops[i].rs1) alloc_reg(&current,i,dops[i].rs1);
7653             if(dops[i].rs2) alloc_reg(&current,i,dops[i].rs2);
7654             if((dops[i].rs1&&(dops[i].rs1==dops[i+1].rt1||dops[i].rs1==dops[i+1].rt2))||
7655                (dops[i].rs2&&(dops[i].rs2==dops[i+1].rt1||dops[i].rs2==dops[i+1].rt2))) {
7656               // The delay slot overwrites one of our conditions.
7657               // Allocate the branch condition registers instead.
7658               current.isconst=0;
7659               current.wasconst=0;
7660               regs[i].wasconst=0;
7661               if(dops[i].rs1) alloc_reg(&current,i,dops[i].rs1);
7662               if(dops[i].rs2) alloc_reg(&current,i,dops[i].rs2);
7663             }
7664             else
7665             {
7666               dops[i].ooo=1;
7667               delayslot_alloc(&current,i+1);
7668             }
7669           }
7670           else
7671           if((dops[i].opcode&0x3E)==6) // BLEZ/BGTZ
7672           {
7673             alloc_cc(&current,i);
7674             dirty_reg(&current,CCREG);
7675             alloc_reg(&current,i,dops[i].rs1);
7676             if(dops[i].rs1&&(dops[i].rs1==dops[i+1].rt1||dops[i].rs1==dops[i+1].rt2)) {
7677               // The delay slot overwrites one of our conditions.
7678               // Allocate the branch condition registers instead.
7679               current.isconst=0;
7680               current.wasconst=0;
7681               regs[i].wasconst=0;
7682               if(dops[i].rs1) alloc_reg(&current,i,dops[i].rs1);
7683             }
7684             else
7685             {
7686               dops[i].ooo=1;
7687               delayslot_alloc(&current,i+1);
7688             }
7689           }
7690           else
7691           // Don't alloc the delay slot yet because we might not execute it
7692           if((dops[i].opcode&0x3E)==0x14) // BEQL/BNEL
7693           {
7694             current.isconst=0;
7695             current.wasconst=0;
7696             regs[i].wasconst=0;
7697             alloc_cc(&current,i);
7698             dirty_reg(&current,CCREG);
7699             alloc_reg(&current,i,dops[i].rs1);
7700             alloc_reg(&current,i,dops[i].rs2);
7701           }
7702           else
7703           if((dops[i].opcode&0x3E)==0x16) // BLEZL/BGTZL
7704           {
7705             current.isconst=0;
7706             current.wasconst=0;
7707             regs[i].wasconst=0;
7708             alloc_cc(&current,i);
7709             dirty_reg(&current,CCREG);
7710             alloc_reg(&current,i,dops[i].rs1);
7711           }
7712           ds=1;
7713           //current.isconst=0;
7714           break;
7715         case SJUMP:
7716           //current.isconst=0;
7717           //current.wasconst=0;
7718           //regs[i].wasconst=0;
7719           clear_const(&current,dops[i].rs1);
7720           clear_const(&current,dops[i].rt1);
7721           //if((dops[i].opcode2&0x1E)==0x0) // BLTZ/BGEZ
7722           if((dops[i].opcode2&0x0E)==0x0) // BLTZ/BGEZ
7723           {
7724             alloc_cc(&current,i);
7725             dirty_reg(&current,CCREG);
7726             alloc_reg(&current,i,dops[i].rs1);
7727             if (dops[i].rt1==31) { // BLTZAL/BGEZAL
7728               alloc_reg(&current,i,31);
7729               dirty_reg(&current,31);
7730               //#ifdef REG_PREFETCH
7731               //alloc_reg(&current,i,PTEMP);
7732               //#endif
7733             }
7734             if((dops[i].rs1&&(dops[i].rs1==dops[i+1].rt1||dops[i].rs1==dops[i+1].rt2)) // The delay slot overwrites the branch condition.
7735                ||(dops[i].rt1==31&&(dops[i+1].rs1==31||dops[i+1].rs2==31||dops[i+1].rt1==31||dops[i+1].rt2==31))) { // DS touches $ra
7736               // Allocate the branch condition registers instead.
7737               current.isconst=0;
7738               current.wasconst=0;
7739               regs[i].wasconst=0;
7740               if(dops[i].rs1) alloc_reg(&current,i,dops[i].rs1);
7741             }
7742             else
7743             {
7744               dops[i].ooo=1;
7745               delayslot_alloc(&current,i+1);
7746             }
7747           }
7748           else
7749           // Don't alloc the delay slot yet because we might not execute it
7750           if((dops[i].opcode2&0x1E)==0x2) // BLTZL/BGEZL
7751           {
7752             current.isconst=0;
7753             current.wasconst=0;
7754             regs[i].wasconst=0;
7755             alloc_cc(&current,i);
7756             dirty_reg(&current,CCREG);
7757             alloc_reg(&current,i,dops[i].rs1);
7758           }
7759           ds=1;
7760           //current.isconst=0;
7761           break;
7762         case IMM16:
7763           imm16_alloc(&current,i);
7764           break;
7765         case LOAD:
7766         case LOADLR:
7767           load_alloc(&current,i);
7768           break;
7769         case STORE:
7770         case STORELR:
7771           store_alloc(&current,i);
7772           break;
7773         case ALU:
7774           alu_alloc(&current,i);
7775           break;
7776         case SHIFT:
7777           shift_alloc(&current,i);
7778           break;
7779         case MULTDIV:
7780           multdiv_alloc(&current,i);
7781           break;
7782         case SHIFTIMM:
7783           shiftimm_alloc(&current,i);
7784           break;
7785         case MOV:
7786           mov_alloc(&current,i);
7787           break;
7788         case COP0:
7789           cop0_alloc(&current,i);
7790           break;
7791         case COP1:
7792           break;
7793         case COP2:
7794           cop2_alloc(&current,i);
7795           break;
7796         case C1LS:
7797           c1ls_alloc(&current,i);
7798           break;
7799         case C2LS:
7800           c2ls_alloc(&current,i);
7801           break;
7802         case C2OP:
7803           c2op_alloc(&current,i);
7804           break;
7805         case SYSCALL:
7806         case HLECALL:
7807         case INTCALL:
7808           syscall_alloc(&current,i);
7809           break;
7810         case SPAN:
7811           pagespan_alloc(&current,i);
7812           break;
7813       }
7814
7815       // Create entry (branch target) regmap
7816       for(hr=0;hr<HOST_REGS;hr++)
7817       {
7818         int r,or;
7819         r=current.regmap[hr];
7820         if(r>=0) {
7821           if(r!=regmap_pre[i][hr]) {
7822             // TODO: delay slot (?)
7823             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
7824             if(or<0||(r&63)>=TEMPREG){
7825               regs[i].regmap_entry[hr]=-1;
7826             }
7827             else
7828             {
7829               // Just move it to a different register
7830               regs[i].regmap_entry[hr]=r;
7831               // If it was dirty before, it's still dirty
7832               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
7833             }
7834           }
7835           else
7836           {
7837             // Unneeded
7838             if(r==0){
7839               regs[i].regmap_entry[hr]=0;
7840             }
7841             else
7842             {
7843               assert(r<64);
7844               if((current.u>>r)&1) {
7845                 regs[i].regmap_entry[hr]=-1;
7846                 //regs[i].regmap[hr]=-1;
7847                 current.regmap[hr]=-1;
7848               }else
7849                 regs[i].regmap_entry[hr]=r;
7850             }
7851           }
7852         } else {
7853           // Branches expect CCREG to be allocated at the target
7854           if(regmap_pre[i][hr]==CCREG)
7855             regs[i].regmap_entry[hr]=CCREG;
7856           else
7857             regs[i].regmap_entry[hr]=-1;
7858         }
7859       }
7860       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
7861     }
7862
7863     if(i>0&&(dops[i-1].itype==STORE||dops[i-1].itype==STORELR||(dops[i-1].itype==C2LS&&dops[i-1].opcode==0x3a))&&(u_int)imm[i-1]<0x800)
7864       current.waswritten|=1<<dops[i-1].rs1;
7865     current.waswritten&=~(1<<dops[i].rt1);
7866     current.waswritten&=~(1<<dops[i].rt2);
7867     if((dops[i].itype==STORE||dops[i].itype==STORELR||(dops[i].itype==C2LS&&dops[i].opcode==0x3a))&&(u_int)imm[i]>=0x800)
7868       current.waswritten&=~(1<<dops[i].rs1);
7869
7870     /* Branch post-alloc */
7871     if(i>0)
7872     {
7873       current.wasdirty=current.dirty;
7874       switch(dops[i-1].itype) {
7875         case UJUMP:
7876           memcpy(&branch_regs[i-1],&current,sizeof(current));
7877           branch_regs[i-1].isconst=0;
7878           branch_regs[i-1].wasconst=0;
7879           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<dops[i-1].rs1)|(1LL<<dops[i-1].rs2));
7880           alloc_cc(&branch_regs[i-1],i-1);
7881           dirty_reg(&branch_regs[i-1],CCREG);
7882           if(dops[i-1].rt1==31) { // JAL
7883             alloc_reg(&branch_regs[i-1],i-1,31);
7884             dirty_reg(&branch_regs[i-1],31);
7885           }
7886           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7887           memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
7888           break;
7889         case RJUMP:
7890           memcpy(&branch_regs[i-1],&current,sizeof(current));
7891           branch_regs[i-1].isconst=0;
7892           branch_regs[i-1].wasconst=0;
7893           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<dops[i-1].rs1)|(1LL<<dops[i-1].rs2));
7894           alloc_cc(&branch_regs[i-1],i-1);
7895           dirty_reg(&branch_regs[i-1],CCREG);
7896           alloc_reg(&branch_regs[i-1],i-1,dops[i-1].rs1);
7897           if(dops[i-1].rt1!=0) { // JALR
7898             alloc_reg(&branch_regs[i-1],i-1,dops[i-1].rt1);
7899             dirty_reg(&branch_regs[i-1],dops[i-1].rt1);
7900           }
7901           #ifdef USE_MINI_HT
7902           if(dops[i-1].rs1==31) { // JALR
7903             alloc_reg(&branch_regs[i-1],i-1,RHASH);
7904             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
7905           }
7906           #endif
7907           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7908           memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
7909           break;
7910         case CJUMP:
7911           if((dops[i-1].opcode&0x3E)==4) // BEQ/BNE
7912           {
7913             alloc_cc(&current,i-1);
7914             dirty_reg(&current,CCREG);
7915             if((dops[i-1].rs1&&(dops[i-1].rs1==dops[i].rt1||dops[i-1].rs1==dops[i].rt2))||
7916                (dops[i-1].rs2&&(dops[i-1].rs2==dops[i].rt1||dops[i-1].rs2==dops[i].rt2))) {
7917               // The delay slot overwrote one of our conditions
7918               // Delay slot goes after the test (in order)
7919               current.u=branch_unneeded_reg[i-1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
7920               current.u|=1;
7921               delayslot_alloc(&current,i);
7922               current.isconst=0;
7923             }
7924             else
7925             {
7926               current.u=branch_unneeded_reg[i-1]&~((1LL<<dops[i-1].rs1)|(1LL<<dops[i-1].rs2));
7927               // Alloc the branch condition registers
7928               if(dops[i-1].rs1) alloc_reg(&current,i-1,dops[i-1].rs1);
7929               if(dops[i-1].rs2) alloc_reg(&current,i-1,dops[i-1].rs2);
7930             }
7931             memcpy(&branch_regs[i-1],&current,sizeof(current));
7932             branch_regs[i-1].isconst=0;
7933             branch_regs[i-1].wasconst=0;
7934             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
7935             memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
7936           }
7937           else
7938           if((dops[i-1].opcode&0x3E)==6) // BLEZ/BGTZ
7939           {
7940             alloc_cc(&current,i-1);
7941             dirty_reg(&current,CCREG);
7942             if(dops[i-1].rs1==dops[i].rt1||dops[i-1].rs1==dops[i].rt2) {
7943               // The delay slot overwrote the branch condition
7944               // Delay slot goes after the test (in order)
7945               current.u=branch_unneeded_reg[i-1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
7946               current.u|=1;
7947               delayslot_alloc(&current,i);
7948               current.isconst=0;
7949             }
7950             else
7951             {
7952               current.u=branch_unneeded_reg[i-1]&~(1LL<<dops[i-1].rs1);
7953               // Alloc the branch condition register
7954               alloc_reg(&current,i-1,dops[i-1].rs1);
7955             }
7956             memcpy(&branch_regs[i-1],&current,sizeof(current));
7957             branch_regs[i-1].isconst=0;
7958             branch_regs[i-1].wasconst=0;
7959             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
7960             memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
7961           }
7962           else
7963           // Alloc the delay slot in case the branch is taken
7964           if((dops[i-1].opcode&0x3E)==0x14) // BEQL/BNEL
7965           {
7966             memcpy(&branch_regs[i-1],&current,sizeof(current));
7967             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2)|(1LL<<dops[i].rt1)|(1LL<<dops[i].rt2)))|1;
7968             alloc_cc(&branch_regs[i-1],i);
7969             dirty_reg(&branch_regs[i-1],CCREG);
7970             delayslot_alloc(&branch_regs[i-1],i);
7971             branch_regs[i-1].isconst=0;
7972             alloc_reg(&current,i,CCREG); // Not taken path
7973             dirty_reg(&current,CCREG);
7974             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7975           }
7976           else
7977           if((dops[i-1].opcode&0x3E)==0x16) // BLEZL/BGTZL
7978           {
7979             memcpy(&branch_regs[i-1],&current,sizeof(current));
7980             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2)|(1LL<<dops[i].rt1)|(1LL<<dops[i].rt2)))|1;
7981             alloc_cc(&branch_regs[i-1],i);
7982             dirty_reg(&branch_regs[i-1],CCREG);
7983             delayslot_alloc(&branch_regs[i-1],i);
7984             branch_regs[i-1].isconst=0;
7985             alloc_reg(&current,i,CCREG); // Not taken path
7986             dirty_reg(&current,CCREG);
7987             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7988           }
7989           break;
7990         case SJUMP:
7991           //if((dops[i-1].opcode2&0x1E)==0) // BLTZ/BGEZ
7992           if((dops[i-1].opcode2&0x0E)==0) // BLTZ/BGEZ
7993           {
7994             alloc_cc(&current,i-1);
7995             dirty_reg(&current,CCREG);
7996             if(dops[i-1].rs1==dops[i].rt1||dops[i-1].rs1==dops[i].rt2) {
7997               // The delay slot overwrote the branch condition
7998               // Delay slot goes after the test (in order)
7999               current.u=branch_unneeded_reg[i-1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2));
8000               current.u|=1;
8001               delayslot_alloc(&current,i);
8002               current.isconst=0;
8003             }
8004             else
8005             {
8006               current.u=branch_unneeded_reg[i-1]&~(1LL<<dops[i-1].rs1);
8007               // Alloc the branch condition register
8008               alloc_reg(&current,i-1,dops[i-1].rs1);
8009             }
8010             memcpy(&branch_regs[i-1],&current,sizeof(current));
8011             branch_regs[i-1].isconst=0;
8012             branch_regs[i-1].wasconst=0;
8013             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8014             memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
8015           }
8016           else
8017           // Alloc the delay slot in case the branch is taken
8018           if((dops[i-1].opcode2&0x1E)==2) // BLTZL/BGEZL
8019           {
8020             memcpy(&branch_regs[i-1],&current,sizeof(current));
8021             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<dops[i].rs1)|(1LL<<dops[i].rs2)|(1LL<<dops[i].rt1)|(1LL<<dops[i].rt2)))|1;
8022             alloc_cc(&branch_regs[i-1],i);
8023             dirty_reg(&branch_regs[i-1],CCREG);
8024             delayslot_alloc(&branch_regs[i-1],i);
8025             branch_regs[i-1].isconst=0;
8026             alloc_reg(&current,i,CCREG); // Not taken path
8027             dirty_reg(&current,CCREG);
8028             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8029           }
8030           // FIXME: BLTZAL/BGEZAL
8031           if(dops[i-1].opcode2&0x10) { // BxxZAL
8032             alloc_reg(&branch_regs[i-1],i-1,31);
8033             dirty_reg(&branch_regs[i-1],31);
8034           }
8035           break;
8036       }
8037
8038       if (dops[i-1].is_ujump)
8039       {
8040         if(dops[i-1].rt1==31) // JAL/JALR
8041         {
8042           // Subroutine call will return here, don't alloc any registers
8043           current.dirty=0;
8044           clear_all_regs(current.regmap);
8045           alloc_reg(&current,i,CCREG);
8046           dirty_reg(&current,CCREG);
8047         }
8048         else if(i+1<slen)
8049         {
8050           // Internal branch will jump here, match registers to caller
8051           current.dirty=0;
8052           clear_all_regs(current.regmap);
8053           alloc_reg(&current,i,CCREG);
8054           dirty_reg(&current,CCREG);
8055           for(j=i-1;j>=0;j--)
8056           {
8057             if(ba[j]==start+i*4+4) {
8058               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
8059               current.dirty=branch_regs[j].dirty;
8060               break;
8061             }
8062           }
8063           while(j>=0) {
8064             if(ba[j]==start+i*4+4) {
8065               for(hr=0;hr<HOST_REGS;hr++) {
8066                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
8067                   current.regmap[hr]=-1;
8068                 }
8069                 current.dirty&=branch_regs[j].dirty;
8070               }
8071             }
8072             j--;
8073           }
8074         }
8075       }
8076     }
8077
8078     // Count cycles in between branches
8079     ccadj[i]=cc;
8080     if (i > 0 && (dops[i-1].is_jump || dops[i].itype == SYSCALL || dops[i].itype == HLECALL))
8081     {
8082       cc=0;
8083     }
8084 #if !defined(DRC_DBG)
8085     else if(dops[i].itype==C2OP&&gte_cycletab[source[i]&0x3f]>2)
8086     {
8087       // this should really be removed since the real stalls have been implemented,
8088       // but doing so causes sizeable perf regression against the older version
8089       u_int gtec = gte_cycletab[source[i] & 0x3f];
8090       cc += HACK_ENABLED(NDHACK_NO_STALLS) ? gtec/2 : 2;
8091     }
8092     else if(i>1&&dops[i].itype==STORE&&dops[i-1].itype==STORE&&dops[i-2].itype==STORE&&!dops[i].bt)
8093     {
8094       cc+=4;
8095     }
8096     else if(dops[i].itype==C2LS)
8097     {
8098       // same as with C2OP
8099       cc += HACK_ENABLED(NDHACK_NO_STALLS) ? 4 : 2;
8100     }
8101 #endif
8102     else
8103     {
8104       cc++;
8105     }
8106
8107     if(!dops[i].is_ds) {
8108       regs[i].dirty=current.dirty;
8109       regs[i].isconst=current.isconst;
8110       memcpy(constmap[i],current_constmap,sizeof(constmap[i]));
8111     }
8112     for(hr=0;hr<HOST_REGS;hr++) {
8113       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
8114         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
8115           regs[i].wasconst&=~(1<<hr);
8116         }
8117       }
8118     }
8119     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
8120     regs[i].waswritten=current.waswritten;
8121   }
8122
8123   /* Pass 4 - Cull unused host registers */
8124
8125   uint64_t nr=0;
8126
8127   for (i=slen-1;i>=0;i--)
8128   {
8129     int hr;
8130     if(dops[i].is_jump)
8131     {
8132       if(ba[i]<start || ba[i]>=(start+slen*4))
8133       {
8134         // Branch out of this block, don't need anything
8135         nr=0;
8136       }
8137       else
8138       {
8139         // Internal branch
8140         // Need whatever matches the target
8141         nr=0;
8142         int t=(ba[i]-start)>>2;
8143         for(hr=0;hr<HOST_REGS;hr++)
8144         {
8145           if(regs[i].regmap_entry[hr]>=0) {
8146             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
8147           }
8148         }
8149       }
8150       // Conditional branch may need registers for following instructions
8151       if (!dops[i].is_ujump)
8152       {
8153         if(i<slen-2) {
8154           nr|=needed_reg[i+2];
8155           for(hr=0;hr<HOST_REGS;hr++)
8156           {
8157             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
8158             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
8159           }
8160         }
8161       }
8162       // Don't need stuff which is overwritten
8163       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8164       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8165       // Merge in delay slot
8166       for(hr=0;hr<HOST_REGS;hr++)
8167       {
8168         if(dops[i+1].rt1&&dops[i+1].rt1==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8169         if(dops[i+1].rt2&&dops[i+1].rt2==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8170         if(dops[i+1].rs1==regmap_pre[i][hr]) nr|=1<<hr;
8171         if(dops[i+1].rs2==regmap_pre[i][hr]) nr|=1<<hr;
8172         if(dops[i+1].rs1==regs[i].regmap_entry[hr]) nr|=1<<hr;
8173         if(dops[i+1].rs2==regs[i].regmap_entry[hr]) nr|=1<<hr;
8174         if(dops[i+1].itype==STORE || dops[i+1].itype==STORELR || (dops[i+1].opcode&0x3b)==0x39 || (dops[i+1].opcode&0x3b)==0x3a) {
8175           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8176           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8177         }
8178       }
8179     }
8180     else if(dops[i].itype==SYSCALL||dops[i].itype==HLECALL||dops[i].itype==INTCALL)
8181     {
8182       // SYSCALL instruction (software interrupt)
8183       nr=0;
8184     }
8185     else if(dops[i].itype==COP0 && (source[i]&0x3f)==0x18)
8186     {
8187       // ERET instruction (return from interrupt)
8188       nr=0;
8189     }
8190     else // Non-branch
8191     {
8192       if(i<slen-1) {
8193         for(hr=0;hr<HOST_REGS;hr++) {
8194           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
8195           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
8196           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8197           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8198         }
8199       }
8200     }
8201     for(hr=0;hr<HOST_REGS;hr++)
8202     {
8203       // Overwritten registers are not needed
8204       if(dops[i].rt1&&dops[i].rt1==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8205       if(dops[i].rt2&&dops[i].rt2==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8206       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8207       // Source registers are needed
8208       if(dops[i].rs1==regmap_pre[i][hr]) nr|=1<<hr;
8209       if(dops[i].rs2==regmap_pre[i][hr]) nr|=1<<hr;
8210       if(dops[i].rs1==regs[i].regmap_entry[hr]) nr|=1<<hr;
8211       if(dops[i].rs2==regs[i].regmap_entry[hr]) nr|=1<<hr;
8212       if(dops[i].itype==STORE || dops[i].itype==STORELR || (dops[i].opcode&0x3b)==0x39 || (dops[i].opcode&0x3b)==0x3a) {
8213         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8214         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8215       }
8216       // Don't store a register immediately after writing it,
8217       // may prevent dual-issue.
8218       // But do so if this is a branch target, otherwise we
8219       // might have to load the register before the branch.
8220       if(i>0&&!dops[i].bt&&((regs[i].wasdirty>>hr)&1)) {
8221         if((regmap_pre[i][hr]>0&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1))) {
8222           if(dops[i-1].rt1==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8223           if(dops[i-1].rt2==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8224         }
8225         if((regs[i].regmap_entry[hr]>0&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1))) {
8226           if(dops[i-1].rt1==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8227           if(dops[i-1].rt2==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8228         }
8229       }
8230     }
8231     // Cycle count is needed at branches.  Assume it is needed at the target too.
8232     if(i==0||dops[i].bt||dops[i].itype==CJUMP||dops[i].itype==SPAN) {
8233       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8234       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8235     }
8236     // Save it
8237     needed_reg[i]=nr;
8238
8239     // Deallocate unneeded registers
8240     for(hr=0;hr<HOST_REGS;hr++)
8241     {
8242       if(!((nr>>hr)&1)) {
8243         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
8244         if(dops[i].is_jump)
8245         {
8246           int map=0,temp=0;
8247           if(dops[i+1].itype==STORE || dops[i+1].itype==STORELR ||
8248              (dops[i+1].opcode&0x3b)==0x39 || (dops[i+1].opcode&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8249             map=INVCP;
8250           }
8251           if(dops[i+1].itype==LOADLR || dops[i+1].itype==STORELR ||
8252              dops[i+1].itype==C1LS || dops[i+1].itype==C2LS)
8253             temp=FTEMP;
8254           if((regs[i].regmap[hr]&63)!=dops[i].rs1 && (regs[i].regmap[hr]&63)!=dops[i].rs2 &&
8255              (regs[i].regmap[hr]&63)!=dops[i].rt1 && (regs[i].regmap[hr]&63)!=dops[i].rt2 &&
8256              (regs[i].regmap[hr]&63)!=dops[i+1].rt1 && (regs[i].regmap[hr]&63)!=dops[i+1].rt2 &&
8257              regs[i].regmap[hr]!=dops[i+1].rs1 && regs[i].regmap[hr]!=dops[i+1].rs2 &&
8258              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
8259              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
8260              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
8261              regs[i].regmap[hr]!=map )
8262           {
8263             regs[i].regmap[hr]=-1;
8264             regs[i].isconst&=~(1<<hr);
8265             if((branch_regs[i].regmap[hr]&63)!=dops[i].rs1 && (branch_regs[i].regmap[hr]&63)!=dops[i].rs2 &&
8266                (branch_regs[i].regmap[hr]&63)!=dops[i].rt1 && (branch_regs[i].regmap[hr]&63)!=dops[i].rt2 &&
8267                (branch_regs[i].regmap[hr]&63)!=dops[i+1].rt1 && (branch_regs[i].regmap[hr]&63)!=dops[i+1].rt2 &&
8268                branch_regs[i].regmap[hr]!=dops[i+1].rs1 && branch_regs[i].regmap[hr]!=dops[i+1].rs2 &&
8269                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
8270                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
8271                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
8272                branch_regs[i].regmap[hr]!=map)
8273             {
8274               branch_regs[i].regmap[hr]=-1;
8275               branch_regs[i].regmap_entry[hr]=-1;
8276               if (!dops[i].is_ujump)
8277               {
8278                 if (i < slen-2) {
8279                   regmap_pre[i+2][hr]=-1;
8280                   regs[i+2].wasconst&=~(1<<hr);
8281                 }
8282               }
8283             }
8284           }
8285         }
8286         else
8287         {
8288           // Non-branch
8289           if(i>0)
8290           {
8291             int map=-1,temp=-1;
8292             if(dops[i].itype==STORE || dops[i].itype==STORELR ||
8293                       (dops[i].opcode&0x3b)==0x39 || (dops[i].opcode&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8294               map=INVCP;
8295             }
8296             if(dops[i].itype==LOADLR || dops[i].itype==STORELR ||
8297                dops[i].itype==C1LS || dops[i].itype==C2LS)
8298               temp=FTEMP;
8299             if((regs[i].regmap[hr]&63)!=dops[i].rt1 && (regs[i].regmap[hr]&63)!=dops[i].rt2 &&
8300                regs[i].regmap[hr]!=dops[i].rs1 && regs[i].regmap[hr]!=dops[i].rs2 &&
8301                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
8302                (dops[i].itype!=SPAN||regs[i].regmap[hr]!=CCREG))
8303             {
8304               if(i<slen-1&&!dops[i].is_ds) {
8305                 assert(regs[i].regmap[hr]<64);
8306                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]>0)
8307                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
8308                 {
8309                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
8310                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
8311                 }
8312                 regmap_pre[i+1][hr]=-1;
8313                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
8314                 regs[i+1].wasconst&=~(1<<hr);
8315               }
8316               regs[i].regmap[hr]=-1;
8317               regs[i].isconst&=~(1<<hr);
8318             }
8319           }
8320         }
8321       } // if needed
8322     } // for hr
8323   }
8324
8325   /* Pass 5 - Pre-allocate registers */
8326
8327   // If a register is allocated during a loop, try to allocate it for the
8328   // entire loop, if possible.  This avoids loading/storing registers
8329   // inside of the loop.
8330
8331   signed char f_regmap[HOST_REGS];
8332   clear_all_regs(f_regmap);
8333   for(i=0;i<slen-1;i++)
8334   {
8335     if(dops[i].itype==UJUMP||dops[i].itype==CJUMP||dops[i].itype==SJUMP)
8336     {
8337       if(ba[i]>=start && ba[i]<(start+i*4))
8338       if(dops[i+1].itype==NOP||dops[i+1].itype==MOV||dops[i+1].itype==ALU
8339       ||dops[i+1].itype==SHIFTIMM||dops[i+1].itype==IMM16||dops[i+1].itype==LOAD
8340       ||dops[i+1].itype==STORE||dops[i+1].itype==STORELR||dops[i+1].itype==C1LS
8341       ||dops[i+1].itype==SHIFT||dops[i+1].itype==COP1
8342       ||dops[i+1].itype==COP2||dops[i+1].itype==C2LS||dops[i+1].itype==C2OP)
8343       {
8344         int t=(ba[i]-start)>>2;
8345         if(t > 0 && !dops[t-1].is_jump) // loop_preload can't handle jumps into delay slots
8346         if(t<2||(dops[t-2].itype!=UJUMP&&dops[t-2].itype!=RJUMP)||dops[t-2].rt1!=31) // call/ret assumes no registers allocated
8347         for(hr=0;hr<HOST_REGS;hr++)
8348         {
8349           if(regs[i].regmap[hr]>=0) {
8350             if(f_regmap[hr]!=regs[i].regmap[hr]) {
8351               // dealloc old register
8352               int n;
8353               for(n=0;n<HOST_REGS;n++)
8354               {
8355                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
8356               }
8357               // and alloc new one
8358               f_regmap[hr]=regs[i].regmap[hr];
8359             }
8360           }
8361           if(branch_regs[i].regmap[hr]>=0) {
8362             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
8363               // dealloc old register
8364               int n;
8365               for(n=0;n<HOST_REGS;n++)
8366               {
8367                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
8368               }
8369               // and alloc new one
8370               f_regmap[hr]=branch_regs[i].regmap[hr];
8371             }
8372           }
8373           if(dops[i].ooo) {
8374             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
8375               f_regmap[hr]=branch_regs[i].regmap[hr];
8376           }else{
8377             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
8378               f_regmap[hr]=branch_regs[i].regmap[hr];
8379           }
8380           // Avoid dirty->clean transition
8381           #ifdef DESTRUCTIVE_WRITEBACK
8382           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
8383           #endif
8384           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
8385           // case above, however it's always a good idea.  We can't hoist the
8386           // load if the register was already allocated, so there's no point
8387           // wasting time analyzing most of these cases.  It only "succeeds"
8388           // when the mapping was different and the load can be replaced with
8389           // a mov, which is of negligible benefit.  So such cases are
8390           // skipped below.
8391           if(f_regmap[hr]>0) {
8392             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
8393               int r=f_regmap[hr];
8394               for(j=t;j<=i;j++)
8395               {
8396                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
8397                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
8398                 assert(r < 64);
8399                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
8400                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
8401                   int k;
8402                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
8403                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
8404                     if(r>63) {
8405                       if(get_reg(regs[i].regmap,r&63)<0) break;
8406                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
8407                     }
8408                     k=i;
8409                     while(k>1&&regs[k-1].regmap[hr]==-1) {
8410                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
8411                         //printf("no free regs for store %x\n",start+(k-1)*4);
8412                         break;
8413                       }
8414                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
8415                         //printf("no-match due to different register\n");
8416                         break;
8417                       }
8418                       if (dops[k-2].is_jump) {
8419                         //printf("no-match due to branch\n");
8420                         break;
8421                       }
8422                       // call/ret fast path assumes no registers allocated
8423                       if(k>2&&(dops[k-3].itype==UJUMP||dops[k-3].itype==RJUMP)&&dops[k-3].rt1==31) {
8424                         break;
8425                       }
8426                       assert(r < 64);
8427                       k--;
8428                     }
8429                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
8430                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
8431                       while(k<i) {
8432                         regs[k].regmap_entry[hr]=f_regmap[hr];
8433                         regs[k].regmap[hr]=f_regmap[hr];
8434                         regmap_pre[k+1][hr]=f_regmap[hr];
8435                         regs[k].wasdirty&=~(1<<hr);
8436                         regs[k].dirty&=~(1<<hr);
8437                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
8438                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
8439                         regs[k].wasconst&=~(1<<hr);
8440                         regs[k].isconst&=~(1<<hr);
8441                         k++;
8442                       }
8443                     }
8444                     else {
8445                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
8446                       break;
8447                     }
8448                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
8449                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
8450                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
8451                       regs[i].regmap_entry[hr]=f_regmap[hr];
8452                       regs[i].regmap[hr]=f_regmap[hr];
8453                       regs[i].wasdirty&=~(1<<hr);
8454                       regs[i].dirty&=~(1<<hr);
8455                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
8456                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
8457                       regs[i].wasconst&=~(1<<hr);
8458                       regs[i].isconst&=~(1<<hr);
8459                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
8460                       branch_regs[i].wasdirty&=~(1<<hr);
8461                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
8462                       branch_regs[i].regmap[hr]=f_regmap[hr];
8463                       branch_regs[i].dirty&=~(1<<hr);
8464                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
8465                       branch_regs[i].wasconst&=~(1<<hr);
8466                       branch_regs[i].isconst&=~(1<<hr);
8467                       if (!dops[i].is_ujump) {
8468                         regmap_pre[i+2][hr]=f_regmap[hr];
8469                         regs[i+2].wasdirty&=~(1<<hr);
8470                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
8471                       }
8472                     }
8473                   }
8474                   for(k=t;k<j;k++) {
8475                     // Alloc register clean at beginning of loop,
8476                     // but may dirty it in pass 6
8477                     regs[k].regmap_entry[hr]=f_regmap[hr];
8478                     regs[k].regmap[hr]=f_regmap[hr];
8479                     regs[k].dirty&=~(1<<hr);
8480                     regs[k].wasconst&=~(1<<hr);
8481                     regs[k].isconst&=~(1<<hr);
8482                     if (dops[k].is_jump) {
8483                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
8484                       branch_regs[k].regmap[hr]=f_regmap[hr];
8485                       branch_regs[k].dirty&=~(1<<hr);
8486                       branch_regs[k].wasconst&=~(1<<hr);
8487                       branch_regs[k].isconst&=~(1<<hr);
8488                       if (!dops[k].is_ujump) {
8489                         regmap_pre[k+2][hr]=f_regmap[hr];
8490                         regs[k+2].wasdirty&=~(1<<hr);
8491                       }
8492                     }
8493                     else
8494                     {
8495                       regmap_pre[k+1][hr]=f_regmap[hr];
8496                       regs[k+1].wasdirty&=~(1<<hr);
8497                     }
8498                   }
8499                   if(regs[j].regmap[hr]==f_regmap[hr])
8500                     regs[j].regmap_entry[hr]=f_regmap[hr];
8501                   break;
8502                 }
8503                 if(j==i) break;
8504                 if(regs[j].regmap[hr]>=0)
8505                   break;
8506                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
8507                   //printf("no-match due to different register\n");
8508                   break;
8509                 }
8510                 if (dops[j].is_ujump)
8511                 {
8512                   // Stop on unconditional branch
8513                   break;
8514                 }
8515                 if(dops[j].itype==CJUMP||dops[j].itype==SJUMP)
8516                 {
8517                   if(dops[j].ooo) {
8518                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
8519                       break;
8520                   }else{
8521                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
8522                       break;
8523                   }
8524                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
8525                     //printf("no-match due to different register (branch)\n");
8526                     break;
8527                   }
8528                 }
8529                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
8530                   //printf("No free regs for store %x\n",start+j*4);
8531                   break;
8532                 }
8533                 assert(f_regmap[hr]<64);
8534               }
8535             }
8536           }
8537         }
8538       }
8539     }else{
8540       // Non branch or undetermined branch target
8541       for(hr=0;hr<HOST_REGS;hr++)
8542       {
8543         if(hr!=EXCLUDE_REG) {
8544           if(regs[i].regmap[hr]>=0) {
8545             if(f_regmap[hr]!=regs[i].regmap[hr]) {
8546               // dealloc old register
8547               int n;
8548               for(n=0;n<HOST_REGS;n++)
8549               {
8550                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
8551               }
8552               // and alloc new one
8553               f_regmap[hr]=regs[i].regmap[hr];
8554             }
8555           }
8556         }
8557       }
8558       // Try to restore cycle count at branch targets
8559       if(dops[i].bt) {
8560         for(j=i;j<slen-1;j++) {
8561           if(regs[j].regmap[HOST_CCREG]!=-1) break;
8562           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
8563             //printf("no free regs for store %x\n",start+j*4);
8564             break;
8565           }
8566         }
8567         if(regs[j].regmap[HOST_CCREG]==CCREG) {
8568           int k=i;
8569           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
8570           while(k<j) {
8571             regs[k].regmap_entry[HOST_CCREG]=CCREG;
8572             regs[k].regmap[HOST_CCREG]=CCREG;
8573             regmap_pre[k+1][HOST_CCREG]=CCREG;
8574             regs[k+1].wasdirty|=1<<HOST_CCREG;
8575             regs[k].dirty|=1<<HOST_CCREG;
8576             regs[k].wasconst&=~(1<<HOST_CCREG);
8577             regs[k].isconst&=~(1<<HOST_CCREG);
8578             k++;
8579           }
8580           regs[j].regmap_entry[HOST_CCREG]=CCREG;
8581         }
8582         // Work backwards from the branch target
8583         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
8584         {
8585           //printf("Extend backwards\n");
8586           int k;
8587           k=i;
8588           while(regs[k-1].regmap[HOST_CCREG]==-1) {
8589             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
8590               //printf("no free regs for store %x\n",start+(k-1)*4);
8591               break;
8592             }
8593             k--;
8594           }
8595           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
8596             //printf("Extend CC, %x ->\n",start+k*4);
8597             while(k<=i) {
8598               regs[k].regmap_entry[HOST_CCREG]=CCREG;
8599               regs[k].regmap[HOST_CCREG]=CCREG;
8600               regmap_pre[k+1][HOST_CCREG]=CCREG;
8601               regs[k+1].wasdirty|=1<<HOST_CCREG;
8602               regs[k].dirty|=1<<HOST_CCREG;
8603               regs[k].wasconst&=~(1<<HOST_CCREG);
8604               regs[k].isconst&=~(1<<HOST_CCREG);
8605               k++;
8606             }
8607           }
8608           else {
8609             //printf("Fail Extend CC, %x ->\n",start+k*4);
8610           }
8611         }
8612       }
8613       if(dops[i].itype!=STORE&&dops[i].itype!=STORELR&&dops[i].itype!=C1LS&&dops[i].itype!=SHIFT&&
8614          dops[i].itype!=NOP&&dops[i].itype!=MOV&&dops[i].itype!=ALU&&dops[i].itype!=SHIFTIMM&&
8615          dops[i].itype!=IMM16&&dops[i].itype!=LOAD&&dops[i].itype!=COP1)
8616       {
8617         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
8618       }
8619     }
8620   }
8621
8622   // This allocates registers (if possible) one instruction prior
8623   // to use, which can avoid a load-use penalty on certain CPUs.
8624   for(i=0;i<slen-1;i++)
8625   {
8626     if (!i || !dops[i-1].is_jump)
8627     {
8628       if(!dops[i+1].bt)
8629       {
8630         if(dops[i].itype==ALU||dops[i].itype==MOV||dops[i].itype==LOAD||dops[i].itype==SHIFTIMM||dops[i].itype==IMM16
8631            ||((dops[i].itype==COP1||dops[i].itype==COP2)&&dops[i].opcode2<3))
8632         {
8633           if(dops[i+1].rs1) {
8634             if((hr=get_reg(regs[i+1].regmap,dops[i+1].rs1))>=0)
8635             {
8636               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8637               {
8638                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
8639                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
8640                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
8641                 regs[i].isconst&=~(1<<hr);
8642                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8643                 constmap[i][hr]=constmap[i+1][hr];
8644                 regs[i+1].wasdirty&=~(1<<hr);
8645                 regs[i].dirty&=~(1<<hr);
8646               }
8647             }
8648           }
8649           if(dops[i+1].rs2) {
8650             if((hr=get_reg(regs[i+1].regmap,dops[i+1].rs2))>=0)
8651             {
8652               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8653               {
8654                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
8655                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
8656                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
8657                 regs[i].isconst&=~(1<<hr);
8658                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8659                 constmap[i][hr]=constmap[i+1][hr];
8660                 regs[i+1].wasdirty&=~(1<<hr);
8661                 regs[i].dirty&=~(1<<hr);
8662               }
8663             }
8664           }
8665           // Preload target address for load instruction (non-constant)
8666           if(dops[i+1].itype==LOAD&&dops[i+1].rs1&&get_reg(regs[i+1].regmap,dops[i+1].rs1)<0) {
8667             if((hr=get_reg(regs[i+1].regmap,dops[i+1].rt1))>=0)
8668             {
8669               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8670               {
8671                 regs[i].regmap[hr]=dops[i+1].rs1;
8672                 regmap_pre[i+1][hr]=dops[i+1].rs1;
8673                 regs[i+1].regmap_entry[hr]=dops[i+1].rs1;
8674                 regs[i].isconst&=~(1<<hr);
8675                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8676                 constmap[i][hr]=constmap[i+1][hr];
8677                 regs[i+1].wasdirty&=~(1<<hr);
8678                 regs[i].dirty&=~(1<<hr);
8679               }
8680             }
8681           }
8682           // Load source into target register
8683           if(dops[i+1].lt1&&get_reg(regs[i+1].regmap,dops[i+1].rs1)<0) {
8684             if((hr=get_reg(regs[i+1].regmap,dops[i+1].rt1))>=0)
8685             {
8686               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8687               {
8688                 regs[i].regmap[hr]=dops[i+1].rs1;
8689                 regmap_pre[i+1][hr]=dops[i+1].rs1;
8690                 regs[i+1].regmap_entry[hr]=dops[i+1].rs1;
8691                 regs[i].isconst&=~(1<<hr);
8692                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8693                 constmap[i][hr]=constmap[i+1][hr];
8694                 regs[i+1].wasdirty&=~(1<<hr);
8695                 regs[i].dirty&=~(1<<hr);
8696               }
8697             }
8698           }
8699           // Address for store instruction (non-constant)
8700           if(dops[i+1].itype==STORE||dops[i+1].itype==STORELR
8701              ||(dops[i+1].opcode&0x3b)==0x39||(dops[i+1].opcode&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
8702             if(get_reg(regs[i+1].regmap,dops[i+1].rs1)<0) {
8703               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
8704               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
8705               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
8706               assert(hr>=0);
8707               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8708               {
8709                 regs[i].regmap[hr]=dops[i+1].rs1;
8710                 regmap_pre[i+1][hr]=dops[i+1].rs1;
8711                 regs[i+1].regmap_entry[hr]=dops[i+1].rs1;
8712                 regs[i].isconst&=~(1<<hr);
8713                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8714                 constmap[i][hr]=constmap[i+1][hr];
8715                 regs[i+1].wasdirty&=~(1<<hr);
8716                 regs[i].dirty&=~(1<<hr);
8717               }
8718             }
8719           }
8720           if(dops[i+1].itype==LOADLR||(dops[i+1].opcode&0x3b)==0x31||(dops[i+1].opcode&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
8721             if(get_reg(regs[i+1].regmap,dops[i+1].rs1)<0) {
8722               int nr;
8723               hr=get_reg(regs[i+1].regmap,FTEMP);
8724               assert(hr>=0);
8725               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8726               {
8727                 regs[i].regmap[hr]=dops[i+1].rs1;
8728                 regmap_pre[i+1][hr]=dops[i+1].rs1;
8729                 regs[i+1].regmap_entry[hr]=dops[i+1].rs1;
8730                 regs[i].isconst&=~(1<<hr);
8731                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8732                 constmap[i][hr]=constmap[i+1][hr];
8733                 regs[i+1].wasdirty&=~(1<<hr);
8734                 regs[i].dirty&=~(1<<hr);
8735               }
8736               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
8737               {
8738                 // move it to another register
8739                 regs[i+1].regmap[hr]=-1;
8740                 regmap_pre[i+2][hr]=-1;
8741                 regs[i+1].regmap[nr]=FTEMP;
8742                 regmap_pre[i+2][nr]=FTEMP;
8743                 regs[i].regmap[nr]=dops[i+1].rs1;
8744                 regmap_pre[i+1][nr]=dops[i+1].rs1;
8745                 regs[i+1].regmap_entry[nr]=dops[i+1].rs1;
8746                 regs[i].isconst&=~(1<<nr);
8747                 regs[i+1].isconst&=~(1<<nr);
8748                 regs[i].dirty&=~(1<<nr);
8749                 regs[i+1].wasdirty&=~(1<<nr);
8750                 regs[i+1].dirty&=~(1<<nr);
8751                 regs[i+2].wasdirty&=~(1<<nr);
8752               }
8753             }
8754           }
8755           if(dops[i+1].itype==LOAD||dops[i+1].itype==LOADLR||dops[i+1].itype==STORE||dops[i+1].itype==STORELR/*||dops[i+1].itype==C1LS||||dops[i+1].itype==C2LS*/) {
8756             if(dops[i+1].itype==LOAD)
8757               hr=get_reg(regs[i+1].regmap,dops[i+1].rt1);
8758             if(dops[i+1].itype==LOADLR||(dops[i+1].opcode&0x3b)==0x31||(dops[i+1].opcode&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
8759               hr=get_reg(regs[i+1].regmap,FTEMP);
8760             if(dops[i+1].itype==STORE||dops[i+1].itype==STORELR||(dops[i+1].opcode&0x3b)==0x39||(dops[i+1].opcode&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
8761               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
8762               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
8763             }
8764             if(hr>=0&&regs[i].regmap[hr]<0) {
8765               int rs=get_reg(regs[i+1].regmap,dops[i+1].rs1);
8766               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
8767                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
8768                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
8769                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
8770                 regs[i].isconst&=~(1<<hr);
8771                 regs[i+1].wasdirty&=~(1<<hr);
8772                 regs[i].dirty&=~(1<<hr);
8773               }
8774             }
8775           }
8776         }
8777       }
8778     }
8779   }
8780
8781   /* Pass 6 - Optimize clean/dirty state */
8782   clean_registers(0,slen-1,1);
8783
8784   /* Pass 7 - Identify 32-bit registers */
8785   for (i=slen-1;i>=0;i--)
8786   {
8787     if(dops[i].itype==CJUMP||dops[i].itype==SJUMP)
8788     {
8789       // Conditional branch
8790       if((source[i]>>16)!=0x1000&&i<slen-2) {
8791         // Mark this address as a branch target since it may be called
8792         // upon return from interrupt
8793         dops[i+2].bt=1;
8794       }
8795     }
8796   }
8797
8798   if(dops[slen-1].itype==SPAN) {
8799     dops[slen-1].bt=1; // Mark as a branch target so instruction can restart after exception
8800   }
8801
8802 #ifdef DISASM
8803   /* Debug/disassembly */
8804   for(i=0;i<slen;i++)
8805   {
8806     printf("U:");
8807     int r;
8808     for(r=1;r<=CCREG;r++) {
8809       if((unneeded_reg[i]>>r)&1) {
8810         if(r==HIREG) printf(" HI");
8811         else if(r==LOREG) printf(" LO");
8812         else printf(" r%d",r);
8813       }
8814     }
8815     printf("\n");
8816     #if defined(__i386__) || defined(__x86_64__)
8817     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
8818     #endif
8819     #ifdef __arm__
8820     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
8821     #endif
8822     #if defined(__i386__) || defined(__x86_64__)
8823     printf("needs: ");
8824     if(needed_reg[i]&1) printf("eax ");
8825     if((needed_reg[i]>>1)&1) printf("ecx ");
8826     if((needed_reg[i]>>2)&1) printf("edx ");
8827     if((needed_reg[i]>>3)&1) printf("ebx ");
8828     if((needed_reg[i]>>5)&1) printf("ebp ");
8829     if((needed_reg[i]>>6)&1) printf("esi ");
8830     if((needed_reg[i]>>7)&1) printf("edi ");
8831     printf("\n");
8832     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
8833     printf("dirty: ");
8834     if(regs[i].wasdirty&1) printf("eax ");
8835     if((regs[i].wasdirty>>1)&1) printf("ecx ");
8836     if((regs[i].wasdirty>>2)&1) printf("edx ");
8837     if((regs[i].wasdirty>>3)&1) printf("ebx ");
8838     if((regs[i].wasdirty>>5)&1) printf("ebp ");
8839     if((regs[i].wasdirty>>6)&1) printf("esi ");
8840     if((regs[i].wasdirty>>7)&1) printf("edi ");
8841     #endif
8842     #ifdef __arm__
8843     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
8844     printf("dirty: ");
8845     if(regs[i].wasdirty&1) printf("r0 ");
8846     if((regs[i].wasdirty>>1)&1) printf("r1 ");
8847     if((regs[i].wasdirty>>2)&1) printf("r2 ");
8848     if((regs[i].wasdirty>>3)&1) printf("r3 ");
8849     if((regs[i].wasdirty>>4)&1) printf("r4 ");
8850     if((regs[i].wasdirty>>5)&1) printf("r5 ");
8851     if((regs[i].wasdirty>>6)&1) printf("r6 ");
8852     if((regs[i].wasdirty>>7)&1) printf("r7 ");
8853     if((regs[i].wasdirty>>8)&1) printf("r8 ");
8854     if((regs[i].wasdirty>>9)&1) printf("r9 ");
8855     if((regs[i].wasdirty>>10)&1) printf("r10 ");
8856     if((regs[i].wasdirty>>12)&1) printf("r12 ");
8857     #endif
8858     printf("\n");
8859     disassemble_inst(i);
8860     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
8861     #if defined(__i386__) || defined(__x86_64__)
8862     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
8863     if(regs[i].dirty&1) printf("eax ");
8864     if((regs[i].dirty>>1)&1) printf("ecx ");
8865     if((regs[i].dirty>>2)&1) printf("edx ");
8866     if((regs[i].dirty>>3)&1) printf("ebx ");
8867     if((regs[i].dirty>>5)&1) printf("ebp ");
8868     if((regs[i].dirty>>6)&1) printf("esi ");
8869     if((regs[i].dirty>>7)&1) printf("edi ");
8870     #endif
8871     #ifdef __arm__
8872     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
8873     if(regs[i].dirty&1) printf("r0 ");
8874     if((regs[i].dirty>>1)&1) printf("r1 ");
8875     if((regs[i].dirty>>2)&1) printf("r2 ");
8876     if((regs[i].dirty>>3)&1) printf("r3 ");
8877     if((regs[i].dirty>>4)&1) printf("r4 ");
8878     if((regs[i].dirty>>5)&1) printf("r5 ");
8879     if((regs[i].dirty>>6)&1) printf("r6 ");
8880     if((regs[i].dirty>>7)&1) printf("r7 ");
8881     if((regs[i].dirty>>8)&1) printf("r8 ");
8882     if((regs[i].dirty>>9)&1) printf("r9 ");
8883     if((regs[i].dirty>>10)&1) printf("r10 ");
8884     if((regs[i].dirty>>12)&1) printf("r12 ");
8885     #endif
8886     printf("\n");
8887     if(regs[i].isconst) {
8888       printf("constants: ");
8889       #if defined(__i386__) || defined(__x86_64__)
8890       if(regs[i].isconst&1) printf("eax=%x ",(u_int)constmap[i][0]);
8891       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(u_int)constmap[i][1]);
8892       if((regs[i].isconst>>2)&1) printf("edx=%x ",(u_int)constmap[i][2]);
8893       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(u_int)constmap[i][3]);
8894       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(u_int)constmap[i][5]);
8895       if((regs[i].isconst>>6)&1) printf("esi=%x ",(u_int)constmap[i][6]);
8896       if((regs[i].isconst>>7)&1) printf("edi=%x ",(u_int)constmap[i][7]);
8897       #endif
8898       #if defined(__arm__) || defined(__aarch64__)
8899       int r;
8900       for (r = 0; r < ARRAY_SIZE(constmap[i]); r++)
8901         if ((regs[i].isconst >> r) & 1)
8902           printf(" r%d=%x", r, (u_int)constmap[i][r]);
8903       #endif
8904       printf("\n");
8905     }
8906     if(dops[i].is_jump) {
8907       #if defined(__i386__) || defined(__x86_64__)
8908       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
8909       if(branch_regs[i].dirty&1) printf("eax ");
8910       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
8911       if((branch_regs[i].dirty>>2)&1) printf("edx ");
8912       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
8913       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
8914       if((branch_regs[i].dirty>>6)&1) printf("esi ");
8915       if((branch_regs[i].dirty>>7)&1) printf("edi ");
8916       #endif
8917       #ifdef __arm__
8918       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
8919       if(branch_regs[i].dirty&1) printf("r0 ");
8920       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
8921       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
8922       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
8923       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
8924       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
8925       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
8926       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
8927       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
8928       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
8929       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
8930       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
8931       #endif
8932     }
8933   }
8934 #endif // DISASM
8935
8936   /* Pass 8 - Assembly */
8937   linkcount=0;stubcount=0;
8938   ds=0;is_delayslot=0;
8939   u_int dirty_pre=0;
8940   void *beginning=start_block();
8941   if((u_int)addr&1) {
8942     ds=1;
8943     pagespan_ds();
8944   }
8945   void *instr_addr0_override = NULL;
8946
8947   if (start == 0x80030000) {
8948     // nasty hack for the fastbios thing
8949     // override block entry to this code
8950     instr_addr0_override = out;
8951     emit_movimm(start,0);
8952     // abuse io address var as a flag that we
8953     // have already returned here once
8954     emit_readword(&address,1);
8955     emit_writeword(0,&pcaddr);
8956     emit_writeword(0,&address);
8957     emit_cmp(0,1);
8958     #ifdef __aarch64__
8959     emit_jeq(out + 4*2);
8960     emit_far_jump(new_dyna_leave);
8961     #else
8962     emit_jne(new_dyna_leave);
8963     #endif
8964   }
8965   for(i=0;i<slen;i++)
8966   {
8967     //if(ds) printf("ds: ");
8968     disassemble_inst(i);
8969     if(ds) {
8970       ds=0; // Skip delay slot
8971       if(dops[i].bt) assem_debug("OOPS - branch into delay slot\n");
8972       instr_addr[i] = NULL;
8973     } else {
8974       speculate_register_values(i);
8975       #ifndef DESTRUCTIVE_WRITEBACK
8976       if (i < 2 || !dops[i-2].is_ujump)
8977       {
8978         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,unneeded_reg[i]);
8979       }
8980       if((dops[i].itype==CJUMP||dops[i].itype==SJUMP)) {
8981         dirty_pre=branch_regs[i].dirty;
8982       }else{
8983         dirty_pre=regs[i].dirty;
8984       }
8985       #endif
8986       // write back
8987       if (i < 2 || !dops[i-2].is_ujump)
8988       {
8989         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,unneeded_reg[i]);
8990         loop_preload(regmap_pre[i],regs[i].regmap_entry);
8991       }
8992       // branch target entry point
8993       instr_addr[i] = out;
8994       assem_debug("<->\n");
8995       drc_dbg_emit_do_cmp(i);
8996
8997       // load regs
8998       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
8999         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty);
9000       load_regs(regs[i].regmap_entry,regs[i].regmap,dops[i].rs1,dops[i].rs2);
9001       address_generation(i,&regs[i],regs[i].regmap_entry);
9002       load_consts(regmap_pre[i],regs[i].regmap,i);
9003       if(dops[i].is_jump)
9004       {
9005         // Load the delay slot registers if necessary
9006         if(dops[i+1].rs1!=dops[i].rs1&&dops[i+1].rs1!=dops[i].rs2&&(dops[i+1].rs1!=dops[i].rt1||dops[i].rt1==0))
9007           load_regs(regs[i].regmap_entry,regs[i].regmap,dops[i+1].rs1,dops[i+1].rs1);
9008         if(dops[i+1].rs2!=dops[i+1].rs1&&dops[i+1].rs2!=dops[i].rs1&&dops[i+1].rs2!=dops[i].rs2&&(dops[i+1].rs2!=dops[i].rt1||dops[i].rt1==0))
9009           load_regs(regs[i].regmap_entry,regs[i].regmap,dops[i+1].rs2,dops[i+1].rs2);
9010         if(dops[i+1].itype==STORE||dops[i+1].itype==STORELR||(dops[i+1].opcode&0x3b)==0x39||(dops[i+1].opcode&0x3b)==0x3a)
9011           load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP);
9012       }
9013       else if(i+1<slen)
9014       {
9015         // Preload registers for following instruction
9016         if(dops[i+1].rs1!=dops[i].rs1&&dops[i+1].rs1!=dops[i].rs2)
9017           if(dops[i+1].rs1!=dops[i].rt1&&dops[i+1].rs1!=dops[i].rt2)
9018             load_regs(regs[i].regmap_entry,regs[i].regmap,dops[i+1].rs1,dops[i+1].rs1);
9019         if(dops[i+1].rs2!=dops[i+1].rs1&&dops[i+1].rs2!=dops[i].rs1&&dops[i+1].rs2!=dops[i].rs2)
9020           if(dops[i+1].rs2!=dops[i].rt1&&dops[i+1].rs2!=dops[i].rt2)
9021             load_regs(regs[i].regmap_entry,regs[i].regmap,dops[i+1].rs2,dops[i+1].rs2);
9022       }
9023       // TODO: if(is_ooo(i)) address_generation(i+1);
9024       if(dops[i].itype==CJUMP)
9025         load_regs(regs[i].regmap_entry,regs[i].regmap,CCREG,CCREG);
9026       if(dops[i].itype==STORE||dops[i].itype==STORELR||(dops[i].opcode&0x3b)==0x39||(dops[i].opcode&0x3b)==0x3a)
9027         load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP);
9028       // assemble
9029       switch(dops[i].itype) {
9030         case ALU:
9031           alu_assemble(i,&regs[i]);break;
9032         case IMM16:
9033           imm16_assemble(i,&regs[i]);break;
9034         case SHIFT:
9035           shift_assemble(i,&regs[i]);break;
9036         case SHIFTIMM:
9037           shiftimm_assemble(i,&regs[i]);break;
9038         case LOAD:
9039           load_assemble(i,&regs[i]);break;
9040         case LOADLR:
9041           loadlr_assemble(i,&regs[i]);break;
9042         case STORE:
9043           store_assemble(i,&regs[i]);break;
9044         case STORELR:
9045           storelr_assemble(i,&regs[i]);break;
9046         case COP0:
9047           cop0_assemble(i,&regs[i]);break;
9048         case COP1:
9049           cop1_assemble(i,&regs[i]);break;
9050         case C1LS:
9051           c1ls_assemble(i,&regs[i]);break;
9052         case COP2:
9053           cop2_assemble(i,&regs[i]);break;
9054         case C2LS:
9055           c2ls_assemble(i,&regs[i]);break;
9056         case C2OP:
9057           c2op_assemble(i,&regs[i]);break;
9058         case MULTDIV:
9059           multdiv_assemble(i,&regs[i]);
9060           multdiv_prepare_stall(i,&regs[i]);
9061           break;
9062         case MOV:
9063           mov_assemble(i,&regs[i]);break;
9064         case SYSCALL:
9065           syscall_assemble(i,&regs[i]);break;
9066         case HLECALL:
9067           hlecall_assemble(i,&regs[i]);break;
9068         case INTCALL:
9069           intcall_assemble(i,&regs[i]);break;
9070         case UJUMP:
9071           ujump_assemble(i,&regs[i]);ds=1;break;
9072         case RJUMP:
9073           rjump_assemble(i,&regs[i]);ds=1;break;
9074         case CJUMP:
9075           cjump_assemble(i,&regs[i]);ds=1;break;
9076         case SJUMP:
9077           sjump_assemble(i,&regs[i]);ds=1;break;
9078         case SPAN:
9079           pagespan_assemble(i,&regs[i]);break;
9080       }
9081       if (dops[i].is_ujump)
9082         literal_pool(1024);
9083       else
9084         literal_pool_jumpover(256);
9085     }
9086   }
9087
9088   assert(slen > 0);
9089   if (slen > 0 && dops[slen-1].itype == INTCALL) {
9090     // no ending needed for this block since INTCALL never returns
9091   }
9092   // If the block did not end with an unconditional branch,
9093   // add a jump to the next instruction.
9094   else if (i > 1) {
9095     if (!dops[i-2].is_ujump && dops[i-1].itype != SPAN) {
9096       assert(!dops[i-1].is_jump);
9097       assert(i==slen);
9098       if(dops[i-2].itype!=CJUMP&&dops[i-2].itype!=SJUMP) {
9099         store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4);
9100         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
9101           emit_loadreg(CCREG,HOST_CCREG);
9102         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
9103       }
9104       else
9105       {
9106         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].dirty,start+i*4);
9107         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
9108       }
9109       add_to_linker(out,start+i*4,0);
9110       emit_jmp(0);
9111     }
9112   }
9113   else
9114   {
9115     assert(i>0);
9116     assert(!dops[i-1].is_jump);
9117     store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4);
9118     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
9119       emit_loadreg(CCREG,HOST_CCREG);
9120     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
9121     add_to_linker(out,start+i*4,0);
9122     emit_jmp(0);
9123   }
9124
9125   // TODO: delay slot stubs?
9126   // Stubs
9127   for(i=0;i<stubcount;i++)
9128   {
9129     switch(stubs[i].type)
9130     {
9131       case LOADB_STUB:
9132       case LOADH_STUB:
9133       case LOADW_STUB:
9134       case LOADD_STUB:
9135       case LOADBU_STUB:
9136       case LOADHU_STUB:
9137         do_readstub(i);break;
9138       case STOREB_STUB:
9139       case STOREH_STUB:
9140       case STOREW_STUB:
9141       case STORED_STUB:
9142         do_writestub(i);break;
9143       case CC_STUB:
9144         do_ccstub(i);break;
9145       case INVCODE_STUB:
9146         do_invstub(i);break;
9147       case FP_STUB:
9148         do_cop1stub(i);break;
9149       case STORELR_STUB:
9150         do_unalignedwritestub(i);break;
9151     }
9152   }
9153
9154   if (instr_addr0_override)
9155     instr_addr[0] = instr_addr0_override;
9156
9157   /* Pass 9 - Linker */
9158   for(i=0;i<linkcount;i++)
9159   {
9160     assem_debug("%p -> %8x\n",link_addr[i].addr,link_addr[i].target);
9161     literal_pool(64);
9162     if (!link_addr[i].ext)
9163     {
9164       void *stub = out;
9165       void *addr = check_addr(link_addr[i].target);
9166       emit_extjump(link_addr[i].addr, link_addr[i].target);
9167       if (addr) {
9168         set_jump_target(link_addr[i].addr, addr);
9169         add_jump_out(link_addr[i].target,stub);
9170       }
9171       else
9172         set_jump_target(link_addr[i].addr, stub);
9173     }
9174     else
9175     {
9176       // Internal branch
9177       int target=(link_addr[i].target-start)>>2;
9178       assert(target>=0&&target<slen);
9179       assert(instr_addr[target]);
9180       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
9181       //set_jump_target_fillslot(link_addr[i].addr,instr_addr[target],link_addr[i].ext>>1);
9182       //#else
9183       set_jump_target(link_addr[i].addr, instr_addr[target]);
9184       //#endif
9185     }
9186   }
9187
9188   u_int source_len = slen*4;
9189   if (dops[slen-1].itype == INTCALL && source_len > 4)
9190     // no need to treat the last instruction as compiled
9191     // as interpreter fully handles it
9192     source_len -= 4;
9193
9194   if ((u_char *)copy + source_len > (u_char *)shadow + sizeof(shadow))
9195     copy = shadow;
9196
9197   // External Branch Targets (jump_in)
9198   for(i=0;i<slen;i++)
9199   {
9200     if(dops[i].bt||i==0)
9201     {
9202       if(instr_addr[i]) // TODO - delay slots (=null)
9203       {
9204         u_int vaddr=start+i*4;
9205         u_int page=get_page(vaddr);
9206         u_int vpage=get_vpage(vaddr);
9207         literal_pool(256);
9208         {
9209           assem_debug("%p (%d) <- %8x\n",instr_addr[i],i,start+i*4);
9210           assem_debug("jump_in: %x\n",start+i*4);
9211           ll_add(jump_dirty+vpage,vaddr,out);
9212           void *entry_point = do_dirty_stub(i, source_len);
9213           ll_add_flags(jump_in+page,vaddr,state_rflags,entry_point);
9214           // If there was an existing entry in the hash table,
9215           // replace it with the new address.
9216           // Don't add new entries.  We'll insert the
9217           // ones that actually get used in check_addr().
9218           struct ht_entry *ht_bin = hash_table_get(vaddr);
9219           if (ht_bin->vaddr[0] == vaddr)
9220             ht_bin->tcaddr[0] = entry_point;
9221           if (ht_bin->vaddr[1] == vaddr)
9222             ht_bin->tcaddr[1] = entry_point;
9223         }
9224       }
9225     }
9226   }
9227   // Write out the literal pool if necessary
9228   literal_pool(0);
9229   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
9230   // Align code
9231   if(((u_int)out)&7) emit_addnop(13);
9232   #endif
9233   assert(out - (u_char *)beginning < MAX_OUTPUT_BLOCK_SIZE);
9234   //printf("shadow buffer: %p-%p\n",copy,(u_char *)copy+slen*4);
9235   memcpy(copy, source, source_len);
9236   copy += source_len;
9237
9238   end_block(beginning);
9239
9240   // If we're within 256K of the end of the buffer,
9241   // start over from the beginning. (Is 256K enough?)
9242   if (out > ndrc->translation_cache + sizeof(ndrc->translation_cache) - MAX_OUTPUT_BLOCK_SIZE)
9243     out = ndrc->translation_cache;
9244
9245   // Trap writes to any of the pages we compiled
9246   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
9247     invalid_code[i]=0;
9248   }
9249   inv_code_start=inv_code_end=~0;
9250
9251   // for PCSX we need to mark all mirrors too
9252   if(get_page(start)<(RAM_SIZE>>12))
9253     for(i=start>>12;i<=(start+slen*4)>>12;i++)
9254       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
9255       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
9256       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
9257
9258   /* Pass 10 - Free memory by expiring oldest blocks */
9259
9260   int end=(((out-ndrc->translation_cache)>>(TARGET_SIZE_2-16))+16384)&65535;
9261   while(expirep!=end)
9262   {
9263     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
9264     uintptr_t base_offs = ((uintptr_t)(expirep >> 13) << shift); // Base offset of this block
9265     uintptr_t base_offs_s = base_offs >> shift;
9266     inv_debug("EXP: Phase %d\n",expirep);
9267     switch((expirep>>11)&3)
9268     {
9269       case 0:
9270         // Clear jump_in and jump_dirty
9271         ll_remove_matching_addrs(jump_in+(expirep&2047),base_offs_s,shift);
9272         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base_offs_s,shift);
9273         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base_offs_s,shift);
9274         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base_offs_s,shift);
9275         break;
9276       case 1:
9277         // Clear pointers
9278         ll_kill_pointers(jump_out[expirep&2047],base_offs_s,shift);
9279         ll_kill_pointers(jump_out[(expirep&2047)+2048],base_offs_s,shift);
9280         break;
9281       case 2:
9282         // Clear hash table
9283         for(i=0;i<32;i++) {
9284           struct ht_entry *ht_bin = &hash_table[((expirep&2047)<<5)+i];
9285           uintptr_t o1 = (u_char *)ht_bin->tcaddr[1] - ndrc->translation_cache;
9286           uintptr_t o2 = o1 - MAX_OUTPUT_BLOCK_SIZE;
9287           if ((o1 >> shift) == base_offs_s || (o2 >> shift) == base_offs_s) {
9288             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[1],ht_bin->tcaddr[1]);
9289             ht_bin->vaddr[1] = -1;
9290             ht_bin->tcaddr[1] = NULL;
9291           }
9292           o1 = (u_char *)ht_bin->tcaddr[0] - ndrc->translation_cache;
9293           o2 = o1 - MAX_OUTPUT_BLOCK_SIZE;
9294           if ((o1 >> shift) == base_offs_s || (o2 >> shift) == base_offs_s) {
9295             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[0],ht_bin->tcaddr[0]);
9296             ht_bin->vaddr[0] = ht_bin->vaddr[1];
9297             ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
9298             ht_bin->vaddr[1] = -1;
9299             ht_bin->tcaddr[1] = NULL;
9300           }
9301         }
9302         break;
9303       case 3:
9304         // Clear jump_out
9305         if((expirep&2047)==0)
9306           do_clear_cache();
9307         ll_remove_matching_addrs(jump_out+(expirep&2047),base_offs_s,shift);
9308         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base_offs_s,shift);
9309         break;
9310     }
9311     expirep=(expirep+1)&65535;
9312   }
9313   return 0;
9314 }
9315
9316 // vim:shiftwidth=2:expandtab