drc: fix block expire
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29 #ifdef _3DS
30 #include <3ds_utils.h>
31 #endif
32 #ifdef VITA
33 #include <psp2/kernel/sysmem.h>
34 static int sceBlock;
35 #endif
36
37 #include "new_dynarec_config.h"
38 #include "../psxhle.h"
39 #include "../psxinterpreter.h"
40 #include "../gte.h"
41 #include "emu_if.h" // emulator interface
42
43 #define noinline __attribute__((noinline,noclone))
44 #ifndef ARRAY_SIZE
45 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
46 #endif
47 #ifndef min
48 #define min(a, b) ((b) < (a) ? (b) : (a))
49 #endif
50 #ifndef max
51 #define max(a, b) ((b) > (a) ? (b) : (a))
52 #endif
53
54 //#define DISASM
55 //#define ASSEM_PRINT
56
57 #ifdef ASSEM_PRINT
58 #define assem_debug printf
59 #else
60 #define assem_debug(...)
61 #endif
62 //#define inv_debug printf
63 #define inv_debug(...)
64
65 #ifdef __i386__
66 #include "assem_x86.h"
67 #endif
68 #ifdef __x86_64__
69 #include "assem_x64.h"
70 #endif
71 #ifdef __arm__
72 #include "assem_arm.h"
73 #endif
74 #ifdef __aarch64__
75 #include "assem_arm64.h"
76 #endif
77
78 #define RAM_SIZE 0x200000
79 #define MAXBLOCK 4096
80 #define MAX_OUTPUT_BLOCK_SIZE 262144
81
82 struct ndrc_mem
83 {
84   u_char translation_cache[1 << TARGET_SIZE_2];
85   struct
86   {
87     struct tramp_insns ops[2048 / sizeof(struct tramp_insns)];
88     const void *f[2048 / sizeof(void *)];
89   } tramp;
90 };
91
92 #ifdef BASE_ADDR_DYNAMIC
93 static struct ndrc_mem *ndrc;
94 #else
95 static struct ndrc_mem ndrc_ __attribute__((aligned(4096)));
96 static struct ndrc_mem *ndrc = &ndrc_;
97 #endif
98
99 // stubs
100 enum stub_type {
101   CC_STUB = 1,
102   FP_STUB = 2,
103   LOADB_STUB = 3,
104   LOADH_STUB = 4,
105   LOADW_STUB = 5,
106   LOADD_STUB = 6,
107   LOADBU_STUB = 7,
108   LOADHU_STUB = 8,
109   STOREB_STUB = 9,
110   STOREH_STUB = 10,
111   STOREW_STUB = 11,
112   STORED_STUB = 12,
113   STORELR_STUB = 13,
114   INVCODE_STUB = 14,
115 };
116
117 struct regstat
118 {
119   signed char regmap_entry[HOST_REGS];
120   signed char regmap[HOST_REGS];
121   uint64_t wasdirty;
122   uint64_t dirty;
123   uint64_t u;
124   u_int wasconst;
125   u_int isconst;
126   u_int loadedconst;             // host regs that have constants loaded
127   u_int waswritten;              // MIPS regs that were used as store base before
128 };
129
130 // note: asm depends on this layout
131 struct ll_entry
132 {
133   u_int vaddr;
134   u_int reg_sv_flags;
135   void *addr;
136   struct ll_entry *next;
137 };
138
139 struct ht_entry
140 {
141   u_int vaddr[2];
142   void *tcaddr[2];
143 };
144
145 struct code_stub
146 {
147   enum stub_type type;
148   void *addr;
149   void *retaddr;
150   u_int a;
151   uintptr_t b;
152   uintptr_t c;
153   u_int d;
154   u_int e;
155 };
156
157 struct link_entry
158 {
159   void *addr;
160   u_int target;
161   u_int ext;
162 };
163
164   // used by asm:
165   u_char *out;
166   struct ht_entry hash_table[65536]  __attribute__((aligned(16)));
167   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
168   struct ll_entry *jump_dirty[4096];
169
170   static struct ll_entry *jump_out[4096];
171   static u_int start;
172   static u_int *source;
173   static char insn[MAXBLOCK][10];
174   static u_char itype[MAXBLOCK];
175   static u_char opcode[MAXBLOCK];
176   static u_char opcode2[MAXBLOCK];
177   static u_char bt[MAXBLOCK];
178   static u_char rs1[MAXBLOCK];
179   static u_char rs2[MAXBLOCK];
180   static u_char rt1[MAXBLOCK];
181   static u_char rt2[MAXBLOCK];
182   static u_char dep1[MAXBLOCK];
183   static u_char dep2[MAXBLOCK];
184   static u_char lt1[MAXBLOCK];
185   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
186   static uint64_t gte_rt[MAXBLOCK];
187   static uint64_t gte_unneeded[MAXBLOCK];
188   static u_int smrv[32]; // speculated MIPS register values
189   static u_int smrv_strong; // mask or regs that are likely to have correct values
190   static u_int smrv_weak; // same, but somewhat less likely
191   static u_int smrv_strong_next; // same, but after current insn executes
192   static u_int smrv_weak_next;
193   static int imm[MAXBLOCK];
194   static u_int ba[MAXBLOCK];
195   static char likely[MAXBLOCK];
196   static char is_ds[MAXBLOCK];
197   static char ooo[MAXBLOCK];
198   static uint64_t unneeded_reg[MAXBLOCK];
199   static uint64_t branch_unneeded_reg[MAXBLOCK];
200   static signed char regmap_pre[MAXBLOCK][HOST_REGS]; // pre-instruction i?
201   // contains 'real' consts at [i] insn, but may differ from what's actually
202   // loaded in host reg as 'final' value is always loaded, see get_final_value()
203   static uint32_t current_constmap[HOST_REGS];
204   static uint32_t constmap[MAXBLOCK][HOST_REGS];
205   static struct regstat regs[MAXBLOCK];
206   static struct regstat branch_regs[MAXBLOCK];
207   static signed char minimum_free_regs[MAXBLOCK];
208   static u_int needed_reg[MAXBLOCK];
209   static u_int wont_dirty[MAXBLOCK];
210   static u_int will_dirty[MAXBLOCK];
211   static int ccadj[MAXBLOCK];
212   static int slen;
213   static void *instr_addr[MAXBLOCK];
214   static struct link_entry link_addr[MAXBLOCK];
215   static int linkcount;
216   static struct code_stub stubs[MAXBLOCK*3];
217   static int stubcount;
218   static u_int literals[1024][2];
219   static int literalcount;
220   static int is_delayslot;
221   static char shadow[1048576]  __attribute__((aligned(16)));
222   static void *copy;
223   static int expirep;
224   static u_int stop_after_jal;
225 #ifndef RAM_FIXED
226   static uintptr_t ram_offset;
227 #else
228   static const uintptr_t ram_offset=0;
229 #endif
230
231   int new_dynarec_hacks;
232   int new_dynarec_hacks_pergame;
233   int new_dynarec_hacks_old;
234   int new_dynarec_did_compile;
235
236   #define HACK_ENABLED(x) ((new_dynarec_hacks | new_dynarec_hacks_pergame) & (x))
237
238   extern int cycle_count; // ... until end of the timeslice, counts -N -> 0
239   extern int last_count;  // last absolute target, often = next_interupt
240   extern int pcaddr;
241   extern int pending_exception;
242   extern int branch_target;
243   extern uintptr_t mini_ht[32][2];
244   extern u_char restore_candidate[512];
245
246   /* registers that may be allocated */
247   /* 1-31 gpr */
248 #define LOREG 32 // lo
249 #define HIREG 33 // hi
250 //#define FSREG 34 // FPU status (FCSR)
251 #define CSREG 35 // Coprocessor status
252 #define CCREG 36 // Cycle count
253 #define INVCP 37 // Pointer to invalid_code
254 //#define MMREG 38 // Pointer to memory_map
255 //#define ROREG 39 // ram offset (if rdram!=0x80000000)
256 #define TEMPREG 40
257 #define FTEMP 40 // FPU temporary register
258 #define PTEMP 41 // Prefetch temporary register
259 //#define TLREG 42 // TLB mapping offset
260 #define RHASH 43 // Return address hash
261 #define RHTBL 44 // Return address hash table address
262 #define RTEMP 45 // JR/JALR address register
263 #define MAXREG 45
264 #define AGEN1 46 // Address generation temporary register
265 //#define AGEN2 47 // Address generation temporary register
266 //#define MGEN1 48 // Maptable address generation temporary register
267 //#define MGEN2 49 // Maptable address generation temporary register
268 #define BTREG 50 // Branch target temporary register
269
270   /* instruction types */
271 #define NOP 0     // No operation
272 #define LOAD 1    // Load
273 #define STORE 2   // Store
274 #define LOADLR 3  // Unaligned load
275 #define STORELR 4 // Unaligned store
276 #define MOV 5     // Move
277 #define ALU 6     // Arithmetic/logic
278 #define MULTDIV 7 // Multiply/divide
279 #define SHIFT 8   // Shift by register
280 #define SHIFTIMM 9// Shift by immediate
281 #define IMM16 10  // 16-bit immediate
282 #define RJUMP 11  // Unconditional jump to register
283 #define UJUMP 12  // Unconditional jump
284 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
285 #define SJUMP 14  // Conditional branch (regimm format)
286 #define COP0 15   // Coprocessor 0
287 #define COP1 16   // Coprocessor 1
288 #define C1LS 17   // Coprocessor 1 load/store
289 //#define FJUMP 18  // Conditional branch (floating point)
290 //#define FLOAT 19  // Floating point unit
291 //#define FCONV 20  // Convert integer to float
292 //#define FCOMP 21  // Floating point compare (sets FSREG)
293 #define SYSCALL 22// SYSCALL
294 #define OTHER 23  // Other
295 #define SPAN 24   // Branch/delay slot spans 2 pages
296 #define NI 25     // Not implemented
297 #define HLECALL 26// PCSX fake opcodes for HLE
298 #define COP2 27   // Coprocessor 2 move
299 #define C2LS 28   // Coprocessor 2 load/store
300 #define C2OP 29   // Coprocessor 2 operation
301 #define INTCALL 30// Call interpreter to handle rare corner cases
302
303   /* branch codes */
304 #define TAKEN 1
305 #define NOTTAKEN 2
306 #define NULLDS 3
307
308 #define DJT_1 (void *)1l // no function, just a label in assem_debug log
309 #define DJT_2 (void *)2l
310
311 // asm linkage
312 int new_recompile_block(u_int addr);
313 void *get_addr_ht(u_int vaddr);
314 void invalidate_block(u_int block);
315 void invalidate_addr(u_int addr);
316 void remove_hash(int vaddr);
317 void dyna_linker();
318 void dyna_linker_ds();
319 void verify_code();
320 void verify_code_ds();
321 void cc_interrupt();
322 void fp_exception();
323 void fp_exception_ds();
324 void jump_to_new_pc();
325 void call_gteStall();
326 void new_dyna_leave();
327
328 // Needed by assembler
329 static void wb_register(signed char r,signed char regmap[],uint64_t dirty);
330 static void wb_dirtys(signed char i_regmap[],uint64_t i_dirty);
331 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_dirty,int addr);
332 static void load_all_regs(signed char i_regmap[]);
333 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
334 static void load_regs_entry(int t);
335 static void load_all_consts(signed char regmap[],u_int dirty,int i);
336 static u_int get_host_reglist(const signed char *regmap);
337
338 static int verify_dirty(const u_int *ptr);
339 static int get_final_value(int hr, int i, int *value);
340 static void add_stub(enum stub_type type, void *addr, void *retaddr,
341   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e);
342 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
343   int i, int addr_reg, const struct regstat *i_regs, int ccadj, u_int reglist);
344 static void add_to_linker(void *addr, u_int target, int ext);
345 static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override);
346 static void *get_direct_memhandler(void *table, u_int addr,
347   enum stub_type type, uintptr_t *addr_host);
348 static void cop2_do_stall_check(u_int op, int i, const struct regstat *i_regs, u_int reglist);
349 static void pass_args(int a0, int a1);
350 static void emit_far_jump(const void *f);
351 static void emit_far_call(const void *f);
352
353 static void mprotect_w_x(void *start, void *end, int is_x)
354 {
355 #ifdef NO_WRITE_EXEC
356   #if defined(VITA)
357   // *Open* enables write on all memory that was
358   // allocated by sceKernelAllocMemBlockForVM()?
359   if (is_x)
360     sceKernelCloseVMDomain();
361   else
362     sceKernelOpenVMDomain();
363   #else
364   u_long mstart = (u_long)start & ~4095ul;
365   u_long mend = (u_long)end;
366   if (mprotect((void *)mstart, mend - mstart,
367                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
368     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
369   #endif
370 #endif
371 }
372
373 static void start_tcache_write(void *start, void *end)
374 {
375   mprotect_w_x(start, end, 0);
376 }
377
378 static void end_tcache_write(void *start, void *end)
379 {
380 #if defined(__arm__) || defined(__aarch64__)
381   size_t len = (char *)end - (char *)start;
382   #if   defined(__BLACKBERRY_QNX__)
383   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
384   #elif defined(__MACH__)
385   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
386   #elif defined(VITA)
387   sceKernelSyncVMDomain(sceBlock, start, len);
388   #elif defined(_3DS)
389   ctr_flush_invalidate_cache();
390   #elif defined(__aarch64__)
391   // as of 2021, __clear_cache() is still broken on arm64
392   // so here is a custom one :(
393   clear_cache_arm64(start, end);
394   #else
395   __clear_cache(start, end);
396   #endif
397   (void)len;
398 #endif
399
400   mprotect_w_x(start, end, 1);
401 }
402
403 static void *start_block(void)
404 {
405   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
406   if (end > ndrc->translation_cache + sizeof(ndrc->translation_cache))
407     end = ndrc->translation_cache + sizeof(ndrc->translation_cache);
408   start_tcache_write(out, end);
409   return out;
410 }
411
412 static void end_block(void *start)
413 {
414   end_tcache_write(start, out);
415 }
416
417 // also takes care of w^x mappings when patching code
418 static u_int needs_clear_cache[1<<(TARGET_SIZE_2-17)];
419
420 static void mark_clear_cache(void *target)
421 {
422   uintptr_t offset = (u_char *)target - ndrc->translation_cache;
423   u_int mask = 1u << ((offset >> 12) & 31);
424   if (!(needs_clear_cache[offset >> 17] & mask)) {
425     char *start = (char *)((uintptr_t)target & ~4095l);
426     start_tcache_write(start, start + 4095);
427     needs_clear_cache[offset >> 17] |= mask;
428   }
429 }
430
431 // Clearing the cache is rather slow on ARM Linux, so mark the areas
432 // that need to be cleared, and then only clear these areas once.
433 static void do_clear_cache(void)
434 {
435   int i, j;
436   for (i = 0; i < (1<<(TARGET_SIZE_2-17)); i++)
437   {
438     u_int bitmap = needs_clear_cache[i];
439     if (!bitmap)
440       continue;
441     for (j = 0; j < 32; j++)
442     {
443       u_char *start, *end;
444       if (!(bitmap & (1<<j)))
445         continue;
446
447       start = ndrc->translation_cache + i*131072 + j*4096;
448       end = start + 4095;
449       for (j++; j < 32; j++) {
450         if (!(bitmap & (1<<j)))
451           break;
452         end += 4096;
453       }
454       end_tcache_write(start, end);
455     }
456     needs_clear_cache[i] = 0;
457   }
458 }
459
460 //#define DEBUG_CYCLE_COUNT 1
461
462 #define NO_CYCLE_PENALTY_THR 12
463
464 int cycle_multiplier; // 100 for 1.0
465 int cycle_multiplier_override;
466 int cycle_multiplier_old;
467
468 static int CLOCK_ADJUST(int x)
469 {
470   int m = cycle_multiplier_override
471         ? cycle_multiplier_override : cycle_multiplier;
472   int s=(x>>31)|1;
473   return (x * m + s * 50) / 100;
474 }
475
476 // is the op an unconditional jump?
477 static int is_ujump(int i)
478 {
479   return itype[i] == UJUMP || itype[i] == RJUMP
480     || (source[i] >> 16) == 0x1000; // beq r0, r0, offset // b offset
481 }
482
483 static int is_jump(int i)
484 {
485   return itype[i] == RJUMP || itype[i] == UJUMP || itype[i] == CJUMP || itype[i] == SJUMP;
486 }
487
488 static u_int get_page(u_int vaddr)
489 {
490   u_int page=vaddr&~0xe0000000;
491   if (page < 0x1000000)
492     page &= ~0x0e00000; // RAM mirrors
493   page>>=12;
494   if(page>2048) page=2048+(page&2047);
495   return page;
496 }
497
498 // no virtual mem in PCSX
499 static u_int get_vpage(u_int vaddr)
500 {
501   return get_page(vaddr);
502 }
503
504 static struct ht_entry *hash_table_get(u_int vaddr)
505 {
506   return &hash_table[((vaddr>>16)^vaddr)&0xFFFF];
507 }
508
509 static void hash_table_add(struct ht_entry *ht_bin, u_int vaddr, void *tcaddr)
510 {
511   ht_bin->vaddr[1] = ht_bin->vaddr[0];
512   ht_bin->tcaddr[1] = ht_bin->tcaddr[0];
513   ht_bin->vaddr[0] = vaddr;
514   ht_bin->tcaddr[0] = tcaddr;
515 }
516
517 // some messy ari64's code, seems to rely on unsigned 32bit overflow
518 static int doesnt_expire_soon(void *tcaddr)
519 {
520   u_int diff = (u_int)((u_char *)tcaddr - out) << (32-TARGET_SIZE_2);
521   return diff > (u_int)(0x60000000 + (MAX_OUTPUT_BLOCK_SIZE << (32-TARGET_SIZE_2)));
522 }
523
524 // Get address from virtual address
525 // This is called from the recompiled JR/JALR instructions
526 void noinline *get_addr(u_int vaddr)
527 {
528   u_int page=get_page(vaddr);
529   u_int vpage=get_vpage(vaddr);
530   struct ll_entry *head;
531   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
532   head=jump_in[page];
533   while(head!=NULL) {
534     if(head->vaddr==vaddr) {
535   //printf("TRACE: count=%d next=%d (get_addr match %x: %p)\n",Count,next_interupt,vaddr,head->addr);
536       hash_table_add(hash_table_get(vaddr), vaddr, head->addr);
537       return head->addr;
538     }
539     head=head->next;
540   }
541   head=jump_dirty[vpage];
542   while(head!=NULL) {
543     if(head->vaddr==vaddr) {
544       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %p)\n",Count,next_interupt,vaddr,head->addr);
545       // Don't restore blocks which are about to expire from the cache
546       if (doesnt_expire_soon(head->addr))
547       if (verify_dirty(head->addr)) {
548         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
549         invalid_code[vaddr>>12]=0;
550         inv_code_start=inv_code_end=~0;
551         if(vpage<2048) {
552           restore_candidate[vpage>>3]|=1<<(vpage&7);
553         }
554         else restore_candidate[page>>3]|=1<<(page&7);
555         struct ht_entry *ht_bin = hash_table_get(vaddr);
556         if (ht_bin->vaddr[0] == vaddr)
557           ht_bin->tcaddr[0] = head->addr; // Replace existing entry
558         else
559           hash_table_add(ht_bin, vaddr, head->addr);
560
561         return head->addr;
562       }
563     }
564     head=head->next;
565   }
566   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
567   int r=new_recompile_block(vaddr);
568   if(r==0) return get_addr(vaddr);
569   // Execute in unmapped page, generate pagefault execption
570   Status|=2;
571   Cause=(vaddr<<31)|0x8;
572   EPC=(vaddr&1)?vaddr-5:vaddr;
573   BadVAddr=(vaddr&~1);
574   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
575   EntryHi=BadVAddr&0xFFFFE000;
576   return get_addr_ht(0x80000000);
577 }
578 // Look up address in hash table first
579 void *get_addr_ht(u_int vaddr)
580 {
581   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
582   const struct ht_entry *ht_bin = hash_table_get(vaddr);
583   if (ht_bin->vaddr[0] == vaddr) return ht_bin->tcaddr[0];
584   if (ht_bin->vaddr[1] == vaddr) return ht_bin->tcaddr[1];
585   return get_addr(vaddr);
586 }
587
588 void clear_all_regs(signed char regmap[])
589 {
590   int hr;
591   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
592 }
593
594 static signed char get_reg(const signed char regmap[],int r)
595 {
596   int hr;
597   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
598   return -1;
599 }
600
601 // Find a register that is available for two consecutive cycles
602 static signed char get_reg2(signed char regmap1[], const signed char regmap2[], int r)
603 {
604   int hr;
605   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
606   return -1;
607 }
608
609 int count_free_regs(signed char regmap[])
610 {
611   int count=0;
612   int hr;
613   for(hr=0;hr<HOST_REGS;hr++)
614   {
615     if(hr!=EXCLUDE_REG) {
616       if(regmap[hr]<0) count++;
617     }
618   }
619   return count;
620 }
621
622 void dirty_reg(struct regstat *cur,signed char reg)
623 {
624   int hr;
625   if(!reg) return;
626   for (hr=0;hr<HOST_REGS;hr++) {
627     if((cur->regmap[hr]&63)==reg) {
628       cur->dirty|=1<<hr;
629     }
630   }
631 }
632
633 static void set_const(struct regstat *cur, signed char reg, uint32_t value)
634 {
635   int hr;
636   if(!reg) return;
637   for (hr=0;hr<HOST_REGS;hr++) {
638     if(cur->regmap[hr]==reg) {
639       cur->isconst|=1<<hr;
640       current_constmap[hr]=value;
641     }
642   }
643 }
644
645 static void clear_const(struct regstat *cur, signed char reg)
646 {
647   int hr;
648   if(!reg) return;
649   for (hr=0;hr<HOST_REGS;hr++) {
650     if((cur->regmap[hr]&63)==reg) {
651       cur->isconst&=~(1<<hr);
652     }
653   }
654 }
655
656 static int is_const(struct regstat *cur, signed char reg)
657 {
658   int hr;
659   if(reg<0) return 0;
660   if(!reg) return 1;
661   for (hr=0;hr<HOST_REGS;hr++) {
662     if((cur->regmap[hr]&63)==reg) {
663       return (cur->isconst>>hr)&1;
664     }
665   }
666   return 0;
667 }
668
669 static uint32_t get_const(struct regstat *cur, signed char reg)
670 {
671   int hr;
672   if(!reg) return 0;
673   for (hr=0;hr<HOST_REGS;hr++) {
674     if(cur->regmap[hr]==reg) {
675       return current_constmap[hr];
676     }
677   }
678   SysPrintf("Unknown constant in r%d\n",reg);
679   abort();
680 }
681
682 // Least soon needed registers
683 // Look at the next ten instructions and see which registers
684 // will be used.  Try not to reallocate these.
685 void lsn(u_char hsn[], int i, int *preferred_reg)
686 {
687   int j;
688   int b=-1;
689   for(j=0;j<9;j++)
690   {
691     if(i+j>=slen) {
692       j=slen-i-1;
693       break;
694     }
695     if (is_ujump(i+j))
696     {
697       // Don't go past an unconditonal jump
698       j++;
699       break;
700     }
701   }
702   for(;j>=0;j--)
703   {
704     if(rs1[i+j]) hsn[rs1[i+j]]=j;
705     if(rs2[i+j]) hsn[rs2[i+j]]=j;
706     if(rt1[i+j]) hsn[rt1[i+j]]=j;
707     if(rt2[i+j]) hsn[rt2[i+j]]=j;
708     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
709       // Stores can allocate zero
710       hsn[rs1[i+j]]=j;
711       hsn[rs2[i+j]]=j;
712     }
713     // On some architectures stores need invc_ptr
714     #if defined(HOST_IMM8)
715     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
716       hsn[INVCP]=j;
717     }
718     #endif
719     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP))
720     {
721       hsn[CCREG]=j;
722       b=j;
723     }
724   }
725   if(b>=0)
726   {
727     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
728     {
729       // Follow first branch
730       int t=(ba[i+b]-start)>>2;
731       j=7-b;if(t+j>=slen) j=slen-t-1;
732       for(;j>=0;j--)
733       {
734         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
735         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
736         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
737         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
738       }
739     }
740     // TODO: preferred register based on backward branch
741   }
742   // Delay slot should preferably not overwrite branch conditions or cycle count
743   if (i > 0 && is_jump(i-1)) {
744     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
745     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
746     hsn[CCREG]=1;
747     // ...or hash tables
748     hsn[RHASH]=1;
749     hsn[RHTBL]=1;
750   }
751   // Coprocessor load/store needs FTEMP, even if not declared
752   if(itype[i]==C1LS||itype[i]==C2LS) {
753     hsn[FTEMP]=0;
754   }
755   // Load L/R also uses FTEMP as a temporary register
756   if(itype[i]==LOADLR) {
757     hsn[FTEMP]=0;
758   }
759   // Also SWL/SWR/SDL/SDR
760   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
761     hsn[FTEMP]=0;
762   }
763   // Don't remove the miniht registers
764   if(itype[i]==UJUMP||itype[i]==RJUMP)
765   {
766     hsn[RHASH]=0;
767     hsn[RHTBL]=0;
768   }
769 }
770
771 // We only want to allocate registers if we're going to use them again soon
772 int needed_again(int r, int i)
773 {
774   int j;
775   int b=-1;
776   int rn=10;
777
778   if (i > 0 && is_ujump(i-1))
779   {
780     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
781       return 0; // Don't need any registers if exiting the block
782   }
783   for(j=0;j<9;j++)
784   {
785     if(i+j>=slen) {
786       j=slen-i-1;
787       break;
788     }
789     if (is_ujump(i+j))
790     {
791       // Don't go past an unconditonal jump
792       j++;
793       break;
794     }
795     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
796     {
797       break;
798     }
799   }
800   for(;j>=1;j--)
801   {
802     if(rs1[i+j]==r) rn=j;
803     if(rs2[i+j]==r) rn=j;
804     if((unneeded_reg[i+j]>>r)&1) rn=10;
805     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP))
806     {
807       b=j;
808     }
809   }
810   /*
811   if(b>=0)
812   {
813     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
814     {
815       // Follow first branch
816       int o=rn;
817       int t=(ba[i+b]-start)>>2;
818       j=7-b;if(t+j>=slen) j=slen-t-1;
819       for(;j>=0;j--)
820       {
821         if(!((unneeded_reg[t+j]>>r)&1)) {
822           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
823           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
824         }
825         else rn=o;
826       }
827     }
828   }*/
829   if(rn<10) return 1;
830   (void)b;
831   return 0;
832 }
833
834 // Try to match register allocations at the end of a loop with those
835 // at the beginning
836 int loop_reg(int i, int r, int hr)
837 {
838   int j,k;
839   for(j=0;j<9;j++)
840   {
841     if(i+j>=slen) {
842       j=slen-i-1;
843       break;
844     }
845     if (is_ujump(i+j))
846     {
847       // Don't go past an unconditonal jump
848       j++;
849       break;
850     }
851   }
852   k=0;
853   if(i>0){
854     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP)
855       k--;
856   }
857   for(;k<j;k++)
858   {
859     assert(r < 64);
860     if((unneeded_reg[i+k]>>r)&1) return hr;
861     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP))
862     {
863       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
864       {
865         int t=(ba[i+k]-start)>>2;
866         int reg=get_reg(regs[t].regmap_entry,r);
867         if(reg>=0) return reg;
868         //reg=get_reg(regs[t+1].regmap_entry,r);
869         //if(reg>=0) return reg;
870       }
871     }
872   }
873   return hr;
874 }
875
876
877 // Allocate every register, preserving source/target regs
878 void alloc_all(struct regstat *cur,int i)
879 {
880   int hr;
881
882   for(hr=0;hr<HOST_REGS;hr++) {
883     if(hr!=EXCLUDE_REG) {
884       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
885          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
886       {
887         cur->regmap[hr]=-1;
888         cur->dirty&=~(1<<hr);
889       }
890       // Don't need zeros
891       if((cur->regmap[hr]&63)==0)
892       {
893         cur->regmap[hr]=-1;
894         cur->dirty&=~(1<<hr);
895       }
896     }
897   }
898 }
899
900 #ifndef NDEBUG
901 static int host_tempreg_in_use;
902
903 static void host_tempreg_acquire(void)
904 {
905   assert(!host_tempreg_in_use);
906   host_tempreg_in_use = 1;
907 }
908
909 static void host_tempreg_release(void)
910 {
911   host_tempreg_in_use = 0;
912 }
913 #else
914 static void host_tempreg_acquire(void) {}
915 static void host_tempreg_release(void) {}
916 #endif
917
918 #ifdef ASSEM_PRINT
919 extern void gen_interupt();
920 extern void do_insn_cmp();
921 #define FUNCNAME(f) { f, " " #f }
922 static const struct {
923   void *addr;
924   const char *name;
925 } function_names[] = {
926   FUNCNAME(cc_interrupt),
927   FUNCNAME(gen_interupt),
928   FUNCNAME(get_addr_ht),
929   FUNCNAME(get_addr),
930   FUNCNAME(jump_handler_read8),
931   FUNCNAME(jump_handler_read16),
932   FUNCNAME(jump_handler_read32),
933   FUNCNAME(jump_handler_write8),
934   FUNCNAME(jump_handler_write16),
935   FUNCNAME(jump_handler_write32),
936   FUNCNAME(invalidate_addr),
937   FUNCNAME(jump_to_new_pc),
938   FUNCNAME(call_gteStall),
939   FUNCNAME(new_dyna_leave),
940   FUNCNAME(pcsx_mtc0),
941   FUNCNAME(pcsx_mtc0_ds),
942 #ifdef DRC_DBG
943   FUNCNAME(do_insn_cmp),
944 #endif
945 #ifdef __arm__
946   FUNCNAME(verify_code),
947 #endif
948 };
949
950 static const char *func_name(const void *a)
951 {
952   int i;
953   for (i = 0; i < sizeof(function_names)/sizeof(function_names[0]); i++)
954     if (function_names[i].addr == a)
955       return function_names[i].name;
956   return "";
957 }
958 #else
959 #define func_name(x) ""
960 #endif
961
962 #ifdef __i386__
963 #include "assem_x86.c"
964 #endif
965 #ifdef __x86_64__
966 #include "assem_x64.c"
967 #endif
968 #ifdef __arm__
969 #include "assem_arm.c"
970 #endif
971 #ifdef __aarch64__
972 #include "assem_arm64.c"
973 #endif
974
975 static void *get_trampoline(const void *f)
976 {
977   size_t i;
978
979   for (i = 0; i < ARRAY_SIZE(ndrc->tramp.f); i++) {
980     if (ndrc->tramp.f[i] == f || ndrc->tramp.f[i] == NULL)
981       break;
982   }
983   if (i == ARRAY_SIZE(ndrc->tramp.f)) {
984     SysPrintf("trampoline table is full, last func %p\n", f);
985     abort();
986   }
987   if (ndrc->tramp.f[i] == NULL) {
988     start_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]);
989     ndrc->tramp.f[i] = f;
990     end_tcache_write(&ndrc->tramp.f[i], &ndrc->tramp.f[i + 1]);
991   }
992   return &ndrc->tramp.ops[i];
993 }
994
995 static void emit_far_jump(const void *f)
996 {
997   if (can_jump_or_call(f)) {
998     emit_jmp(f);
999     return;
1000   }
1001
1002   f = get_trampoline(f);
1003   emit_jmp(f);
1004 }
1005
1006 static void emit_far_call(const void *f)
1007 {
1008   if (can_jump_or_call(f)) {
1009     emit_call(f);
1010     return;
1011   }
1012
1013   f = get_trampoline(f);
1014   emit_call(f);
1015 }
1016
1017 // Add virtual address mapping to linked list
1018 void ll_add(struct ll_entry **head,int vaddr,void *addr)
1019 {
1020   struct ll_entry *new_entry;
1021   new_entry=malloc(sizeof(struct ll_entry));
1022   assert(new_entry!=NULL);
1023   new_entry->vaddr=vaddr;
1024   new_entry->reg_sv_flags=0;
1025   new_entry->addr=addr;
1026   new_entry->next=*head;
1027   *head=new_entry;
1028 }
1029
1030 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
1031 {
1032   ll_add(head,vaddr,addr);
1033   (*head)->reg_sv_flags=reg_sv_flags;
1034 }
1035
1036 // Check if an address is already compiled
1037 // but don't return addresses which are about to expire from the cache
1038 void *check_addr(u_int vaddr)
1039 {
1040   struct ht_entry *ht_bin = hash_table_get(vaddr);
1041   size_t i;
1042   for (i = 0; i < ARRAY_SIZE(ht_bin->vaddr); i++) {
1043     if (ht_bin->vaddr[i] == vaddr)
1044       if (doesnt_expire_soon((u_char *)ht_bin->tcaddr[i] - MAX_OUTPUT_BLOCK_SIZE))
1045         if (isclean(ht_bin->tcaddr[i]))
1046           return ht_bin->tcaddr[i];
1047   }
1048   u_int page=get_page(vaddr);
1049   struct ll_entry *head;
1050   head=jump_in[page];
1051   while (head != NULL) {
1052     if (head->vaddr == vaddr) {
1053       if (doesnt_expire_soon(head->addr)) {
1054         // Update existing entry with current address
1055         if (ht_bin->vaddr[0] == vaddr) {
1056           ht_bin->tcaddr[0] = head->addr;
1057           return head->addr;
1058         }
1059         if (ht_bin->vaddr[1] == vaddr) {
1060           ht_bin->tcaddr[1] = head->addr;
1061           return head->addr;
1062         }
1063         // Insert into hash table with low priority.
1064         // Don't evict existing entries, as they are probably
1065         // addresses that are being accessed frequently.
1066         if (ht_bin->vaddr[0] == -1) {
1067           ht_bin->vaddr[0] = vaddr;
1068           ht_bin->tcaddr[0] = head->addr;
1069         }
1070         else if (ht_bin->vaddr[1] == -1) {
1071           ht_bin->vaddr[1] = vaddr;
1072           ht_bin->tcaddr[1] = head->addr;
1073         }
1074         return head->addr;
1075       }
1076     }
1077     head=head->next;
1078   }
1079   return 0;
1080 }
1081
1082 void remove_hash(int vaddr)
1083 {
1084   //printf("remove hash: %x\n",vaddr);
1085   struct ht_entry *ht_bin = hash_table_get(vaddr);
1086   if (ht_bin->vaddr[1] == vaddr) {
1087     ht_bin->vaddr[1] = -1;
1088     ht_bin->tcaddr[1] = NULL;
1089   }
1090   if (ht_bin->vaddr[0] == vaddr) {
1091     ht_bin->vaddr[0] = ht_bin->vaddr[1];
1092     ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
1093     ht_bin->vaddr[1] = -1;
1094     ht_bin->tcaddr[1] = NULL;
1095   }
1096 }
1097
1098 static void ll_remove_matching_addrs(struct ll_entry **head,
1099   uintptr_t base_offs_s, int shift)
1100 {
1101   struct ll_entry *next;
1102   while(*head) {
1103     uintptr_t o1 = (u_char *)(*head)->addr - ndrc->translation_cache;
1104     uintptr_t o2 = o1 - MAX_OUTPUT_BLOCK_SIZE;
1105     if ((o1 >> shift) == base_offs_s || (o2 >> shift) == base_offs_s)
1106     {
1107       inv_debug("EXP: Remove pointer to %p (%x)\n",(*head)->addr,(*head)->vaddr);
1108       remove_hash((*head)->vaddr);
1109       next=(*head)->next;
1110       free(*head);
1111       *head=next;
1112     }
1113     else
1114     {
1115       head=&((*head)->next);
1116     }
1117   }
1118 }
1119
1120 // Remove all entries from linked list
1121 void ll_clear(struct ll_entry **head)
1122 {
1123   struct ll_entry *cur;
1124   struct ll_entry *next;
1125   if((cur=*head)) {
1126     *head=0;
1127     while(cur) {
1128       next=cur->next;
1129       free(cur);
1130       cur=next;
1131     }
1132   }
1133 }
1134
1135 // Dereference the pointers and remove if it matches
1136 static void ll_kill_pointers(struct ll_entry *head,
1137   uintptr_t base_offs_s, int shift)
1138 {
1139   while(head) {
1140     u_char *ptr = get_pointer(head->addr);
1141     uintptr_t o1 = ptr - ndrc->translation_cache;
1142     uintptr_t o2 = o1 - MAX_OUTPUT_BLOCK_SIZE;
1143     inv_debug("EXP: Lookup pointer to %p at %p (%x)\n",ptr,head->addr,head->vaddr);
1144     if ((o1 >> shift) == base_offs_s || (o2 >> shift) == base_offs_s)
1145     {
1146       inv_debug("EXP: Kill pointer at %p (%x)\n",head->addr,head->vaddr);
1147       void *host_addr=find_extjump_insn(head->addr);
1148       mark_clear_cache(host_addr);
1149       set_jump_target(host_addr, head->addr);
1150     }
1151     head=head->next;
1152   }
1153 }
1154
1155 // This is called when we write to a compiled block (see do_invstub)
1156 static void invalidate_page(u_int page)
1157 {
1158   struct ll_entry *head;
1159   struct ll_entry *next;
1160   head=jump_in[page];
1161   jump_in[page]=0;
1162   while(head!=NULL) {
1163     inv_debug("INVALIDATE: %x\n",head->vaddr);
1164     remove_hash(head->vaddr);
1165     next=head->next;
1166     free(head);
1167     head=next;
1168   }
1169   head=jump_out[page];
1170   jump_out[page]=0;
1171   while(head!=NULL) {
1172     inv_debug("INVALIDATE: kill pointer to %x (%p)\n",head->vaddr,head->addr);
1173     void *host_addr=find_extjump_insn(head->addr);
1174     mark_clear_cache(host_addr);
1175     set_jump_target(host_addr, head->addr);
1176     next=head->next;
1177     free(head);
1178     head=next;
1179   }
1180 }
1181
1182 static void invalidate_block_range(u_int block, u_int first, u_int last)
1183 {
1184   u_int page=get_page(block<<12);
1185   //printf("first=%d last=%d\n",first,last);
1186   invalidate_page(page);
1187   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1188   assert(last<page+5);
1189   // Invalidate the adjacent pages if a block crosses a 4K boundary
1190   while(first<page) {
1191     invalidate_page(first);
1192     first++;
1193   }
1194   for(first=page+1;first<last;first++) {
1195     invalidate_page(first);
1196   }
1197   do_clear_cache();
1198
1199   // Don't trap writes
1200   invalid_code[block]=1;
1201
1202   #ifdef USE_MINI_HT
1203   memset(mini_ht,-1,sizeof(mini_ht));
1204   #endif
1205 }
1206
1207 void invalidate_block(u_int block)
1208 {
1209   u_int page=get_page(block<<12);
1210   u_int vpage=get_vpage(block<<12);
1211   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1212   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1213   u_int first,last;
1214   first=last=page;
1215   struct ll_entry *head;
1216   head=jump_dirty[vpage];
1217   //printf("page=%d vpage=%d\n",page,vpage);
1218   while(head!=NULL) {
1219     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1220       u_char *start, *end;
1221       get_bounds(head->addr, &start, &end);
1222       //printf("start: %p end: %p\n", start, end);
1223       if (page < 2048 && start >= rdram && end < rdram+RAM_SIZE) {
1224         if (((start-rdram)>>12) <= page && ((end-1-rdram)>>12) >= page) {
1225           if ((((start-rdram)>>12)&2047) < first) first = ((start-rdram)>>12)&2047;
1226           if ((((end-1-rdram)>>12)&2047) > last)  last = ((end-1-rdram)>>12)&2047;
1227         }
1228       }
1229     }
1230     head=head->next;
1231   }
1232   invalidate_block_range(block,first,last);
1233 }
1234
1235 void invalidate_addr(u_int addr)
1236 {
1237   //static int rhits;
1238   // this check is done by the caller
1239   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1240   u_int page=get_vpage(addr);
1241   if(page<2048) { // RAM
1242     struct ll_entry *head;
1243     u_int addr_min=~0, addr_max=0;
1244     u_int mask=RAM_SIZE-1;
1245     u_int addr_main=0x80000000|(addr&mask);
1246     int pg1;
1247     inv_code_start=addr_main&~0xfff;
1248     inv_code_end=addr_main|0xfff;
1249     pg1=page;
1250     if (pg1>0) {
1251       // must check previous page too because of spans..
1252       pg1--;
1253       inv_code_start-=0x1000;
1254     }
1255     for(;pg1<=page;pg1++) {
1256       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1257         u_char *start_h, *end_h;
1258         u_int start, end;
1259         get_bounds(head->addr, &start_h, &end_h);
1260         start = (uintptr_t)start_h - ram_offset;
1261         end = (uintptr_t)end_h - ram_offset;
1262         if(start<=addr_main&&addr_main<end) {
1263           if(start<addr_min) addr_min=start;
1264           if(end>addr_max) addr_max=end;
1265         }
1266         else if(addr_main<start) {
1267           if(start<inv_code_end)
1268             inv_code_end=start-1;
1269         }
1270         else {
1271           if(end>inv_code_start)
1272             inv_code_start=end;
1273         }
1274       }
1275     }
1276     if (addr_min!=~0) {
1277       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1278       inv_code_start=inv_code_end=~0;
1279       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1280       return;
1281     }
1282     else {
1283       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1284       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1285       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1286       return;
1287     }
1288   }
1289   invalidate_block(addr>>12);
1290 }
1291
1292 // This is called when loading a save state.
1293 // Anything could have changed, so invalidate everything.
1294 void invalidate_all_pages(void)
1295 {
1296   u_int page;
1297   for(page=0;page<4096;page++)
1298     invalidate_page(page);
1299   for(page=0;page<1048576;page++)
1300     if(!invalid_code[page]) {
1301       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1302       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1303     }
1304   #ifdef USE_MINI_HT
1305   memset(mini_ht,-1,sizeof(mini_ht));
1306   #endif
1307   do_clear_cache();
1308 }
1309
1310 static void do_invstub(int n)
1311 {
1312   literal_pool(20);
1313   u_int reglist=stubs[n].a;
1314   set_jump_target(stubs[n].addr, out);
1315   save_regs(reglist);
1316   if(stubs[n].b!=0) emit_mov(stubs[n].b,0);
1317   emit_far_call(invalidate_addr);
1318   restore_regs(reglist);
1319   emit_jmp(stubs[n].retaddr); // return address
1320 }
1321
1322 // Add an entry to jump_out after making a link
1323 // src should point to code by emit_extjump2()
1324 void add_link(u_int vaddr,void *src)
1325 {
1326   u_int page=get_page(vaddr);
1327   inv_debug("add_link: %p -> %x (%d)\n",src,vaddr,page);
1328   check_extjump2(src);
1329   ll_add(jump_out+page,vaddr,src);
1330   //void *ptr=get_pointer(src);
1331   //inv_debug("add_link: Pointer is to %p\n",ptr);
1332 }
1333
1334 // If a code block was found to be unmodified (bit was set in
1335 // restore_candidate) and it remains unmodified (bit is clear
1336 // in invalid_code) then move the entries for that 4K page from
1337 // the dirty list to the clean list.
1338 void clean_blocks(u_int page)
1339 {
1340   struct ll_entry *head;
1341   inv_debug("INV: clean_blocks page=%d\n",page);
1342   head=jump_dirty[page];
1343   while(head!=NULL) {
1344     if(!invalid_code[head->vaddr>>12]) {
1345       // Don't restore blocks which are about to expire from the cache
1346       if (doesnt_expire_soon(head->addr)) {
1347         if(verify_dirty(head->addr)) {
1348           u_char *start, *end;
1349           //printf("Possibly Restore %x (%p)\n",head->vaddr, head->addr);
1350           u_int i;
1351           u_int inv=0;
1352           get_bounds(head->addr, &start, &end);
1353           if (start - rdram < RAM_SIZE) {
1354             for (i = (start-rdram+0x80000000)>>12; i <= (end-1-rdram+0x80000000)>>12; i++) {
1355               inv|=invalid_code[i];
1356             }
1357           }
1358           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1359             inv=1;
1360           }
1361           if(!inv) {
1362             void *clean_addr = get_clean_addr(head->addr);
1363             if (doesnt_expire_soon(clean_addr)) {
1364               u_int ppage=page;
1365               inv_debug("INV: Restored %x (%p/%p)\n",head->vaddr, head->addr, clean_addr);
1366               //printf("page=%x, addr=%x\n",page,head->vaddr);
1367               //assert(head->vaddr>>12==(page|0x80000));
1368               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1369               struct ht_entry *ht_bin = hash_table_get(head->vaddr);
1370               if (ht_bin->vaddr[0] == head->vaddr)
1371                 ht_bin->tcaddr[0] = clean_addr; // Replace existing entry
1372               if (ht_bin->vaddr[1] == head->vaddr)
1373                 ht_bin->tcaddr[1] = clean_addr; // Replace existing entry
1374             }
1375           }
1376         }
1377       }
1378     }
1379     head=head->next;
1380   }
1381 }
1382
1383 /* Register allocation */
1384
1385 // Note: registers are allocated clean (unmodified state)
1386 // if you intend to modify the register, you must call dirty_reg().
1387 static void alloc_reg(struct regstat *cur,int i,signed char reg)
1388 {
1389   int r,hr;
1390   int preferred_reg = (reg&7);
1391   if(reg==CCREG) preferred_reg=HOST_CCREG;
1392   if(reg==PTEMP||reg==FTEMP) preferred_reg=12;
1393
1394   // Don't allocate unused registers
1395   if((cur->u>>reg)&1) return;
1396
1397   // see if it's already allocated
1398   for(hr=0;hr<HOST_REGS;hr++)
1399   {
1400     if(cur->regmap[hr]==reg) return;
1401   }
1402
1403   // Keep the same mapping if the register was already allocated in a loop
1404   preferred_reg = loop_reg(i,reg,preferred_reg);
1405
1406   // Try to allocate the preferred register
1407   if(cur->regmap[preferred_reg]==-1) {
1408     cur->regmap[preferred_reg]=reg;
1409     cur->dirty&=~(1<<preferred_reg);
1410     cur->isconst&=~(1<<preferred_reg);
1411     return;
1412   }
1413   r=cur->regmap[preferred_reg];
1414   assert(r < 64);
1415   if((cur->u>>r)&1) {
1416     cur->regmap[preferred_reg]=reg;
1417     cur->dirty&=~(1<<preferred_reg);
1418     cur->isconst&=~(1<<preferred_reg);
1419     return;
1420   }
1421
1422   // Clear any unneeded registers
1423   // We try to keep the mapping consistent, if possible, because it
1424   // makes branches easier (especially loops).  So we try to allocate
1425   // first (see above) before removing old mappings.  If this is not
1426   // possible then go ahead and clear out the registers that are no
1427   // longer needed.
1428   for(hr=0;hr<HOST_REGS;hr++)
1429   {
1430     r=cur->regmap[hr];
1431     if(r>=0) {
1432       assert(r < 64);
1433       if((cur->u>>r)&1) {cur->regmap[hr]=-1;break;}
1434     }
1435   }
1436   // Try to allocate any available register, but prefer
1437   // registers that have not been used recently.
1438   if(i>0) {
1439     for(hr=0;hr<HOST_REGS;hr++) {
1440       if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
1441         if(regs[i-1].regmap[hr]!=rs1[i-1]&&regs[i-1].regmap[hr]!=rs2[i-1]&&regs[i-1].regmap[hr]!=rt1[i-1]&&regs[i-1].regmap[hr]!=rt2[i-1]) {
1442           cur->regmap[hr]=reg;
1443           cur->dirty&=~(1<<hr);
1444           cur->isconst&=~(1<<hr);
1445           return;
1446         }
1447       }
1448     }
1449   }
1450   // Try to allocate any available register
1451   for(hr=0;hr<HOST_REGS;hr++) {
1452     if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
1453       cur->regmap[hr]=reg;
1454       cur->dirty&=~(1<<hr);
1455       cur->isconst&=~(1<<hr);
1456       return;
1457     }
1458   }
1459
1460   // Ok, now we have to evict someone
1461   // Pick a register we hopefully won't need soon
1462   u_char hsn[MAXREG+1];
1463   memset(hsn,10,sizeof(hsn));
1464   int j;
1465   lsn(hsn,i,&preferred_reg);
1466   //printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",cur->regmap[0],cur->regmap[1],cur->regmap[2],cur->regmap[3],cur->regmap[5],cur->regmap[6],cur->regmap[7]);
1467   //printf("hsn(%x): %d %d %d %d %d %d %d\n",start+i*4,hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]);
1468   if(i>0) {
1469     // Don't evict the cycle count at entry points, otherwise the entry
1470     // stub will have to write it.
1471     if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2;
1472     if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP)) hsn[CCREG]=2;
1473     for(j=10;j>=3;j--)
1474     {
1475       // Alloc preferred register if available
1476       if(hsn[r=cur->regmap[preferred_reg]&63]==j) {
1477         for(hr=0;hr<HOST_REGS;hr++) {
1478           // Evict both parts of a 64-bit register
1479           if((cur->regmap[hr]&63)==r) {
1480             cur->regmap[hr]=-1;
1481             cur->dirty&=~(1<<hr);
1482             cur->isconst&=~(1<<hr);
1483           }
1484         }
1485         cur->regmap[preferred_reg]=reg;
1486         return;
1487       }
1488       for(r=1;r<=MAXREG;r++)
1489       {
1490         if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) {
1491           for(hr=0;hr<HOST_REGS;hr++) {
1492             if(hr!=HOST_CCREG||j<hsn[CCREG]) {
1493               if(cur->regmap[hr]==r) {
1494                 cur->regmap[hr]=reg;
1495                 cur->dirty&=~(1<<hr);
1496                 cur->isconst&=~(1<<hr);
1497                 return;
1498               }
1499             }
1500           }
1501         }
1502       }
1503     }
1504   }
1505   for(j=10;j>=0;j--)
1506   {
1507     for(r=1;r<=MAXREG;r++)
1508     {
1509       if(hsn[r]==j) {
1510         for(hr=0;hr<HOST_REGS;hr++) {
1511           if(cur->regmap[hr]==r) {
1512             cur->regmap[hr]=reg;
1513             cur->dirty&=~(1<<hr);
1514             cur->isconst&=~(1<<hr);
1515             return;
1516           }
1517         }
1518       }
1519     }
1520   }
1521   SysPrintf("This shouldn't happen (alloc_reg)");abort();
1522 }
1523
1524 // Allocate a temporary register.  This is done without regard to
1525 // dirty status or whether the register we request is on the unneeded list
1526 // Note: This will only allocate one register, even if called multiple times
1527 static void alloc_reg_temp(struct regstat *cur,int i,signed char reg)
1528 {
1529   int r,hr;
1530   int preferred_reg = -1;
1531
1532   // see if it's already allocated
1533   for(hr=0;hr<HOST_REGS;hr++)
1534   {
1535     if(hr!=EXCLUDE_REG&&cur->regmap[hr]==reg) return;
1536   }
1537
1538   // Try to allocate any available register
1539   for(hr=HOST_REGS-1;hr>=0;hr--) {
1540     if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) {
1541       cur->regmap[hr]=reg;
1542       cur->dirty&=~(1<<hr);
1543       cur->isconst&=~(1<<hr);
1544       return;
1545     }
1546   }
1547
1548   // Find an unneeded register
1549   for(hr=HOST_REGS-1;hr>=0;hr--)
1550   {
1551     r=cur->regmap[hr];
1552     if(r>=0) {
1553       assert(r < 64);
1554       if((cur->u>>r)&1) {
1555         if(i==0||((unneeded_reg[i-1]>>r)&1)) {
1556           cur->regmap[hr]=reg;
1557           cur->dirty&=~(1<<hr);
1558           cur->isconst&=~(1<<hr);
1559           return;
1560         }
1561       }
1562     }
1563   }
1564
1565   // Ok, now we have to evict someone
1566   // Pick a register we hopefully won't need soon
1567   // TODO: we might want to follow unconditional jumps here
1568   // TODO: get rid of dupe code and make this into a function
1569   u_char hsn[MAXREG+1];
1570   memset(hsn,10,sizeof(hsn));
1571   int j;
1572   lsn(hsn,i,&preferred_reg);
1573   //printf("hsn: %d %d %d %d %d %d %d\n",hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]);
1574   if(i>0) {
1575     // Don't evict the cycle count at entry points, otherwise the entry
1576     // stub will have to write it.
1577     if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2;
1578     if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP)) hsn[CCREG]=2;
1579     for(j=10;j>=3;j--)
1580     {
1581       for(r=1;r<=MAXREG;r++)
1582       {
1583         if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) {
1584           for(hr=0;hr<HOST_REGS;hr++) {
1585             if(hr!=HOST_CCREG||hsn[CCREG]>2) {
1586               if(cur->regmap[hr]==r) {
1587                 cur->regmap[hr]=reg;
1588                 cur->dirty&=~(1<<hr);
1589                 cur->isconst&=~(1<<hr);
1590                 return;
1591               }
1592             }
1593           }
1594         }
1595       }
1596     }
1597   }
1598   for(j=10;j>=0;j--)
1599   {
1600     for(r=1;r<=MAXREG;r++)
1601     {
1602       if(hsn[r]==j) {
1603         for(hr=0;hr<HOST_REGS;hr++) {
1604           if(cur->regmap[hr]==r) {
1605             cur->regmap[hr]=reg;
1606             cur->dirty&=~(1<<hr);
1607             cur->isconst&=~(1<<hr);
1608             return;
1609           }
1610         }
1611       }
1612     }
1613   }
1614   SysPrintf("This shouldn't happen");abort();
1615 }
1616
1617 static void mov_alloc(struct regstat *current,int i)
1618 {
1619   if (rs1[i] == HIREG || rs1[i] == LOREG) {
1620     // logically this is needed but just won't work, no idea why
1621     //alloc_cc(current,i); // for stalls
1622     //dirty_reg(current,CCREG);
1623   }
1624
1625   // Note: Don't need to actually alloc the source registers
1626   //alloc_reg(current,i,rs1[i]);
1627   alloc_reg(current,i,rt1[i]);
1628
1629   clear_const(current,rs1[i]);
1630   clear_const(current,rt1[i]);
1631   dirty_reg(current,rt1[i]);
1632 }
1633
1634 static void shiftimm_alloc(struct regstat *current,int i)
1635 {
1636   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1637   {
1638     if(rt1[i]) {
1639       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1640       else lt1[i]=rs1[i];
1641       alloc_reg(current,i,rt1[i]);
1642       dirty_reg(current,rt1[i]);
1643       if(is_const(current,rs1[i])) {
1644         int v=get_const(current,rs1[i]);
1645         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1646         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1647         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1648       }
1649       else clear_const(current,rt1[i]);
1650     }
1651   }
1652   else
1653   {
1654     clear_const(current,rs1[i]);
1655     clear_const(current,rt1[i]);
1656   }
1657
1658   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1659   {
1660     assert(0);
1661   }
1662   if(opcode2[i]==0x3c) // DSLL32
1663   {
1664     assert(0);
1665   }
1666   if(opcode2[i]==0x3e) // DSRL32
1667   {
1668     assert(0);
1669   }
1670   if(opcode2[i]==0x3f) // DSRA32
1671   {
1672     assert(0);
1673   }
1674 }
1675
1676 static void shift_alloc(struct regstat *current,int i)
1677 {
1678   if(rt1[i]) {
1679     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1680     {
1681       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1682       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1683       alloc_reg(current,i,rt1[i]);
1684       if(rt1[i]==rs2[i]) {
1685         alloc_reg_temp(current,i,-1);
1686         minimum_free_regs[i]=1;
1687       }
1688     } else { // DSLLV/DSRLV/DSRAV
1689       assert(0);
1690     }
1691     clear_const(current,rs1[i]);
1692     clear_const(current,rs2[i]);
1693     clear_const(current,rt1[i]);
1694     dirty_reg(current,rt1[i]);
1695   }
1696 }
1697
1698 static void alu_alloc(struct regstat *current,int i)
1699 {
1700   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1701     if(rt1[i]) {
1702       if(rs1[i]&&rs2[i]) {
1703         alloc_reg(current,i,rs1[i]);
1704         alloc_reg(current,i,rs2[i]);
1705       }
1706       else {
1707         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1708         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1709       }
1710       alloc_reg(current,i,rt1[i]);
1711     }
1712   }
1713   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1714     if(rt1[i]) {
1715       alloc_reg(current,i,rs1[i]);
1716       alloc_reg(current,i,rs2[i]);
1717       alloc_reg(current,i,rt1[i]);
1718     }
1719   }
1720   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1721     if(rt1[i]) {
1722       if(rs1[i]&&rs2[i]) {
1723         alloc_reg(current,i,rs1[i]);
1724         alloc_reg(current,i,rs2[i]);
1725       }
1726       else
1727       {
1728         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1729         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1730       }
1731       alloc_reg(current,i,rt1[i]);
1732     }
1733   }
1734   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1735     assert(0);
1736   }
1737   clear_const(current,rs1[i]);
1738   clear_const(current,rs2[i]);
1739   clear_const(current,rt1[i]);
1740   dirty_reg(current,rt1[i]);
1741 }
1742
1743 static void imm16_alloc(struct regstat *current,int i)
1744 {
1745   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1746   else lt1[i]=rs1[i];
1747   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1748   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1749     assert(0);
1750   }
1751   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1752     clear_const(current,rs1[i]);
1753     clear_const(current,rt1[i]);
1754   }
1755   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1756     if(is_const(current,rs1[i])) {
1757       int v=get_const(current,rs1[i]);
1758       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1759       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1760       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1761     }
1762     else clear_const(current,rt1[i]);
1763   }
1764   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1765     if(is_const(current,rs1[i])) {
1766       int v=get_const(current,rs1[i]);
1767       set_const(current,rt1[i],v+imm[i]);
1768     }
1769     else clear_const(current,rt1[i]);
1770   }
1771   else {
1772     set_const(current,rt1[i],imm[i]<<16); // LUI
1773   }
1774   dirty_reg(current,rt1[i]);
1775 }
1776
1777 static void load_alloc(struct regstat *current,int i)
1778 {
1779   clear_const(current,rt1[i]);
1780   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1781   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1782   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1783   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1784     alloc_reg(current,i,rt1[i]);
1785     assert(get_reg(current->regmap,rt1[i])>=0);
1786     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1787     {
1788       assert(0);
1789     }
1790     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1791     {
1792       assert(0);
1793     }
1794     dirty_reg(current,rt1[i]);
1795     // LWL/LWR need a temporary register for the old value
1796     if(opcode[i]==0x22||opcode[i]==0x26)
1797     {
1798       alloc_reg(current,i,FTEMP);
1799       alloc_reg_temp(current,i,-1);
1800       minimum_free_regs[i]=1;
1801     }
1802   }
1803   else
1804   {
1805     // Load to r0 or unneeded register (dummy load)
1806     // but we still need a register to calculate the address
1807     if(opcode[i]==0x22||opcode[i]==0x26)
1808     {
1809       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1810     }
1811     alloc_reg_temp(current,i,-1);
1812     minimum_free_regs[i]=1;
1813     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1814     {
1815       assert(0);
1816     }
1817   }
1818 }
1819
1820 void store_alloc(struct regstat *current,int i)
1821 {
1822   clear_const(current,rs2[i]);
1823   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1824   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1825   alloc_reg(current,i,rs2[i]);
1826   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1827     assert(0);
1828   }
1829   #if defined(HOST_IMM8)
1830   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1831   else alloc_reg(current,i,INVCP);
1832   #endif
1833   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1834     alloc_reg(current,i,FTEMP);
1835   }
1836   // We need a temporary register for address generation
1837   alloc_reg_temp(current,i,-1);
1838   minimum_free_regs[i]=1;
1839 }
1840
1841 void c1ls_alloc(struct regstat *current,int i)
1842 {
1843   //clear_const(current,rs1[i]); // FIXME
1844   clear_const(current,rt1[i]);
1845   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1846   alloc_reg(current,i,CSREG); // Status
1847   alloc_reg(current,i,FTEMP);
1848   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1849     assert(0);
1850   }
1851   #if defined(HOST_IMM8)
1852   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1853   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1854     alloc_reg(current,i,INVCP);
1855   #endif
1856   // We need a temporary register for address generation
1857   alloc_reg_temp(current,i,-1);
1858 }
1859
1860 void c2ls_alloc(struct regstat *current,int i)
1861 {
1862   clear_const(current,rt1[i]);
1863   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1864   alloc_reg(current,i,FTEMP);
1865   #if defined(HOST_IMM8)
1866   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1867   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1868     alloc_reg(current,i,INVCP);
1869   #endif
1870   // We need a temporary register for address generation
1871   alloc_reg_temp(current,i,-1);
1872   minimum_free_regs[i]=1;
1873 }
1874
1875 #ifndef multdiv_alloc
1876 void multdiv_alloc(struct regstat *current,int i)
1877 {
1878   //  case 0x18: MULT
1879   //  case 0x19: MULTU
1880   //  case 0x1A: DIV
1881   //  case 0x1B: DIVU
1882   //  case 0x1C: DMULT
1883   //  case 0x1D: DMULTU
1884   //  case 0x1E: DDIV
1885   //  case 0x1F: DDIVU
1886   clear_const(current,rs1[i]);
1887   clear_const(current,rs2[i]);
1888   alloc_cc(current,i); // for stalls
1889   if(rs1[i]&&rs2[i])
1890   {
1891     if((opcode2[i]&4)==0) // 32-bit
1892     {
1893       current->u&=~(1LL<<HIREG);
1894       current->u&=~(1LL<<LOREG);
1895       alloc_reg(current,i,HIREG);
1896       alloc_reg(current,i,LOREG);
1897       alloc_reg(current,i,rs1[i]);
1898       alloc_reg(current,i,rs2[i]);
1899       dirty_reg(current,HIREG);
1900       dirty_reg(current,LOREG);
1901     }
1902     else // 64-bit
1903     {
1904       assert(0);
1905     }
1906   }
1907   else
1908   {
1909     // Multiply by zero is zero.
1910     // MIPS does not have a divide by zero exception.
1911     // The result is undefined, we return zero.
1912     alloc_reg(current,i,HIREG);
1913     alloc_reg(current,i,LOREG);
1914     dirty_reg(current,HIREG);
1915     dirty_reg(current,LOREG);
1916   }
1917 }
1918 #endif
1919
1920 void cop0_alloc(struct regstat *current,int i)
1921 {
1922   if(opcode2[i]==0) // MFC0
1923   {
1924     if(rt1[i]) {
1925       clear_const(current,rt1[i]);
1926       alloc_all(current,i);
1927       alloc_reg(current,i,rt1[i]);
1928       dirty_reg(current,rt1[i]);
1929     }
1930   }
1931   else if(opcode2[i]==4) // MTC0
1932   {
1933     if(rs1[i]){
1934       clear_const(current,rs1[i]);
1935       alloc_reg(current,i,rs1[i]);
1936       alloc_all(current,i);
1937     }
1938     else {
1939       alloc_all(current,i); // FIXME: Keep r0
1940       current->u&=~1LL;
1941       alloc_reg(current,i,0);
1942     }
1943   }
1944   else
1945   {
1946     // TLBR/TLBWI/TLBWR/TLBP/ERET
1947     assert(opcode2[i]==0x10);
1948     alloc_all(current,i);
1949   }
1950   minimum_free_regs[i]=HOST_REGS;
1951 }
1952
1953 static void cop2_alloc(struct regstat *current,int i)
1954 {
1955   if (opcode2[i] < 3) // MFC2/CFC2
1956   {
1957     alloc_cc(current,i); // for stalls
1958     dirty_reg(current,CCREG);
1959     if(rt1[i]){
1960       clear_const(current,rt1[i]);
1961       alloc_reg(current,i,rt1[i]);
1962       dirty_reg(current,rt1[i]);
1963     }
1964   }
1965   else if (opcode2[i] > 3) // MTC2/CTC2
1966   {
1967     if(rs1[i]){
1968       clear_const(current,rs1[i]);
1969       alloc_reg(current,i,rs1[i]);
1970     }
1971     else {
1972       current->u&=~1LL;
1973       alloc_reg(current,i,0);
1974     }
1975   }
1976   alloc_reg_temp(current,i,-1);
1977   minimum_free_regs[i]=1;
1978 }
1979
1980 void c2op_alloc(struct regstat *current,int i)
1981 {
1982   alloc_cc(current,i); // for stalls
1983   dirty_reg(current,CCREG);
1984   alloc_reg_temp(current,i,-1);
1985 }
1986
1987 void syscall_alloc(struct regstat *current,int i)
1988 {
1989   alloc_cc(current,i);
1990   dirty_reg(current,CCREG);
1991   alloc_all(current,i);
1992   minimum_free_regs[i]=HOST_REGS;
1993   current->isconst=0;
1994 }
1995
1996 void delayslot_alloc(struct regstat *current,int i)
1997 {
1998   switch(itype[i]) {
1999     case UJUMP:
2000     case CJUMP:
2001     case SJUMP:
2002     case RJUMP:
2003     case SYSCALL:
2004     case HLECALL:
2005     case SPAN:
2006       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//abort();
2007       SysPrintf("Disabled speculative precompilation\n");
2008       stop_after_jal=1;
2009       break;
2010     case IMM16:
2011       imm16_alloc(current,i);
2012       break;
2013     case LOAD:
2014     case LOADLR:
2015       load_alloc(current,i);
2016       break;
2017     case STORE:
2018     case STORELR:
2019       store_alloc(current,i);
2020       break;
2021     case ALU:
2022       alu_alloc(current,i);
2023       break;
2024     case SHIFT:
2025       shift_alloc(current,i);
2026       break;
2027     case MULTDIV:
2028       multdiv_alloc(current,i);
2029       break;
2030     case SHIFTIMM:
2031       shiftimm_alloc(current,i);
2032       break;
2033     case MOV:
2034       mov_alloc(current,i);
2035       break;
2036     case COP0:
2037       cop0_alloc(current,i);
2038       break;
2039     case COP1:
2040       break;
2041     case COP2:
2042       cop2_alloc(current,i);
2043       break;
2044     case C1LS:
2045       c1ls_alloc(current,i);
2046       break;
2047     case C2LS:
2048       c2ls_alloc(current,i);
2049       break;
2050     case C2OP:
2051       c2op_alloc(current,i);
2052       break;
2053   }
2054 }
2055
2056 // Special case where a branch and delay slot span two pages in virtual memory
2057 static void pagespan_alloc(struct regstat *current,int i)
2058 {
2059   current->isconst=0;
2060   current->wasconst=0;
2061   regs[i].wasconst=0;
2062   minimum_free_regs[i]=HOST_REGS;
2063   alloc_all(current,i);
2064   alloc_cc(current,i);
2065   dirty_reg(current,CCREG);
2066   if(opcode[i]==3) // JAL
2067   {
2068     alloc_reg(current,i,31);
2069     dirty_reg(current,31);
2070   }
2071   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
2072   {
2073     alloc_reg(current,i,rs1[i]);
2074     if (rt1[i]!=0) {
2075       alloc_reg(current,i,rt1[i]);
2076       dirty_reg(current,rt1[i]);
2077     }
2078   }
2079   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
2080   {
2081     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2082     if(rs2[i]) alloc_reg(current,i,rs2[i]);
2083   }
2084   else
2085   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
2086   {
2087     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2088   }
2089   //else ...
2090 }
2091
2092 static void add_stub(enum stub_type type, void *addr, void *retaddr,
2093   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e)
2094 {
2095   assert(stubcount < ARRAY_SIZE(stubs));
2096   stubs[stubcount].type = type;
2097   stubs[stubcount].addr = addr;
2098   stubs[stubcount].retaddr = retaddr;
2099   stubs[stubcount].a = a;
2100   stubs[stubcount].b = b;
2101   stubs[stubcount].c = c;
2102   stubs[stubcount].d = d;
2103   stubs[stubcount].e = e;
2104   stubcount++;
2105 }
2106
2107 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
2108   int i, int addr_reg, const struct regstat *i_regs, int ccadj, u_int reglist)
2109 {
2110   add_stub(type, addr, retaddr, i, addr_reg, (uintptr_t)i_regs, ccadj, reglist);
2111 }
2112
2113 // Write out a single register
2114 static void wb_register(signed char r,signed char regmap[],uint64_t dirty)
2115 {
2116   int hr;
2117   for(hr=0;hr<HOST_REGS;hr++) {
2118     if(hr!=EXCLUDE_REG) {
2119       if((regmap[hr]&63)==r) {
2120         if((dirty>>hr)&1) {
2121           assert(regmap[hr]<64);
2122           emit_storereg(r,hr);
2123         }
2124       }
2125     }
2126   }
2127 }
2128
2129 static void wb_valid(signed char pre[],signed char entry[],u_int dirty_pre,u_int dirty,uint64_t u)
2130 {
2131   //if(dirty_pre==dirty) return;
2132   int hr,reg;
2133   for(hr=0;hr<HOST_REGS;hr++) {
2134     if(hr!=EXCLUDE_REG) {
2135       reg=pre[hr];
2136       if(((~u)>>(reg&63))&1) {
2137         if(reg>0) {
2138           if(((dirty_pre&~dirty)>>hr)&1) {
2139             if(reg>0&&reg<34) {
2140               emit_storereg(reg,hr);
2141             }
2142             else if(reg>=64) {
2143               assert(0);
2144             }
2145           }
2146         }
2147       }
2148     }
2149   }
2150 }
2151
2152 // trashes r2
2153 static void pass_args(int a0, int a1)
2154 {
2155   if(a0==1&&a1==0) {
2156     // must swap
2157     emit_mov(a0,2); emit_mov(a1,1); emit_mov(2,0);
2158   }
2159   else if(a0!=0&&a1==0) {
2160     emit_mov(a1,1);
2161     if (a0>=0) emit_mov(a0,0);
2162   }
2163   else {
2164     if(a0>=0&&a0!=0) emit_mov(a0,0);
2165     if(a1>=0&&a1!=1) emit_mov(a1,1);
2166   }
2167 }
2168
2169 static void alu_assemble(int i,struct regstat *i_regs)
2170 {
2171   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2172     if(rt1[i]) {
2173       signed char s1,s2,t;
2174       t=get_reg(i_regs->regmap,rt1[i]);
2175       if(t>=0) {
2176         s1=get_reg(i_regs->regmap,rs1[i]);
2177         s2=get_reg(i_regs->regmap,rs2[i]);
2178         if(rs1[i]&&rs2[i]) {
2179           assert(s1>=0);
2180           assert(s2>=0);
2181           if(opcode2[i]&2) emit_sub(s1,s2,t);
2182           else emit_add(s1,s2,t);
2183         }
2184         else if(rs1[i]) {
2185           if(s1>=0) emit_mov(s1,t);
2186           else emit_loadreg(rs1[i],t);
2187         }
2188         else if(rs2[i]) {
2189           if(s2>=0) {
2190             if(opcode2[i]&2) emit_neg(s2,t);
2191             else emit_mov(s2,t);
2192           }
2193           else {
2194             emit_loadreg(rs2[i],t);
2195             if(opcode2[i]&2) emit_neg(t,t);
2196           }
2197         }
2198         else emit_zeroreg(t);
2199       }
2200     }
2201   }
2202   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2203     assert(0);
2204   }
2205   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2206     if(rt1[i]) {
2207       signed char s1l,s2l,t;
2208       {
2209         t=get_reg(i_regs->regmap,rt1[i]);
2210         //assert(t>=0);
2211         if(t>=0) {
2212           s1l=get_reg(i_regs->regmap,rs1[i]);
2213           s2l=get_reg(i_regs->regmap,rs2[i]);
2214           if(rs2[i]==0) // rx<r0
2215           {
2216             if(opcode2[i]==0x2a&&rs1[i]!=0) { // SLT
2217               assert(s1l>=0);
2218               emit_shrimm(s1l,31,t);
2219             }
2220             else // SLTU (unsigned can not be less than zero, 0<0)
2221               emit_zeroreg(t);
2222           }
2223           else if(rs1[i]==0) // r0<rx
2224           {
2225             assert(s2l>=0);
2226             if(opcode2[i]==0x2a) // SLT
2227               emit_set_gz32(s2l,t);
2228             else // SLTU (set if not zero)
2229               emit_set_nz32(s2l,t);
2230           }
2231           else{
2232             assert(s1l>=0);assert(s2l>=0);
2233             if(opcode2[i]==0x2a) // SLT
2234               emit_set_if_less32(s1l,s2l,t);
2235             else // SLTU
2236               emit_set_if_carry32(s1l,s2l,t);
2237           }
2238         }
2239       }
2240     }
2241   }
2242   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2243     if(rt1[i]) {
2244       signed char s1l,s2l,tl;
2245       tl=get_reg(i_regs->regmap,rt1[i]);
2246       {
2247         if(tl>=0) {
2248           s1l=get_reg(i_regs->regmap,rs1[i]);
2249           s2l=get_reg(i_regs->regmap,rs2[i]);
2250           if(rs1[i]&&rs2[i]) {
2251             assert(s1l>=0);
2252             assert(s2l>=0);
2253             if(opcode2[i]==0x24) { // AND
2254               emit_and(s1l,s2l,tl);
2255             } else
2256             if(opcode2[i]==0x25) { // OR
2257               emit_or(s1l,s2l,tl);
2258             } else
2259             if(opcode2[i]==0x26) { // XOR
2260               emit_xor(s1l,s2l,tl);
2261             } else
2262             if(opcode2[i]==0x27) { // NOR
2263               emit_or(s1l,s2l,tl);
2264               emit_not(tl,tl);
2265             }
2266           }
2267           else
2268           {
2269             if(opcode2[i]==0x24) { // AND
2270               emit_zeroreg(tl);
2271             } else
2272             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2273               if(rs1[i]){
2274                 if(s1l>=0) emit_mov(s1l,tl);
2275                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2276               }
2277               else
2278               if(rs2[i]){
2279                 if(s2l>=0) emit_mov(s2l,tl);
2280                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2281               }
2282               else emit_zeroreg(tl);
2283             } else
2284             if(opcode2[i]==0x27) { // NOR
2285               if(rs1[i]){
2286                 if(s1l>=0) emit_not(s1l,tl);
2287                 else {
2288                   emit_loadreg(rs1[i],tl);
2289                   emit_not(tl,tl);
2290                 }
2291               }
2292               else
2293               if(rs2[i]){
2294                 if(s2l>=0) emit_not(s2l,tl);
2295                 else {
2296                   emit_loadreg(rs2[i],tl);
2297                   emit_not(tl,tl);
2298                 }
2299               }
2300               else emit_movimm(-1,tl);
2301             }
2302           }
2303         }
2304       }
2305     }
2306   }
2307 }
2308
2309 void imm16_assemble(int i,struct regstat *i_regs)
2310 {
2311   if (opcode[i]==0x0f) { // LUI
2312     if(rt1[i]) {
2313       signed char t;
2314       t=get_reg(i_regs->regmap,rt1[i]);
2315       //assert(t>=0);
2316       if(t>=0) {
2317         if(!((i_regs->isconst>>t)&1))
2318           emit_movimm(imm[i]<<16,t);
2319       }
2320     }
2321   }
2322   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2323     if(rt1[i]) {
2324       signed char s,t;
2325       t=get_reg(i_regs->regmap,rt1[i]);
2326       s=get_reg(i_regs->regmap,rs1[i]);
2327       if(rs1[i]) {
2328         //assert(t>=0);
2329         //assert(s>=0);
2330         if(t>=0) {
2331           if(!((i_regs->isconst>>t)&1)) {
2332             if(s<0) {
2333               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2334               emit_addimm(t,imm[i],t);
2335             }else{
2336               if(!((i_regs->wasconst>>s)&1))
2337                 emit_addimm(s,imm[i],t);
2338               else
2339                 emit_movimm(constmap[i][s]+imm[i],t);
2340             }
2341           }
2342         }
2343       } else {
2344         if(t>=0) {
2345           if(!((i_regs->isconst>>t)&1))
2346             emit_movimm(imm[i],t);
2347         }
2348       }
2349     }
2350   }
2351   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2352     if(rt1[i]) {
2353       signed char sl,tl;
2354       tl=get_reg(i_regs->regmap,rt1[i]);
2355       sl=get_reg(i_regs->regmap,rs1[i]);
2356       if(tl>=0) {
2357         if(rs1[i]) {
2358           assert(sl>=0);
2359           emit_addimm(sl,imm[i],tl);
2360         } else {
2361           emit_movimm(imm[i],tl);
2362         }
2363       }
2364     }
2365   }
2366   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2367     if(rt1[i]) {
2368       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2369       signed char sl,t;
2370       t=get_reg(i_regs->regmap,rt1[i]);
2371       sl=get_reg(i_regs->regmap,rs1[i]);
2372       //assert(t>=0);
2373       if(t>=0) {
2374         if(rs1[i]>0) {
2375             if(opcode[i]==0x0a) { // SLTI
2376               if(sl<0) {
2377                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2378                 emit_slti32(t,imm[i],t);
2379               }else{
2380                 emit_slti32(sl,imm[i],t);
2381               }
2382             }
2383             else { // SLTIU
2384               if(sl<0) {
2385                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2386                 emit_sltiu32(t,imm[i],t);
2387               }else{
2388                 emit_sltiu32(sl,imm[i],t);
2389               }
2390             }
2391         }else{
2392           // SLTI(U) with r0 is just stupid,
2393           // nonetheless examples can be found
2394           if(opcode[i]==0x0a) // SLTI
2395             if(0<imm[i]) emit_movimm(1,t);
2396             else emit_zeroreg(t);
2397           else // SLTIU
2398           {
2399             if(imm[i]) emit_movimm(1,t);
2400             else emit_zeroreg(t);
2401           }
2402         }
2403       }
2404     }
2405   }
2406   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2407     if(rt1[i]) {
2408       signed char sl,tl;
2409       tl=get_reg(i_regs->regmap,rt1[i]);
2410       sl=get_reg(i_regs->regmap,rs1[i]);
2411       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2412         if(opcode[i]==0x0c) //ANDI
2413         {
2414           if(rs1[i]) {
2415             if(sl<0) {
2416               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2417               emit_andimm(tl,imm[i],tl);
2418             }else{
2419               if(!((i_regs->wasconst>>sl)&1))
2420                 emit_andimm(sl,imm[i],tl);
2421               else
2422                 emit_movimm(constmap[i][sl]&imm[i],tl);
2423             }
2424           }
2425           else
2426             emit_zeroreg(tl);
2427         }
2428         else
2429         {
2430           if(rs1[i]) {
2431             if(sl<0) {
2432               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2433             }
2434             if(opcode[i]==0x0d) { // ORI
2435               if(sl<0) {
2436                 emit_orimm(tl,imm[i],tl);
2437               }else{
2438                 if(!((i_regs->wasconst>>sl)&1))
2439                   emit_orimm(sl,imm[i],tl);
2440                 else
2441                   emit_movimm(constmap[i][sl]|imm[i],tl);
2442               }
2443             }
2444             if(opcode[i]==0x0e) { // XORI
2445               if(sl<0) {
2446                 emit_xorimm(tl,imm[i],tl);
2447               }else{
2448                 if(!((i_regs->wasconst>>sl)&1))
2449                   emit_xorimm(sl,imm[i],tl);
2450                 else
2451                   emit_movimm(constmap[i][sl]^imm[i],tl);
2452               }
2453             }
2454           }
2455           else {
2456             emit_movimm(imm[i],tl);
2457           }
2458         }
2459       }
2460     }
2461   }
2462 }
2463
2464 void shiftimm_assemble(int i,struct regstat *i_regs)
2465 {
2466   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2467   {
2468     if(rt1[i]) {
2469       signed char s,t;
2470       t=get_reg(i_regs->regmap,rt1[i]);
2471       s=get_reg(i_regs->regmap,rs1[i]);
2472       //assert(t>=0);
2473       if(t>=0&&!((i_regs->isconst>>t)&1)){
2474         if(rs1[i]==0)
2475         {
2476           emit_zeroreg(t);
2477         }
2478         else
2479         {
2480           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2481           if(imm[i]) {
2482             if(opcode2[i]==0) // SLL
2483             {
2484               emit_shlimm(s<0?t:s,imm[i],t);
2485             }
2486             if(opcode2[i]==2) // SRL
2487             {
2488               emit_shrimm(s<0?t:s,imm[i],t);
2489             }
2490             if(opcode2[i]==3) // SRA
2491             {
2492               emit_sarimm(s<0?t:s,imm[i],t);
2493             }
2494           }else{
2495             // Shift by zero
2496             if(s>=0 && s!=t) emit_mov(s,t);
2497           }
2498         }
2499       }
2500       //emit_storereg(rt1[i],t); //DEBUG
2501     }
2502   }
2503   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2504   {
2505     assert(0);
2506   }
2507   if(opcode2[i]==0x3c) // DSLL32
2508   {
2509     assert(0);
2510   }
2511   if(opcode2[i]==0x3e) // DSRL32
2512   {
2513     assert(0);
2514   }
2515   if(opcode2[i]==0x3f) // DSRA32
2516   {
2517     assert(0);
2518   }
2519 }
2520
2521 #ifndef shift_assemble
2522 static void shift_assemble(int i,struct regstat *i_regs)
2523 {
2524   signed char s,t,shift;
2525   if (rt1[i] == 0)
2526     return;
2527   assert(opcode2[i]<=0x07); // SLLV/SRLV/SRAV
2528   t = get_reg(i_regs->regmap, rt1[i]);
2529   s = get_reg(i_regs->regmap, rs1[i]);
2530   shift = get_reg(i_regs->regmap, rs2[i]);
2531   if (t < 0)
2532     return;
2533
2534   if(rs1[i]==0)
2535     emit_zeroreg(t);
2536   else if(rs2[i]==0) {
2537     assert(s>=0);
2538     if(s!=t) emit_mov(s,t);
2539   }
2540   else {
2541     host_tempreg_acquire();
2542     emit_andimm(shift,31,HOST_TEMPREG);
2543     switch(opcode2[i]) {
2544     case 4: // SLLV
2545       emit_shl(s,HOST_TEMPREG,t);
2546       break;
2547     case 6: // SRLV
2548       emit_shr(s,HOST_TEMPREG,t);
2549       break;
2550     case 7: // SRAV
2551       emit_sar(s,HOST_TEMPREG,t);
2552       break;
2553     default:
2554       assert(0);
2555     }
2556     host_tempreg_release();
2557   }
2558 }
2559
2560 #endif
2561
2562 enum {
2563   MTYPE_8000 = 0,
2564   MTYPE_8020,
2565   MTYPE_0000,
2566   MTYPE_A000,
2567   MTYPE_1F80,
2568 };
2569
2570 static int get_ptr_mem_type(u_int a)
2571 {
2572   if(a < 0x00200000) {
2573     if(a<0x1000&&((start>>20)==0xbfc||(start>>24)==0xa0))
2574       // return wrong, must use memhandler for BIOS self-test to pass
2575       // 007 does similar stuff from a00 mirror, weird stuff
2576       return MTYPE_8000;
2577     return MTYPE_0000;
2578   }
2579   if(0x1f800000 <= a && a < 0x1f801000)
2580     return MTYPE_1F80;
2581   if(0x80200000 <= a && a < 0x80800000)
2582     return MTYPE_8020;
2583   if(0xa0000000 <= a && a < 0xa0200000)
2584     return MTYPE_A000;
2585   return MTYPE_8000;
2586 }
2587
2588 static void *emit_fastpath_cmp_jump(int i,int addr,int *addr_reg_override)
2589 {
2590   void *jaddr = NULL;
2591   int type=0;
2592   int mr=rs1[i];
2593   if(((smrv_strong|smrv_weak)>>mr)&1) {
2594     type=get_ptr_mem_type(smrv[mr]);
2595     //printf("set %08x @%08x r%d %d\n", smrv[mr], start+i*4, mr, type);
2596   }
2597   else {
2598     // use the mirror we are running on
2599     type=get_ptr_mem_type(start);
2600     //printf("set nospec   @%08x r%d %d\n", start+i*4, mr, type);
2601   }
2602
2603   if(type==MTYPE_8020) { // RAM 80200000+ mirror
2604     host_tempreg_acquire();
2605     emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
2606     addr=*addr_reg_override=HOST_TEMPREG;
2607     type=0;
2608   }
2609   else if(type==MTYPE_0000) { // RAM 0 mirror
2610     host_tempreg_acquire();
2611     emit_orimm(addr,0x80000000,HOST_TEMPREG);
2612     addr=*addr_reg_override=HOST_TEMPREG;
2613     type=0;
2614   }
2615   else if(type==MTYPE_A000) { // RAM A mirror
2616     host_tempreg_acquire();
2617     emit_andimm(addr,~0x20000000,HOST_TEMPREG);
2618     addr=*addr_reg_override=HOST_TEMPREG;
2619     type=0;
2620   }
2621   else if(type==MTYPE_1F80) { // scratchpad
2622     if (psxH == (void *)0x1f800000) {
2623       host_tempreg_acquire();
2624       emit_xorimm(addr,0x1f800000,HOST_TEMPREG);
2625       emit_cmpimm(HOST_TEMPREG,0x1000);
2626       host_tempreg_release();
2627       jaddr=out;
2628       emit_jc(0);
2629     }
2630     else {
2631       // do the usual RAM check, jump will go to the right handler
2632       type=0;
2633     }
2634   }
2635
2636   if(type==0)
2637   {
2638     emit_cmpimm(addr,RAM_SIZE);
2639     jaddr=out;
2640     #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2641     // Hint to branch predictor that the branch is unlikely to be taken
2642     if(rs1[i]>=28)
2643       emit_jno_unlikely(0);
2644     else
2645     #endif
2646       emit_jno(0);
2647     if(ram_offset!=0) {
2648       host_tempreg_acquire();
2649       emit_addimm(addr,ram_offset,HOST_TEMPREG);
2650       addr=*addr_reg_override=HOST_TEMPREG;
2651     }
2652   }
2653
2654   return jaddr;
2655 }
2656
2657 // return memhandler, or get directly accessable address and return 0
2658 static void *get_direct_memhandler(void *table, u_int addr,
2659   enum stub_type type, uintptr_t *addr_host)
2660 {
2661   uintptr_t l1, l2 = 0;
2662   l1 = ((uintptr_t *)table)[addr>>12];
2663   if ((l1 & (1ul << (sizeof(l1)*8-1))) == 0) {
2664     uintptr_t v = l1 << 1;
2665     *addr_host = v + addr;
2666     return NULL;
2667   }
2668   else {
2669     l1 <<= 1;
2670     if (type == LOADB_STUB || type == LOADBU_STUB || type == STOREB_STUB)
2671       l2 = ((uintptr_t *)l1)[0x1000/4 + 0x1000/2 + (addr&0xfff)];
2672     else if (type == LOADH_STUB || type == LOADHU_STUB || type == STOREH_STUB)
2673       l2=((uintptr_t *)l1)[0x1000/4 + (addr&0xfff)/2];
2674     else
2675       l2=((uintptr_t *)l1)[(addr&0xfff)/4];
2676     if ((l2 & (1<<31)) == 0) {
2677       uintptr_t v = l2 << 1;
2678       *addr_host = v + (addr&0xfff);
2679       return NULL;
2680     }
2681     return (void *)(l2 << 1);
2682   }
2683 }
2684
2685 static u_int get_host_reglist(const signed char *regmap)
2686 {
2687   u_int reglist = 0, hr;
2688   for (hr = 0; hr < HOST_REGS; hr++) {
2689     if (hr != EXCLUDE_REG && regmap[hr] >= 0)
2690       reglist |= 1 << hr;
2691   }
2692   return reglist;
2693 }
2694
2695 static u_int reglist_exclude(u_int reglist, int r1, int r2)
2696 {
2697   if (r1 >= 0)
2698     reglist &= ~(1u << r1);
2699   if (r2 >= 0)
2700     reglist &= ~(1u << r2);
2701   return reglist;
2702 }
2703
2704 // find a temp caller-saved register not in reglist (so assumed to be free)
2705 static int reglist_find_free(u_int reglist)
2706 {
2707   u_int free_regs = ~reglist & CALLER_SAVE_REGS;
2708   if (free_regs == 0)
2709     return -1;
2710   return __builtin_ctz(free_regs);
2711 }
2712
2713 static void load_assemble(int i, const struct regstat *i_regs)
2714 {
2715   int s,tl,addr;
2716   int offset;
2717   void *jaddr=0;
2718   int memtarget=0,c=0;
2719   int fastio_reg_override=-1;
2720   u_int reglist=get_host_reglist(i_regs->regmap);
2721   tl=get_reg(i_regs->regmap,rt1[i]);
2722   s=get_reg(i_regs->regmap,rs1[i]);
2723   offset=imm[i];
2724   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2725   if(s>=0) {
2726     c=(i_regs->wasconst>>s)&1;
2727     if (c) {
2728       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2729     }
2730   }
2731   //printf("load_assemble: c=%d\n",c);
2732   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2733   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2734   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2735     ||rt1[i]==0) {
2736       // could be FIFO, must perform the read
2737       // ||dummy read
2738       assem_debug("(forced read)\n");
2739       tl=get_reg(i_regs->regmap,-1);
2740       assert(tl>=0);
2741   }
2742   if(offset||s<0||c) addr=tl;
2743   else addr=s;
2744   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2745  if(tl>=0) {
2746   //printf("load_assemble: c=%d\n",c);
2747   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2748   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2749   reglist&=~(1<<tl);
2750   if(!c) {
2751     #ifdef R29_HACK
2752     // Strmnnrmn's speed hack
2753     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2754     #endif
2755     {
2756       jaddr=emit_fastpath_cmp_jump(i,addr,&fastio_reg_override);
2757     }
2758   }
2759   else if(ram_offset&&memtarget) {
2760     host_tempreg_acquire();
2761     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2762     fastio_reg_override=HOST_TEMPREG;
2763   }
2764   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2765   if (opcode[i]==0x20) { // LB
2766     if(!c||memtarget) {
2767       if(!dummy) {
2768         {
2769           int x=0,a=tl;
2770           if(!c) a=addr;
2771           if(fastio_reg_override>=0) a=fastio_reg_override;
2772
2773           emit_movsbl_indexed(x,a,tl);
2774         }
2775       }
2776       if(jaddr)
2777         add_stub_r(LOADB_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2778     }
2779     else
2780       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2781   }
2782   if (opcode[i]==0x21) { // LH
2783     if(!c||memtarget) {
2784       if(!dummy) {
2785         int x=0,a=tl;
2786         if(!c) a=addr;
2787         if(fastio_reg_override>=0) a=fastio_reg_override;
2788         emit_movswl_indexed(x,a,tl);
2789       }
2790       if(jaddr)
2791         add_stub_r(LOADH_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2792     }
2793     else
2794       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2795   }
2796   if (opcode[i]==0x23) { // LW
2797     if(!c||memtarget) {
2798       if(!dummy) {
2799         int a=addr;
2800         if(fastio_reg_override>=0) a=fastio_reg_override;
2801         emit_readword_indexed(0,a,tl);
2802       }
2803       if(jaddr)
2804         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2805     }
2806     else
2807       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2808   }
2809   if (opcode[i]==0x24) { // LBU
2810     if(!c||memtarget) {
2811       if(!dummy) {
2812         int x=0,a=tl;
2813         if(!c) a=addr;
2814         if(fastio_reg_override>=0) a=fastio_reg_override;
2815
2816         emit_movzbl_indexed(x,a,tl);
2817       }
2818       if(jaddr)
2819         add_stub_r(LOADBU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2820     }
2821     else
2822       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2823   }
2824   if (opcode[i]==0x25) { // LHU
2825     if(!c||memtarget) {
2826       if(!dummy) {
2827         int x=0,a=tl;
2828         if(!c) a=addr;
2829         if(fastio_reg_override>=0) a=fastio_reg_override;
2830         emit_movzwl_indexed(x,a,tl);
2831       }
2832       if(jaddr)
2833         add_stub_r(LOADHU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2834     }
2835     else
2836       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2837   }
2838   if (opcode[i]==0x27) { // LWU
2839     assert(0);
2840   }
2841   if (opcode[i]==0x37) { // LD
2842     assert(0);
2843   }
2844  }
2845  if (fastio_reg_override == HOST_TEMPREG)
2846    host_tempreg_release();
2847 }
2848
2849 #ifndef loadlr_assemble
2850 static void loadlr_assemble(int i, const struct regstat *i_regs)
2851 {
2852   int s,tl,temp,temp2,addr;
2853   int offset;
2854   void *jaddr=0;
2855   int memtarget=0,c=0;
2856   int fastio_reg_override=-1;
2857   u_int reglist=get_host_reglist(i_regs->regmap);
2858   tl=get_reg(i_regs->regmap,rt1[i]);
2859   s=get_reg(i_regs->regmap,rs1[i]);
2860   temp=get_reg(i_regs->regmap,-1);
2861   temp2=get_reg(i_regs->regmap,FTEMP);
2862   addr=get_reg(i_regs->regmap,AGEN1+(i&1));
2863   assert(addr<0);
2864   offset=imm[i];
2865   reglist|=1<<temp;
2866   if(offset||s<0||c) addr=temp2;
2867   else addr=s;
2868   if(s>=0) {
2869     c=(i_regs->wasconst>>s)&1;
2870     if(c) {
2871       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2872     }
2873   }
2874   if(!c) {
2875     emit_shlimm(addr,3,temp);
2876     if (opcode[i]==0x22||opcode[i]==0x26) {
2877       emit_andimm(addr,0xFFFFFFFC,temp2); // LWL/LWR
2878     }else{
2879       emit_andimm(addr,0xFFFFFFF8,temp2); // LDL/LDR
2880     }
2881     jaddr=emit_fastpath_cmp_jump(i,temp2,&fastio_reg_override);
2882   }
2883   else {
2884     if(ram_offset&&memtarget) {
2885       host_tempreg_acquire();
2886       emit_addimm(temp2,ram_offset,HOST_TEMPREG);
2887       fastio_reg_override=HOST_TEMPREG;
2888     }
2889     if (opcode[i]==0x22||opcode[i]==0x26) {
2890       emit_movimm(((constmap[i][s]+offset)<<3)&24,temp); // LWL/LWR
2891     }else{
2892       emit_movimm(((constmap[i][s]+offset)<<3)&56,temp); // LDL/LDR
2893     }
2894   }
2895   if (opcode[i]==0x22||opcode[i]==0x26) { // LWL/LWR
2896     if(!c||memtarget) {
2897       int a=temp2;
2898       if(fastio_reg_override>=0) a=fastio_reg_override;
2899       emit_readword_indexed(0,a,temp2);
2900       if(fastio_reg_override==HOST_TEMPREG) host_tempreg_release();
2901       if(jaddr) add_stub_r(LOADW_STUB,jaddr,out,i,temp2,i_regs,ccadj[i],reglist);
2902     }
2903     else
2904       inline_readstub(LOADW_STUB,i,(constmap[i][s]+offset)&0xFFFFFFFC,i_regs->regmap,FTEMP,ccadj[i],reglist);
2905     if(rt1[i]) {
2906       assert(tl>=0);
2907       emit_andimm(temp,24,temp);
2908       if (opcode[i]==0x22) // LWL
2909         emit_xorimm(temp,24,temp);
2910       host_tempreg_acquire();
2911       emit_movimm(-1,HOST_TEMPREG);
2912       if (opcode[i]==0x26) {
2913         emit_shr(temp2,temp,temp2);
2914         emit_bic_lsr(tl,HOST_TEMPREG,temp,tl);
2915       }else{
2916         emit_shl(temp2,temp,temp2);
2917         emit_bic_lsl(tl,HOST_TEMPREG,temp,tl);
2918       }
2919       host_tempreg_release();
2920       emit_or(temp2,tl,tl);
2921     }
2922     //emit_storereg(rt1[i],tl); // DEBUG
2923   }
2924   if (opcode[i]==0x1A||opcode[i]==0x1B) { // LDL/LDR
2925     assert(0);
2926   }
2927 }
2928 #endif
2929
2930 void store_assemble(int i, const struct regstat *i_regs)
2931 {
2932   int s,tl;
2933   int addr,temp;
2934   int offset;
2935   void *jaddr=0;
2936   enum stub_type type;
2937   int memtarget=0,c=0;
2938   int agr=AGEN1+(i&1);
2939   int fastio_reg_override=-1;
2940   u_int reglist=get_host_reglist(i_regs->regmap);
2941   tl=get_reg(i_regs->regmap,rs2[i]);
2942   s=get_reg(i_regs->regmap,rs1[i]);
2943   temp=get_reg(i_regs->regmap,agr);
2944   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2945   offset=imm[i];
2946   if(s>=0) {
2947     c=(i_regs->wasconst>>s)&1;
2948     if(c) {
2949       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2950     }
2951   }
2952   assert(tl>=0);
2953   assert(temp>=0);
2954   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2955   if(offset||s<0||c) addr=temp;
2956   else addr=s;
2957   if(!c) {
2958     jaddr=emit_fastpath_cmp_jump(i,addr,&fastio_reg_override);
2959   }
2960   else if(ram_offset&&memtarget) {
2961     host_tempreg_acquire();
2962     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2963     fastio_reg_override=HOST_TEMPREG;
2964   }
2965
2966   if (opcode[i]==0x28) { // SB
2967     if(!c||memtarget) {
2968       int x=0,a=temp;
2969       if(!c) a=addr;
2970       if(fastio_reg_override>=0) a=fastio_reg_override;
2971       emit_writebyte_indexed(tl,x,a);
2972     }
2973     type=STOREB_STUB;
2974   }
2975   if (opcode[i]==0x29) { // SH
2976     if(!c||memtarget) {
2977       int x=0,a=temp;
2978       if(!c) a=addr;
2979       if(fastio_reg_override>=0) a=fastio_reg_override;
2980       emit_writehword_indexed(tl,x,a);
2981     }
2982     type=STOREH_STUB;
2983   }
2984   if (opcode[i]==0x2B) { // SW
2985     if(!c||memtarget) {
2986       int a=addr;
2987       if(fastio_reg_override>=0) a=fastio_reg_override;
2988       emit_writeword_indexed(tl,0,a);
2989     }
2990     type=STOREW_STUB;
2991   }
2992   if (opcode[i]==0x3F) { // SD
2993     assert(0);
2994     type=STORED_STUB;
2995   }
2996   if(fastio_reg_override==HOST_TEMPREG)
2997     host_tempreg_release();
2998   if(jaddr) {
2999     // PCSX store handlers don't check invcode again
3000     reglist|=1<<addr;
3001     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
3002     jaddr=0;
3003   }
3004   if(!(i_regs->waswritten&(1<<rs1[i])) && !HACK_ENABLED(NDHACK_NO_SMC_CHECK)) {
3005     if(!c||memtarget) {
3006       #ifdef DESTRUCTIVE_SHIFT
3007       // The x86 shift operation is 'destructive'; it overwrites the
3008       // source register, so we need to make a copy first and use that.
3009       addr=temp;
3010       #endif
3011       #if defined(HOST_IMM8)
3012       int ir=get_reg(i_regs->regmap,INVCP);
3013       assert(ir>=0);
3014       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3015       #else
3016       emit_cmpmem_indexedsr12_imm(invalid_code,addr,1);
3017       #endif
3018       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3019       emit_callne(invalidate_addr_reg[addr]);
3020       #else
3021       void *jaddr2 = out;
3022       emit_jne(0);
3023       add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3024       #endif
3025     }
3026   }
3027   u_int addr_val=constmap[i][s]+offset;
3028   if(jaddr) {
3029     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
3030   } else if(c&&!memtarget) {
3031     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
3032   }
3033   // basic current block modification detection..
3034   // not looking back as that should be in mips cache already
3035   // (see Spyro2 title->attract mode)
3036   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
3037     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
3038     assert(i_regs->regmap==regs[i].regmap); // not delay slot
3039     if(i_regs->regmap==regs[i].regmap) {
3040       load_all_consts(regs[i].regmap_entry,regs[i].wasdirty,i);
3041       wb_dirtys(regs[i].regmap_entry,regs[i].wasdirty);
3042       emit_movimm(start+i*4+4,0);
3043       emit_writeword(0,&pcaddr);
3044       emit_addimm(HOST_CCREG,2,HOST_CCREG);
3045       emit_far_call(get_addr_ht);
3046       emit_jmpreg(0);
3047     }
3048   }
3049 }
3050
3051 static void storelr_assemble(int i, const struct regstat *i_regs)
3052 {
3053   int s,tl;
3054   int temp;
3055   int offset;
3056   void *jaddr=0;
3057   void *case1, *case2, *case3;
3058   void *done0, *done1, *done2;
3059   int memtarget=0,c=0;
3060   int agr=AGEN1+(i&1);
3061   u_int reglist=get_host_reglist(i_regs->regmap);
3062   tl=get_reg(i_regs->regmap,rs2[i]);
3063   s=get_reg(i_regs->regmap,rs1[i]);
3064   temp=get_reg(i_regs->regmap,agr);
3065   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3066   offset=imm[i];
3067   if(s>=0) {
3068     c=(i_regs->isconst>>s)&1;
3069     if(c) {
3070       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3071     }
3072   }
3073   assert(tl>=0);
3074   assert(temp>=0);
3075   if(!c) {
3076     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3077     if(!offset&&s!=temp) emit_mov(s,temp);
3078     jaddr=out;
3079     emit_jno(0);
3080   }
3081   else
3082   {
3083     if(!memtarget||!rs1[i]) {
3084       jaddr=out;
3085       emit_jmp(0);
3086     }
3087   }
3088   if(ram_offset)
3089     emit_addimm_no_flags(ram_offset,temp);
3090
3091   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3092     assert(0);
3093   }
3094
3095   emit_xorimm(temp,3,temp);
3096   emit_testimm(temp,2);
3097   case2=out;
3098   emit_jne(0);
3099   emit_testimm(temp,1);
3100   case1=out;
3101   emit_jne(0);
3102   // 0
3103   if (opcode[i]==0x2A) { // SWL
3104     emit_writeword_indexed(tl,0,temp);
3105   }
3106   else if (opcode[i]==0x2E) { // SWR
3107     emit_writebyte_indexed(tl,3,temp);
3108   }
3109   else
3110     assert(0);
3111   done0=out;
3112   emit_jmp(0);
3113   // 1
3114   set_jump_target(case1, out);
3115   if (opcode[i]==0x2A) { // SWL
3116     // Write 3 msb into three least significant bytes
3117     if(rs2[i]) emit_rorimm(tl,8,tl);
3118     emit_writehword_indexed(tl,-1,temp);
3119     if(rs2[i]) emit_rorimm(tl,16,tl);
3120     emit_writebyte_indexed(tl,1,temp);
3121     if(rs2[i]) emit_rorimm(tl,8,tl);
3122   }
3123   else if (opcode[i]==0x2E) { // SWR
3124     // Write two lsb into two most significant bytes
3125     emit_writehword_indexed(tl,1,temp);
3126   }
3127   done1=out;
3128   emit_jmp(0);
3129   // 2
3130   set_jump_target(case2, out);
3131   emit_testimm(temp,1);
3132   case3=out;
3133   emit_jne(0);
3134   if (opcode[i]==0x2A) { // SWL
3135     // Write two msb into two least significant bytes
3136     if(rs2[i]) emit_rorimm(tl,16,tl);
3137     emit_writehword_indexed(tl,-2,temp);
3138     if(rs2[i]) emit_rorimm(tl,16,tl);
3139   }
3140   else if (opcode[i]==0x2E) { // SWR
3141     // Write 3 lsb into three most significant bytes
3142     emit_writebyte_indexed(tl,-1,temp);
3143     if(rs2[i]) emit_rorimm(tl,8,tl);
3144     emit_writehword_indexed(tl,0,temp);
3145     if(rs2[i]) emit_rorimm(tl,24,tl);
3146   }
3147   done2=out;
3148   emit_jmp(0);
3149   // 3
3150   set_jump_target(case3, out);
3151   if (opcode[i]==0x2A) { // SWL
3152     // Write msb into least significant byte
3153     if(rs2[i]) emit_rorimm(tl,24,tl);
3154     emit_writebyte_indexed(tl,-3,temp);
3155     if(rs2[i]) emit_rorimm(tl,8,tl);
3156   }
3157   else if (opcode[i]==0x2E) { // SWR
3158     // Write entire word
3159     emit_writeword_indexed(tl,-3,temp);
3160   }
3161   set_jump_target(done0, out);
3162   set_jump_target(done1, out);
3163   set_jump_target(done2, out);
3164   if(!c||!memtarget)
3165     add_stub_r(STORELR_STUB,jaddr,out,i,temp,i_regs,ccadj[i],reglist);
3166   if(!(i_regs->waswritten&(1<<rs1[i])) && !HACK_ENABLED(NDHACK_NO_SMC_CHECK)) {
3167     emit_addimm_no_flags(-ram_offset,temp);
3168     #if defined(HOST_IMM8)
3169     int ir=get_reg(i_regs->regmap,INVCP);
3170     assert(ir>=0);
3171     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3172     #else
3173     emit_cmpmem_indexedsr12_imm(invalid_code,temp,1);
3174     #endif
3175     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3176     emit_callne(invalidate_addr_reg[temp]);
3177     #else
3178     void *jaddr2 = out;
3179     emit_jne(0);
3180     add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3181     #endif
3182   }
3183 }
3184
3185 static void cop0_assemble(int i,struct regstat *i_regs)
3186 {
3187   if(opcode2[i]==0) // MFC0
3188   {
3189     signed char t=get_reg(i_regs->regmap,rt1[i]);
3190     u_int copr=(source[i]>>11)&0x1f;
3191     //assert(t>=0); // Why does this happen?  OOT is weird
3192     if(t>=0&&rt1[i]!=0) {
3193       emit_readword(&reg_cop0[copr],t);
3194     }
3195   }
3196   else if(opcode2[i]==4) // MTC0
3197   {
3198     signed char s=get_reg(i_regs->regmap,rs1[i]);
3199     char copr=(source[i]>>11)&0x1f;
3200     assert(s>=0);
3201     wb_register(rs1[i],i_regs->regmap,i_regs->dirty);
3202     if(copr==9||copr==11||copr==12||copr==13) {
3203       emit_readword(&last_count,HOST_TEMPREG);
3204       emit_loadreg(CCREG,HOST_CCREG); // TODO: do proper reg alloc
3205       emit_add(HOST_CCREG,HOST_TEMPREG,HOST_CCREG);
3206       emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3207       emit_writeword(HOST_CCREG,&Count);
3208     }
3209     // What a mess.  The status register (12) can enable interrupts,
3210     // so needs a special case to handle a pending interrupt.
3211     // The interrupt must be taken immediately, because a subsequent
3212     // instruction might disable interrupts again.
3213     if(copr==12||copr==13) {
3214       if (is_delayslot) {
3215         // burn cycles to cause cc_interrupt, which will
3216         // reschedule next_interupt. Relies on CCREG from above.
3217         assem_debug("MTC0 DS %d\n", copr);
3218         emit_writeword(HOST_CCREG,&last_count);
3219         emit_movimm(0,HOST_CCREG);
3220         emit_storereg(CCREG,HOST_CCREG);
3221         emit_loadreg(rs1[i],1);
3222         emit_movimm(copr,0);
3223         emit_far_call(pcsx_mtc0_ds);
3224         emit_loadreg(rs1[i],s);
3225         return;
3226       }
3227       emit_movimm(start+i*4+4,HOST_TEMPREG);
3228       emit_writeword(HOST_TEMPREG,&pcaddr);
3229       emit_movimm(0,HOST_TEMPREG);
3230       emit_writeword(HOST_TEMPREG,&pending_exception);
3231     }
3232     if(s==HOST_CCREG)
3233       emit_loadreg(rs1[i],1);
3234     else if(s!=1)
3235       emit_mov(s,1);
3236     emit_movimm(copr,0);
3237     emit_far_call(pcsx_mtc0);
3238     if(copr==9||copr==11||copr==12||copr==13) {
3239       emit_readword(&Count,HOST_CCREG);
3240       emit_readword(&next_interupt,HOST_TEMPREG);
3241       emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3242       emit_sub(HOST_CCREG,HOST_TEMPREG,HOST_CCREG);
3243       emit_writeword(HOST_TEMPREG,&last_count);
3244       emit_storereg(CCREG,HOST_CCREG);
3245     }
3246     if(copr==12||copr==13) {
3247       assert(!is_delayslot);
3248       emit_readword(&pending_exception,14);
3249       emit_test(14,14);
3250       void *jaddr = out;
3251       emit_jeq(0);
3252       emit_readword(&pcaddr, 0);
3253       emit_addimm(HOST_CCREG,2,HOST_CCREG);
3254       emit_far_call(get_addr_ht);
3255       emit_jmpreg(0);
3256       set_jump_target(jaddr, out);
3257     }
3258     emit_loadreg(rs1[i],s);
3259   }
3260   else
3261   {
3262     assert(opcode2[i]==0x10);
3263     //if((source[i]&0x3f)==0x10) // RFE
3264     {
3265       emit_readword(&Status,0);
3266       emit_andimm(0,0x3c,1);
3267       emit_andimm(0,~0xf,0);
3268       emit_orrshr_imm(1,2,0);
3269       emit_writeword(0,&Status);
3270     }
3271   }
3272 }
3273
3274 static void cop1_unusable(int i,struct regstat *i_regs)
3275 {
3276   // XXX: should just just do the exception instead
3277   //if(!cop1_usable)
3278   {
3279     void *jaddr=out;
3280     emit_jmp(0);
3281     add_stub_r(FP_STUB,jaddr,out,i,0,i_regs,is_delayslot,0);
3282   }
3283 }
3284
3285 static void cop1_assemble(int i,struct regstat *i_regs)
3286 {
3287   cop1_unusable(i, i_regs);
3288 }
3289
3290 static void c1ls_assemble(int i,struct regstat *i_regs)
3291 {
3292   cop1_unusable(i, i_regs);
3293 }
3294
3295 // FP_STUB
3296 static void do_cop1stub(int n)
3297 {
3298   literal_pool(256);
3299   assem_debug("do_cop1stub %x\n",start+stubs[n].a*4);
3300   set_jump_target(stubs[n].addr, out);
3301   int i=stubs[n].a;
3302 //  int rs=stubs[n].b;
3303   struct regstat *i_regs=(struct regstat *)stubs[n].c;
3304   int ds=stubs[n].d;
3305   if(!ds) {
3306     load_all_consts(regs[i].regmap_entry,regs[i].wasdirty,i);
3307     //if(i_regs!=&regs[i]) printf("oops: regs[i]=%x i_regs=%x",(int)&regs[i],(int)i_regs);
3308   }
3309   //else {printf("fp exception in delay slot\n");}
3310   wb_dirtys(i_regs->regmap_entry,i_regs->wasdirty);
3311   if(regs[i].regmap_entry[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
3312   emit_movimm(start+(i-ds)*4,EAX); // Get PC
3313   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3314   emit_far_jump(ds?fp_exception_ds:fp_exception);
3315 }
3316
3317 static int cop2_is_stalling_op(int i, int *cycles)
3318 {
3319   if (opcode[i] == 0x3a) { // SWC2
3320     *cycles = 0;
3321     return 1;
3322   }
3323   if (itype[i] == COP2 && (opcode2[i] == 0 || opcode2[i] == 2)) { // MFC2/CFC2
3324     *cycles = 0;
3325     return 1;
3326   }
3327   if (itype[i] == C2OP) {
3328     *cycles = gte_cycletab[source[i] & 0x3f];
3329     return 1;
3330   }
3331   // ... what about MTC2/CTC2/LWC2?
3332   return 0;
3333 }
3334
3335 #if 0
3336 static void log_gte_stall(int stall, u_int cycle)
3337 {
3338   if ((u_int)stall <= 44)
3339     printf("x    stall %2d %u\n", stall, cycle + last_count);
3340 }
3341
3342 static void emit_log_gte_stall(int i, int stall, u_int reglist)
3343 {
3344   save_regs(reglist);
3345   if (stall > 0)
3346     emit_movimm(stall, 0);
3347   else
3348     emit_mov(HOST_TEMPREG, 0);
3349   emit_addimm(HOST_CCREG, CLOCK_ADJUST(ccadj[i]), 1);
3350   emit_far_call(log_gte_stall);
3351   restore_regs(reglist);
3352 }
3353 #endif
3354
3355 static void cop2_do_stall_check(u_int op, int i, const struct regstat *i_regs, u_int reglist)
3356 {
3357   int j = i, other_gte_op_cycles = -1, stall = -MAXBLOCK, cycles_passed;
3358   int rtmp = reglist_find_free(reglist);
3359
3360   if (HACK_ENABLED(NDHACK_NO_STALLS))
3361     return;
3362   if (get_reg(i_regs->regmap, CCREG) != HOST_CCREG) {
3363     // happens occasionally... cc evicted? Don't bother then
3364     //printf("no cc %08x\n", start + i*4);
3365     return;
3366   }
3367   if (!bt[i]) {
3368     for (j = i - 1; j >= 0; j--) {
3369       //if (is_ds[j]) break;
3370       if (cop2_is_stalling_op(j, &other_gte_op_cycles) || bt[j])
3371         break;
3372     }
3373     j = max(j, 0);
3374   }
3375   cycles_passed = CLOCK_ADJUST(ccadj[i] - ccadj[j]);
3376   if (other_gte_op_cycles >= 0)
3377     stall = other_gte_op_cycles - cycles_passed;
3378   else if (cycles_passed >= 44)
3379     stall = 0; // can't stall
3380   if (stall == -MAXBLOCK && rtmp >= 0) {
3381     // unknown stall, do the expensive runtime check
3382     assem_debug("; cop2_do_stall_check\n");
3383 #if 0 // too slow
3384     save_regs(reglist);
3385     emit_movimm(gte_cycletab[op], 0);
3386     emit_addimm(HOST_CCREG, CLOCK_ADJUST(ccadj[i]), 1);
3387     emit_far_call(call_gteStall);
3388     restore_regs(reglist);
3389 #else
3390     host_tempreg_acquire();
3391     emit_readword(&psxRegs.gteBusyCycle, rtmp);
3392     emit_addimm(rtmp, -CLOCK_ADJUST(ccadj[i]), rtmp);
3393     emit_sub(rtmp, HOST_CCREG, HOST_TEMPREG);
3394     emit_cmpimm(HOST_TEMPREG, 44);
3395     emit_cmovb_reg(rtmp, HOST_CCREG);
3396     //emit_log_gte_stall(i, 0, reglist);
3397     host_tempreg_release();
3398 #endif
3399   }
3400   else if (stall > 0) {
3401     //emit_log_gte_stall(i, stall, reglist);
3402     emit_addimm(HOST_CCREG, stall, HOST_CCREG);
3403   }
3404
3405   // save gteBusyCycle, if needed
3406   if (gte_cycletab[op] == 0)
3407     return;
3408   other_gte_op_cycles = -1;
3409   for (j = i + 1; j < slen; j++) {
3410     if (cop2_is_stalling_op(j, &other_gte_op_cycles))
3411       break;
3412     if (is_jump(j)) {
3413       // check ds
3414       if (j + 1 < slen && cop2_is_stalling_op(j + 1, &other_gte_op_cycles))
3415         j++;
3416       break;
3417     }
3418   }
3419   if (other_gte_op_cycles >= 0)
3420     // will handle stall when assembling that op
3421     return;
3422   cycles_passed = CLOCK_ADJUST(ccadj[min(j, slen -1)] - ccadj[i]);
3423   if (cycles_passed >= 44)
3424     return;
3425   assem_debug("; save gteBusyCycle\n");
3426   host_tempreg_acquire();
3427 #if 0
3428   emit_readword(&last_count, HOST_TEMPREG);
3429   emit_add(HOST_TEMPREG, HOST_CCREG, HOST_TEMPREG);
3430   emit_addimm(HOST_TEMPREG, CLOCK_ADJUST(ccadj[i]), HOST_TEMPREG);
3431   emit_addimm(HOST_TEMPREG, gte_cycletab[op]), HOST_TEMPREG);
3432   emit_writeword(HOST_TEMPREG, &psxRegs.gteBusyCycle);
3433 #else
3434   emit_addimm(HOST_CCREG, CLOCK_ADJUST(ccadj[i]) + gte_cycletab[op], HOST_TEMPREG);
3435   emit_writeword(HOST_TEMPREG, &psxRegs.gteBusyCycle);
3436 #endif
3437   host_tempreg_release();
3438 }
3439
3440 static int is_mflohi(int i)
3441 {
3442   return (itype[i] == MOV && (rs1[i] == HIREG || rs1[i] == LOREG));
3443 }
3444
3445 static int check_multdiv(int i, int *cycles)
3446 {
3447   if (itype[i] != MULTDIV)
3448     return 0;
3449   if (opcode2[i] == 0x18 || opcode2[i] == 0x19) // MULT(U)
3450     *cycles = 11; // approx from 7 11 14
3451   else
3452     *cycles = 37;
3453   return 1;
3454 }
3455
3456 static void multdiv_prepare_stall(int i, const struct regstat *i_regs)
3457 {
3458   int j, found = 0, c = 0;
3459   if (HACK_ENABLED(NDHACK_NO_STALLS))
3460     return;
3461   if (get_reg(i_regs->regmap, CCREG) != HOST_CCREG) {
3462     // happens occasionally... cc evicted? Don't bother then
3463     return;
3464   }
3465   for (j = i + 1; j < slen; j++) {
3466     if (bt[j])
3467       break;
3468     if ((found = is_mflohi(j)))
3469       break;
3470     if (is_jump(j)) {
3471       // check ds
3472       if (j + 1 < slen && (found = is_mflohi(j + 1)))
3473         j++;
3474       break;
3475     }
3476   }
3477   if (found)
3478     // handle all in multdiv_do_stall()
3479     return;
3480   check_multdiv(i, &c);
3481   assert(c > 0);
3482   assem_debug("; muldiv prepare stall %d\n", c);
3483   host_tempreg_acquire();
3484   emit_addimm(HOST_CCREG, CLOCK_ADJUST(ccadj[i]) + c, HOST_TEMPREG);
3485   emit_writeword(HOST_TEMPREG, &psxRegs.muldivBusyCycle);
3486   host_tempreg_release();
3487 }
3488
3489 static void multdiv_do_stall(int i, const struct regstat *i_regs)
3490 {
3491   int j, known_cycles = 0;
3492   u_int reglist = get_host_reglist(i_regs->regmap);
3493   int rtmp = get_reg(i_regs->regmap, -1);
3494   if (rtmp < 0)
3495     rtmp = reglist_find_free(reglist);
3496   if (HACK_ENABLED(NDHACK_NO_STALLS))
3497     return;
3498   if (get_reg(i_regs->regmap, CCREG) != HOST_CCREG || rtmp < 0) {
3499     // happens occasionally... cc evicted? Don't bother then
3500     //printf("no cc/rtmp %08x\n", start + i*4);
3501     return;
3502   }
3503   if (!bt[i]) {
3504     for (j = i - 1; j >= 0; j--) {
3505       if (is_ds[j]) break;
3506       if (check_multdiv(j, &known_cycles) || bt[j])
3507         break;
3508       if (is_mflohi(j))
3509         // already handled by this op
3510         return;
3511     }
3512     j = max(j, 0);
3513   }
3514   if (known_cycles > 0) {
3515     known_cycles -= CLOCK_ADJUST(ccadj[i] - ccadj[j]);
3516     assem_debug("; muldiv stall resolved %d\n", known_cycles);
3517     if (known_cycles > 0)
3518       emit_addimm(HOST_CCREG, known_cycles, HOST_CCREG);
3519     return;
3520   }
3521   assem_debug("; muldiv stall unresolved\n");
3522   host_tempreg_acquire();
3523   emit_readword(&psxRegs.muldivBusyCycle, rtmp);
3524   emit_addimm(rtmp, -CLOCK_ADJUST(ccadj[i]), rtmp);
3525   emit_sub(rtmp, HOST_CCREG, HOST_TEMPREG);
3526   emit_cmpimm(HOST_TEMPREG, 37);
3527   emit_cmovb_reg(rtmp, HOST_CCREG);
3528   //emit_log_gte_stall(i, 0, reglist);
3529   host_tempreg_release();
3530 }
3531
3532 static void cop2_get_dreg(u_int copr,signed char tl,signed char temp)
3533 {
3534   switch (copr) {
3535     case 1:
3536     case 3:
3537     case 5:
3538     case 8:
3539     case 9:
3540     case 10:
3541     case 11:
3542       emit_readword(&reg_cop2d[copr],tl);
3543       emit_signextend16(tl,tl);
3544       emit_writeword(tl,&reg_cop2d[copr]); // hmh
3545       break;
3546     case 7:
3547     case 16:
3548     case 17:
3549     case 18:
3550     case 19:
3551       emit_readword(&reg_cop2d[copr],tl);
3552       emit_andimm(tl,0xffff,tl);
3553       emit_writeword(tl,&reg_cop2d[copr]);
3554       break;
3555     case 15:
3556       emit_readword(&reg_cop2d[14],tl); // SXY2
3557       emit_writeword(tl,&reg_cop2d[copr]);
3558       break;
3559     case 28:
3560     case 29:
3561       c2op_mfc2_29_assemble(tl,temp);
3562       break;
3563     default:
3564       emit_readword(&reg_cop2d[copr],tl);
3565       break;
3566   }
3567 }
3568
3569 static void cop2_put_dreg(u_int copr,signed char sl,signed char temp)
3570 {
3571   switch (copr) {
3572     case 15:
3573       emit_readword(&reg_cop2d[13],temp);  // SXY1
3574       emit_writeword(sl,&reg_cop2d[copr]);
3575       emit_writeword(temp,&reg_cop2d[12]); // SXY0
3576       emit_readword(&reg_cop2d[14],temp);  // SXY2
3577       emit_writeword(sl,&reg_cop2d[14]);
3578       emit_writeword(temp,&reg_cop2d[13]); // SXY1
3579       break;
3580     case 28:
3581       emit_andimm(sl,0x001f,temp);
3582       emit_shlimm(temp,7,temp);
3583       emit_writeword(temp,&reg_cop2d[9]);
3584       emit_andimm(sl,0x03e0,temp);
3585       emit_shlimm(temp,2,temp);
3586       emit_writeword(temp,&reg_cop2d[10]);
3587       emit_andimm(sl,0x7c00,temp);
3588       emit_shrimm(temp,3,temp);
3589       emit_writeword(temp,&reg_cop2d[11]);
3590       emit_writeword(sl,&reg_cop2d[28]);
3591       break;
3592     case 30:
3593       emit_xorsar_imm(sl,sl,31,temp);
3594 #if defined(HAVE_ARMV5) || defined(__aarch64__)
3595       emit_clz(temp,temp);
3596 #else
3597       emit_movs(temp,HOST_TEMPREG);
3598       emit_movimm(0,temp);
3599       emit_jeq((int)out+4*4);
3600       emit_addpl_imm(temp,1,temp);
3601       emit_lslpls_imm(HOST_TEMPREG,1,HOST_TEMPREG);
3602       emit_jns((int)out-2*4);
3603 #endif
3604       emit_writeword(sl,&reg_cop2d[30]);
3605       emit_writeword(temp,&reg_cop2d[31]);
3606       break;
3607     case 31:
3608       break;
3609     default:
3610       emit_writeword(sl,&reg_cop2d[copr]);
3611       break;
3612   }
3613 }
3614
3615 static void c2ls_assemble(int i, const struct regstat *i_regs)
3616 {
3617   int s,tl;
3618   int ar;
3619   int offset;
3620   int memtarget=0,c=0;
3621   void *jaddr2=NULL;
3622   enum stub_type type;
3623   int agr=AGEN1+(i&1);
3624   int fastio_reg_override=-1;
3625   u_int reglist=get_host_reglist(i_regs->regmap);
3626   u_int copr=(source[i]>>16)&0x1f;
3627   s=get_reg(i_regs->regmap,rs1[i]);
3628   tl=get_reg(i_regs->regmap,FTEMP);
3629   offset=imm[i];
3630   assert(rs1[i]>0);
3631   assert(tl>=0);
3632
3633   if(i_regs->regmap[HOST_CCREG]==CCREG)
3634     reglist&=~(1<<HOST_CCREG);
3635
3636   // get the address
3637   if (opcode[i]==0x3a) { // SWC2
3638     ar=get_reg(i_regs->regmap,agr);
3639     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3640     reglist|=1<<ar;
3641   } else { // LWC2
3642     ar=tl;
3643   }
3644   if(s>=0) c=(i_regs->wasconst>>s)&1;
3645   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3646   if (!offset&&!c&&s>=0) ar=s;
3647   assert(ar>=0);
3648
3649   cop2_do_stall_check(0, i, i_regs, reglist);
3650
3651   if (opcode[i]==0x3a) { // SWC2
3652     cop2_get_dreg(copr,tl,-1);
3653     type=STOREW_STUB;
3654   }
3655   else
3656     type=LOADW_STUB;
3657
3658   if(c&&!memtarget) {
3659     jaddr2=out;
3660     emit_jmp(0); // inline_readstub/inline_writestub?
3661   }
3662   else {
3663     if(!c) {
3664       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3665     }
3666     else if(ram_offset&&memtarget) {
3667       host_tempreg_acquire();
3668       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3669       fastio_reg_override=HOST_TEMPREG;
3670     }
3671     if (opcode[i]==0x32) { // LWC2
3672       int a=ar;
3673       if(fastio_reg_override>=0) a=fastio_reg_override;
3674       emit_readword_indexed(0,a,tl);
3675     }
3676     if (opcode[i]==0x3a) { // SWC2
3677       #ifdef DESTRUCTIVE_SHIFT
3678       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3679       #endif
3680       int a=ar;
3681       if(fastio_reg_override>=0) a=fastio_reg_override;
3682       emit_writeword_indexed(tl,0,a);
3683     }
3684   }
3685   if(fastio_reg_override==HOST_TEMPREG)
3686     host_tempreg_release();
3687   if(jaddr2)
3688     add_stub_r(type,jaddr2,out,i,ar,i_regs,ccadj[i],reglist);
3689   if(opcode[i]==0x3a) // SWC2
3690   if(!(i_regs->waswritten&(1<<rs1[i])) && !HACK_ENABLED(NDHACK_NO_SMC_CHECK)) {
3691 #if defined(HOST_IMM8)
3692     int ir=get_reg(i_regs->regmap,INVCP);
3693     assert(ir>=0);
3694     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3695 #else
3696     emit_cmpmem_indexedsr12_imm(invalid_code,ar,1);
3697 #endif
3698     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3699     emit_callne(invalidate_addr_reg[ar]);
3700     #else
3701     void *jaddr3 = out;
3702     emit_jne(0);
3703     add_stub(INVCODE_STUB,jaddr3,out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3704     #endif
3705   }
3706   if (opcode[i]==0x32) { // LWC2
3707     host_tempreg_acquire();
3708     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3709     host_tempreg_release();
3710   }
3711 }
3712
3713 static void cop2_assemble(int i, const struct regstat *i_regs)
3714 {
3715   u_int copr = (source[i]>>11) & 0x1f;
3716   signed char temp = get_reg(i_regs->regmap, -1);
3717
3718   if (!HACK_ENABLED(NDHACK_NO_STALLS)) {
3719     u_int reglist = reglist_exclude(get_host_reglist(i_regs->regmap), temp, -1);
3720     if (opcode2[i] == 0 || opcode2[i] == 2) { // MFC2/CFC2
3721       signed char tl = get_reg(i_regs->regmap, rt1[i]);
3722       reglist = reglist_exclude(reglist, tl, -1);
3723     }
3724     cop2_do_stall_check(0, i, i_regs, reglist);
3725   }
3726   if (opcode2[i]==0) { // MFC2
3727     signed char tl=get_reg(i_regs->regmap,rt1[i]);
3728     if(tl>=0&&rt1[i]!=0)
3729       cop2_get_dreg(copr,tl,temp);
3730   }
3731   else if (opcode2[i]==4) { // MTC2
3732     signed char sl=get_reg(i_regs->regmap,rs1[i]);
3733     cop2_put_dreg(copr,sl,temp);
3734   }
3735   else if (opcode2[i]==2) // CFC2
3736   {
3737     signed char tl=get_reg(i_regs->regmap,rt1[i]);
3738     if(tl>=0&&rt1[i]!=0)
3739       emit_readword(&reg_cop2c[copr],tl);
3740   }
3741   else if (opcode2[i]==6) // CTC2
3742   {
3743     signed char sl=get_reg(i_regs->regmap,rs1[i]);
3744     switch(copr) {
3745       case 4:
3746       case 12:
3747       case 20:
3748       case 26:
3749       case 27:
3750       case 29:
3751       case 30:
3752         emit_signextend16(sl,temp);
3753         break;
3754       case 31:
3755         c2op_ctc2_31_assemble(sl,temp);
3756         break;
3757       default:
3758         temp=sl;
3759         break;
3760     }
3761     emit_writeword(temp,&reg_cop2c[copr]);
3762     assert(sl>=0);
3763   }
3764 }
3765
3766 static void do_unalignedwritestub(int n)
3767 {
3768   assem_debug("do_unalignedwritestub %x\n",start+stubs[n].a*4);
3769   literal_pool(256);
3770   set_jump_target(stubs[n].addr, out);
3771
3772   int i=stubs[n].a;
3773   struct regstat *i_regs=(struct regstat *)stubs[n].c;
3774   int addr=stubs[n].b;
3775   u_int reglist=stubs[n].e;
3776   signed char *i_regmap=i_regs->regmap;
3777   int temp2=get_reg(i_regmap,FTEMP);
3778   int rt;
3779   rt=get_reg(i_regmap,rs2[i]);
3780   assert(rt>=0);
3781   assert(addr>=0);
3782   assert(opcode[i]==0x2a||opcode[i]==0x2e); // SWL/SWR only implemented
3783   reglist|=(1<<addr);
3784   reglist&=~(1<<temp2);
3785
3786 #if 1
3787   // don't bother with it and call write handler
3788   save_regs(reglist);
3789   pass_args(addr,rt);
3790   int cc=get_reg(i_regmap,CCREG);
3791   if(cc<0)
3792     emit_loadreg(CCREG,2);
3793   emit_addimm(cc<0?2:cc,CLOCK_ADJUST((int)stubs[n].d+1),2);
3794   emit_far_call((opcode[i]==0x2a?jump_handle_swl:jump_handle_swr));
3795   emit_addimm(0,-CLOCK_ADJUST((int)stubs[n].d+1),cc<0?2:cc);
3796   if(cc<0)
3797     emit_storereg(CCREG,2);
3798   restore_regs(reglist);
3799   emit_jmp(stubs[n].retaddr); // return address
3800 #else
3801   emit_andimm(addr,0xfffffffc,temp2);
3802   emit_writeword(temp2,&address);
3803
3804   save_regs(reglist);
3805   emit_shrimm(addr,16,1);
3806   int cc=get_reg(i_regmap,CCREG);
3807   if(cc<0) {
3808     emit_loadreg(CCREG,2);
3809   }
3810   emit_movimm((u_int)readmem,0);
3811   emit_addimm(cc<0?2:cc,2*stubs[n].d+2,2);
3812   emit_call((int)&indirect_jump_indexed);
3813   restore_regs(reglist);
3814
3815   emit_readword(&readmem_dword,temp2);
3816   int temp=addr; //hmh
3817   emit_shlimm(addr,3,temp);
3818   emit_andimm(temp,24,temp);
3819   if (opcode[i]==0x2a) // SWL
3820     emit_xorimm(temp,24,temp);
3821   emit_movimm(-1,HOST_TEMPREG);
3822   if (opcode[i]==0x2a) { // SWL
3823     emit_bic_lsr(temp2,HOST_TEMPREG,temp,temp2);
3824     emit_orrshr(rt,temp,temp2);
3825   }else{
3826     emit_bic_lsl(temp2,HOST_TEMPREG,temp,temp2);
3827     emit_orrshl(rt,temp,temp2);
3828   }
3829   emit_readword(&address,addr);
3830   emit_writeword(temp2,&word);
3831   //save_regs(reglist); // don't need to, no state changes
3832   emit_shrimm(addr,16,1);
3833   emit_movimm((u_int)writemem,0);
3834   //emit_call((int)&indirect_jump_indexed);
3835   emit_mov(15,14);
3836   emit_readword_dualindexedx4(0,1,15);
3837   emit_readword(&Count,HOST_TEMPREG);
3838   emit_readword(&next_interupt,2);
3839   emit_addimm(HOST_TEMPREG,-2*stubs[n].d-2,HOST_TEMPREG);
3840   emit_writeword(2,&last_count);
3841   emit_sub(HOST_TEMPREG,2,cc<0?HOST_TEMPREG:cc);
3842   if(cc<0) {
3843     emit_storereg(CCREG,HOST_TEMPREG);
3844   }
3845   restore_regs(reglist);
3846   emit_jmp(stubs[n].retaddr); // return address
3847 #endif
3848 }
3849
3850 #ifndef multdiv_assemble
3851 void multdiv_assemble(int i,struct regstat *i_regs)
3852 {
3853   printf("Need multdiv_assemble for this architecture.\n");
3854   abort();
3855 }
3856 #endif
3857
3858 static void mov_assemble(int i,struct regstat *i_regs)
3859 {
3860   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3861   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3862   if(rt1[i]) {
3863     signed char sl,tl;
3864     tl=get_reg(i_regs->regmap,rt1[i]);
3865     //assert(tl>=0);
3866     if(tl>=0) {
3867       sl=get_reg(i_regs->regmap,rs1[i]);
3868       if(sl>=0) emit_mov(sl,tl);
3869       else emit_loadreg(rs1[i],tl);
3870     }
3871   }
3872   if (rs1[i] == HIREG || rs1[i] == LOREG) // MFHI/MFLO
3873     multdiv_do_stall(i, i_regs);
3874 }
3875
3876 // call interpreter, exception handler, things that change pc/regs/cycles ...
3877 static void call_c_cpu_handler(int i, const struct regstat *i_regs, u_int pc, void *func)
3878 {
3879   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3880   assert(ccreg==HOST_CCREG);
3881   assert(!is_delayslot);
3882   (void)ccreg;
3883
3884   emit_movimm(pc,3); // Get PC
3885   emit_readword(&last_count,2);
3886   emit_writeword(3,&psxRegs.pc);
3887   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3888   emit_add(2,HOST_CCREG,2);
3889   emit_writeword(2,&psxRegs.cycle);
3890   emit_far_call(func);
3891   emit_far_jump(jump_to_new_pc);
3892 }
3893
3894 static void syscall_assemble(int i,struct regstat *i_regs)
3895 {
3896   emit_movimm(0x20,0); // cause code
3897   emit_movimm(0,1);    // not in delay slot
3898   call_c_cpu_handler(i,i_regs,start+i*4,psxException);
3899 }
3900
3901 static void hlecall_assemble(int i,struct regstat *i_regs)
3902 {
3903   void *hlefunc = psxNULL;
3904   uint32_t hleCode = source[i] & 0x03ffffff;
3905   if (hleCode < ARRAY_SIZE(psxHLEt))
3906     hlefunc = psxHLEt[hleCode];
3907
3908   call_c_cpu_handler(i,i_regs,start+i*4+4,hlefunc);
3909 }
3910
3911 static void intcall_assemble(int i,struct regstat *i_regs)
3912 {
3913   call_c_cpu_handler(i,i_regs,start+i*4,execI);
3914 }
3915
3916 static void speculate_mov(int rs,int rt)
3917 {
3918   if(rt!=0) {
3919     smrv_strong_next|=1<<rt;
3920     smrv[rt]=smrv[rs];
3921   }
3922 }
3923
3924 static void speculate_mov_weak(int rs,int rt)
3925 {
3926   if(rt!=0) {
3927     smrv_weak_next|=1<<rt;
3928     smrv[rt]=smrv[rs];
3929   }
3930 }
3931
3932 static void speculate_register_values(int i)
3933 {
3934   if(i==0) {
3935     memcpy(smrv,psxRegs.GPR.r,sizeof(smrv));
3936     // gp,sp are likely to stay the same throughout the block
3937     smrv_strong_next=(1<<28)|(1<<29)|(1<<30);
3938     smrv_weak_next=~smrv_strong_next;
3939     //printf(" llr %08x\n", smrv[4]);
3940   }
3941   smrv_strong=smrv_strong_next;
3942   smrv_weak=smrv_weak_next;
3943   switch(itype[i]) {
3944     case ALU:
3945       if     ((smrv_strong>>rs1[i])&1) speculate_mov(rs1[i],rt1[i]);
3946       else if((smrv_strong>>rs2[i])&1) speculate_mov(rs2[i],rt1[i]);
3947       else if((smrv_weak>>rs1[i])&1) speculate_mov_weak(rs1[i],rt1[i]);
3948       else if((smrv_weak>>rs2[i])&1) speculate_mov_weak(rs2[i],rt1[i]);
3949       else {
3950         smrv_strong_next&=~(1<<rt1[i]);
3951         smrv_weak_next&=~(1<<rt1[i]);
3952       }
3953       break;
3954     case SHIFTIMM:
3955       smrv_strong_next&=~(1<<rt1[i]);
3956       smrv_weak_next&=~(1<<rt1[i]);
3957       // fallthrough
3958     case IMM16:
3959       if(rt1[i]&&is_const(&regs[i],rt1[i])) {
3960         int value,hr=get_reg(regs[i].regmap,rt1[i]);
3961         if(hr>=0) {
3962           if(get_final_value(hr,i,&value))
3963                smrv[rt1[i]]=value;
3964           else smrv[rt1[i]]=constmap[i][hr];
3965           smrv_strong_next|=1<<rt1[i];
3966         }
3967       }
3968       else {
3969         if     ((smrv_strong>>rs1[i])&1) speculate_mov(rs1[i],rt1[i]);
3970         else if((smrv_weak>>rs1[i])&1) speculate_mov_weak(rs1[i],rt1[i]);
3971       }
3972       break;
3973     case LOAD:
3974       if(start<0x2000&&(rt1[i]==26||(smrv[rt1[i]]>>24)==0xa0)) {
3975         // special case for BIOS
3976         smrv[rt1[i]]=0xa0000000;
3977         smrv_strong_next|=1<<rt1[i];
3978         break;
3979       }
3980       // fallthrough
3981     case SHIFT:
3982     case LOADLR:
3983     case MOV:
3984       smrv_strong_next&=~(1<<rt1[i]);
3985       smrv_weak_next&=~(1<<rt1[i]);
3986       break;
3987     case COP0:
3988     case COP2:
3989       if(opcode2[i]==0||opcode2[i]==2) { // MFC/CFC
3990         smrv_strong_next&=~(1<<rt1[i]);
3991         smrv_weak_next&=~(1<<rt1[i]);
3992       }
3993       break;
3994     case C2LS:
3995       if (opcode[i]==0x32) { // LWC2
3996         smrv_strong_next&=~(1<<rt1[i]);
3997         smrv_weak_next&=~(1<<rt1[i]);
3998       }
3999       break;
4000   }
4001 #if 0
4002   int r=4;
4003   printf("x %08x %08x %d %d c %08x %08x\n",smrv[r],start+i*4,
4004     ((smrv_strong>>r)&1),(smrv_weak>>r)&1,regs[i].isconst,regs[i].wasconst);
4005 #endif
4006 }
4007
4008 static void ds_assemble(int i,struct regstat *i_regs)
4009 {
4010   speculate_register_values(i);
4011   is_delayslot=1;
4012   switch(itype[i]) {
4013     case ALU:
4014       alu_assemble(i,i_regs);break;
4015     case IMM16:
4016       imm16_assemble(i,i_regs);break;
4017     case SHIFT:
4018       shift_assemble(i,i_regs);break;
4019     case SHIFTIMM:
4020       shiftimm_assemble(i,i_regs);break;
4021     case LOAD:
4022       load_assemble(i,i_regs);break;
4023     case LOADLR:
4024       loadlr_assemble(i,i_regs);break;
4025     case STORE:
4026       store_assemble(i,i_regs);break;
4027     case STORELR:
4028       storelr_assemble(i,i_regs);break;
4029     case COP0:
4030       cop0_assemble(i,i_regs);break;
4031     case COP1:
4032       cop1_assemble(i,i_regs);break;
4033     case C1LS:
4034       c1ls_assemble(i,i_regs);break;
4035     case COP2:
4036       cop2_assemble(i,i_regs);break;
4037     case C2LS:
4038       c2ls_assemble(i,i_regs);break;
4039     case C2OP:
4040       c2op_assemble(i,i_regs);break;
4041     case MULTDIV:
4042       multdiv_assemble(i,i_regs);
4043       multdiv_prepare_stall(i,i_regs);
4044       break;
4045     case MOV:
4046       mov_assemble(i,i_regs);break;
4047     case SYSCALL:
4048     case HLECALL:
4049     case INTCALL:
4050     case SPAN:
4051     case UJUMP:
4052     case RJUMP:
4053     case CJUMP:
4054     case SJUMP:
4055       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4056   }
4057   is_delayslot=0;
4058 }
4059
4060 // Is the branch target a valid internal jump?
4061 static int internal_branch(int addr)
4062 {
4063   if(addr&1) return 0; // Indirect (register) jump
4064   if(addr>=start && addr<start+slen*4-4)
4065   {
4066     return 1;
4067   }
4068   return 0;
4069 }
4070
4071 static void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t u)
4072 {
4073   int hr;
4074   for(hr=0;hr<HOST_REGS;hr++) {
4075     if(hr!=EXCLUDE_REG) {
4076       if(pre[hr]!=entry[hr]) {
4077         if(pre[hr]>=0) {
4078           if((dirty>>hr)&1) {
4079             if(get_reg(entry,pre[hr])<0) {
4080               assert(pre[hr]<64);
4081               if(!((u>>pre[hr])&1))
4082                 emit_storereg(pre[hr],hr);
4083             }
4084           }
4085         }
4086       }
4087     }
4088   }
4089   // Move from one register to another (no writeback)
4090   for(hr=0;hr<HOST_REGS;hr++) {
4091     if(hr!=EXCLUDE_REG) {
4092       if(pre[hr]!=entry[hr]) {
4093         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
4094           int nr;
4095           if((nr=get_reg(entry,pre[hr]))>=0) {
4096             emit_mov(hr,nr);
4097           }
4098         }
4099       }
4100     }
4101   }
4102 }
4103
4104 // Load the specified registers
4105 // This only loads the registers given as arguments because
4106 // we don't want to load things that will be overwritten
4107 static void load_regs(signed char entry[],signed char regmap[],int rs1,int rs2)
4108 {
4109   int hr;
4110   // Load 32-bit regs
4111   for(hr=0;hr<HOST_REGS;hr++) {
4112     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4113       if(entry[hr]!=regmap[hr]) {
4114         if(regmap[hr]==rs1||regmap[hr]==rs2)
4115         {
4116           if(regmap[hr]==0) {
4117             emit_zeroreg(hr);
4118           }
4119           else
4120           {
4121             emit_loadreg(regmap[hr],hr);
4122           }
4123         }
4124       }
4125     }
4126   }
4127 }
4128
4129 // Load registers prior to the start of a loop
4130 // so that they are not loaded within the loop
4131 static void loop_preload(signed char pre[],signed char entry[])
4132 {
4133   int hr;
4134   for(hr=0;hr<HOST_REGS;hr++) {
4135     if(hr!=EXCLUDE_REG) {
4136       if(pre[hr]!=entry[hr]) {
4137         if(entry[hr]>=0) {
4138           if(get_reg(pre,entry[hr])<0) {
4139             assem_debug("loop preload:\n");
4140             //printf("loop preload: %d\n",hr);
4141             if(entry[hr]==0) {
4142               emit_zeroreg(hr);
4143             }
4144             else if(entry[hr]<TEMPREG)
4145             {
4146               emit_loadreg(entry[hr],hr);
4147             }
4148             else if(entry[hr]-64<TEMPREG)
4149             {
4150               emit_loadreg(entry[hr],hr);
4151             }
4152           }
4153         }
4154       }
4155     }
4156   }
4157 }
4158
4159 // Generate address for load/store instruction
4160 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
4161 void address_generation(int i,struct regstat *i_regs,signed char entry[])
4162 {
4163   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
4164     int ra=-1;
4165     int agr=AGEN1+(i&1);
4166     if(itype[i]==LOAD) {
4167       ra=get_reg(i_regs->regmap,rt1[i]);
4168       if(ra<0) ra=get_reg(i_regs->regmap,-1);
4169       assert(ra>=0);
4170     }
4171     if(itype[i]==LOADLR) {
4172       ra=get_reg(i_regs->regmap,FTEMP);
4173     }
4174     if(itype[i]==STORE||itype[i]==STORELR) {
4175       ra=get_reg(i_regs->regmap,agr);
4176       if(ra<0) ra=get_reg(i_regs->regmap,-1);
4177     }
4178     if(itype[i]==C1LS||itype[i]==C2LS) {
4179       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
4180         ra=get_reg(i_regs->regmap,FTEMP);
4181       else { // SWC1/SDC1/SWC2/SDC2
4182         ra=get_reg(i_regs->regmap,agr);
4183         if(ra<0) ra=get_reg(i_regs->regmap,-1);
4184       }
4185     }
4186     int rs=get_reg(i_regs->regmap,rs1[i]);
4187     if(ra>=0) {
4188       int offset=imm[i];
4189       int c=(i_regs->wasconst>>rs)&1;
4190       if(rs1[i]==0) {
4191         // Using r0 as a base address
4192         if(!entry||entry[ra]!=agr) {
4193           if (opcode[i]==0x22||opcode[i]==0x26) {
4194             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4195           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4196             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4197           }else{
4198             emit_movimm(offset,ra);
4199           }
4200         } // else did it in the previous cycle
4201       }
4202       else if(rs<0) {
4203         if(!entry||entry[ra]!=rs1[i])
4204           emit_loadreg(rs1[i],ra);
4205         //if(!entry||entry[ra]!=rs1[i])
4206         //  printf("poor load scheduling!\n");
4207       }
4208       else if(c) {
4209         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
4210           if(!entry||entry[ra]!=agr) {
4211             if (opcode[i]==0x22||opcode[i]==0x26) {
4212               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4213             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4214               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4215             }else{
4216               emit_movimm(constmap[i][rs]+offset,ra);
4217               regs[i].loadedconst|=1<<ra;
4218             }
4219           } // else did it in the previous cycle
4220         } // else load_consts already did it
4221       }
4222       if(offset&&!c&&rs1[i]) {
4223         if(rs>=0) {
4224           emit_addimm(rs,offset,ra);
4225         }else{
4226           emit_addimm(ra,offset,ra);
4227         }
4228       }
4229     }
4230   }
4231   // Preload constants for next instruction
4232   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
4233     int agr,ra;
4234     // Actual address
4235     agr=AGEN1+((i+1)&1);
4236     ra=get_reg(i_regs->regmap,agr);
4237     if(ra>=0) {
4238       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4239       int offset=imm[i+1];
4240       int c=(regs[i+1].wasconst>>rs)&1;
4241       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
4242         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4243           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4244         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4245           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4246         }else{
4247           emit_movimm(constmap[i+1][rs]+offset,ra);
4248           regs[i+1].loadedconst|=1<<ra;
4249         }
4250       }
4251       else if(rs1[i+1]==0) {
4252         // Using r0 as a base address
4253         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4254           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4255         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4256           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4257         }else{
4258           emit_movimm(offset,ra);
4259         }
4260       }
4261     }
4262   }
4263 }
4264
4265 static int get_final_value(int hr, int i, int *value)
4266 {
4267   int reg=regs[i].regmap[hr];
4268   while(i<slen-1) {
4269     if(regs[i+1].regmap[hr]!=reg) break;
4270     if(!((regs[i+1].isconst>>hr)&1)) break;
4271     if(bt[i+1]) break;
4272     i++;
4273   }
4274   if(i<slen-1) {
4275     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
4276       *value=constmap[i][hr];
4277       return 1;
4278     }
4279     if(!bt[i+1]) {
4280       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
4281         // Load in delay slot, out-of-order execution
4282         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
4283         {
4284           // Precompute load address
4285           *value=constmap[i][hr]+imm[i+2];
4286           return 1;
4287         }
4288       }
4289       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
4290       {
4291         // Precompute load address
4292         *value=constmap[i][hr]+imm[i+1];
4293         //printf("c=%x imm=%lx\n",(long)constmap[i][hr],imm[i+1]);
4294         return 1;
4295       }
4296     }
4297   }
4298   *value=constmap[i][hr];
4299   //printf("c=%lx\n",(long)constmap[i][hr]);
4300   if(i==slen-1) return 1;
4301   assert(reg < 64);
4302   return !((unneeded_reg[i+1]>>reg)&1);
4303 }
4304
4305 // Load registers with known constants
4306 static void load_consts(signed char pre[],signed char regmap[],int i)
4307 {
4308   int hr,hr2;
4309   // propagate loaded constant flags
4310   if(i==0||bt[i])
4311     regs[i].loadedconst=0;
4312   else {
4313     for(hr=0;hr<HOST_REGS;hr++) {
4314       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
4315          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
4316       {
4317         regs[i].loadedconst|=1<<hr;
4318       }
4319     }
4320   }
4321   // Load 32-bit regs
4322   for(hr=0;hr<HOST_REGS;hr++) {
4323     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4324       //if(entry[hr]!=regmap[hr]) {
4325       if(!((regs[i].loadedconst>>hr)&1)) {
4326         assert(regmap[hr]<64);
4327         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>0) {
4328           int value,similar=0;
4329           if(get_final_value(hr,i,&value)) {
4330             // see if some other register has similar value
4331             for(hr2=0;hr2<HOST_REGS;hr2++) {
4332               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
4333                 if(is_similar_value(value,constmap[i][hr2])) {
4334                   similar=1;
4335                   break;
4336                 }
4337               }
4338             }
4339             if(similar) {
4340               int value2;
4341               if(get_final_value(hr2,i,&value2)) // is this needed?
4342                 emit_movimm_from(value2,hr2,value,hr);
4343               else
4344                 emit_movimm(value,hr);
4345             }
4346             else if(value==0) {
4347               emit_zeroreg(hr);
4348             }
4349             else {
4350               emit_movimm(value,hr);
4351             }
4352           }
4353           regs[i].loadedconst|=1<<hr;
4354         }
4355       }
4356     }
4357   }
4358 }
4359
4360 void load_all_consts(signed char regmap[], u_int dirty, int i)
4361 {
4362   int hr;
4363   // Load 32-bit regs
4364   for(hr=0;hr<HOST_REGS;hr++) {
4365     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4366       assert(regmap[hr] < 64);
4367       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>0) {
4368         int value=constmap[i][hr];
4369         if(value==0) {
4370           emit_zeroreg(hr);
4371         }
4372         else {
4373           emit_movimm(value,hr);
4374         }
4375       }
4376     }
4377   }
4378 }
4379
4380 // Write out all dirty registers (except cycle count)
4381 static void wb_dirtys(signed char i_regmap[],uint64_t i_dirty)
4382 {
4383   int hr;
4384   for(hr=0;hr<HOST_REGS;hr++) {
4385     if(hr!=EXCLUDE_REG) {
4386       if(i_regmap[hr]>0) {
4387         if(i_regmap[hr]!=CCREG) {
4388           if((i_dirty>>hr)&1) {
4389             assert(i_regmap[hr]<64);
4390             emit_storereg(i_regmap[hr],hr);
4391           }
4392         }
4393       }
4394     }
4395   }
4396 }
4397
4398 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4399 // This writes the registers not written by store_regs_bt
4400 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_dirty,int addr)
4401 {
4402   int hr;
4403   int t=(addr-start)>>2;
4404   for(hr=0;hr<HOST_REGS;hr++) {
4405     if(hr!=EXCLUDE_REG) {
4406       if(i_regmap[hr]>0) {
4407         if(i_regmap[hr]!=CCREG) {
4408           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1)) {
4409             if((i_dirty>>hr)&1) {
4410               assert(i_regmap[hr]<64);
4411               emit_storereg(i_regmap[hr],hr);
4412             }
4413           }
4414         }
4415       }
4416     }
4417   }
4418 }
4419
4420 // Load all registers (except cycle count)
4421 void load_all_regs(signed char i_regmap[])
4422 {
4423   int hr;
4424   for(hr=0;hr<HOST_REGS;hr++) {
4425     if(hr!=EXCLUDE_REG) {
4426       if(i_regmap[hr]==0) {
4427         emit_zeroreg(hr);
4428       }
4429       else
4430       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4431       {
4432         emit_loadreg(i_regmap[hr],hr);
4433       }
4434     }
4435   }
4436 }
4437
4438 // Load all current registers also needed by next instruction
4439 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4440 {
4441   int hr;
4442   for(hr=0;hr<HOST_REGS;hr++) {
4443     if(hr!=EXCLUDE_REG) {
4444       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4445         if(i_regmap[hr]==0) {
4446           emit_zeroreg(hr);
4447         }
4448         else
4449         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4450         {
4451           emit_loadreg(i_regmap[hr],hr);
4452         }
4453       }
4454     }
4455   }
4456 }
4457
4458 // Load all regs, storing cycle count if necessary
4459 void load_regs_entry(int t)
4460 {
4461   int hr;
4462   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
4463   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
4464   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4465     emit_storereg(CCREG,HOST_CCREG);
4466   }
4467   // Load 32-bit regs
4468   for(hr=0;hr<HOST_REGS;hr++) {
4469     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4470       if(regs[t].regmap_entry[hr]==0) {
4471         emit_zeroreg(hr);
4472       }
4473       else if(regs[t].regmap_entry[hr]!=CCREG)
4474       {
4475         emit_loadreg(regs[t].regmap_entry[hr],hr);
4476       }
4477     }
4478   }
4479 }
4480
4481 // Store dirty registers prior to branch
4482 void store_regs_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
4483 {
4484   if(internal_branch(addr))
4485   {
4486     int t=(addr-start)>>2;
4487     int hr;
4488     for(hr=0;hr<HOST_REGS;hr++) {
4489       if(hr!=EXCLUDE_REG) {
4490         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4491           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1)) {
4492             if((i_dirty>>hr)&1) {
4493               assert(i_regmap[hr]<64);
4494               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4495                 emit_storereg(i_regmap[hr],hr);
4496             }
4497           }
4498         }
4499       }
4500     }
4501   }
4502   else
4503   {
4504     // Branch out of this block, write out all dirty regs
4505     wb_dirtys(i_regmap,i_dirty);
4506   }
4507 }
4508
4509 // Load all needed registers for branch target
4510 static void load_regs_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
4511 {
4512   //if(addr>=start && addr<(start+slen*4))
4513   if(internal_branch(addr))
4514   {
4515     int t=(addr-start)>>2;
4516     int hr;
4517     // Store the cycle count before loading something else
4518     if(i_regmap[HOST_CCREG]!=CCREG) {
4519       assert(i_regmap[HOST_CCREG]==-1);
4520     }
4521     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4522       emit_storereg(CCREG,HOST_CCREG);
4523     }
4524     // Load 32-bit regs
4525     for(hr=0;hr<HOST_REGS;hr++) {
4526       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4527         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4528           if(regs[t].regmap_entry[hr]==0) {
4529             emit_zeroreg(hr);
4530           }
4531           else if(regs[t].regmap_entry[hr]!=CCREG)
4532           {
4533             emit_loadreg(regs[t].regmap_entry[hr],hr);
4534           }
4535         }
4536       }
4537     }
4538   }
4539 }
4540
4541 static int match_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
4542 {
4543   if(addr>=start && addr<start+slen*4-4)
4544   {
4545     int t=(addr-start)>>2;
4546     int hr;
4547     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4548     for(hr=0;hr<HOST_REGS;hr++)
4549     {
4550       if(hr!=EXCLUDE_REG)
4551       {
4552         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4553         {
4554           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4555           {
4556             return 0;
4557           }
4558           else
4559           if((i_dirty>>hr)&1)
4560           {
4561             if(i_regmap[hr]<TEMPREG)
4562             {
4563               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4564                 return 0;
4565             }
4566             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4567             {
4568               assert(0);
4569             }
4570           }
4571         }
4572         else // Same register but is it 32-bit or dirty?
4573         if(i_regmap[hr]>=0)
4574         {
4575           if(!((regs[t].dirty>>hr)&1))
4576           {
4577             if((i_dirty>>hr)&1)
4578             {
4579               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4580               {
4581                 //printf("%x: dirty no match\n",addr);
4582                 return 0;
4583               }
4584             }
4585           }
4586         }
4587       }
4588     }
4589     // Delay slots are not valid branch targets
4590     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP)) return 0;
4591     // Delay slots require additional processing, so do not match
4592     if(is_ds[t]) return 0;
4593   }
4594   else
4595   {
4596     int hr;
4597     for(hr=0;hr<HOST_REGS;hr++)
4598     {
4599       if(hr!=EXCLUDE_REG)
4600       {
4601         if(i_regmap[hr]>=0)
4602         {
4603           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4604           {
4605             if((i_dirty>>hr)&1)
4606             {
4607               return 0;
4608             }
4609           }
4610         }
4611       }
4612     }
4613   }
4614   return 1;
4615 }
4616
4617 #ifdef DRC_DBG
4618 static void drc_dbg_emit_do_cmp(int i)
4619 {
4620   extern void do_insn_cmp();
4621   //extern int cycle;
4622   u_int hr, reglist = get_host_reglist(regs[i].regmap);
4623
4624   assem_debug("//do_insn_cmp %08x\n", start+i*4);
4625   save_regs(reglist);
4626   // write out changed consts to match the interpreter
4627   if (i > 0 && !bt[i]) {
4628     for (hr = 0; hr < HOST_REGS; hr++) {
4629       int reg = regs[i-1].regmap[hr];
4630       if (hr == EXCLUDE_REG || reg < 0)
4631         continue;
4632       if (!((regs[i-1].isconst >> hr) & 1))
4633         continue;
4634       if (i > 1 && reg == regs[i-2].regmap[hr] && constmap[i-1][hr] == constmap[i-2][hr])
4635         continue;
4636       emit_movimm(constmap[i-1][hr],0);
4637       emit_storereg(reg, 0);
4638     }
4639   }
4640   emit_movimm(start+i*4,0);
4641   emit_writeword(0,&pcaddr);
4642   emit_far_call(do_insn_cmp);
4643   //emit_readword(&cycle,0);
4644   //emit_addimm(0,2,0);
4645   //emit_writeword(0,&cycle);
4646   (void)get_reg2;
4647   restore_regs(reglist);
4648   assem_debug("\\\\do_insn_cmp\n");
4649 }
4650 #else
4651 #define drc_dbg_emit_do_cmp(x)
4652 #endif
4653
4654 // Used when a branch jumps into the delay slot of another branch
4655 static void ds_assemble_entry(int i)
4656 {
4657   int t=(ba[i]-start)>>2;
4658   if (!instr_addr[t])
4659     instr_addr[t] = out;
4660   assem_debug("Assemble delay slot at %x\n",ba[i]);
4661   assem_debug("<->\n");
4662   drc_dbg_emit_do_cmp(t);
4663   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4664     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty);
4665   load_regs(regs[t].regmap_entry,regs[t].regmap,rs1[t],rs2[t]);
4666   address_generation(t,&regs[t],regs[t].regmap_entry);
4667   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4668     load_regs(regs[t].regmap_entry,regs[t].regmap,INVCP,INVCP);
4669   is_delayslot=0;
4670   switch(itype[t]) {
4671     case ALU:
4672       alu_assemble(t,&regs[t]);break;
4673     case IMM16:
4674       imm16_assemble(t,&regs[t]);break;
4675     case SHIFT:
4676       shift_assemble(t,&regs[t]);break;
4677     case SHIFTIMM:
4678       shiftimm_assemble(t,&regs[t]);break;
4679     case LOAD:
4680       load_assemble(t,&regs[t]);break;
4681     case LOADLR:
4682       loadlr_assemble(t,&regs[t]);break;
4683     case STORE:
4684       store_assemble(t,&regs[t]);break;
4685     case STORELR:
4686       storelr_assemble(t,&regs[t]);break;
4687     case COP0:
4688       cop0_assemble(t,&regs[t]);break;
4689     case COP1:
4690       cop1_assemble(t,&regs[t]);break;
4691     case C1LS:
4692       c1ls_assemble(t,&regs[t]);break;
4693     case COP2:
4694       cop2_assemble(t,&regs[t]);break;
4695     case C2LS:
4696       c2ls_assemble(t,&regs[t]);break;
4697     case C2OP:
4698       c2op_assemble(t,&regs[t]);break;
4699     case MULTDIV:
4700       multdiv_assemble(t,&regs[t]);
4701       multdiv_prepare_stall(i,&regs[t]);
4702       break;
4703     case MOV:
4704       mov_assemble(t,&regs[t]);break;
4705     case SYSCALL:
4706     case HLECALL:
4707     case INTCALL:
4708     case SPAN:
4709     case UJUMP:
4710     case RJUMP:
4711     case CJUMP:
4712     case SJUMP:
4713       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4714   }
4715   store_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+4);
4716   load_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+4);
4717   if(internal_branch(ba[i]+4))
4718     assem_debug("branch: internal\n");
4719   else
4720     assem_debug("branch: external\n");
4721   assert(internal_branch(ba[i]+4));
4722   add_to_linker(out,ba[i]+4,internal_branch(ba[i]+4));
4723   emit_jmp(0);
4724 }
4725
4726 static void emit_extjump(void *addr, u_int target)
4727 {
4728   emit_extjump2(addr, target, dyna_linker);
4729 }
4730
4731 static void emit_extjump_ds(void *addr, u_int target)
4732 {
4733   emit_extjump2(addr, target, dyna_linker_ds);
4734 }
4735
4736 // Load 2 immediates optimizing for small code size
4737 static void emit_mov2imm_compact(int imm1,u_int rt1,int imm2,u_int rt2)
4738 {
4739   emit_movimm(imm1,rt1);
4740   emit_movimm_from(imm1,rt1,imm2,rt2);
4741 }
4742
4743 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4744 {
4745   int count;
4746   void *jaddr;
4747   void *idle=NULL;
4748   int t=0;
4749   if(itype[i]==RJUMP)
4750   {
4751     *adj=0;
4752   }
4753   //if(ba[i]>=start && ba[i]<(start+slen*4))
4754   if(internal_branch(ba[i]))
4755   {
4756     t=(ba[i]-start)>>2;
4757     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4758     else *adj=ccadj[t];
4759   }
4760   else
4761   {
4762     *adj=0;
4763   }
4764   count=ccadj[i];
4765   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4766     // Idle loop
4767     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4768     idle=out;
4769     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4770     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4771     jaddr=out;
4772     emit_jmp(0);
4773   }
4774   else if(*adj==0||invert) {
4775     int cycles=CLOCK_ADJUST(count+2);
4776     // faster loop HACK
4777 #if 0
4778     if (t&&*adj) {
4779       int rel=t-i;
4780       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4781         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4782     }
4783 #endif
4784     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4785     jaddr=out;
4786     emit_jns(0);
4787   }
4788   else
4789   {
4790     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4791     jaddr=out;
4792     emit_jns(0);
4793   }
4794   add_stub(CC_STUB,jaddr,idle?idle:out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4795 }
4796
4797 static void do_ccstub(int n)
4798 {
4799   literal_pool(256);
4800   assem_debug("do_ccstub %x\n",start+(u_int)stubs[n].b*4);
4801   set_jump_target(stubs[n].addr, out);
4802   int i=stubs[n].b;
4803   if(stubs[n].d==NULLDS) {
4804     // Delay slot instruction is nullified ("likely" branch)
4805     wb_dirtys(regs[i].regmap,regs[i].dirty);
4806   }
4807   else if(stubs[n].d!=TAKEN) {
4808     wb_dirtys(branch_regs[i].regmap,branch_regs[i].dirty);
4809   }
4810   else {
4811     if(internal_branch(ba[i]))
4812       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4813   }
4814   if(stubs[n].c!=-1)
4815   {
4816     // Save PC as return address
4817     emit_movimm(stubs[n].c,EAX);
4818     emit_writeword(EAX,&pcaddr);
4819   }
4820   else
4821   {
4822     // Return address depends on which way the branch goes
4823     if(itype[i]==CJUMP||itype[i]==SJUMP)
4824     {
4825       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4826       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4827       if(rs1[i]==0)
4828       {
4829         s1l=s2l;
4830         s2l=-1;
4831       }
4832       else if(rs2[i]==0)
4833       {
4834         s2l=-1;
4835       }
4836       assert(s1l>=0);
4837       #ifdef DESTRUCTIVE_WRITEBACK
4838       if(rs1[i]) {
4839         if((branch_regs[i].dirty>>s1l)&&1)
4840           emit_loadreg(rs1[i],s1l);
4841       }
4842       else {
4843         if((branch_regs[i].dirty>>s1l)&1)
4844           emit_loadreg(rs2[i],s1l);
4845       }
4846       if(s2l>=0)
4847         if((branch_regs[i].dirty>>s2l)&1)
4848           emit_loadreg(rs2[i],s2l);
4849       #endif
4850       int hr=0;
4851       int addr=-1,alt=-1,ntaddr=-1;
4852       while(hr<HOST_REGS)
4853       {
4854         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4855            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4856            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4857         {
4858           addr=hr++;break;
4859         }
4860         hr++;
4861       }
4862       while(hr<HOST_REGS)
4863       {
4864         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4865            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4866            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4867         {
4868           alt=hr++;break;
4869         }
4870         hr++;
4871       }
4872       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4873       {
4874         while(hr<HOST_REGS)
4875         {
4876           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4877              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4878              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4879           {
4880             ntaddr=hr;break;
4881           }
4882           hr++;
4883         }
4884         assert(hr<HOST_REGS);
4885       }
4886       if((opcode[i]&0x2f)==4) // BEQ
4887       {
4888         #ifdef HAVE_CMOV_IMM
4889         if(s2l>=0) emit_cmp(s1l,s2l);
4890         else emit_test(s1l,s1l);
4891         emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4892         #else
4893         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4894         if(s2l>=0) emit_cmp(s1l,s2l);
4895         else emit_test(s1l,s1l);
4896         emit_cmovne_reg(alt,addr);
4897         #endif
4898       }
4899       if((opcode[i]&0x2f)==5) // BNE
4900       {
4901         #ifdef HAVE_CMOV_IMM
4902         if(s2l>=0) emit_cmp(s1l,s2l);
4903         else emit_test(s1l,s1l);
4904         emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4905         #else
4906         emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4907         if(s2l>=0) emit_cmp(s1l,s2l);
4908         else emit_test(s1l,s1l);
4909         emit_cmovne_reg(alt,addr);
4910         #endif
4911       }
4912       if((opcode[i]&0x2f)==6) // BLEZ
4913       {
4914         //emit_movimm(ba[i],alt);
4915         //emit_movimm(start+i*4+8,addr);
4916         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4917         emit_cmpimm(s1l,1);
4918         emit_cmovl_reg(alt,addr);
4919       }
4920       if((opcode[i]&0x2f)==7) // BGTZ
4921       {
4922         //emit_movimm(ba[i],addr);
4923         //emit_movimm(start+i*4+8,ntaddr);
4924         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4925         emit_cmpimm(s1l,1);
4926         emit_cmovl_reg(ntaddr,addr);
4927       }
4928       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4929       {
4930         //emit_movimm(ba[i],alt);
4931         //emit_movimm(start+i*4+8,addr);
4932         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4933         emit_test(s1l,s1l);
4934         emit_cmovs_reg(alt,addr);
4935       }
4936       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4937       {
4938         //emit_movimm(ba[i],addr);
4939         //emit_movimm(start+i*4+8,alt);
4940         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4941         emit_test(s1l,s1l);
4942         emit_cmovs_reg(alt,addr);
4943       }
4944       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4945         if(source[i]&0x10000) // BC1T
4946         {
4947           //emit_movimm(ba[i],alt);
4948           //emit_movimm(start+i*4+8,addr);
4949           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4950           emit_testimm(s1l,0x800000);
4951           emit_cmovne_reg(alt,addr);
4952         }
4953         else // BC1F
4954         {
4955           //emit_movimm(ba[i],addr);
4956           //emit_movimm(start+i*4+8,alt);
4957           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4958           emit_testimm(s1l,0x800000);
4959           emit_cmovne_reg(alt,addr);
4960         }
4961       }
4962       emit_writeword(addr,&pcaddr);
4963     }
4964     else
4965     if(itype[i]==RJUMP)
4966     {
4967       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4968       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4969         r=get_reg(branch_regs[i].regmap,RTEMP);
4970       }
4971       emit_writeword(r,&pcaddr);
4972     }
4973     else {SysPrintf("Unknown branch type in do_ccstub\n");abort();}
4974   }
4975   // Update cycle count
4976   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4977   if(stubs[n].a) emit_addimm(HOST_CCREG,CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
4978   emit_far_call(cc_interrupt);
4979   if(stubs[n].a) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
4980   if(stubs[n].d==TAKEN) {
4981     if(internal_branch(ba[i]))
4982       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4983     else if(itype[i]==RJUMP) {
4984       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4985         emit_readword(&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4986       else
4987         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4988     }
4989   }else if(stubs[n].d==NOTTAKEN) {
4990     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4991     else load_all_regs(branch_regs[i].regmap);
4992   }else if(stubs[n].d==NULLDS) {
4993     // Delay slot instruction is nullified ("likely" branch)
4994     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4995     else load_all_regs(regs[i].regmap);
4996   }else{
4997     load_all_regs(branch_regs[i].regmap);
4998   }
4999   if (stubs[n].retaddr)
5000     emit_jmp(stubs[n].retaddr);
5001   else
5002     do_jump_vaddr(stubs[n].e);
5003 }
5004
5005 static void add_to_linker(void *addr, u_int target, int ext)
5006 {
5007   assert(linkcount < ARRAY_SIZE(link_addr));
5008   link_addr[linkcount].addr = addr;
5009   link_addr[linkcount].target = target;
5010   link_addr[linkcount].ext = ext;
5011   linkcount++;
5012 }
5013
5014 static void ujump_assemble_write_ra(int i)
5015 {
5016   int rt;
5017   unsigned int return_address;
5018   rt=get_reg(branch_regs[i].regmap,31);
5019   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5020   //assert(rt>=0);
5021   return_address=start+i*4+8;
5022   if(rt>=0) {
5023     #ifdef USE_MINI_HT
5024     if(internal_branch(return_address)&&rt1[i+1]!=31) {
5025       int temp=-1; // note: must be ds-safe
5026       #ifdef HOST_TEMPREG
5027       temp=HOST_TEMPREG;
5028       #endif
5029       if(temp>=0) do_miniht_insert(return_address,rt,temp);
5030       else emit_movimm(return_address,rt);
5031     }
5032     else
5033     #endif
5034     {
5035       #ifdef REG_PREFETCH
5036       if(temp>=0)
5037       {
5038         if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
5039       }
5040       #endif
5041       emit_movimm(return_address,rt); // PC into link register
5042       #ifdef IMM_PREFETCH
5043       emit_prefetch(hash_table_get(return_address));
5044       #endif
5045     }
5046   }
5047 }
5048
5049 static void ujump_assemble(int i,struct regstat *i_regs)
5050 {
5051   int ra_done=0;
5052   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5053   address_generation(i+1,i_regs,regs[i].regmap_entry);
5054   #ifdef REG_PREFETCH
5055   int temp=get_reg(branch_regs[i].regmap,PTEMP);
5056   if(rt1[i]==31&&temp>=0)
5057   {
5058     signed char *i_regmap=i_regs->regmap;
5059     int return_address=start+i*4+8;
5060     if(get_reg(branch_regs[i].regmap,31)>0)
5061     if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
5062   }
5063   #endif
5064   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
5065     ujump_assemble_write_ra(i); // writeback ra for DS
5066     ra_done=1;
5067   }
5068   ds_assemble(i+1,i_regs);
5069   uint64_t bc_unneeded=branch_regs[i].u;
5070   bc_unneeded|=1|(1LL<<rt1[i]);
5071   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
5072   load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5073   if(!ra_done&&rt1[i]==31)
5074     ujump_assemble_write_ra(i);
5075   int cc,adj;
5076   cc=get_reg(branch_regs[i].regmap,CCREG);
5077   assert(cc==HOST_CCREG);
5078   store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5079   #ifdef REG_PREFETCH
5080   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5081   #endif
5082   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5083   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5084   load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5085   if(internal_branch(ba[i]))
5086     assem_debug("branch: internal\n");
5087   else
5088     assem_debug("branch: external\n");
5089   if(internal_branch(ba[i])&&is_ds[(ba[i]-start)>>2]) {
5090     ds_assemble_entry(i);
5091   }
5092   else {
5093     add_to_linker(out,ba[i],internal_branch(ba[i]));
5094     emit_jmp(0);
5095   }
5096 }
5097
5098 static void rjump_assemble_write_ra(int i)
5099 {
5100   int rt,return_address;
5101   assert(rt1[i+1]!=rt1[i]);
5102   assert(rt2[i+1]!=rt1[i]);
5103   rt=get_reg(branch_regs[i].regmap,rt1[i]);
5104   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5105   assert(rt>=0);
5106   return_address=start+i*4+8;
5107   #ifdef REG_PREFETCH
5108   if(temp>=0)
5109   {
5110     if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
5111   }
5112   #endif
5113   emit_movimm(return_address,rt); // PC into link register
5114   #ifdef IMM_PREFETCH
5115   emit_prefetch(hash_table_get(return_address));
5116   #endif
5117 }
5118
5119 static void rjump_assemble(int i,struct regstat *i_regs)
5120 {
5121   int temp;
5122   int rs,cc;
5123   int ra_done=0;
5124   rs=get_reg(branch_regs[i].regmap,rs1[i]);
5125   assert(rs>=0);
5126   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5127     // Delay slot abuse, make a copy of the branch address register
5128     temp=get_reg(branch_regs[i].regmap,RTEMP);
5129     assert(temp>=0);
5130     assert(regs[i].regmap[temp]==RTEMP);
5131     emit_mov(rs,temp);
5132     rs=temp;
5133   }
5134   address_generation(i+1,i_regs,regs[i].regmap_entry);
5135   #ifdef REG_PREFETCH
5136   if(rt1[i]==31)
5137   {
5138     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
5139       signed char *i_regmap=i_regs->regmap;
5140       int return_address=start+i*4+8;
5141       if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
5142     }
5143   }
5144   #endif
5145   #ifdef USE_MINI_HT
5146   if(rs1[i]==31) {
5147     int rh=get_reg(regs[i].regmap,RHASH);
5148     if(rh>=0) do_preload_rhash(rh);
5149   }
5150   #endif
5151   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
5152     rjump_assemble_write_ra(i);
5153     ra_done=1;
5154   }
5155   ds_assemble(i+1,i_regs);
5156   uint64_t bc_unneeded=branch_regs[i].u;
5157   bc_unneeded|=1|(1LL<<rt1[i]);
5158   bc_unneeded&=~(1LL<<rs1[i]);
5159   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
5160   load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],CCREG);
5161   if(!ra_done&&rt1[i]!=0)
5162     rjump_assemble_write_ra(i);
5163   cc=get_reg(branch_regs[i].regmap,CCREG);
5164   assert(cc==HOST_CCREG);
5165   (void)cc;
5166   #ifdef USE_MINI_HT
5167   int rh=get_reg(branch_regs[i].regmap,RHASH);
5168   int ht=get_reg(branch_regs[i].regmap,RHTBL);
5169   if(rs1[i]==31) {
5170     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
5171     do_preload_rhtbl(ht);
5172     do_rhash(rs,rh);
5173   }
5174   #endif
5175   store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1);
5176   #ifdef DESTRUCTIVE_WRITEBACK
5177   if((branch_regs[i].dirty>>rs)&1) {
5178     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
5179       emit_loadreg(rs1[i],rs);
5180     }
5181   }
5182   #endif
5183   #ifdef REG_PREFETCH
5184   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5185   #endif
5186   #ifdef USE_MINI_HT
5187   if(rs1[i]==31) {
5188     do_miniht_load(ht,rh);
5189   }
5190   #endif
5191   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
5192   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
5193   //assert(adj==0);
5194   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5195   add_stub(CC_STUB,out,NULL,0,i,-1,TAKEN,rs);
5196   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
5197     // special case for RFE
5198     emit_jmp(0);
5199   else
5200     emit_jns(0);
5201   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1);
5202   #ifdef USE_MINI_HT
5203   if(rs1[i]==31) {
5204     do_miniht_jump(rs,rh,ht);
5205   }
5206   else
5207   #endif
5208   {
5209     do_jump_vaddr(rs);
5210   }
5211   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5212   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
5213   #endif
5214 }
5215
5216 static void cjump_assemble(int i,struct regstat *i_regs)
5217 {
5218   signed char *i_regmap=i_regs->regmap;
5219   int cc;
5220   int match;
5221   match=match_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5222   assem_debug("match=%d\n",match);
5223   int s1l,s2l;
5224   int unconditional=0,nop=0;
5225   int invert=0;
5226   int internal=internal_branch(ba[i]);
5227   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5228   if(!match) invert=1;
5229   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5230   if(i>(ba[i]-start)>>2) invert=1;
5231   #endif
5232   #ifdef __aarch64__
5233   invert=1; // because of near cond. branches
5234   #endif
5235
5236   if(ooo[i]) {
5237     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5238     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5239   }
5240   else {
5241     s1l=get_reg(i_regmap,rs1[i]);
5242     s2l=get_reg(i_regmap,rs2[i]);
5243   }
5244   if(rs1[i]==0&&rs2[i]==0)
5245   {
5246     if(opcode[i]&1) nop=1;
5247     else unconditional=1;
5248     //assert(opcode[i]!=5);
5249     //assert(opcode[i]!=7);
5250     //assert(opcode[i]!=0x15);
5251     //assert(opcode[i]!=0x17);
5252   }
5253   else if(rs1[i]==0)
5254   {
5255     s1l=s2l;
5256     s2l=-1;
5257   }
5258   else if(rs2[i]==0)
5259   {
5260     s2l=-1;
5261   }
5262
5263   if(ooo[i]) {
5264     // Out of order execution (delay slot first)
5265     //printf("OOOE\n");
5266     address_generation(i+1,i_regs,regs[i].regmap_entry);
5267     ds_assemble(i+1,i_regs);
5268     int adj;
5269     uint64_t bc_unneeded=branch_regs[i].u;
5270     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5271     bc_unneeded|=1;
5272     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
5273     load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],rs2[i]);
5274     load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5275     cc=get_reg(branch_regs[i].regmap,CCREG);
5276     assert(cc==HOST_CCREG);
5277     if(unconditional)
5278       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5279     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5280     //assem_debug("cycle count (adj)\n");
5281     if(unconditional) {
5282       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5283       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5284         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5285         load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5286         if(internal)
5287           assem_debug("branch: internal\n");
5288         else
5289           assem_debug("branch: external\n");
5290         if(internal&&is_ds[(ba[i]-start)>>2]) {
5291           ds_assemble_entry(i);
5292         }
5293         else {
5294           add_to_linker(out,ba[i],internal);
5295           emit_jmp(0);
5296         }
5297         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5298         if(((u_int)out)&7) emit_addnop(0);
5299         #endif
5300       }
5301     }
5302     else if(nop) {
5303       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5304       void *jaddr=out;
5305       emit_jns(0);
5306       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5307     }
5308     else {
5309       void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
5310       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5311       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5312
5313       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5314       assert(s1l>=0);
5315       if(opcode[i]==4) // BEQ
5316       {
5317         if(s2l>=0) emit_cmp(s1l,s2l);
5318         else emit_test(s1l,s1l);
5319         if(invert){
5320           nottaken=out;
5321           emit_jne(DJT_1);
5322         }else{
5323           add_to_linker(out,ba[i],internal);
5324           emit_jeq(0);
5325         }
5326       }
5327       if(opcode[i]==5) // BNE
5328       {
5329         if(s2l>=0) emit_cmp(s1l,s2l);
5330         else emit_test(s1l,s1l);
5331         if(invert){
5332           nottaken=out;
5333           emit_jeq(DJT_1);
5334         }else{
5335           add_to_linker(out,ba[i],internal);
5336           emit_jne(0);
5337         }
5338       }
5339       if(opcode[i]==6) // BLEZ
5340       {
5341         emit_cmpimm(s1l,1);
5342         if(invert){
5343           nottaken=out;
5344           emit_jge(DJT_1);
5345         }else{
5346           add_to_linker(out,ba[i],internal);
5347           emit_jl(0);
5348         }
5349       }
5350       if(opcode[i]==7) // BGTZ
5351       {
5352         emit_cmpimm(s1l,1);
5353         if(invert){
5354           nottaken=out;
5355           emit_jl(DJT_1);
5356         }else{
5357           add_to_linker(out,ba[i],internal);
5358           emit_jge(0);
5359         }
5360       }
5361       if(invert) {
5362         if(taken) set_jump_target(taken, out);
5363         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5364         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5365           if(adj) {
5366             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5367             add_to_linker(out,ba[i],internal);
5368           }else{
5369             emit_addnop(13);
5370             add_to_linker(out,ba[i],internal*2);
5371           }
5372           emit_jmp(0);
5373         }else
5374         #endif
5375         {
5376           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5377           store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5378           load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5379           if(internal)
5380             assem_debug("branch: internal\n");
5381           else
5382             assem_debug("branch: external\n");
5383           if(internal&&is_ds[(ba[i]-start)>>2]) {
5384             ds_assemble_entry(i);
5385           }
5386           else {
5387             add_to_linker(out,ba[i],internal);
5388             emit_jmp(0);
5389           }
5390         }
5391         set_jump_target(nottaken, out);
5392       }
5393
5394       if(nottaken1) set_jump_target(nottaken1, out);
5395       if(adj) {
5396         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5397       }
5398     } // (!unconditional)
5399   } // if(ooo)
5400   else
5401   {
5402     // In-order execution (branch first)
5403     //if(likely[i]) printf("IOL\n");
5404     //else
5405     //printf("IOE\n");
5406     void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
5407     if(!unconditional&&!nop) {
5408       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5409       assert(s1l>=0);
5410       if((opcode[i]&0x2f)==4) // BEQ
5411       {
5412         if(s2l>=0) emit_cmp(s1l,s2l);
5413         else emit_test(s1l,s1l);
5414         nottaken=out;
5415         emit_jne(DJT_2);
5416       }
5417       if((opcode[i]&0x2f)==5) // BNE
5418       {
5419         if(s2l>=0) emit_cmp(s1l,s2l);
5420         else emit_test(s1l,s1l);
5421         nottaken=out;
5422         emit_jeq(DJT_2);
5423       }
5424       if((opcode[i]&0x2f)==6) // BLEZ
5425       {
5426         emit_cmpimm(s1l,1);
5427         nottaken=out;
5428         emit_jge(DJT_2);
5429       }
5430       if((opcode[i]&0x2f)==7) // BGTZ
5431       {
5432         emit_cmpimm(s1l,1);
5433         nottaken=out;
5434         emit_jl(DJT_2);
5435       }
5436     } // if(!unconditional)
5437     int adj;
5438     uint64_t ds_unneeded=branch_regs[i].u;
5439     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5440     ds_unneeded|=1;
5441     // branch taken
5442     if(!nop) {
5443       if(taken) set_jump_target(taken, out);
5444       assem_debug("1:\n");
5445       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5446       // load regs
5447       load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5448       address_generation(i+1,&branch_regs[i],0);
5449       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
5450       ds_assemble(i+1,&branch_regs[i]);
5451       cc=get_reg(branch_regs[i].regmap,CCREG);
5452       if(cc==-1) {
5453         emit_loadreg(CCREG,cc=HOST_CCREG);
5454         // CHECK: Is the following instruction (fall thru) allocated ok?
5455       }
5456       assert(cc==HOST_CCREG);
5457       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5458       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5459       assem_debug("cycle count (adj)\n");
5460       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5461       load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5462       if(internal)
5463         assem_debug("branch: internal\n");
5464       else
5465         assem_debug("branch: external\n");
5466       if(internal&&is_ds[(ba[i]-start)>>2]) {
5467         ds_assemble_entry(i);
5468       }
5469       else {
5470         add_to_linker(out,ba[i],internal);
5471         emit_jmp(0);
5472       }
5473     }
5474     // branch not taken
5475     if(!unconditional) {
5476       if(nottaken1) set_jump_target(nottaken1, out);
5477       set_jump_target(nottaken, out);
5478       assem_debug("2:\n");
5479       if(!likely[i]) {
5480         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5481         load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5482         address_generation(i+1,&branch_regs[i],0);
5483         load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5484         ds_assemble(i+1,&branch_regs[i]);
5485       }
5486       cc=get_reg(branch_regs[i].regmap,CCREG);
5487       if(cc==-1&&!likely[i]) {
5488         // Cycle count isn't in a register, temporarily load it then write it out
5489         emit_loadreg(CCREG,HOST_CCREG);
5490         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5491         void *jaddr=out;
5492         emit_jns(0);
5493         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5494         emit_storereg(CCREG,HOST_CCREG);
5495       }
5496       else{
5497         cc=get_reg(i_regmap,CCREG);
5498         assert(cc==HOST_CCREG);
5499         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5500         void *jaddr=out;
5501         emit_jns(0);
5502         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5503       }
5504     }
5505   }
5506 }
5507
5508 static void sjump_assemble(int i,struct regstat *i_regs)
5509 {
5510   signed char *i_regmap=i_regs->regmap;
5511   int cc;
5512   int match;
5513   match=match_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5514   assem_debug("smatch=%d\n",match);
5515   int s1l;
5516   int unconditional=0,nevertaken=0;
5517   int invert=0;
5518   int internal=internal_branch(ba[i]);
5519   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5520   if(!match) invert=1;
5521   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5522   if(i>(ba[i]-start)>>2) invert=1;
5523   #endif
5524   #ifdef __aarch64__
5525   invert=1; // because of near cond. branches
5526   #endif
5527
5528   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5529   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5530
5531   if(ooo[i]) {
5532     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5533   }
5534   else {
5535     s1l=get_reg(i_regmap,rs1[i]);
5536   }
5537   if(rs1[i]==0)
5538   {
5539     if(opcode2[i]&1) unconditional=1;
5540     else nevertaken=1;
5541     // These are never taken (r0 is never less than zero)
5542     //assert(opcode2[i]!=0);
5543     //assert(opcode2[i]!=2);
5544     //assert(opcode2[i]!=0x10);
5545     //assert(opcode2[i]!=0x12);
5546   }
5547
5548   if(ooo[i]) {
5549     // Out of order execution (delay slot first)
5550     //printf("OOOE\n");
5551     address_generation(i+1,i_regs,regs[i].regmap_entry);
5552     ds_assemble(i+1,i_regs);
5553     int adj;
5554     uint64_t bc_unneeded=branch_regs[i].u;
5555     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5556     bc_unneeded|=1;
5557     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
5558     load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],rs1[i]);
5559     load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5560     if(rt1[i]==31) {
5561       int rt,return_address;
5562       rt=get_reg(branch_regs[i].regmap,31);
5563       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5564       if(rt>=0) {
5565         // Save the PC even if the branch is not taken
5566         return_address=start+i*4+8;
5567         emit_movimm(return_address,rt); // PC into link register
5568         #ifdef IMM_PREFETCH
5569         if(!nevertaken) emit_prefetch(hash_table_get(return_address));
5570         #endif
5571       }
5572     }
5573     cc=get_reg(branch_regs[i].regmap,CCREG);
5574     assert(cc==HOST_CCREG);
5575     if(unconditional)
5576       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5577     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5578     assem_debug("cycle count (adj)\n");
5579     if(unconditional) {
5580       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5581       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5582         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5583         load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5584         if(internal)
5585           assem_debug("branch: internal\n");
5586         else
5587           assem_debug("branch: external\n");
5588         if(internal&&is_ds[(ba[i]-start)>>2]) {
5589           ds_assemble_entry(i);
5590         }
5591         else {
5592           add_to_linker(out,ba[i],internal);
5593           emit_jmp(0);
5594         }
5595         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5596         if(((u_int)out)&7) emit_addnop(0);
5597         #endif
5598       }
5599     }
5600     else if(nevertaken) {
5601       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5602       void *jaddr=out;
5603       emit_jns(0);
5604       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5605     }
5606     else {
5607       void *nottaken = NULL;
5608       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5609       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5610       {
5611         assert(s1l>=0);
5612         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5613         {
5614           emit_test(s1l,s1l);
5615           if(invert){
5616             nottaken=out;
5617             emit_jns(DJT_1);
5618           }else{
5619             add_to_linker(out,ba[i],internal);
5620             emit_js(0);
5621           }
5622         }
5623         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5624         {
5625           emit_test(s1l,s1l);
5626           if(invert){
5627             nottaken=out;
5628             emit_js(DJT_1);
5629           }else{
5630             add_to_linker(out,ba[i],internal);
5631             emit_jns(0);
5632           }
5633         }
5634       }
5635
5636       if(invert) {
5637         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5638         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5639           if(adj) {
5640             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5641             add_to_linker(out,ba[i],internal);
5642           }else{
5643             emit_addnop(13);
5644             add_to_linker(out,ba[i],internal*2);
5645           }
5646           emit_jmp(0);
5647         }else
5648         #endif
5649         {
5650           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5651           store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5652           load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5653           if(internal)
5654             assem_debug("branch: internal\n");
5655           else
5656             assem_debug("branch: external\n");
5657           if(internal&&is_ds[(ba[i]-start)>>2]) {
5658             ds_assemble_entry(i);
5659           }
5660           else {
5661             add_to_linker(out,ba[i],internal);
5662             emit_jmp(0);
5663           }
5664         }
5665         set_jump_target(nottaken, out);
5666       }
5667
5668       if(adj) {
5669         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5670       }
5671     } // (!unconditional)
5672   } // if(ooo)
5673   else
5674   {
5675     // In-order execution (branch first)
5676     //printf("IOE\n");
5677     void *nottaken = NULL;
5678     if(rt1[i]==31) {
5679       int rt,return_address;
5680       rt=get_reg(branch_regs[i].regmap,31);
5681       if(rt>=0) {
5682         // Save the PC even if the branch is not taken
5683         return_address=start+i*4+8;
5684         emit_movimm(return_address,rt); // PC into link register
5685         #ifdef IMM_PREFETCH
5686         emit_prefetch(hash_table_get(return_address));
5687         #endif
5688       }
5689     }
5690     if(!unconditional) {
5691       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5692         assert(s1l>=0);
5693         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5694         {
5695           emit_test(s1l,s1l);
5696           nottaken=out;
5697           emit_jns(DJT_1);
5698         }
5699         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5700         {
5701           emit_test(s1l,s1l);
5702           nottaken=out;
5703           emit_js(DJT_1);
5704         }
5705     } // if(!unconditional)
5706     int adj;
5707     uint64_t ds_unneeded=branch_regs[i].u;
5708     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5709     ds_unneeded|=1;
5710     // branch taken
5711     if(!nevertaken) {
5712       //assem_debug("1:\n");
5713       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5714       // load regs
5715       load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5716       address_generation(i+1,&branch_regs[i],0);
5717       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
5718       ds_assemble(i+1,&branch_regs[i]);
5719       cc=get_reg(branch_regs[i].regmap,CCREG);
5720       if(cc==-1) {
5721         emit_loadreg(CCREG,cc=HOST_CCREG);
5722         // CHECK: Is the following instruction (fall thru) allocated ok?
5723       }
5724       assert(cc==HOST_CCREG);
5725       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5726       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5727       assem_debug("cycle count (adj)\n");
5728       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5729       load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
5730       if(internal)
5731         assem_debug("branch: internal\n");
5732       else
5733         assem_debug("branch: external\n");
5734       if(internal&&is_ds[(ba[i]-start)>>2]) {
5735         ds_assemble_entry(i);
5736       }
5737       else {
5738         add_to_linker(out,ba[i],internal);
5739         emit_jmp(0);
5740       }
5741     }
5742     // branch not taken
5743     if(!unconditional) {
5744       set_jump_target(nottaken, out);
5745       assem_debug("1:\n");
5746       if(!likely[i]) {
5747         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
5748         load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
5749         address_generation(i+1,&branch_regs[i],0);
5750         load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
5751         ds_assemble(i+1,&branch_regs[i]);
5752       }
5753       cc=get_reg(branch_regs[i].regmap,CCREG);
5754       if(cc==-1&&!likely[i]) {
5755         // Cycle count isn't in a register, temporarily load it then write it out
5756         emit_loadreg(CCREG,HOST_CCREG);
5757         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5758         void *jaddr=out;
5759         emit_jns(0);
5760         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5761         emit_storereg(CCREG,HOST_CCREG);
5762       }
5763       else{
5764         cc=get_reg(i_regmap,CCREG);
5765         assert(cc==HOST_CCREG);
5766         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5767         void *jaddr=out;
5768         emit_jns(0);
5769         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5770       }
5771     }
5772   }
5773 }
5774
5775 static void pagespan_assemble(int i,struct regstat *i_regs)
5776 {
5777   int s1l=get_reg(i_regs->regmap,rs1[i]);
5778   int s2l=get_reg(i_regs->regmap,rs2[i]);
5779   void *taken = NULL;
5780   void *nottaken = NULL;
5781   int unconditional=0;
5782   if(rs1[i]==0)
5783   {
5784     s1l=s2l;
5785     s2l=-1;
5786   }
5787   else if(rs2[i]==0)
5788   {
5789     s2l=-1;
5790   }
5791   int hr=0;
5792   int addr=-1,alt=-1,ntaddr=-1;
5793   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5794   else {
5795     while(hr<HOST_REGS)
5796     {
5797       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5798          (i_regs->regmap[hr]&63)!=rs1[i] &&
5799          (i_regs->regmap[hr]&63)!=rs2[i] )
5800       {
5801         addr=hr++;break;
5802       }
5803       hr++;
5804     }
5805   }
5806   while(hr<HOST_REGS)
5807   {
5808     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5809        (i_regs->regmap[hr]&63)!=rs1[i] &&
5810        (i_regs->regmap[hr]&63)!=rs2[i] )
5811     {
5812       alt=hr++;break;
5813     }
5814     hr++;
5815   }
5816   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5817   {
5818     while(hr<HOST_REGS)
5819     {
5820       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5821          (i_regs->regmap[hr]&63)!=rs1[i] &&
5822          (i_regs->regmap[hr]&63)!=rs2[i] )
5823       {
5824         ntaddr=hr;break;
5825       }
5826       hr++;
5827     }
5828   }
5829   assert(hr<HOST_REGS);
5830   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5831     load_regs(regs[i].regmap_entry,regs[i].regmap,CCREG,CCREG);
5832   }
5833   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5834   if(opcode[i]==2) // J
5835   {
5836     unconditional=1;
5837   }
5838   if(opcode[i]==3) // JAL
5839   {
5840     // TODO: mini_ht
5841     int rt=get_reg(i_regs->regmap,31);
5842     emit_movimm(start+i*4+8,rt);
5843     unconditional=1;
5844   }
5845   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
5846   {
5847     emit_mov(s1l,addr);
5848     if(opcode2[i]==9) // JALR
5849     {
5850       int rt=get_reg(i_regs->regmap,rt1[i]);
5851       emit_movimm(start+i*4+8,rt);
5852     }
5853   }
5854   if((opcode[i]&0x3f)==4) // BEQ
5855   {
5856     if(rs1[i]==rs2[i])
5857     {
5858       unconditional=1;
5859     }
5860     else
5861     #ifdef HAVE_CMOV_IMM
5862     if(1) {
5863       if(s2l>=0) emit_cmp(s1l,s2l);
5864       else emit_test(s1l,s1l);
5865       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5866     }
5867     else
5868     #endif
5869     {
5870       assert(s1l>=0);
5871       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5872       if(s2l>=0) emit_cmp(s1l,s2l);
5873       else emit_test(s1l,s1l);
5874       emit_cmovne_reg(alt,addr);
5875     }
5876   }
5877   if((opcode[i]&0x3f)==5) // BNE
5878   {
5879     #ifdef HAVE_CMOV_IMM
5880     if(s2l>=0) emit_cmp(s1l,s2l);
5881     else emit_test(s1l,s1l);
5882     emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5883     #else
5884     assert(s1l>=0);
5885     emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5886     if(s2l>=0) emit_cmp(s1l,s2l);
5887     else emit_test(s1l,s1l);
5888     emit_cmovne_reg(alt,addr);
5889     #endif
5890   }
5891   if((opcode[i]&0x3f)==0x14) // BEQL
5892   {
5893     if(s2l>=0) emit_cmp(s1l,s2l);
5894     else emit_test(s1l,s1l);
5895     if(nottaken) set_jump_target(nottaken, out);
5896     nottaken=out;
5897     emit_jne(0);
5898   }
5899   if((opcode[i]&0x3f)==0x15) // BNEL
5900   {
5901     if(s2l>=0) emit_cmp(s1l,s2l);
5902     else emit_test(s1l,s1l);
5903     nottaken=out;
5904     emit_jeq(0);
5905     if(taken) set_jump_target(taken, out);
5906   }
5907   if((opcode[i]&0x3f)==6) // BLEZ
5908   {
5909     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5910     emit_cmpimm(s1l,1);
5911     emit_cmovl_reg(alt,addr);
5912   }
5913   if((opcode[i]&0x3f)==7) // BGTZ
5914   {
5915     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5916     emit_cmpimm(s1l,1);
5917     emit_cmovl_reg(ntaddr,addr);
5918   }
5919   if((opcode[i]&0x3f)==0x16) // BLEZL
5920   {
5921     assert((opcode[i]&0x3f)!=0x16);
5922   }
5923   if((opcode[i]&0x3f)==0x17) // BGTZL
5924   {
5925     assert((opcode[i]&0x3f)!=0x17);
5926   }
5927   assert(opcode[i]!=1); // BLTZ/BGEZ
5928
5929   //FIXME: Check CSREG
5930   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5931     if((source[i]&0x30000)==0) // BC1F
5932     {
5933       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5934       emit_testimm(s1l,0x800000);
5935       emit_cmovne_reg(alt,addr);
5936     }
5937     if((source[i]&0x30000)==0x10000) // BC1T
5938     {
5939       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5940       emit_testimm(s1l,0x800000);
5941       emit_cmovne_reg(alt,addr);
5942     }
5943     if((source[i]&0x30000)==0x20000) // BC1FL
5944     {
5945       emit_testimm(s1l,0x800000);
5946       nottaken=out;
5947       emit_jne(0);
5948     }
5949     if((source[i]&0x30000)==0x30000) // BC1TL
5950     {
5951       emit_testimm(s1l,0x800000);
5952       nottaken=out;
5953       emit_jeq(0);
5954     }
5955   }
5956
5957   assert(i_regs->regmap[HOST_CCREG]==CCREG);
5958   wb_dirtys(regs[i].regmap,regs[i].dirty);
5959   if(likely[i]||unconditional)
5960   {
5961     emit_movimm(ba[i],HOST_BTREG);
5962   }
5963   else if(addr!=HOST_BTREG)
5964   {
5965     emit_mov(addr,HOST_BTREG);
5966   }
5967   void *branch_addr=out;
5968   emit_jmp(0);
5969   int target_addr=start+i*4+5;
5970   void *stub=out;
5971   void *compiled_target_addr=check_addr(target_addr);
5972   emit_extjump_ds(branch_addr, target_addr);
5973   if(compiled_target_addr) {
5974     set_jump_target(branch_addr, compiled_target_addr);
5975     add_link(target_addr,stub);
5976   }
5977   else set_jump_target(branch_addr, stub);
5978   if(likely[i]) {
5979     // Not-taken path
5980     set_jump_target(nottaken, out);
5981     wb_dirtys(regs[i].regmap,regs[i].dirty);
5982     void *branch_addr=out;
5983     emit_jmp(0);
5984     int target_addr=start+i*4+8;
5985     void *stub=out;
5986     void *compiled_target_addr=check_addr(target_addr);
5987     emit_extjump_ds(branch_addr, target_addr);
5988     if(compiled_target_addr) {
5989       set_jump_target(branch_addr, compiled_target_addr);
5990       add_link(target_addr,stub);
5991     }
5992     else set_jump_target(branch_addr, stub);
5993   }
5994 }
5995
5996 // Assemble the delay slot for the above
5997 static void pagespan_ds()
5998 {
5999   assem_debug("initial delay slot:\n");
6000   u_int vaddr=start+1;
6001   u_int page=get_page(vaddr);
6002   u_int vpage=get_vpage(vaddr);
6003   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6004   do_dirty_stub_ds();
6005   ll_add(jump_in+page,vaddr,(void *)out);
6006   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6007   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6008     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty);
6009   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6010     emit_writeword(HOST_BTREG,&branch_target);
6011   load_regs(regs[0].regmap_entry,regs[0].regmap,rs1[0],rs2[0]);
6012   address_generation(0,&regs[0],regs[0].regmap_entry);
6013   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6014     load_regs(regs[0].regmap_entry,regs[0].regmap,INVCP,INVCP);
6015   is_delayslot=0;
6016   switch(itype[0]) {
6017     case ALU:
6018       alu_assemble(0,&regs[0]);break;
6019     case IMM16:
6020       imm16_assemble(0,&regs[0]);break;
6021     case SHIFT:
6022       shift_assemble(0,&regs[0]);break;
6023     case SHIFTIMM:
6024       shiftimm_assemble(0,&regs[0]);break;
6025     case LOAD:
6026       load_assemble(0,&regs[0]);break;
6027     case LOADLR:
6028       loadlr_assemble(0,&regs[0]);break;
6029     case STORE:
6030       store_assemble(0,&regs[0]);break;
6031     case STORELR:
6032       storelr_assemble(0,&regs[0]);break;
6033     case COP0:
6034       cop0_assemble(0,&regs[0]);break;
6035     case COP1:
6036       cop1_assemble(0,&regs[0]);break;
6037     case C1LS:
6038       c1ls_assemble(0,&regs[0]);break;
6039     case COP2:
6040       cop2_assemble(0,&regs[0]);break;
6041     case C2LS:
6042       c2ls_assemble(0,&regs[0]);break;
6043     case C2OP:
6044       c2op_assemble(0,&regs[0]);break;
6045     case MULTDIV:
6046       multdiv_assemble(0,&regs[0]);
6047       multdiv_prepare_stall(0,&regs[0]);
6048       break;
6049     case MOV:
6050       mov_assemble(0,&regs[0]);break;
6051     case SYSCALL:
6052     case HLECALL:
6053     case INTCALL:
6054     case SPAN:
6055     case UJUMP:
6056     case RJUMP:
6057     case CJUMP:
6058     case SJUMP:
6059       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
6060   }
6061   int btaddr=get_reg(regs[0].regmap,BTREG);
6062   if(btaddr<0) {
6063     btaddr=get_reg(regs[0].regmap,-1);
6064     emit_readword(&branch_target,btaddr);
6065   }
6066   assert(btaddr!=HOST_CCREG);
6067   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6068 #ifdef HOST_IMM8
6069   host_tempreg_acquire();
6070   emit_movimm(start+4,HOST_TEMPREG);
6071   emit_cmp(btaddr,HOST_TEMPREG);
6072   host_tempreg_release();
6073 #else
6074   emit_cmpimm(btaddr,start+4);
6075 #endif
6076   void *branch = out;
6077   emit_jeq(0);
6078   store_regs_bt(regs[0].regmap,regs[0].dirty,-1);
6079   do_jump_vaddr(btaddr);
6080   set_jump_target(branch, out);
6081   store_regs_bt(regs[0].regmap,regs[0].dirty,start+4);
6082   load_regs_bt(regs[0].regmap,regs[0].dirty,start+4);
6083 }
6084
6085 // Basic liveness analysis for MIPS registers
6086 void unneeded_registers(int istart,int iend,int r)
6087 {
6088   int i;
6089   uint64_t u,gte_u,b,gte_b;
6090   uint64_t temp_u,temp_gte_u=0;
6091   uint64_t gte_u_unknown=0;
6092   if (HACK_ENABLED(NDHACK_GTE_UNNEEDED))
6093     gte_u_unknown=~0ll;
6094   if(iend==slen-1) {
6095     u=1;
6096     gte_u=gte_u_unknown;
6097   }else{
6098     //u=unneeded_reg[iend+1];
6099     u=1;
6100     gte_u=gte_unneeded[iend+1];
6101   }
6102
6103   for (i=iend;i>=istart;i--)
6104   {
6105     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6106     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
6107     {
6108       // If subroutine call, flag return address as a possible branch target
6109       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6110
6111       if(ba[i]<start || ba[i]>=(start+slen*4))
6112       {
6113         // Branch out of this block, flush all regs
6114         u=1;
6115         gte_u=gte_u_unknown;
6116         branch_unneeded_reg[i]=u;
6117         // Merge in delay slot
6118         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6119         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6120         u|=1;
6121         gte_u|=gte_rt[i+1];
6122         gte_u&=~gte_rs[i+1];
6123         // If branch is "likely" (and conditional)
6124         // then we skip the delay slot on the fall-thru path
6125         if(likely[i]) {
6126           if(i<slen-1) {
6127             u&=unneeded_reg[i+2];
6128             gte_u&=gte_unneeded[i+2];
6129           }
6130           else
6131           {
6132             u=1;
6133             gte_u=gte_u_unknown;
6134           }
6135         }
6136       }
6137       else
6138       {
6139         // Internal branch, flag target
6140         bt[(ba[i]-start)>>2]=1;
6141         if(ba[i]<=start+i*4) {
6142           // Backward branch
6143           if(is_ujump(i))
6144           {
6145             // Unconditional branch
6146             temp_u=1;
6147             temp_gte_u=0;
6148           } else {
6149             // Conditional branch (not taken case)
6150             temp_u=unneeded_reg[i+2];
6151             temp_gte_u&=gte_unneeded[i+2];
6152           }
6153           // Merge in delay slot
6154           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6155           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6156           temp_u|=1;
6157           temp_gte_u|=gte_rt[i+1];
6158           temp_gte_u&=~gte_rs[i+1];
6159           // If branch is "likely" (and conditional)
6160           // then we skip the delay slot on the fall-thru path
6161           if(likely[i]) {
6162             if(i<slen-1) {
6163               temp_u&=unneeded_reg[i+2];
6164               temp_gte_u&=gte_unneeded[i+2];
6165             }
6166             else
6167             {
6168               temp_u=1;
6169               temp_gte_u=gte_u_unknown;
6170             }
6171           }
6172           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6173           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6174           temp_u|=1;
6175           temp_gte_u|=gte_rt[i];
6176           temp_gte_u&=~gte_rs[i];
6177           unneeded_reg[i]=temp_u;
6178           gte_unneeded[i]=temp_gte_u;
6179           // Only go three levels deep.  This recursion can take an
6180           // excessive amount of time if there are a lot of nested loops.
6181           if(r<2) {
6182             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6183           }else{
6184             unneeded_reg[(ba[i]-start)>>2]=1;
6185             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
6186           }
6187         } /*else*/ if(1) {
6188           if (is_ujump(i))
6189           {
6190             // Unconditional branch
6191             u=unneeded_reg[(ba[i]-start)>>2];
6192             gte_u=gte_unneeded[(ba[i]-start)>>2];
6193             branch_unneeded_reg[i]=u;
6194             // Merge in delay slot
6195             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6196             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6197             u|=1;
6198             gte_u|=gte_rt[i+1];
6199             gte_u&=~gte_rs[i+1];
6200           } else {
6201             // Conditional branch
6202             b=unneeded_reg[(ba[i]-start)>>2];
6203             gte_b=gte_unneeded[(ba[i]-start)>>2];
6204             branch_unneeded_reg[i]=b;
6205             // Branch delay slot
6206             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6207             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6208             b|=1;
6209             gte_b|=gte_rt[i+1];
6210             gte_b&=~gte_rs[i+1];
6211             // If branch is "likely" then we skip the
6212             // delay slot on the fall-thru path
6213             if(likely[i]) {
6214               u=b;
6215               gte_u=gte_b;
6216               if(i<slen-1) {
6217                 u&=unneeded_reg[i+2];
6218                 gte_u&=gte_unneeded[i+2];
6219               }
6220             } else {
6221               u&=b;
6222               gte_u&=gte_b;
6223             }
6224             if(i<slen-1) {
6225               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6226             } else {
6227               branch_unneeded_reg[i]=1;
6228             }
6229           }
6230         }
6231       }
6232     }
6233     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6234     {
6235       // SYSCALL instruction (software interrupt)
6236       u=1;
6237     }
6238     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6239     {
6240       // ERET instruction (return from interrupt)
6241       u=1;
6242     }
6243     //u=1; // DEBUG
6244     // Written registers are unneeded
6245     u|=1LL<<rt1[i];
6246     u|=1LL<<rt2[i];
6247     gte_u|=gte_rt[i];
6248     // Accessed registers are needed
6249     u&=~(1LL<<rs1[i]);
6250     u&=~(1LL<<rs2[i]);
6251     gte_u&=~gte_rs[i];
6252     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
6253       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
6254     // Source-target dependencies
6255     // R0 is always unneeded
6256     u|=1;
6257     // Save it
6258     unneeded_reg[i]=u;
6259     gte_unneeded[i]=gte_u;
6260     /*
6261     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6262     printf("U:");
6263     int r;
6264     for(r=1;r<=CCREG;r++) {
6265       if((unneeded_reg[i]>>r)&1) {
6266         if(r==HIREG) printf(" HI");
6267         else if(r==LOREG) printf(" LO");
6268         else printf(" r%d",r);
6269       }
6270     }
6271     printf("\n");
6272     */
6273   }
6274 }
6275
6276 // Write back dirty registers as soon as we will no longer modify them,
6277 // so that we don't end up with lots of writes at the branches.
6278 void clean_registers(int istart,int iend,int wr)
6279 {
6280   int i;
6281   int r;
6282   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6283   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6284   if(iend==slen-1) {
6285     will_dirty_i=will_dirty_next=0;
6286     wont_dirty_i=wont_dirty_next=0;
6287   }else{
6288     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6289     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6290   }
6291   for (i=iend;i>=istart;i--)
6292   {
6293     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
6294     {
6295       if(ba[i]<start || ba[i]>=(start+slen*4))
6296       {
6297         // Branch out of this block, flush all regs
6298         if (is_ujump(i))
6299         {
6300           // Unconditional branch
6301           will_dirty_i=0;
6302           wont_dirty_i=0;
6303           // Merge in delay slot (will dirty)
6304           for(r=0;r<HOST_REGS;r++) {
6305             if(r!=EXCLUDE_REG) {
6306               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6307               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6308               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6309               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6310               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6311               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6312               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6313               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6314               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6315               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6316               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6317               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6318               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6319               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6320             }
6321           }
6322         }
6323         else
6324         {
6325           // Conditional branch
6326           will_dirty_i=0;
6327           wont_dirty_i=wont_dirty_next;
6328           // Merge in delay slot (will dirty)
6329           for(r=0;r<HOST_REGS;r++) {
6330             if(r!=EXCLUDE_REG) {
6331               if(!likely[i]) {
6332                 // Might not dirty if likely branch is not taken
6333                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6334                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6335                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6336                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6337                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6338                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6339                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6340                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6341                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6342                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6343                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6344                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6345                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6346                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6347               }
6348             }
6349           }
6350         }
6351         // Merge in delay slot (wont dirty)
6352         for(r=0;r<HOST_REGS;r++) {
6353           if(r!=EXCLUDE_REG) {
6354             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6355             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6356             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6357             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6358             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6359             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6360             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6361             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6362             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6363             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6364           }
6365         }
6366         if(wr) {
6367           #ifndef DESTRUCTIVE_WRITEBACK
6368           branch_regs[i].dirty&=wont_dirty_i;
6369           #endif
6370           branch_regs[i].dirty|=will_dirty_i;
6371         }
6372       }
6373       else
6374       {
6375         // Internal branch
6376         if(ba[i]<=start+i*4) {
6377           // Backward branch
6378           if (is_ujump(i))
6379           {
6380             // Unconditional branch
6381             temp_will_dirty=0;
6382             temp_wont_dirty=0;
6383             // Merge in delay slot (will dirty)
6384             for(r=0;r<HOST_REGS;r++) {
6385               if(r!=EXCLUDE_REG) {
6386                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6387                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6388                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6389                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6390                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6391                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6392                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6393                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6394                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6395                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6396                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6397                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6398                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6399                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6400               }
6401             }
6402           } else {
6403             // Conditional branch (not taken case)
6404             temp_will_dirty=will_dirty_next;
6405             temp_wont_dirty=wont_dirty_next;
6406             // Merge in delay slot (will dirty)
6407             for(r=0;r<HOST_REGS;r++) {
6408               if(r!=EXCLUDE_REG) {
6409                 if(!likely[i]) {
6410                   // Will not dirty if likely branch is not taken
6411                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6412                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6413                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6414                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6415                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6416                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
6417                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6418                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6419                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6420                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6421                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6422                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6423                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6424                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6425                 }
6426               }
6427             }
6428           }
6429           // Merge in delay slot (wont dirty)
6430           for(r=0;r<HOST_REGS;r++) {
6431             if(r!=EXCLUDE_REG) {
6432               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6433               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6434               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6435               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6436               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6437               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6438               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6439               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6440               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6441               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6442             }
6443           }
6444           // Deal with changed mappings
6445           if(i<iend) {
6446             for(r=0;r<HOST_REGS;r++) {
6447               if(r!=EXCLUDE_REG) {
6448                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
6449                   temp_will_dirty&=~(1<<r);
6450                   temp_wont_dirty&=~(1<<r);
6451                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6452                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6453                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6454                   } else {
6455                     temp_will_dirty|=1<<r;
6456                     temp_wont_dirty|=1<<r;
6457                   }
6458                 }
6459               }
6460             }
6461           }
6462           if(wr) {
6463             will_dirty[i]=temp_will_dirty;
6464             wont_dirty[i]=temp_wont_dirty;
6465             clean_registers((ba[i]-start)>>2,i-1,0);
6466           }else{
6467             // Limit recursion.  It can take an excessive amount
6468             // of time if there are a lot of nested loops.
6469             will_dirty[(ba[i]-start)>>2]=0;
6470             wont_dirty[(ba[i]-start)>>2]=-1;
6471           }
6472         }
6473         /*else*/ if(1)
6474         {
6475           if (is_ujump(i))
6476           {
6477             // Unconditional branch
6478             will_dirty_i=0;
6479             wont_dirty_i=0;
6480           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6481             for(r=0;r<HOST_REGS;r++) {
6482               if(r!=EXCLUDE_REG) {
6483                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6484                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
6485                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6486                 }
6487                 if(branch_regs[i].regmap[r]>=0) {
6488                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6489                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6490                 }
6491               }
6492             }
6493           //}
6494             // Merge in delay slot
6495             for(r=0;r<HOST_REGS;r++) {
6496               if(r!=EXCLUDE_REG) {
6497                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6498                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6499                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6500                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6501                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6502                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6503                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6504                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6505                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6506                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6507                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6508                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6509                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6510                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6511               }
6512             }
6513           } else {
6514             // Conditional branch
6515             will_dirty_i=will_dirty_next;
6516             wont_dirty_i=wont_dirty_next;
6517           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6518             for(r=0;r<HOST_REGS;r++) {
6519               if(r!=EXCLUDE_REG) {
6520                 signed char target_reg=branch_regs[i].regmap[r];
6521                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6522                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6523                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6524                 }
6525                 else if(target_reg>=0) {
6526                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6527                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6528                 }
6529                 // Treat delay slot as part of branch too
6530                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6531                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6532                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6533                 }
6534                 else
6535                 {
6536                   will_dirty[i+1]&=~(1<<r);
6537                 }*/
6538               }
6539             }
6540           //}
6541             // Merge in delay slot
6542             for(r=0;r<HOST_REGS;r++) {
6543               if(r!=EXCLUDE_REG) {
6544                 if(!likely[i]) {
6545                   // Might not dirty if likely branch is not taken
6546                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6547                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6548                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6549                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6550                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6551                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6552                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6553                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6554                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6555                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6556                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6557                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6558                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6559                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6560                 }
6561               }
6562             }
6563           }
6564           // Merge in delay slot (won't dirty)
6565           for(r=0;r<HOST_REGS;r++) {
6566             if(r!=EXCLUDE_REG) {
6567               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6568               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6569               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6570               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6571               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6572               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6573               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6574               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6575               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6576               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6577             }
6578           }
6579           if(wr) {
6580             #ifndef DESTRUCTIVE_WRITEBACK
6581             branch_regs[i].dirty&=wont_dirty_i;
6582             #endif
6583             branch_regs[i].dirty|=will_dirty_i;
6584           }
6585         }
6586       }
6587     }
6588     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6589     {
6590       // SYSCALL instruction (software interrupt)
6591       will_dirty_i=0;
6592       wont_dirty_i=0;
6593     }
6594     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6595     {
6596       // ERET instruction (return from interrupt)
6597       will_dirty_i=0;
6598       wont_dirty_i=0;
6599     }
6600     will_dirty_next=will_dirty_i;
6601     wont_dirty_next=wont_dirty_i;
6602     for(r=0;r<HOST_REGS;r++) {
6603       if(r!=EXCLUDE_REG) {
6604         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6605         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6606         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6607         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6608         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6609         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6610         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6611         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6612         if(i>istart) {
6613           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP)
6614           {
6615             // Don't store a register immediately after writing it,
6616             // may prevent dual-issue.
6617             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
6618             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
6619           }
6620         }
6621       }
6622     }
6623     // Save it
6624     will_dirty[i]=will_dirty_i;
6625     wont_dirty[i]=wont_dirty_i;
6626     // Mark registers that won't be dirtied as not dirty
6627     if(wr) {
6628       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
6629       for(r=0;r<HOST_REGS;r++) {
6630         if((will_dirty_i>>r)&1) {
6631           printf(" r%d",r);
6632         }
6633       }
6634       printf("\n");*/
6635
6636       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP)) {
6637         regs[i].dirty|=will_dirty_i;
6638         #ifndef DESTRUCTIVE_WRITEBACK
6639         regs[i].dirty&=wont_dirty_i;
6640         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
6641         {
6642           if (i < iend-1 && !is_ujump(i)) {
6643             for(r=0;r<HOST_REGS;r++) {
6644               if(r!=EXCLUDE_REG) {
6645                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
6646                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
6647                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6648               }
6649             }
6650           }
6651         }
6652         else
6653         {
6654           if(i<iend) {
6655             for(r=0;r<HOST_REGS;r++) {
6656               if(r!=EXCLUDE_REG) {
6657                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
6658                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
6659                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6660               }
6661             }
6662           }
6663         }
6664         #endif
6665       //}
6666     }
6667     // Deal with changed mappings
6668     temp_will_dirty=will_dirty_i;
6669     temp_wont_dirty=wont_dirty_i;
6670     for(r=0;r<HOST_REGS;r++) {
6671       if(r!=EXCLUDE_REG) {
6672         int nr;
6673         if(regs[i].regmap[r]==regmap_pre[i][r]) {
6674           if(wr) {
6675             #ifndef DESTRUCTIVE_WRITEBACK
6676             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6677             #endif
6678             regs[i].wasdirty|=will_dirty_i&(1<<r);
6679           }
6680         }
6681         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
6682           // Register moved to a different register
6683           will_dirty_i&=~(1<<r);
6684           wont_dirty_i&=~(1<<r);
6685           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
6686           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
6687           if(wr) {
6688             #ifndef DESTRUCTIVE_WRITEBACK
6689             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6690             #endif
6691             regs[i].wasdirty|=will_dirty_i&(1<<r);
6692           }
6693         }
6694         else {
6695           will_dirty_i&=~(1<<r);
6696           wont_dirty_i&=~(1<<r);
6697           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6698             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6699             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6700           } else {
6701             wont_dirty_i|=1<<r;
6702             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
6703           }
6704         }
6705       }
6706     }
6707   }
6708 }
6709
6710 #ifdef DISASM
6711   /* disassembly */
6712 void disassemble_inst(int i)
6713 {
6714     if (bt[i]) printf("*"); else printf(" ");
6715     switch(itype[i]) {
6716       case UJUMP:
6717         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6718       case CJUMP:
6719         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
6720       case SJUMP:
6721         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
6722       case RJUMP:
6723         if (opcode[i]==0x9&&rt1[i]!=31)
6724           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
6725         else
6726           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6727         break;
6728       case SPAN:
6729         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
6730       case IMM16:
6731         if(opcode[i]==0xf) //LUI
6732           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
6733         else
6734           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6735         break;
6736       case LOAD:
6737       case LOADLR:
6738         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6739         break;
6740       case STORE:
6741       case STORELR:
6742         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
6743         break;
6744       case ALU:
6745       case SHIFT:
6746         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
6747         break;
6748       case MULTDIV:
6749         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
6750         break;
6751       case SHIFTIMM:
6752         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6753         break;
6754       case MOV:
6755         if((opcode2[i]&0x1d)==0x10)
6756           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
6757         else if((opcode2[i]&0x1d)==0x11)
6758           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6759         else
6760           printf (" %x: %s\n",start+i*4,insn[i]);
6761         break;
6762       case COP0:
6763         if(opcode2[i]==0)
6764           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
6765         else if(opcode2[i]==4)
6766           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
6767         else printf (" %x: %s\n",start+i*4,insn[i]);
6768         break;
6769       case COP1:
6770         if(opcode2[i]<3)
6771           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
6772         else if(opcode2[i]>3)
6773           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
6774         else printf (" %x: %s\n",start+i*4,insn[i]);
6775         break;
6776       case COP2:
6777         if(opcode2[i]<3)
6778           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
6779         else if(opcode2[i]>3)
6780           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
6781         else printf (" %x: %s\n",start+i*4,insn[i]);
6782         break;
6783       case C1LS:
6784         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6785         break;
6786       case C2LS:
6787         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6788         break;
6789       case INTCALL:
6790         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
6791         break;
6792       default:
6793         //printf (" %s %8x\n",insn[i],source[i]);
6794         printf (" %x: %s\n",start+i*4,insn[i]);
6795     }
6796 }
6797 #else
6798 static void disassemble_inst(int i) {}
6799 #endif // DISASM
6800
6801 #define DRC_TEST_VAL 0x74657374
6802
6803 static void new_dynarec_test(void)
6804 {
6805   int (*testfunc)(void);
6806   void *beginning;
6807   int ret[2];
6808   size_t i;
6809
6810   // check structure linkage
6811   if ((u_char *)rcnts - (u_char *)&psxRegs != sizeof(psxRegs))
6812   {
6813     SysPrintf("linkage_arm* miscompilation/breakage detected.\n");
6814   }
6815
6816   SysPrintf("testing if we can run recompiled code...\n");
6817   ((volatile u_int *)out)[0]++; // make cache dirty
6818
6819   for (i = 0; i < ARRAY_SIZE(ret); i++) {
6820     out = ndrc->translation_cache;
6821     beginning = start_block();
6822     emit_movimm(DRC_TEST_VAL + i, 0); // test
6823     emit_ret();
6824     literal_pool(0);
6825     end_block(beginning);
6826     testfunc = beginning;
6827     ret[i] = testfunc();
6828   }
6829
6830   if (ret[0] == DRC_TEST_VAL && ret[1] == DRC_TEST_VAL + 1)
6831     SysPrintf("test passed.\n");
6832   else
6833     SysPrintf("test failed, will likely crash soon (r=%08x %08x)\n", ret[0], ret[1]);
6834   out = ndrc->translation_cache;
6835 }
6836
6837 // clear the state completely, instead of just marking
6838 // things invalid like invalidate_all_pages() does
6839 void new_dynarec_clear_full(void)
6840 {
6841   int n;
6842   out = ndrc->translation_cache;
6843   memset(invalid_code,1,sizeof(invalid_code));
6844   memset(hash_table,0xff,sizeof(hash_table));
6845   memset(mini_ht,-1,sizeof(mini_ht));
6846   memset(restore_candidate,0,sizeof(restore_candidate));
6847   memset(shadow,0,sizeof(shadow));
6848   copy=shadow;
6849   expirep=16384; // Expiry pointer, +2 blocks
6850   pending_exception=0;
6851   literalcount=0;
6852   stop_after_jal=0;
6853   inv_code_start=inv_code_end=~0;
6854   // TLB
6855   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6856   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6857   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6858
6859   cycle_multiplier_old = cycle_multiplier;
6860   new_dynarec_hacks_old = new_dynarec_hacks;
6861 }
6862
6863 void new_dynarec_init(void)
6864 {
6865   SysPrintf("Init new dynarec\n");
6866
6867 #ifdef BASE_ADDR_DYNAMIC
6868   #ifdef VITA
6869   sceBlock = sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
6870   if (sceBlock < 0)
6871     SysPrintf("sceKernelAllocMemBlockForVM failed\n");
6872   int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&ndrc);
6873   if (ret < 0)
6874     SysPrintf("sceKernelGetMemBlockBase failed\n");
6875   #else
6876   uintptr_t desired_addr = 0;
6877   #ifdef __ELF__
6878   extern char _end;
6879   desired_addr = ((uintptr_t)&_end + 0xffffff) & ~0xffffffl;
6880   #endif
6881   ndrc = mmap((void *)desired_addr, sizeof(*ndrc),
6882             PROT_READ | PROT_WRITE | PROT_EXEC,
6883             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
6884   if (ndrc == MAP_FAILED) {
6885     SysPrintf("mmap() failed: %s\n", strerror(errno));
6886     abort();
6887   }
6888   #endif
6889 #else
6890   #ifndef NO_WRITE_EXEC
6891   // not all systems allow execute in data segment by default
6892   if (mprotect(ndrc, sizeof(ndrc->translation_cache) + sizeof(ndrc->tramp.ops),
6893                PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
6894     SysPrintf("mprotect() failed: %s\n", strerror(errno));
6895   #endif
6896 #endif
6897   out = ndrc->translation_cache;
6898   cycle_multiplier=200;
6899   new_dynarec_clear_full();
6900 #ifdef HOST_IMM8
6901   // Copy this into local area so we don't have to put it in every literal pool
6902   invc_ptr=invalid_code;
6903 #endif
6904   arch_init();
6905   new_dynarec_test();
6906 #ifndef RAM_FIXED
6907   ram_offset=(uintptr_t)rdram-0x80000000;
6908 #endif
6909   if (ram_offset!=0)
6910     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
6911 }
6912
6913 void new_dynarec_cleanup(void)
6914 {
6915   int n;
6916 #ifdef BASE_ADDR_DYNAMIC
6917   #ifdef VITA
6918   sceKernelFreeMemBlock(sceBlock);
6919   sceBlock = -1;
6920   #else
6921   if (munmap(ndrc, sizeof(*ndrc)) < 0)
6922     SysPrintf("munmap() failed\n");
6923   #endif
6924 #endif
6925   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6926   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6927   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6928   #ifdef ROM_COPY
6929   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
6930   #endif
6931 }
6932
6933 static u_int *get_source_start(u_int addr, u_int *limit)
6934 {
6935   if (!HACK_ENABLED(NDHACK_OVERRIDE_CYCLE_M))
6936     cycle_multiplier_override = 0;
6937
6938   if (addr < 0x00200000 ||
6939     (0xa0000000 <= addr && addr < 0xa0200000))
6940   {
6941     // used for BIOS calls mostly?
6942     *limit = (addr&0xa0000000)|0x00200000;
6943     return (u_int *)(rdram + (addr&0x1fffff));
6944   }
6945   else if (!Config.HLE && (
6946     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
6947     (0xbfc00000 <= addr && addr < 0xbfc80000)))
6948   {
6949     // BIOS. The multiplier should be much higher as it's uncached 8bit mem,
6950     // but timings in PCSX are too tied to the interpreter's BIAS
6951     if (!HACK_ENABLED(NDHACK_OVERRIDE_CYCLE_M))
6952       cycle_multiplier_override = 200;
6953
6954     *limit = (addr & 0xfff00000) | 0x80000;
6955     return (u_int *)((u_char *)psxR + (addr&0x7ffff));
6956   }
6957   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
6958     *limit = (addr & 0x80600000) + 0x00200000;
6959     return (u_int *)(rdram + (addr&0x1fffff));
6960   }
6961   return NULL;
6962 }
6963
6964 static u_int scan_for_ret(u_int addr)
6965 {
6966   u_int limit = 0;
6967   u_int *mem;
6968
6969   mem = get_source_start(addr, &limit);
6970   if (mem == NULL)
6971     return addr;
6972
6973   if (limit > addr + 0x1000)
6974     limit = addr + 0x1000;
6975   for (; addr < limit; addr += 4, mem++) {
6976     if (*mem == 0x03e00008) // jr $ra
6977       return addr + 8;
6978   }
6979   return addr;
6980 }
6981
6982 struct savestate_block {
6983   uint32_t addr;
6984   uint32_t regflags;
6985 };
6986
6987 static int addr_cmp(const void *p1_, const void *p2_)
6988 {
6989   const struct savestate_block *p1 = p1_, *p2 = p2_;
6990   return p1->addr - p2->addr;
6991 }
6992
6993 int new_dynarec_save_blocks(void *save, int size)
6994 {
6995   struct savestate_block *blocks = save;
6996   int maxcount = size / sizeof(blocks[0]);
6997   struct savestate_block tmp_blocks[1024];
6998   struct ll_entry *head;
6999   int p, s, d, o, bcnt;
7000   u_int addr;
7001
7002   o = 0;
7003   for (p = 0; p < ARRAY_SIZE(jump_in); p++) {
7004     bcnt = 0;
7005     for (head = jump_in[p]; head != NULL; head = head->next) {
7006       tmp_blocks[bcnt].addr = head->vaddr;
7007       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
7008       bcnt++;
7009     }
7010     if (bcnt < 1)
7011       continue;
7012     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
7013
7014     addr = tmp_blocks[0].addr;
7015     for (s = d = 0; s < bcnt; s++) {
7016       if (tmp_blocks[s].addr < addr)
7017         continue;
7018       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
7019         tmp_blocks[d++] = tmp_blocks[s];
7020       addr = scan_for_ret(tmp_blocks[s].addr);
7021     }
7022
7023     if (o + d > maxcount)
7024       d = maxcount - o;
7025     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
7026     o += d;
7027   }
7028
7029   return o * sizeof(blocks[0]);
7030 }
7031
7032 void new_dynarec_load_blocks(const void *save, int size)
7033 {
7034   const struct savestate_block *blocks = save;
7035   int count = size / sizeof(blocks[0]);
7036   u_int regs_save[32];
7037   uint32_t f;
7038   int i, b;
7039
7040   get_addr(psxRegs.pc);
7041
7042   // change GPRs for speculation to at least partially work..
7043   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
7044   for (i = 1; i < 32; i++)
7045     psxRegs.GPR.r[i] = 0x80000000;
7046
7047   for (b = 0; b < count; b++) {
7048     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7049       if (f & 1)
7050         psxRegs.GPR.r[i] = 0x1f800000;
7051     }
7052
7053     get_addr(blocks[b].addr);
7054
7055     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7056       if (f & 1)
7057         psxRegs.GPR.r[i] = 0x80000000;
7058     }
7059   }
7060
7061   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
7062 }
7063
7064 int new_recompile_block(u_int addr)
7065 {
7066   u_int pagelimit = 0;
7067   u_int state_rflags = 0;
7068   int i;
7069
7070   assem_debug("NOTCOMPILED: addr = %x -> %p\n", addr, out);
7071   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7072   //if(debug)
7073   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7074
7075   // this is just for speculation
7076   for (i = 1; i < 32; i++) {
7077     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
7078       state_rflags |= 1 << i;
7079   }
7080
7081   start = (u_int)addr&~3;
7082   //assert(((u_int)addr&1)==0); // start-in-delay-slot flag
7083   new_dynarec_did_compile=1;
7084   if (Config.HLE && start == 0x80001000) // hlecall
7085   {
7086     // XXX: is this enough? Maybe check hleSoftCall?
7087     void *beginning=start_block();
7088     u_int page=get_page(start);
7089
7090     invalid_code[start>>12]=0;
7091     emit_movimm(start,0);
7092     emit_writeword(0,&pcaddr);
7093     emit_far_jump(new_dyna_leave);
7094     literal_pool(0);
7095     end_block(beginning);
7096     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
7097     return 0;
7098   }
7099
7100   source = get_source_start(start, &pagelimit);
7101   if (source == NULL) {
7102     SysPrintf("Compile at bogus memory address: %08x\n", addr);
7103     abort();
7104   }
7105
7106   /* Pass 1: disassemble */
7107   /* Pass 2: register dependencies, branch targets */
7108   /* Pass 3: register allocation */
7109   /* Pass 4: branch dependencies */
7110   /* Pass 5: pre-alloc */
7111   /* Pass 6: optimize clean/dirty state */
7112   /* Pass 7: flag 32-bit registers */
7113   /* Pass 8: assembly */
7114   /* Pass 9: linker */
7115   /* Pass 10: garbage collection / free memory */
7116
7117   int j;
7118   int done=0;
7119   unsigned int type,op,op2;
7120
7121   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7122
7123   /* Pass 1 disassembly */
7124
7125   for(i=0;!done;i++) {
7126     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
7127     minimum_free_regs[i]=0;
7128     opcode[i]=op=source[i]>>26;
7129     switch(op)
7130     {
7131       case 0x00: strcpy(insn[i],"special"); type=NI;
7132         op2=source[i]&0x3f;
7133         switch(op2)
7134         {
7135           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7136           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7137           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7138           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7139           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7140           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7141           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7142           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7143           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7144           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7145           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7146           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7147           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7148           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7149           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7150           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7151           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7152           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7153           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7154           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7155           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7156           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7157           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7158           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7159           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7160           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7161           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7162           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7163           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7164           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7165           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7166           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7167           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7168           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7169           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7170 #if 0
7171           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7172           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7173           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7174           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7175           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7176           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7177           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7178           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7179           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7180           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7181           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7182           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7183           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7184           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7185           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7186           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7187           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7188 #endif
7189         }
7190         break;
7191       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7192         op2=(source[i]>>16)&0x1f;
7193         switch(op2)
7194         {
7195           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7196           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7197           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7198           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7199           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7200           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7201           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7202           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7203           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7204           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7205           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7206           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7207           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7208           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7209         }
7210         break;
7211       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7212       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7213       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7214       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7215       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7216       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7217       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7218       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7219       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7220       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7221       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7222       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7223       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7224       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7225       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7226         op2=(source[i]>>21)&0x1f;
7227         switch(op2)
7228         {
7229           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7230           case 0x02: strcpy(insn[i],"CFC0"); type=COP0; break;
7231           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7232           case 0x06: strcpy(insn[i],"CTC0"); type=COP0; break;
7233           case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
7234         }
7235         break;
7236       case 0x11: strcpy(insn[i],"cop1"); type=COP1;
7237         op2=(source[i]>>21)&0x1f;
7238         break;
7239 #if 0
7240       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7241       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7242       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7243       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7244       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7245       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7246       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7247       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7248 #endif
7249       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7250       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7251       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7252       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7253       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7254       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7255       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7256 #if 0
7257       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7258 #endif
7259       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7260       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7261       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7262       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7263 #if 0
7264       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7265       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7266 #endif
7267       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7268       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7269       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7270       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7271 #if 0
7272       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7273       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7274       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7275 #endif
7276       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7277       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7278 #if 0
7279       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7280       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7281       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7282 #endif
7283       case 0x12: strcpy(insn[i],"COP2"); type=NI;
7284         op2=(source[i]>>21)&0x1f;
7285         //if (op2 & 0x10)
7286         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
7287           if (gte_handlers[source[i]&0x3f]!=NULL) {
7288             if (gte_regnames[source[i]&0x3f]!=NULL)
7289               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
7290             else
7291               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
7292             type=C2OP;
7293           }
7294         }
7295         else switch(op2)
7296         {
7297           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
7298           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
7299           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
7300           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
7301         }
7302         break;
7303       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
7304       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
7305       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
7306       default: strcpy(insn[i],"???"); type=NI;
7307         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
7308         break;
7309     }
7310     itype[i]=type;
7311     opcode2[i]=op2;
7312     /* Get registers/immediates */
7313     lt1[i]=0;
7314     dep1[i]=0;
7315     dep2[i]=0;
7316     gte_rs[i]=gte_rt[i]=0;
7317     switch(type) {
7318       case LOAD:
7319         rs1[i]=(source[i]>>21)&0x1f;
7320         rs2[i]=0;
7321         rt1[i]=(source[i]>>16)&0x1f;
7322         rt2[i]=0;
7323         imm[i]=(short)source[i];
7324         break;
7325       case STORE:
7326       case STORELR:
7327         rs1[i]=(source[i]>>21)&0x1f;
7328         rs2[i]=(source[i]>>16)&0x1f;
7329         rt1[i]=0;
7330         rt2[i]=0;
7331         imm[i]=(short)source[i];
7332         break;
7333       case LOADLR:
7334         // LWL/LWR only load part of the register,
7335         // therefore the target register must be treated as a source too
7336         rs1[i]=(source[i]>>21)&0x1f;
7337         rs2[i]=(source[i]>>16)&0x1f;
7338         rt1[i]=(source[i]>>16)&0x1f;
7339         rt2[i]=0;
7340         imm[i]=(short)source[i];
7341         if(op==0x26) dep1[i]=rt1[i]; // LWR
7342         break;
7343       case IMM16:
7344         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7345         else rs1[i]=(source[i]>>21)&0x1f;
7346         rs2[i]=0;
7347         rt1[i]=(source[i]>>16)&0x1f;
7348         rt2[i]=0;
7349         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7350           imm[i]=(unsigned short)source[i];
7351         }else{
7352           imm[i]=(short)source[i];
7353         }
7354         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7355         break;
7356       case UJUMP:
7357         rs1[i]=0;
7358         rs2[i]=0;
7359         rt1[i]=0;
7360         rt2[i]=0;
7361         // The JAL instruction writes to r31.
7362         if (op&1) {
7363           rt1[i]=31;
7364         }
7365         rs2[i]=CCREG;
7366         break;
7367       case RJUMP:
7368         rs1[i]=(source[i]>>21)&0x1f;
7369         rs2[i]=0;
7370         rt1[i]=0;
7371         rt2[i]=0;
7372         // The JALR instruction writes to rd.
7373         if (op2&1) {
7374           rt1[i]=(source[i]>>11)&0x1f;
7375         }
7376         rs2[i]=CCREG;
7377         break;
7378       case CJUMP:
7379         rs1[i]=(source[i]>>21)&0x1f;
7380         rs2[i]=(source[i]>>16)&0x1f;
7381         rt1[i]=0;
7382         rt2[i]=0;
7383         if(op&2) { // BGTZ/BLEZ
7384           rs2[i]=0;
7385         }
7386         likely[i]=op>>4;
7387         break;
7388       case SJUMP:
7389         rs1[i]=(source[i]>>21)&0x1f;
7390         rs2[i]=CCREG;
7391         rt1[i]=0;
7392         rt2[i]=0;
7393         if(op2&0x10) { // BxxAL
7394           rt1[i]=31;
7395           // NOTE: If the branch is not taken, r31 is still overwritten
7396         }
7397         likely[i]=(op2&2)>>1;
7398         break;
7399       case ALU:
7400         rs1[i]=(source[i]>>21)&0x1f; // source
7401         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7402         rt1[i]=(source[i]>>11)&0x1f; // destination
7403         rt2[i]=0;
7404         if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7405           dep1[i]=rs1[i];dep2[i]=rs2[i];
7406         }
7407         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7408           dep1[i]=rs1[i];dep2[i]=rs2[i];
7409         }
7410         break;
7411       case MULTDIV:
7412         rs1[i]=(source[i]>>21)&0x1f; // source
7413         rs2[i]=(source[i]>>16)&0x1f; // divisor
7414         rt1[i]=HIREG;
7415         rt2[i]=LOREG;
7416         break;
7417       case MOV:
7418         rs1[i]=0;
7419         rs2[i]=0;
7420         rt1[i]=0;
7421         rt2[i]=0;
7422         if(op2==0x10) rs1[i]=HIREG; // MFHI
7423         if(op2==0x11) rt1[i]=HIREG; // MTHI
7424         if(op2==0x12) rs1[i]=LOREG; // MFLO
7425         if(op2==0x13) rt1[i]=LOREG; // MTLO
7426         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7427         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7428         dep1[i]=rs1[i];
7429         break;
7430       case SHIFT:
7431         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7432         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7433         rt1[i]=(source[i]>>11)&0x1f; // destination
7434         rt2[i]=0;
7435         break;
7436       case SHIFTIMM:
7437         rs1[i]=(source[i]>>16)&0x1f;
7438         rs2[i]=0;
7439         rt1[i]=(source[i]>>11)&0x1f;
7440         rt2[i]=0;
7441         imm[i]=(source[i]>>6)&0x1f;
7442         // DSxx32 instructions
7443         if(op2>=0x3c) imm[i]|=0x20;
7444         break;
7445       case COP0:
7446         rs1[i]=0;
7447         rs2[i]=0;
7448         rt1[i]=0;
7449         rt2[i]=0;
7450         if(op2==0||op2==2) rt1[i]=(source[i]>>16)&0x1F; // MFC0/CFC0
7451         if(op2==4||op2==6) rs1[i]=(source[i]>>16)&0x1F; // MTC0/CTC0
7452         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7453         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7454         break;
7455       case COP1:
7456         rs1[i]=0;
7457         rs2[i]=0;
7458         rt1[i]=0;
7459         rt2[i]=0;
7460         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7461         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7462         rs2[i]=CSREG;
7463         break;
7464       case COP2:
7465         rs1[i]=0;
7466         rs2[i]=0;
7467         rt1[i]=0;
7468         rt2[i]=0;
7469         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
7470         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
7471         rs2[i]=CSREG;
7472         int gr=(source[i]>>11)&0x1F;
7473         switch(op2)
7474         {
7475           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
7476           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
7477           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
7478           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
7479         }
7480         break;
7481       case C1LS:
7482         rs1[i]=(source[i]>>21)&0x1F;
7483         rs2[i]=CSREG;
7484         rt1[i]=0;
7485         rt2[i]=0;
7486         imm[i]=(short)source[i];
7487         break;
7488       case C2LS:
7489         rs1[i]=(source[i]>>21)&0x1F;
7490         rs2[i]=0;
7491         rt1[i]=0;
7492         rt2[i]=0;
7493         imm[i]=(short)source[i];
7494         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
7495         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
7496         break;
7497       case C2OP:
7498         rs1[i]=0;
7499         rs2[i]=0;
7500         rt1[i]=0;
7501         rt2[i]=0;
7502         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
7503         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
7504         gte_rt[i]|=1ll<<63; // every op changes flags
7505         if((source[i]&0x3f)==GTE_MVMVA) {
7506           int v = (source[i] >> 15) & 3;
7507           gte_rs[i]&=~0xe3fll;
7508           if(v==3) gte_rs[i]|=0xe00ll;
7509           else gte_rs[i]|=3ll<<(v*2);
7510         }
7511         break;
7512       case SYSCALL:
7513       case HLECALL:
7514       case INTCALL:
7515         rs1[i]=CCREG;
7516         rs2[i]=0;
7517         rt1[i]=0;
7518         rt2[i]=0;
7519         break;
7520       default:
7521         rs1[i]=0;
7522         rs2[i]=0;
7523         rt1[i]=0;
7524         rt2[i]=0;
7525     }
7526     /* Calculate branch target addresses */
7527     if(type==UJUMP)
7528       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
7529     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
7530       ba[i]=start+i*4+8; // Ignore never taken branch
7531     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
7532       ba[i]=start+i*4+8; // Ignore never taken branch
7533     else if(type==CJUMP||type==SJUMP)
7534       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
7535     else ba[i]=-1;
7536     if (i > 0 && is_jump(i-1)) {
7537       int do_in_intrp=0;
7538       // branch in delay slot?
7539       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP) {
7540         // don't handle first branch and call interpreter if it's hit
7541         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
7542         do_in_intrp=1;
7543       }
7544       // basic load delay detection
7545       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
7546         int t=(ba[i-1]-start)/4;
7547         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
7548           // jump target wants DS result - potential load delay effect
7549           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
7550           do_in_intrp=1;
7551           bt[t+1]=1; // expected return from interpreter
7552         }
7553         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
7554               !(i>=3&&is_jump(i-3))) {
7555           // v0 overwrite like this is a sign of trouble, bail out
7556           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
7557           do_in_intrp=1;
7558         }
7559       }
7560       if(do_in_intrp) {
7561         rs1[i-1]=CCREG;
7562         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
7563         ba[i-1]=-1;
7564         itype[i-1]=INTCALL;
7565         done=2;
7566         i--; // don't compile the DS
7567       }
7568     }
7569     /* Is this the end of the block? */
7570     if (i > 0 && is_ujump(i-1)) {
7571       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
7572         done=2;
7573       }
7574       else {
7575         if(stop_after_jal) done=1;
7576         // Stop on BREAK
7577         if((source[i+1]&0xfc00003f)==0x0d) done=1;
7578       }
7579       // Don't recompile stuff that's already compiled
7580       if(check_addr(start+i*4+4)) done=1;
7581       // Don't get too close to the limit
7582       if(i>MAXBLOCK/2) done=1;
7583     }
7584     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
7585     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
7586     if(done==2) {
7587       // Does the block continue due to a branch?
7588       for(j=i-1;j>=0;j--)
7589       {
7590         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
7591         if(ba[j]==start+i*4+4) done=j=0;
7592         if(ba[j]==start+i*4+8) done=j=0;
7593       }
7594     }
7595     //assert(i<MAXBLOCK-1);
7596     if(start+i*4==pagelimit-4) done=1;
7597     assert(start+i*4<pagelimit);
7598     if (i==MAXBLOCK-1) done=1;
7599     // Stop if we're compiling junk
7600     if(itype[i]==NI&&opcode[i]==0x11) {
7601       done=stop_after_jal=1;
7602       SysPrintf("Disabled speculative precompilation\n");
7603     }
7604   }
7605   slen=i;
7606   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP) {
7607     if(start+i*4==pagelimit) {
7608       itype[i-1]=SPAN;
7609     }
7610   }
7611   assert(slen>0);
7612
7613   /* Pass 2 - Register dependencies and branch targets */
7614
7615   unneeded_registers(0,slen-1,0);
7616
7617   /* Pass 3 - Register allocation */
7618
7619   struct regstat current; // Current register allocations/status
7620   current.dirty=0;
7621   current.u=unneeded_reg[0];
7622   clear_all_regs(current.regmap);
7623   alloc_reg(&current,0,CCREG);
7624   dirty_reg(&current,CCREG);
7625   current.isconst=0;
7626   current.wasconst=0;
7627   current.waswritten=0;
7628   int ds=0;
7629   int cc=0;
7630   int hr=-1;
7631
7632   if((u_int)addr&1) {
7633     // First instruction is delay slot
7634     cc=-1;
7635     bt[1]=1;
7636     ds=1;
7637     unneeded_reg[0]=1;
7638     current.regmap[HOST_BTREG]=BTREG;
7639   }
7640
7641   for(i=0;i<slen;i++)
7642   {
7643     if(bt[i])
7644     {
7645       int hr;
7646       for(hr=0;hr<HOST_REGS;hr++)
7647       {
7648         // Is this really necessary?
7649         if(current.regmap[hr]==0) current.regmap[hr]=-1;
7650       }
7651       current.isconst=0;
7652       current.waswritten=0;
7653     }
7654
7655     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
7656     regs[i].wasconst=current.isconst;
7657     regs[i].wasdirty=current.dirty;
7658     regs[i].loadedconst=0;
7659     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP) {
7660       if(i+1<slen) {
7661         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7662         current.u|=1;
7663       } else {
7664         current.u=1;
7665       }
7666     } else {
7667       if(i+1<slen) {
7668         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
7669         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7670         current.u|=1;
7671       } else { SysPrintf("oops, branch at end of block with no delay slot\n");abort(); }
7672     }
7673     is_ds[i]=ds;
7674     if(ds) {
7675       ds=0; // Skip delay slot, already allocated as part of branch
7676       // ...but we need to alloc it in case something jumps here
7677       if(i+1<slen) {
7678         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
7679       }else{
7680         current.u=branch_unneeded_reg[i-1];
7681       }
7682       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7683       current.u|=1;
7684       struct regstat temp;
7685       memcpy(&temp,&current,sizeof(current));
7686       temp.wasdirty=temp.dirty;
7687       // TODO: Take into account unconditional branches, as below
7688       delayslot_alloc(&temp,i);
7689       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
7690       regs[i].wasdirty=temp.wasdirty;
7691       regs[i].dirty=temp.dirty;
7692       regs[i].isconst=0;
7693       regs[i].wasconst=0;
7694       current.isconst=0;
7695       // Create entry (branch target) regmap
7696       for(hr=0;hr<HOST_REGS;hr++)
7697       {
7698         int r=temp.regmap[hr];
7699         if(r>=0) {
7700           if(r!=regmap_pre[i][hr]) {
7701             regs[i].regmap_entry[hr]=-1;
7702           }
7703           else
7704           {
7705               assert(r < 64);
7706               if((current.u>>r)&1) {
7707                 regs[i].regmap_entry[hr]=-1;
7708                 regs[i].regmap[hr]=-1;
7709                 //Don't clear regs in the delay slot as the branch might need them
7710                 //current.regmap[hr]=-1;
7711               }else
7712                 regs[i].regmap_entry[hr]=r;
7713           }
7714         } else {
7715           // First instruction expects CCREG to be allocated
7716           if(i==0&&hr==HOST_CCREG)
7717             regs[i].regmap_entry[hr]=CCREG;
7718           else
7719             regs[i].regmap_entry[hr]=-1;
7720         }
7721       }
7722     }
7723     else { // Not delay slot
7724       switch(itype[i]) {
7725         case UJUMP:
7726           //current.isconst=0; // DEBUG
7727           //current.wasconst=0; // DEBUG
7728           //regs[i].wasconst=0; // DEBUG
7729           clear_const(&current,rt1[i]);
7730           alloc_cc(&current,i);
7731           dirty_reg(&current,CCREG);
7732           if (rt1[i]==31) {
7733             alloc_reg(&current,i,31);
7734             dirty_reg(&current,31);
7735             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
7736             //assert(rt1[i+1]!=rt1[i]);
7737             #ifdef REG_PREFETCH
7738             alloc_reg(&current,i,PTEMP);
7739             #endif
7740           }
7741           ooo[i]=1;
7742           delayslot_alloc(&current,i+1);
7743           //current.isconst=0; // DEBUG
7744           ds=1;
7745           //printf("i=%d, isconst=%x\n",i,current.isconst);
7746           break;
7747         case RJUMP:
7748           //current.isconst=0;
7749           //current.wasconst=0;
7750           //regs[i].wasconst=0;
7751           clear_const(&current,rs1[i]);
7752           clear_const(&current,rt1[i]);
7753           alloc_cc(&current,i);
7754           dirty_reg(&current,CCREG);
7755           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
7756             alloc_reg(&current,i,rs1[i]);
7757             if (rt1[i]!=0) {
7758               alloc_reg(&current,i,rt1[i]);
7759               dirty_reg(&current,rt1[i]);
7760               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
7761               assert(rt1[i+1]!=rt1[i]);
7762               #ifdef REG_PREFETCH
7763               alloc_reg(&current,i,PTEMP);
7764               #endif
7765             }
7766             #ifdef USE_MINI_HT
7767             if(rs1[i]==31) { // JALR
7768               alloc_reg(&current,i,RHASH);
7769               alloc_reg(&current,i,RHTBL);
7770             }
7771             #endif
7772             delayslot_alloc(&current,i+1);
7773           } else {
7774             // The delay slot overwrites our source register,
7775             // allocate a temporary register to hold the old value.
7776             current.isconst=0;
7777             current.wasconst=0;
7778             regs[i].wasconst=0;
7779             delayslot_alloc(&current,i+1);
7780             current.isconst=0;
7781             alloc_reg(&current,i,RTEMP);
7782           }
7783           //current.isconst=0; // DEBUG
7784           ooo[i]=1;
7785           ds=1;
7786           break;
7787         case CJUMP:
7788           //current.isconst=0;
7789           //current.wasconst=0;
7790           //regs[i].wasconst=0;
7791           clear_const(&current,rs1[i]);
7792           clear_const(&current,rs2[i]);
7793           if((opcode[i]&0x3E)==4) // BEQ/BNE
7794           {
7795             alloc_cc(&current,i);
7796             dirty_reg(&current,CCREG);
7797             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7798             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
7799             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
7800                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
7801               // The delay slot overwrites one of our conditions.
7802               // Allocate the branch condition registers instead.
7803               current.isconst=0;
7804               current.wasconst=0;
7805               regs[i].wasconst=0;
7806               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7807               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
7808             }
7809             else
7810             {
7811               ooo[i]=1;
7812               delayslot_alloc(&current,i+1);
7813             }
7814           }
7815           else
7816           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
7817           {
7818             alloc_cc(&current,i);
7819             dirty_reg(&current,CCREG);
7820             alloc_reg(&current,i,rs1[i]);
7821             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
7822               // The delay slot overwrites one of our conditions.
7823               // Allocate the branch condition registers instead.
7824               current.isconst=0;
7825               current.wasconst=0;
7826               regs[i].wasconst=0;
7827               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7828             }
7829             else
7830             {
7831               ooo[i]=1;
7832               delayslot_alloc(&current,i+1);
7833             }
7834           }
7835           else
7836           // Don't alloc the delay slot yet because we might not execute it
7837           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
7838           {
7839             current.isconst=0;
7840             current.wasconst=0;
7841             regs[i].wasconst=0;
7842             alloc_cc(&current,i);
7843             dirty_reg(&current,CCREG);
7844             alloc_reg(&current,i,rs1[i]);
7845             alloc_reg(&current,i,rs2[i]);
7846           }
7847           else
7848           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
7849           {
7850             current.isconst=0;
7851             current.wasconst=0;
7852             regs[i].wasconst=0;
7853             alloc_cc(&current,i);
7854             dirty_reg(&current,CCREG);
7855             alloc_reg(&current,i,rs1[i]);
7856           }
7857           ds=1;
7858           //current.isconst=0;
7859           break;
7860         case SJUMP:
7861           //current.isconst=0;
7862           //current.wasconst=0;
7863           //regs[i].wasconst=0;
7864           clear_const(&current,rs1[i]);
7865           clear_const(&current,rt1[i]);
7866           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
7867           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
7868           {
7869             alloc_cc(&current,i);
7870             dirty_reg(&current,CCREG);
7871             alloc_reg(&current,i,rs1[i]);
7872             if (rt1[i]==31) { // BLTZAL/BGEZAL
7873               alloc_reg(&current,i,31);
7874               dirty_reg(&current,31);
7875               //#ifdef REG_PREFETCH
7876               //alloc_reg(&current,i,PTEMP);
7877               //#endif
7878             }
7879             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
7880                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
7881               // Allocate the branch condition registers instead.
7882               current.isconst=0;
7883               current.wasconst=0;
7884               regs[i].wasconst=0;
7885               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7886             }
7887             else
7888             {
7889               ooo[i]=1;
7890               delayslot_alloc(&current,i+1);
7891             }
7892           }
7893           else
7894           // Don't alloc the delay slot yet because we might not execute it
7895           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
7896           {
7897             current.isconst=0;
7898             current.wasconst=0;
7899             regs[i].wasconst=0;
7900             alloc_cc(&current,i);
7901             dirty_reg(&current,CCREG);
7902             alloc_reg(&current,i,rs1[i]);
7903           }
7904           ds=1;
7905           //current.isconst=0;
7906           break;
7907         case IMM16:
7908           imm16_alloc(&current,i);
7909           break;
7910         case LOAD:
7911         case LOADLR:
7912           load_alloc(&current,i);
7913           break;
7914         case STORE:
7915         case STORELR:
7916           store_alloc(&current,i);
7917           break;
7918         case ALU:
7919           alu_alloc(&current,i);
7920           break;
7921         case SHIFT:
7922           shift_alloc(&current,i);
7923           break;
7924         case MULTDIV:
7925           multdiv_alloc(&current,i);
7926           break;
7927         case SHIFTIMM:
7928           shiftimm_alloc(&current,i);
7929           break;
7930         case MOV:
7931           mov_alloc(&current,i);
7932           break;
7933         case COP0:
7934           cop0_alloc(&current,i);
7935           break;
7936         case COP1:
7937           break;
7938         case COP2:
7939           cop2_alloc(&current,i);
7940           break;
7941         case C1LS:
7942           c1ls_alloc(&current,i);
7943           break;
7944         case C2LS:
7945           c2ls_alloc(&current,i);
7946           break;
7947         case C2OP:
7948           c2op_alloc(&current,i);
7949           break;
7950         case SYSCALL:
7951         case HLECALL:
7952         case INTCALL:
7953           syscall_alloc(&current,i);
7954           break;
7955         case SPAN:
7956           pagespan_alloc(&current,i);
7957           break;
7958       }
7959
7960       // Create entry (branch target) regmap
7961       for(hr=0;hr<HOST_REGS;hr++)
7962       {
7963         int r,or;
7964         r=current.regmap[hr];
7965         if(r>=0) {
7966           if(r!=regmap_pre[i][hr]) {
7967             // TODO: delay slot (?)
7968             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
7969             if(or<0||(r&63)>=TEMPREG){
7970               regs[i].regmap_entry[hr]=-1;
7971             }
7972             else
7973             {
7974               // Just move it to a different register
7975               regs[i].regmap_entry[hr]=r;
7976               // If it was dirty before, it's still dirty
7977               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
7978             }
7979           }
7980           else
7981           {
7982             // Unneeded
7983             if(r==0){
7984               regs[i].regmap_entry[hr]=0;
7985             }
7986             else
7987             {
7988               assert(r<64);
7989               if((current.u>>r)&1) {
7990                 regs[i].regmap_entry[hr]=-1;
7991                 //regs[i].regmap[hr]=-1;
7992                 current.regmap[hr]=-1;
7993               }else
7994                 regs[i].regmap_entry[hr]=r;
7995             }
7996           }
7997         } else {
7998           // Branches expect CCREG to be allocated at the target
7999           if(regmap_pre[i][hr]==CCREG)
8000             regs[i].regmap_entry[hr]=CCREG;
8001           else
8002             regs[i].regmap_entry[hr]=-1;
8003         }
8004       }
8005       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8006     }
8007
8008     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
8009       current.waswritten|=1<<rs1[i-1];
8010     current.waswritten&=~(1<<rt1[i]);
8011     current.waswritten&=~(1<<rt2[i]);
8012     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
8013       current.waswritten&=~(1<<rs1[i]);
8014
8015     /* Branch post-alloc */
8016     if(i>0)
8017     {
8018       current.wasdirty=current.dirty;
8019       switch(itype[i-1]) {
8020         case UJUMP:
8021           memcpy(&branch_regs[i-1],&current,sizeof(current));
8022           branch_regs[i-1].isconst=0;
8023           branch_regs[i-1].wasconst=0;
8024           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8025           alloc_cc(&branch_regs[i-1],i-1);
8026           dirty_reg(&branch_regs[i-1],CCREG);
8027           if(rt1[i-1]==31) { // JAL
8028             alloc_reg(&branch_regs[i-1],i-1,31);
8029             dirty_reg(&branch_regs[i-1],31);
8030           }
8031           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8032           memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
8033           break;
8034         case RJUMP:
8035           memcpy(&branch_regs[i-1],&current,sizeof(current));
8036           branch_regs[i-1].isconst=0;
8037           branch_regs[i-1].wasconst=0;
8038           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8039           alloc_cc(&branch_regs[i-1],i-1);
8040           dirty_reg(&branch_regs[i-1],CCREG);
8041           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8042           if(rt1[i-1]!=0) { // JALR
8043             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
8044             dirty_reg(&branch_regs[i-1],rt1[i-1]);
8045           }
8046           #ifdef USE_MINI_HT
8047           if(rs1[i-1]==31) { // JALR
8048             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8049             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8050           }
8051           #endif
8052           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8053           memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
8054           break;
8055         case CJUMP:
8056           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8057           {
8058             alloc_cc(&current,i-1);
8059             dirty_reg(&current,CCREG);
8060             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8061                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8062               // The delay slot overwrote one of our conditions
8063               // Delay slot goes after the test (in order)
8064               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8065               current.u|=1;
8066               delayslot_alloc(&current,i);
8067               current.isconst=0;
8068             }
8069             else
8070             {
8071               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8072               // Alloc the branch condition registers
8073               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8074               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8075             }
8076             memcpy(&branch_regs[i-1],&current,sizeof(current));
8077             branch_regs[i-1].isconst=0;
8078             branch_regs[i-1].wasconst=0;
8079             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8080             memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
8081           }
8082           else
8083           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8084           {
8085             alloc_cc(&current,i-1);
8086             dirty_reg(&current,CCREG);
8087             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8088               // The delay slot overwrote the branch condition
8089               // Delay slot goes after the test (in order)
8090               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8091               current.u|=1;
8092               delayslot_alloc(&current,i);
8093               current.isconst=0;
8094             }
8095             else
8096             {
8097               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8098               // Alloc the branch condition register
8099               alloc_reg(&current,i-1,rs1[i-1]);
8100             }
8101             memcpy(&branch_regs[i-1],&current,sizeof(current));
8102             branch_regs[i-1].isconst=0;
8103             branch_regs[i-1].wasconst=0;
8104             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8105             memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
8106           }
8107           else
8108           // Alloc the delay slot in case the branch is taken
8109           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8110           {
8111             memcpy(&branch_regs[i-1],&current,sizeof(current));
8112             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8113             alloc_cc(&branch_regs[i-1],i);
8114             dirty_reg(&branch_regs[i-1],CCREG);
8115             delayslot_alloc(&branch_regs[i-1],i);
8116             branch_regs[i-1].isconst=0;
8117             alloc_reg(&current,i,CCREG); // Not taken path
8118             dirty_reg(&current,CCREG);
8119             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8120           }
8121           else
8122           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8123           {
8124             memcpy(&branch_regs[i-1],&current,sizeof(current));
8125             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8126             alloc_cc(&branch_regs[i-1],i);
8127             dirty_reg(&branch_regs[i-1],CCREG);
8128             delayslot_alloc(&branch_regs[i-1],i);
8129             branch_regs[i-1].isconst=0;
8130             alloc_reg(&current,i,CCREG); // Not taken path
8131             dirty_reg(&current,CCREG);
8132             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8133           }
8134           break;
8135         case SJUMP:
8136           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8137           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8138           {
8139             alloc_cc(&current,i-1);
8140             dirty_reg(&current,CCREG);
8141             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8142               // The delay slot overwrote the branch condition
8143               // Delay slot goes after the test (in order)
8144               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8145               current.u|=1;
8146               delayslot_alloc(&current,i);
8147               current.isconst=0;
8148             }
8149             else
8150             {
8151               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8152               // Alloc the branch condition register
8153               alloc_reg(&current,i-1,rs1[i-1]);
8154             }
8155             memcpy(&branch_regs[i-1],&current,sizeof(current));
8156             branch_regs[i-1].isconst=0;
8157             branch_regs[i-1].wasconst=0;
8158             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8159             memcpy(constmap[i],constmap[i-1],sizeof(constmap[i]));
8160           }
8161           else
8162           // Alloc the delay slot in case the branch is taken
8163           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
8164           {
8165             memcpy(&branch_regs[i-1],&current,sizeof(current));
8166             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8167             alloc_cc(&branch_regs[i-1],i);
8168             dirty_reg(&branch_regs[i-1],CCREG);
8169             delayslot_alloc(&branch_regs[i-1],i);
8170             branch_regs[i-1].isconst=0;
8171             alloc_reg(&current,i,CCREG); // Not taken path
8172             dirty_reg(&current,CCREG);
8173             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8174           }
8175           // FIXME: BLTZAL/BGEZAL
8176           if(opcode2[i-1]&0x10) { // BxxZAL
8177             alloc_reg(&branch_regs[i-1],i-1,31);
8178             dirty_reg(&branch_regs[i-1],31);
8179           }
8180           break;
8181       }
8182
8183       if (is_ujump(i-1))
8184       {
8185         if(rt1[i-1]==31) // JAL/JALR
8186         {
8187           // Subroutine call will return here, don't alloc any registers
8188           current.dirty=0;
8189           clear_all_regs(current.regmap);
8190           alloc_reg(&current,i,CCREG);
8191           dirty_reg(&current,CCREG);
8192         }
8193         else if(i+1<slen)
8194         {
8195           // Internal branch will jump here, match registers to caller
8196           current.dirty=0;
8197           clear_all_regs(current.regmap);
8198           alloc_reg(&current,i,CCREG);
8199           dirty_reg(&current,CCREG);
8200           for(j=i-1;j>=0;j--)
8201           {
8202             if(ba[j]==start+i*4+4) {
8203               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
8204               current.dirty=branch_regs[j].dirty;
8205               break;
8206             }
8207           }
8208           while(j>=0) {
8209             if(ba[j]==start+i*4+4) {
8210               for(hr=0;hr<HOST_REGS;hr++) {
8211                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
8212                   current.regmap[hr]=-1;
8213                 }
8214                 current.dirty&=branch_regs[j].dirty;
8215               }
8216             }
8217             j--;
8218           }
8219         }
8220       }
8221     }
8222
8223     // Count cycles in between branches
8224     ccadj[i]=cc;
8225     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
8226     {
8227       cc=0;
8228     }
8229 #if !defined(DRC_DBG)
8230     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
8231     {
8232       // this should really be removed since the real stalls have been implemented,
8233       // but doing so causes sizeable perf regression against the older version
8234       u_int gtec = gte_cycletab[source[i] & 0x3f];
8235       cc += HACK_ENABLED(NDHACK_NO_STALLS) ? gtec/2 : 2;
8236     }
8237     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
8238     {
8239       cc+=4;
8240     }
8241     else if(itype[i]==C2LS)
8242     {
8243       // same as with C2OP
8244       cc += HACK_ENABLED(NDHACK_NO_STALLS) ? 4 : 2;
8245     }
8246 #endif
8247     else
8248     {
8249       cc++;
8250     }
8251
8252     if(!is_ds[i]) {
8253       regs[i].dirty=current.dirty;
8254       regs[i].isconst=current.isconst;
8255       memcpy(constmap[i],current_constmap,sizeof(constmap[i]));
8256     }
8257     for(hr=0;hr<HOST_REGS;hr++) {
8258       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
8259         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
8260           regs[i].wasconst&=~(1<<hr);
8261         }
8262       }
8263     }
8264     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
8265     regs[i].waswritten=current.waswritten;
8266   }
8267
8268   /* Pass 4 - Cull unused host registers */
8269
8270   uint64_t nr=0;
8271
8272   for (i=slen-1;i>=0;i--)
8273   {
8274     int hr;
8275     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
8276     {
8277       if(ba[i]<start || ba[i]>=(start+slen*4))
8278       {
8279         // Branch out of this block, don't need anything
8280         nr=0;
8281       }
8282       else
8283       {
8284         // Internal branch
8285         // Need whatever matches the target
8286         nr=0;
8287         int t=(ba[i]-start)>>2;
8288         for(hr=0;hr<HOST_REGS;hr++)
8289         {
8290           if(regs[i].regmap_entry[hr]>=0) {
8291             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
8292           }
8293         }
8294       }
8295       // Conditional branch may need registers for following instructions
8296       if (!is_ujump(i))
8297       {
8298         if(i<slen-2) {
8299           nr|=needed_reg[i+2];
8300           for(hr=0;hr<HOST_REGS;hr++)
8301           {
8302             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
8303             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
8304           }
8305         }
8306       }
8307       // Don't need stuff which is overwritten
8308       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8309       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8310       // Merge in delay slot
8311       for(hr=0;hr<HOST_REGS;hr++)
8312       {
8313         if(!likely[i]) {
8314           // These are overwritten unless the branch is "likely"
8315           // and the delay slot is nullified if not taken
8316           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8317           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8318         }
8319         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8320         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8321         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8322         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8323         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
8324           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8325           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8326         }
8327       }
8328     }
8329     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
8330     {
8331       // SYSCALL instruction (software interrupt)
8332       nr=0;
8333     }
8334     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
8335     {
8336       // ERET instruction (return from interrupt)
8337       nr=0;
8338     }
8339     else // Non-branch
8340     {
8341       if(i<slen-1) {
8342         for(hr=0;hr<HOST_REGS;hr++) {
8343           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
8344           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
8345           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8346           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8347         }
8348       }
8349     }
8350     for(hr=0;hr<HOST_REGS;hr++)
8351     {
8352       // Overwritten registers are not needed
8353       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8354       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8355       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8356       // Source registers are needed
8357       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
8358       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
8359       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8360       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8361       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
8362         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8363         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8364       }
8365       // Don't store a register immediately after writing it,
8366       // may prevent dual-issue.
8367       // But do so if this is a branch target, otherwise we
8368       // might have to load the register before the branch.
8369       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
8370         if((regmap_pre[i][hr]>0&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1))) {
8371           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8372           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8373         }
8374         if((regs[i].regmap_entry[hr]>0&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1))) {
8375           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8376           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8377         }
8378       }
8379     }
8380     // Cycle count is needed at branches.  Assume it is needed at the target too.
8381     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==SPAN) {
8382       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8383       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8384     }
8385     // Save it
8386     needed_reg[i]=nr;
8387
8388     // Deallocate unneeded registers
8389     for(hr=0;hr<HOST_REGS;hr++)
8390     {
8391       if(!((nr>>hr)&1)) {
8392         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
8393         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8394            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8395            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
8396         {
8397           if (!is_ujump(i))
8398           {
8399             if(likely[i]) {
8400               regs[i].regmap[hr]=-1;
8401               regs[i].isconst&=~(1<<hr);
8402               if(i<slen-2) {
8403                 regmap_pre[i+2][hr]=-1;
8404                 regs[i+2].wasconst&=~(1<<hr);
8405               }
8406             }
8407           }
8408         }
8409         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
8410         {
8411           int map=0,temp=0;
8412           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
8413              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8414             map=INVCP;
8415           }
8416           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
8417              itype[i+1]==C1LS || itype[i+1]==C2LS)
8418             temp=FTEMP;
8419           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8420              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8421              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
8422              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
8423              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
8424              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
8425              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
8426              regs[i].regmap[hr]!=map )
8427           {
8428             regs[i].regmap[hr]=-1;
8429             regs[i].isconst&=~(1<<hr);
8430             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
8431                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
8432                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
8433                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
8434                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
8435                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
8436                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
8437                branch_regs[i].regmap[hr]!=map)
8438             {
8439               branch_regs[i].regmap[hr]=-1;
8440               branch_regs[i].regmap_entry[hr]=-1;
8441               if (!is_ujump(i))
8442               {
8443                 if(!likely[i]&&i<slen-2) {
8444                   regmap_pre[i+2][hr]=-1;
8445                   regs[i+2].wasconst&=~(1<<hr);
8446                 }
8447               }
8448             }
8449           }
8450         }
8451         else
8452         {
8453           // Non-branch
8454           if(i>0)
8455           {
8456             int map=-1,temp=-1;
8457             if(itype[i]==STORE || itype[i]==STORELR ||
8458                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8459               map=INVCP;
8460             }
8461             if(itype[i]==LOADLR || itype[i]==STORELR ||
8462                itype[i]==C1LS || itype[i]==C2LS)
8463               temp=FTEMP;
8464             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8465                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
8466                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
8467                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
8468             {
8469               if(i<slen-1&&!is_ds[i]) {
8470                 assert(regs[i].regmap[hr]<64);
8471                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]>0)
8472                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
8473                 {
8474                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
8475                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
8476                 }
8477                 regmap_pre[i+1][hr]=-1;
8478                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
8479                 regs[i+1].wasconst&=~(1<<hr);
8480               }
8481               regs[i].regmap[hr]=-1;
8482               regs[i].isconst&=~(1<<hr);
8483             }
8484           }
8485         }
8486       } // if needed
8487     } // for hr
8488   }
8489
8490   /* Pass 5 - Pre-allocate registers */
8491
8492   // If a register is allocated during a loop, try to allocate it for the
8493   // entire loop, if possible.  This avoids loading/storing registers
8494   // inside of the loop.
8495
8496   signed char f_regmap[HOST_REGS];
8497   clear_all_regs(f_regmap);
8498   for(i=0;i<slen-1;i++)
8499   {
8500     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
8501     {
8502       if(ba[i]>=start && ba[i]<(start+i*4))
8503       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
8504       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
8505       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
8506       ||itype[i+1]==SHIFT||itype[i+1]==COP1
8507       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
8508       {
8509         int t=(ba[i]-start)>>2;
8510         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP)) // loop_preload can't handle jumps into delay slots
8511         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
8512         for(hr=0;hr<HOST_REGS;hr++)
8513         {
8514           if(regs[i].regmap[hr]>=0) {
8515             if(f_regmap[hr]!=regs[i].regmap[hr]) {
8516               // dealloc old register
8517               int n;
8518               for(n=0;n<HOST_REGS;n++)
8519               {
8520                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
8521               }
8522               // and alloc new one
8523               f_regmap[hr]=regs[i].regmap[hr];
8524             }
8525           }
8526           if(branch_regs[i].regmap[hr]>=0) {
8527             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
8528               // dealloc old register
8529               int n;
8530               for(n=0;n<HOST_REGS;n++)
8531               {
8532                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
8533               }
8534               // and alloc new one
8535               f_regmap[hr]=branch_regs[i].regmap[hr];
8536             }
8537           }
8538           if(ooo[i]) {
8539             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
8540               f_regmap[hr]=branch_regs[i].regmap[hr];
8541           }else{
8542             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
8543               f_regmap[hr]=branch_regs[i].regmap[hr];
8544           }
8545           // Avoid dirty->clean transition
8546           #ifdef DESTRUCTIVE_WRITEBACK
8547           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
8548           #endif
8549           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
8550           // case above, however it's always a good idea.  We can't hoist the
8551           // load if the register was already allocated, so there's no point
8552           // wasting time analyzing most of these cases.  It only "succeeds"
8553           // when the mapping was different and the load can be replaced with
8554           // a mov, which is of negligible benefit.  So such cases are
8555           // skipped below.
8556           if(f_regmap[hr]>0) {
8557             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
8558               int r=f_regmap[hr];
8559               for(j=t;j<=i;j++)
8560               {
8561                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
8562                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
8563                 assert(r < 64);
8564                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
8565                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
8566                   int k;
8567                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
8568                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
8569                     if(r>63) {
8570                       if(get_reg(regs[i].regmap,r&63)<0) break;
8571                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
8572                     }
8573                     k=i;
8574                     while(k>1&&regs[k-1].regmap[hr]==-1) {
8575                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
8576                         //printf("no free regs for store %x\n",start+(k-1)*4);
8577                         break;
8578                       }
8579                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
8580                         //printf("no-match due to different register\n");
8581                         break;
8582                       }
8583                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP) {
8584                         //printf("no-match due to branch\n");
8585                         break;
8586                       }
8587                       // call/ret fast path assumes no registers allocated
8588                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
8589                         break;
8590                       }
8591                       assert(r < 64);
8592                       k--;
8593                     }
8594                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
8595                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
8596                       while(k<i) {
8597                         regs[k].regmap_entry[hr]=f_regmap[hr];
8598                         regs[k].regmap[hr]=f_regmap[hr];
8599                         regmap_pre[k+1][hr]=f_regmap[hr];
8600                         regs[k].wasdirty&=~(1<<hr);
8601                         regs[k].dirty&=~(1<<hr);
8602                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
8603                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
8604                         regs[k].wasconst&=~(1<<hr);
8605                         regs[k].isconst&=~(1<<hr);
8606                         k++;
8607                       }
8608                     }
8609                     else {
8610                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
8611                       break;
8612                     }
8613                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
8614                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
8615                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
8616                       regs[i].regmap_entry[hr]=f_regmap[hr];
8617                       regs[i].regmap[hr]=f_regmap[hr];
8618                       regs[i].wasdirty&=~(1<<hr);
8619                       regs[i].dirty&=~(1<<hr);
8620                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
8621                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
8622                       regs[i].wasconst&=~(1<<hr);
8623                       regs[i].isconst&=~(1<<hr);
8624                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
8625                       branch_regs[i].wasdirty&=~(1<<hr);
8626                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
8627                       branch_regs[i].regmap[hr]=f_regmap[hr];
8628                       branch_regs[i].dirty&=~(1<<hr);
8629                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
8630                       branch_regs[i].wasconst&=~(1<<hr);
8631                       branch_regs[i].isconst&=~(1<<hr);
8632                       if (!is_ujump(i)) {
8633                         regmap_pre[i+2][hr]=f_regmap[hr];
8634                         regs[i+2].wasdirty&=~(1<<hr);
8635                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
8636                       }
8637                     }
8638                   }
8639                   for(k=t;k<j;k++) {
8640                     // Alloc register clean at beginning of loop,
8641                     // but may dirty it in pass 6
8642                     regs[k].regmap_entry[hr]=f_regmap[hr];
8643                     regs[k].regmap[hr]=f_regmap[hr];
8644                     regs[k].dirty&=~(1<<hr);
8645                     regs[k].wasconst&=~(1<<hr);
8646                     regs[k].isconst&=~(1<<hr);
8647                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP) {
8648                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
8649                       branch_regs[k].regmap[hr]=f_regmap[hr];
8650                       branch_regs[k].dirty&=~(1<<hr);
8651                       branch_regs[k].wasconst&=~(1<<hr);
8652                       branch_regs[k].isconst&=~(1<<hr);
8653                       if (!is_ujump(k)) {
8654                         regmap_pre[k+2][hr]=f_regmap[hr];
8655                         regs[k+2].wasdirty&=~(1<<hr);
8656                       }
8657                     }
8658                     else
8659                     {
8660                       regmap_pre[k+1][hr]=f_regmap[hr];
8661                       regs[k+1].wasdirty&=~(1<<hr);
8662                     }
8663                   }
8664                   if(regs[j].regmap[hr]==f_regmap[hr])
8665                     regs[j].regmap_entry[hr]=f_regmap[hr];
8666                   break;
8667                 }
8668                 if(j==i) break;
8669                 if(regs[j].regmap[hr]>=0)
8670                   break;
8671                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
8672                   //printf("no-match due to different register\n");
8673                   break;
8674                 }
8675                 if (is_ujump(j))
8676                 {
8677                   // Stop on unconditional branch
8678                   break;
8679                 }
8680                 if(itype[j]==CJUMP||itype[j]==SJUMP)
8681                 {
8682                   if(ooo[j]) {
8683                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
8684                       break;
8685                   }else{
8686                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
8687                       break;
8688                   }
8689                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
8690                     //printf("no-match due to different register (branch)\n");
8691                     break;
8692                   }
8693                 }
8694                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
8695                   //printf("No free regs for store %x\n",start+j*4);
8696                   break;
8697                 }
8698                 assert(f_regmap[hr]<64);
8699               }
8700             }
8701           }
8702         }
8703       }
8704     }else{
8705       // Non branch or undetermined branch target
8706       for(hr=0;hr<HOST_REGS;hr++)
8707       {
8708         if(hr!=EXCLUDE_REG) {
8709           if(regs[i].regmap[hr]>=0) {
8710             if(f_regmap[hr]!=regs[i].regmap[hr]) {
8711               // dealloc old register
8712               int n;
8713               for(n=0;n<HOST_REGS;n++)
8714               {
8715                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
8716               }
8717               // and alloc new one
8718               f_regmap[hr]=regs[i].regmap[hr];
8719             }
8720           }
8721         }
8722       }
8723       // Try to restore cycle count at branch targets
8724       if(bt[i]) {
8725         for(j=i;j<slen-1;j++) {
8726           if(regs[j].regmap[HOST_CCREG]!=-1) break;
8727           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
8728             //printf("no free regs for store %x\n",start+j*4);
8729             break;
8730           }
8731         }
8732         if(regs[j].regmap[HOST_CCREG]==CCREG) {
8733           int k=i;
8734           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
8735           while(k<j) {
8736             regs[k].regmap_entry[HOST_CCREG]=CCREG;
8737             regs[k].regmap[HOST_CCREG]=CCREG;
8738             regmap_pre[k+1][HOST_CCREG]=CCREG;
8739             regs[k+1].wasdirty|=1<<HOST_CCREG;
8740             regs[k].dirty|=1<<HOST_CCREG;
8741             regs[k].wasconst&=~(1<<HOST_CCREG);
8742             regs[k].isconst&=~(1<<HOST_CCREG);
8743             k++;
8744           }
8745           regs[j].regmap_entry[HOST_CCREG]=CCREG;
8746         }
8747         // Work backwards from the branch target
8748         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
8749         {
8750           //printf("Extend backwards\n");
8751           int k;
8752           k=i;
8753           while(regs[k-1].regmap[HOST_CCREG]==-1) {
8754             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
8755               //printf("no free regs for store %x\n",start+(k-1)*4);
8756               break;
8757             }
8758             k--;
8759           }
8760           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
8761             //printf("Extend CC, %x ->\n",start+k*4);
8762             while(k<=i) {
8763               regs[k].regmap_entry[HOST_CCREG]=CCREG;
8764               regs[k].regmap[HOST_CCREG]=CCREG;
8765               regmap_pre[k+1][HOST_CCREG]=CCREG;
8766               regs[k+1].wasdirty|=1<<HOST_CCREG;
8767               regs[k].dirty|=1<<HOST_CCREG;
8768               regs[k].wasconst&=~(1<<HOST_CCREG);
8769               regs[k].isconst&=~(1<<HOST_CCREG);
8770               k++;
8771             }
8772           }
8773           else {
8774             //printf("Fail Extend CC, %x ->\n",start+k*4);
8775           }
8776         }
8777       }
8778       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
8779          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
8780          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1)
8781       {
8782         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
8783       }
8784     }
8785   }
8786
8787   // This allocates registers (if possible) one instruction prior
8788   // to use, which can avoid a load-use penalty on certain CPUs.
8789   for(i=0;i<slen-1;i++)
8790   {
8791     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP))
8792     {
8793       if(!bt[i+1])
8794       {
8795         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
8796            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
8797         {
8798           if(rs1[i+1]) {
8799             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
8800             {
8801               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8802               {
8803                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
8804                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
8805                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
8806                 regs[i].isconst&=~(1<<hr);
8807                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8808                 constmap[i][hr]=constmap[i+1][hr];
8809                 regs[i+1].wasdirty&=~(1<<hr);
8810                 regs[i].dirty&=~(1<<hr);
8811               }
8812             }
8813           }
8814           if(rs2[i+1]) {
8815             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
8816             {
8817               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8818               {
8819                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
8820                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
8821                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
8822                 regs[i].isconst&=~(1<<hr);
8823                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8824                 constmap[i][hr]=constmap[i+1][hr];
8825                 regs[i+1].wasdirty&=~(1<<hr);
8826                 regs[i].dirty&=~(1<<hr);
8827               }
8828             }
8829           }
8830           // Preload target address for load instruction (non-constant)
8831           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8832             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
8833             {
8834               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8835               {
8836                 regs[i].regmap[hr]=rs1[i+1];
8837                 regmap_pre[i+1][hr]=rs1[i+1];
8838                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8839                 regs[i].isconst&=~(1<<hr);
8840                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8841                 constmap[i][hr]=constmap[i+1][hr];
8842                 regs[i+1].wasdirty&=~(1<<hr);
8843                 regs[i].dirty&=~(1<<hr);
8844               }
8845             }
8846           }
8847           // Load source into target register
8848           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8849             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
8850             {
8851               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8852               {
8853                 regs[i].regmap[hr]=rs1[i+1];
8854                 regmap_pre[i+1][hr]=rs1[i+1];
8855                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8856                 regs[i].isconst&=~(1<<hr);
8857                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8858                 constmap[i][hr]=constmap[i+1][hr];
8859                 regs[i+1].wasdirty&=~(1<<hr);
8860                 regs[i].dirty&=~(1<<hr);
8861               }
8862             }
8863           }
8864           // Address for store instruction (non-constant)
8865           if(itype[i+1]==STORE||itype[i+1]==STORELR
8866              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
8867             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8868               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
8869               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
8870               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
8871               assert(hr>=0);
8872               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8873               {
8874                 regs[i].regmap[hr]=rs1[i+1];
8875                 regmap_pre[i+1][hr]=rs1[i+1];
8876                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8877                 regs[i].isconst&=~(1<<hr);
8878                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8879                 constmap[i][hr]=constmap[i+1][hr];
8880                 regs[i+1].wasdirty&=~(1<<hr);
8881                 regs[i].dirty&=~(1<<hr);
8882               }
8883             }
8884           }
8885           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
8886             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8887               int nr;
8888               hr=get_reg(regs[i+1].regmap,FTEMP);
8889               assert(hr>=0);
8890               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8891               {
8892                 regs[i].regmap[hr]=rs1[i+1];
8893                 regmap_pre[i+1][hr]=rs1[i+1];
8894                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8895                 regs[i].isconst&=~(1<<hr);
8896                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8897                 constmap[i][hr]=constmap[i+1][hr];
8898                 regs[i+1].wasdirty&=~(1<<hr);
8899                 regs[i].dirty&=~(1<<hr);
8900               }
8901               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
8902               {
8903                 // move it to another register
8904                 regs[i+1].regmap[hr]=-1;
8905                 regmap_pre[i+2][hr]=-1;
8906                 regs[i+1].regmap[nr]=FTEMP;
8907                 regmap_pre[i+2][nr]=FTEMP;
8908                 regs[i].regmap[nr]=rs1[i+1];
8909                 regmap_pre[i+1][nr]=rs1[i+1];
8910                 regs[i+1].regmap_entry[nr]=rs1[i+1];
8911                 regs[i].isconst&=~(1<<nr);
8912                 regs[i+1].isconst&=~(1<<nr);
8913                 regs[i].dirty&=~(1<<nr);
8914                 regs[i+1].wasdirty&=~(1<<nr);
8915                 regs[i+1].dirty&=~(1<<nr);
8916                 regs[i+2].wasdirty&=~(1<<nr);
8917               }
8918             }
8919           }
8920           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
8921             if(itype[i+1]==LOAD)
8922               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
8923             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
8924               hr=get_reg(regs[i+1].regmap,FTEMP);
8925             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
8926               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
8927               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
8928             }
8929             if(hr>=0&&regs[i].regmap[hr]<0) {
8930               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
8931               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
8932                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
8933                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
8934                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
8935                 regs[i].isconst&=~(1<<hr);
8936                 regs[i+1].wasdirty&=~(1<<hr);
8937                 regs[i].dirty&=~(1<<hr);
8938               }
8939             }
8940           }
8941         }
8942       }
8943     }
8944   }
8945
8946   /* Pass 6 - Optimize clean/dirty state */
8947   clean_registers(0,slen-1,1);
8948
8949   /* Pass 7 - Identify 32-bit registers */
8950   for (i=slen-1;i>=0;i--)
8951   {
8952     if(itype[i]==CJUMP||itype[i]==SJUMP)
8953     {
8954       // Conditional branch
8955       if((source[i]>>16)!=0x1000&&i<slen-2) {
8956         // Mark this address as a branch target since it may be called
8957         // upon return from interrupt
8958         bt[i+2]=1;
8959       }
8960     }
8961   }
8962
8963   if(itype[slen-1]==SPAN) {
8964     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
8965   }
8966
8967 #ifdef DISASM
8968   /* Debug/disassembly */
8969   for(i=0;i<slen;i++)
8970   {
8971     printf("U:");
8972     int r;
8973     for(r=1;r<=CCREG;r++) {
8974       if((unneeded_reg[i]>>r)&1) {
8975         if(r==HIREG) printf(" HI");
8976         else if(r==LOREG) printf(" LO");
8977         else printf(" r%d",r);
8978       }
8979     }
8980     printf("\n");
8981     #if defined(__i386__) || defined(__x86_64__)
8982     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
8983     #endif
8984     #ifdef __arm__
8985     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
8986     #endif
8987     #if defined(__i386__) || defined(__x86_64__)
8988     printf("needs: ");
8989     if(needed_reg[i]&1) printf("eax ");
8990     if((needed_reg[i]>>1)&1) printf("ecx ");
8991     if((needed_reg[i]>>2)&1) printf("edx ");
8992     if((needed_reg[i]>>3)&1) printf("ebx ");
8993     if((needed_reg[i]>>5)&1) printf("ebp ");
8994     if((needed_reg[i]>>6)&1) printf("esi ");
8995     if((needed_reg[i]>>7)&1) printf("edi ");
8996     printf("\n");
8997     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
8998     printf("dirty: ");
8999     if(regs[i].wasdirty&1) printf("eax ");
9000     if((regs[i].wasdirty>>1)&1) printf("ecx ");
9001     if((regs[i].wasdirty>>2)&1) printf("edx ");
9002     if((regs[i].wasdirty>>3)&1) printf("ebx ");
9003     if((regs[i].wasdirty>>5)&1) printf("ebp ");
9004     if((regs[i].wasdirty>>6)&1) printf("esi ");
9005     if((regs[i].wasdirty>>7)&1) printf("edi ");
9006     #endif
9007     #ifdef __arm__
9008     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
9009     printf("dirty: ");
9010     if(regs[i].wasdirty&1) printf("r0 ");
9011     if((regs[i].wasdirty>>1)&1) printf("r1 ");
9012     if((regs[i].wasdirty>>2)&1) printf("r2 ");
9013     if((regs[i].wasdirty>>3)&1) printf("r3 ");
9014     if((regs[i].wasdirty>>4)&1) printf("r4 ");
9015     if((regs[i].wasdirty>>5)&1) printf("r5 ");
9016     if((regs[i].wasdirty>>6)&1) printf("r6 ");
9017     if((regs[i].wasdirty>>7)&1) printf("r7 ");
9018     if((regs[i].wasdirty>>8)&1) printf("r8 ");
9019     if((regs[i].wasdirty>>9)&1) printf("r9 ");
9020     if((regs[i].wasdirty>>10)&1) printf("r10 ");
9021     if((regs[i].wasdirty>>12)&1) printf("r12 ");
9022     #endif
9023     printf("\n");
9024     disassemble_inst(i);
9025     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
9026     #if defined(__i386__) || defined(__x86_64__)
9027     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
9028     if(regs[i].dirty&1) printf("eax ");
9029     if((regs[i].dirty>>1)&1) printf("ecx ");
9030     if((regs[i].dirty>>2)&1) printf("edx ");
9031     if((regs[i].dirty>>3)&1) printf("ebx ");
9032     if((regs[i].dirty>>5)&1) printf("ebp ");
9033     if((regs[i].dirty>>6)&1) printf("esi ");
9034     if((regs[i].dirty>>7)&1) printf("edi ");
9035     #endif
9036     #ifdef __arm__
9037     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
9038     if(regs[i].dirty&1) printf("r0 ");
9039     if((regs[i].dirty>>1)&1) printf("r1 ");
9040     if((regs[i].dirty>>2)&1) printf("r2 ");
9041     if((regs[i].dirty>>3)&1) printf("r3 ");
9042     if((regs[i].dirty>>4)&1) printf("r4 ");
9043     if((regs[i].dirty>>5)&1) printf("r5 ");
9044     if((regs[i].dirty>>6)&1) printf("r6 ");
9045     if((regs[i].dirty>>7)&1) printf("r7 ");
9046     if((regs[i].dirty>>8)&1) printf("r8 ");
9047     if((regs[i].dirty>>9)&1) printf("r9 ");
9048     if((regs[i].dirty>>10)&1) printf("r10 ");
9049     if((regs[i].dirty>>12)&1) printf("r12 ");
9050     #endif
9051     printf("\n");
9052     if(regs[i].isconst) {
9053       printf("constants: ");
9054       #if defined(__i386__) || defined(__x86_64__)
9055       if(regs[i].isconst&1) printf("eax=%x ",(u_int)constmap[i][0]);
9056       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(u_int)constmap[i][1]);
9057       if((regs[i].isconst>>2)&1) printf("edx=%x ",(u_int)constmap[i][2]);
9058       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(u_int)constmap[i][3]);
9059       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(u_int)constmap[i][5]);
9060       if((regs[i].isconst>>6)&1) printf("esi=%x ",(u_int)constmap[i][6]);
9061       if((regs[i].isconst>>7)&1) printf("edi=%x ",(u_int)constmap[i][7]);
9062       #endif
9063       #if defined(__arm__) || defined(__aarch64__)
9064       int r;
9065       for (r = 0; r < ARRAY_SIZE(constmap[i]); r++)
9066         if ((regs[i].isconst >> r) & 1)
9067           printf(" r%d=%x", r, (u_int)constmap[i][r]);
9068       #endif
9069       printf("\n");
9070     }
9071     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
9072       #if defined(__i386__) || defined(__x86_64__)
9073       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
9074       if(branch_regs[i].dirty&1) printf("eax ");
9075       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
9076       if((branch_regs[i].dirty>>2)&1) printf("edx ");
9077       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
9078       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
9079       if((branch_regs[i].dirty>>6)&1) printf("esi ");
9080       if((branch_regs[i].dirty>>7)&1) printf("edi ");
9081       #endif
9082       #ifdef __arm__
9083       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
9084       if(branch_regs[i].dirty&1) printf("r0 ");
9085       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
9086       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
9087       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
9088       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
9089       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
9090       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
9091       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
9092       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
9093       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
9094       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
9095       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
9096       #endif
9097     }
9098   }
9099 #endif // DISASM
9100
9101   /* Pass 8 - Assembly */
9102   linkcount=0;stubcount=0;
9103   ds=0;is_delayslot=0;
9104   u_int dirty_pre=0;
9105   void *beginning=start_block();
9106   if((u_int)addr&1) {
9107     ds=1;
9108     pagespan_ds();
9109   }
9110   void *instr_addr0_override = NULL;
9111
9112   if (start == 0x80030000) {
9113     // nasty hack for the fastbios thing
9114     // override block entry to this code
9115     instr_addr0_override = out;
9116     emit_movimm(start,0);
9117     // abuse io address var as a flag that we
9118     // have already returned here once
9119     emit_readword(&address,1);
9120     emit_writeword(0,&pcaddr);
9121     emit_writeword(0,&address);
9122     emit_cmp(0,1);
9123     #ifdef __aarch64__
9124     emit_jeq(out + 4*2);
9125     emit_far_jump(new_dyna_leave);
9126     #else
9127     emit_jne(new_dyna_leave);
9128     #endif
9129   }
9130   for(i=0;i<slen;i++)
9131   {
9132     //if(ds) printf("ds: ");
9133     disassemble_inst(i);
9134     if(ds) {
9135       ds=0; // Skip delay slot
9136       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
9137       instr_addr[i] = NULL;
9138     } else {
9139       speculate_register_values(i);
9140       #ifndef DESTRUCTIVE_WRITEBACK
9141       if (i < 2 || !is_ujump(i-2))
9142       {
9143         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,unneeded_reg[i]);
9144       }
9145       if((itype[i]==CJUMP||itype[i]==SJUMP)&&!likely[i]) {
9146         dirty_pre=branch_regs[i].dirty;
9147       }else{
9148         dirty_pre=regs[i].dirty;
9149       }
9150       #endif
9151       // write back
9152       if (i < 2 || !is_ujump(i-2))
9153       {
9154         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,unneeded_reg[i]);
9155         loop_preload(regmap_pre[i],regs[i].regmap_entry);
9156       }
9157       // branch target entry point
9158       instr_addr[i] = out;
9159       assem_debug("<->\n");
9160       drc_dbg_emit_do_cmp(i);
9161
9162       // load regs
9163       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
9164         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty);
9165       load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i],rs2[i]);
9166       address_generation(i,&regs[i],regs[i].regmap_entry);
9167       load_consts(regmap_pre[i],regs[i].regmap,i);
9168       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
9169       {
9170         // Load the delay slot registers if necessary
9171         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
9172           load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i+1],rs1[i+1]);
9173         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
9174           load_regs(regs[i].regmap_entry,regs[i].regmap,rs2[i+1],rs2[i+1]);
9175         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
9176           load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP);
9177       }
9178       else if(i+1<slen)
9179       {
9180         // Preload registers for following instruction
9181         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
9182           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
9183             load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i+1],rs1[i+1]);
9184         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
9185           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
9186             load_regs(regs[i].regmap_entry,regs[i].regmap,rs2[i+1],rs2[i+1]);
9187       }
9188       // TODO: if(is_ooo(i)) address_generation(i+1);
9189       if(itype[i]==CJUMP)
9190         load_regs(regs[i].regmap_entry,regs[i].regmap,CCREG,CCREG);
9191       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
9192         load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP);
9193       // assemble
9194       switch(itype[i]) {
9195         case ALU:
9196           alu_assemble(i,&regs[i]);break;
9197         case IMM16:
9198           imm16_assemble(i,&regs[i]);break;
9199         case SHIFT:
9200           shift_assemble(i,&regs[i]);break;
9201         case SHIFTIMM:
9202           shiftimm_assemble(i,&regs[i]);break;
9203         case LOAD:
9204           load_assemble(i,&regs[i]);break;
9205         case LOADLR:
9206           loadlr_assemble(i,&regs[i]);break;
9207         case STORE:
9208           store_assemble(i,&regs[i]);break;
9209         case STORELR:
9210           storelr_assemble(i,&regs[i]);break;
9211         case COP0:
9212           cop0_assemble(i,&regs[i]);break;
9213         case COP1:
9214           cop1_assemble(i,&regs[i]);break;
9215         case C1LS:
9216           c1ls_assemble(i,&regs[i]);break;
9217         case COP2:
9218           cop2_assemble(i,&regs[i]);break;
9219         case C2LS:
9220           c2ls_assemble(i,&regs[i]);break;
9221         case C2OP:
9222           c2op_assemble(i,&regs[i]);break;
9223         case MULTDIV:
9224           multdiv_assemble(i,&regs[i]);
9225           multdiv_prepare_stall(i,&regs[i]);
9226           break;
9227         case MOV:
9228           mov_assemble(i,&regs[i]);break;
9229         case SYSCALL:
9230           syscall_assemble(i,&regs[i]);break;
9231         case HLECALL:
9232           hlecall_assemble(i,&regs[i]);break;
9233         case INTCALL:
9234           intcall_assemble(i,&regs[i]);break;
9235         case UJUMP:
9236           ujump_assemble(i,&regs[i]);ds=1;break;
9237         case RJUMP:
9238           rjump_assemble(i,&regs[i]);ds=1;break;
9239         case CJUMP:
9240           cjump_assemble(i,&regs[i]);ds=1;break;
9241         case SJUMP:
9242           sjump_assemble(i,&regs[i]);ds=1;break;
9243         case SPAN:
9244           pagespan_assemble(i,&regs[i]);break;
9245       }
9246       if (is_ujump(i))
9247         literal_pool(1024);
9248       else
9249         literal_pool_jumpover(256);
9250     }
9251   }
9252   //assert(is_ujump(i-2));
9253   // If the block did not end with an unconditional branch,
9254   // add a jump to the next instruction.
9255   if(i>1) {
9256     if(!is_ujump(i-2)&&itype[i-1]!=SPAN) {
9257       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP);
9258       assert(i==slen);
9259       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP) {
9260         store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4);
9261         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
9262           emit_loadreg(CCREG,HOST_CCREG);
9263         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
9264       }
9265       else if(!likely[i-2])
9266       {
9267         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].dirty,start+i*4);
9268         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
9269       }
9270       else
9271       {
9272         store_regs_bt(regs[i-2].regmap,regs[i-2].dirty,start+i*4);
9273         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
9274       }
9275       add_to_linker(out,start+i*4,0);
9276       emit_jmp(0);
9277     }
9278   }
9279   else
9280   {
9281     assert(i>0);
9282     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP);
9283     store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4);
9284     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
9285       emit_loadreg(CCREG,HOST_CCREG);
9286     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
9287     add_to_linker(out,start+i*4,0);
9288     emit_jmp(0);
9289   }
9290
9291   // TODO: delay slot stubs?
9292   // Stubs
9293   for(i=0;i<stubcount;i++)
9294   {
9295     switch(stubs[i].type)
9296     {
9297       case LOADB_STUB:
9298       case LOADH_STUB:
9299       case LOADW_STUB:
9300       case LOADD_STUB:
9301       case LOADBU_STUB:
9302       case LOADHU_STUB:
9303         do_readstub(i);break;
9304       case STOREB_STUB:
9305       case STOREH_STUB:
9306       case STOREW_STUB:
9307       case STORED_STUB:
9308         do_writestub(i);break;
9309       case CC_STUB:
9310         do_ccstub(i);break;
9311       case INVCODE_STUB:
9312         do_invstub(i);break;
9313       case FP_STUB:
9314         do_cop1stub(i);break;
9315       case STORELR_STUB:
9316         do_unalignedwritestub(i);break;
9317     }
9318   }
9319
9320   if (instr_addr0_override)
9321     instr_addr[0] = instr_addr0_override;
9322
9323   /* Pass 9 - Linker */
9324   for(i=0;i<linkcount;i++)
9325   {
9326     assem_debug("%p -> %8x\n",link_addr[i].addr,link_addr[i].target);
9327     literal_pool(64);
9328     if (!link_addr[i].ext)
9329     {
9330       void *stub = out;
9331       void *addr = check_addr(link_addr[i].target);
9332       emit_extjump(link_addr[i].addr, link_addr[i].target);
9333       if (addr) {
9334         set_jump_target(link_addr[i].addr, addr);
9335         add_link(link_addr[i].target,stub);
9336       }
9337       else
9338         set_jump_target(link_addr[i].addr, stub);
9339     }
9340     else
9341     {
9342       // Internal branch
9343       int target=(link_addr[i].target-start)>>2;
9344       assert(target>=0&&target<slen);
9345       assert(instr_addr[target]);
9346       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
9347       //set_jump_target_fillslot(link_addr[i].addr,instr_addr[target],link_addr[i].ext>>1);
9348       //#else
9349       set_jump_target(link_addr[i].addr, instr_addr[target]);
9350       //#endif
9351     }
9352   }
9353   // External Branch Targets (jump_in)
9354   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
9355   for(i=0;i<slen;i++)
9356   {
9357     if(bt[i]||i==0)
9358     {
9359       if(instr_addr[i]) // TODO - delay slots (=null)
9360       {
9361         u_int vaddr=start+i*4;
9362         u_int page=get_page(vaddr);
9363         u_int vpage=get_vpage(vaddr);
9364         literal_pool(256);
9365         {
9366           assem_debug("%p (%d) <- %8x\n",instr_addr[i],i,start+i*4);
9367           assem_debug("jump_in: %x\n",start+i*4);
9368           ll_add(jump_dirty+vpage,vaddr,out);
9369           void *entry_point = do_dirty_stub(i);
9370           ll_add_flags(jump_in+page,vaddr,state_rflags,entry_point);
9371           // If there was an existing entry in the hash table,
9372           // replace it with the new address.
9373           // Don't add new entries.  We'll insert the
9374           // ones that actually get used in check_addr().
9375           struct ht_entry *ht_bin = hash_table_get(vaddr);
9376           if (ht_bin->vaddr[0] == vaddr)
9377             ht_bin->tcaddr[0] = entry_point;
9378           if (ht_bin->vaddr[1] == vaddr)
9379             ht_bin->tcaddr[1] = entry_point;
9380         }
9381       }
9382     }
9383   }
9384   // Write out the literal pool if necessary
9385   literal_pool(0);
9386   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
9387   // Align code
9388   if(((u_int)out)&7) emit_addnop(13);
9389   #endif
9390   assert(out - (u_char *)beginning < MAX_OUTPUT_BLOCK_SIZE);
9391   //printf("shadow buffer: %p-%p\n",copy,(u_char *)copy+slen*4);
9392   memcpy(copy,source,slen*4);
9393   copy+=slen*4;
9394
9395   end_block(beginning);
9396
9397   // If we're within 256K of the end of the buffer,
9398   // start over from the beginning. (Is 256K enough?)
9399   if (out > ndrc->translation_cache + sizeof(ndrc->translation_cache) - MAX_OUTPUT_BLOCK_SIZE)
9400     out = ndrc->translation_cache;
9401
9402   // Trap writes to any of the pages we compiled
9403   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
9404     invalid_code[i]=0;
9405   }
9406   inv_code_start=inv_code_end=~0;
9407
9408   // for PCSX we need to mark all mirrors too
9409   if(get_page(start)<(RAM_SIZE>>12))
9410     for(i=start>>12;i<=(start+slen*4)>>12;i++)
9411       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
9412       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
9413       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
9414
9415   /* Pass 10 - Free memory by expiring oldest blocks */
9416
9417   int end=(((out-ndrc->translation_cache)>>(TARGET_SIZE_2-16))+16384)&65535;
9418   while(expirep!=end)
9419   {
9420     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
9421     uintptr_t base_offs = ((uintptr_t)(expirep >> 13) << shift); // Base offset of this block
9422     uintptr_t base_offs_s = base_offs >> shift;
9423     inv_debug("EXP: Phase %d\n",expirep);
9424     switch((expirep>>11)&3)
9425     {
9426       case 0:
9427         // Clear jump_in and jump_dirty
9428         ll_remove_matching_addrs(jump_in+(expirep&2047),base_offs_s,shift);
9429         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base_offs_s,shift);
9430         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base_offs_s,shift);
9431         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base_offs_s,shift);
9432         break;
9433       case 1:
9434         // Clear pointers
9435         ll_kill_pointers(jump_out[expirep&2047],base_offs_s,shift);
9436         ll_kill_pointers(jump_out[(expirep&2047)+2048],base_offs_s,shift);
9437         break;
9438       case 2:
9439         // Clear hash table
9440         for(i=0;i<32;i++) {
9441           struct ht_entry *ht_bin = &hash_table[((expirep&2047)<<5)+i];
9442           uintptr_t o1 = (u_char *)ht_bin->tcaddr[1] - ndrc->translation_cache;
9443           uintptr_t o2 = o1 - MAX_OUTPUT_BLOCK_SIZE;
9444           if ((o1 >> shift) == base_offs_s || (o2 >> shift) == base_offs_s) {
9445             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[1],ht_bin->tcaddr[1]);
9446             ht_bin->vaddr[1] = -1;
9447             ht_bin->tcaddr[1] = NULL;
9448           }
9449           o1 = (u_char *)ht_bin->tcaddr[0] - ndrc->translation_cache;
9450           o2 = o1 - MAX_OUTPUT_BLOCK_SIZE;
9451           if ((o1 >> shift) == base_offs_s || (o2 >> shift) == base_offs_s) {
9452             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[0],ht_bin->tcaddr[0]);
9453             ht_bin->vaddr[0] = ht_bin->vaddr[1];
9454             ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
9455             ht_bin->vaddr[1] = -1;
9456             ht_bin->tcaddr[1] = NULL;
9457           }
9458         }
9459         break;
9460       case 3:
9461         // Clear jump_out
9462         if((expirep&2047)==0)
9463           do_clear_cache();
9464         ll_remove_matching_addrs(jump_out+(expirep&2047),base_offs_s,shift);
9465         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base_offs_s,shift);
9466         break;
9467     }
9468     expirep=(expirep+1)&65535;
9469   }
9470   return 0;
9471 }
9472
9473 // vim:shiftwidth=2:expandtab