drc: remove yet more n64 stuff
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29 #ifdef _3DS
30 #include <3ds_utils.h>
31 #endif
32 #ifdef VITA
33 #include <psp2/kernel/sysmem.h>
34 static int sceBlock;
35 #endif
36
37 #include "new_dynarec_config.h"
38 #include "../psxhle.h" //emulator interface
39 #include "emu_if.h" //emulator interface
40
41 #ifndef ARRAY_SIZE
42 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
43 #endif
44
45 //#define DISASM
46 //#define assem_debug printf
47 //#define inv_debug printf
48 #define assem_debug(...)
49 #define inv_debug(...)
50
51 #ifdef __i386__
52 #include "assem_x86.h"
53 #endif
54 #ifdef __x86_64__
55 #include "assem_x64.h"
56 #endif
57 #ifdef __arm__
58 #include "assem_arm.h"
59 #endif
60
61 #define MAXBLOCK 4096
62 #define MAX_OUTPUT_BLOCK_SIZE 262144
63
64 // stubs
65 enum stub_type {
66   CC_STUB = 1,
67   FP_STUB = 2,
68   LOADB_STUB = 3,
69   LOADH_STUB = 4,
70   LOADW_STUB = 5,
71   LOADD_STUB = 6,
72   LOADBU_STUB = 7,
73   LOADHU_STUB = 8,
74   STOREB_STUB = 9,
75   STOREH_STUB = 10,
76   STOREW_STUB = 11,
77   STORED_STUB = 12,
78   STORELR_STUB = 13,
79   INVCODE_STUB = 14,
80 };
81
82 struct regstat
83 {
84   signed char regmap_entry[HOST_REGS];
85   signed char regmap[HOST_REGS];
86   uint64_t was32;
87   uint64_t is32;
88   uint64_t wasdirty;
89   uint64_t dirty;
90   uint64_t u;
91   u_int wasconst;
92   u_int isconst;
93   u_int loadedconst;             // host regs that have constants loaded
94   u_int waswritten;              // MIPS regs that were used as store base before
95 };
96
97 // note: asm depends on this layout
98 struct ll_entry
99 {
100   u_int vaddr;
101   u_int reg_sv_flags;
102   void *addr;
103   struct ll_entry *next;
104 };
105
106 struct ht_entry
107 {
108   u_int vaddr[2];
109   void *tcaddr[2];
110 };
111
112 struct code_stub
113 {
114   enum stub_type type;
115   void *addr;
116   void *retaddr;
117   u_int a;
118   uintptr_t b;
119   uintptr_t c;
120   u_int d;
121   u_int e;
122 };
123
124 struct link_entry
125 {
126   void *addr;
127   u_int target;
128   u_int ext;
129 };
130
131   // used by asm:
132   u_char *out;
133   struct ht_entry hash_table[65536]  __attribute__((aligned(16)));
134   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
135   struct ll_entry *jump_dirty[4096];
136
137   static struct ll_entry *jump_out[4096];
138   static u_int start;
139   static u_int *source;
140   static char insn[MAXBLOCK][10];
141   static u_char itype[MAXBLOCK];
142   static u_char opcode[MAXBLOCK];
143   static u_char opcode2[MAXBLOCK];
144   static u_char bt[MAXBLOCK];
145   static u_char rs1[MAXBLOCK];
146   static u_char rs2[MAXBLOCK];
147   static u_char rt1[MAXBLOCK];
148   static u_char rt2[MAXBLOCK];
149   static u_char us1[MAXBLOCK];
150   static u_char us2[MAXBLOCK];
151   static u_char dep1[MAXBLOCK];
152   static u_char dep2[MAXBLOCK];
153   static u_char lt1[MAXBLOCK];
154   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
155   static uint64_t gte_rt[MAXBLOCK];
156   static uint64_t gte_unneeded[MAXBLOCK];
157   static u_int smrv[32]; // speculated MIPS register values
158   static u_int smrv_strong; // mask or regs that are likely to have correct values
159   static u_int smrv_weak; // same, but somewhat less likely
160   static u_int smrv_strong_next; // same, but after current insn executes
161   static u_int smrv_weak_next;
162   static int imm[MAXBLOCK];
163   static u_int ba[MAXBLOCK];
164   static char likely[MAXBLOCK];
165   static char is_ds[MAXBLOCK];
166   static char ooo[MAXBLOCK];
167   static uint64_t unneeded_reg[MAXBLOCK];
168   static uint64_t branch_unneeded_reg[MAXBLOCK];
169   static signed char regmap_pre[MAXBLOCK][HOST_REGS];
170   static uint64_t current_constmap[HOST_REGS];
171   static uint64_t constmap[MAXBLOCK][HOST_REGS];
172   static struct regstat regs[MAXBLOCK];
173   static struct regstat branch_regs[MAXBLOCK];
174   static signed char minimum_free_regs[MAXBLOCK];
175   static u_int needed_reg[MAXBLOCK];
176   static u_int wont_dirty[MAXBLOCK];
177   static u_int will_dirty[MAXBLOCK];
178   static int ccadj[MAXBLOCK];
179   static int slen;
180   static void *instr_addr[MAXBLOCK];
181   static struct link_entry link_addr[MAXBLOCK];
182   static int linkcount;
183   static struct code_stub stubs[MAXBLOCK*3];
184   static int stubcount;
185   static u_int literals[1024][2];
186   static int literalcount;
187   static int is_delayslot;
188   static char shadow[1048576]  __attribute__((aligned(16)));
189   static void *copy;
190   static int expirep;
191   static u_int stop_after_jal;
192 #ifndef RAM_FIXED
193   static uintptr_t ram_offset;
194 #else
195   static const uintptr_t ram_offset=0;
196 #endif
197
198   int new_dynarec_hacks;
199   int new_dynarec_did_compile;
200   extern u_char restore_candidate[512];
201   extern int cycle_count;
202
203   /* registers that may be allocated */
204   /* 1-31 gpr */
205 #define HIREG 32 // hi
206 #define LOREG 33 // lo
207 //#define FSREG 34 // FPU status (FCSR)
208 #define CSREG 35 // Coprocessor status
209 #define CCREG 36 // Cycle count
210 #define INVCP 37 // Pointer to invalid_code
211 //#define MMREG 38 // Pointer to memory_map
212 //#define ROREG 39 // ram offset (if rdram!=0x80000000)
213 #define TEMPREG 40
214 #define FTEMP 40 // FPU temporary register
215 #define PTEMP 41 // Prefetch temporary register
216 //#define TLREG 42 // TLB mapping offset
217 #define RHASH 43 // Return address hash
218 #define RHTBL 44 // Return address hash table address
219 #define RTEMP 45 // JR/JALR address register
220 #define MAXREG 45
221 #define AGEN1 46 // Address generation temporary register
222 //#define AGEN2 47 // Address generation temporary register
223 //#define MGEN1 48 // Maptable address generation temporary register
224 //#define MGEN2 49 // Maptable address generation temporary register
225 #define BTREG 50 // Branch target temporary register
226
227   /* instruction types */
228 #define NOP 0     // No operation
229 #define LOAD 1    // Load
230 #define STORE 2   // Store
231 #define LOADLR 3  // Unaligned load
232 #define STORELR 4 // Unaligned store
233 #define MOV 5     // Move
234 #define ALU 6     // Arithmetic/logic
235 #define MULTDIV 7 // Multiply/divide
236 #define SHIFT 8   // Shift by register
237 #define SHIFTIMM 9// Shift by immediate
238 #define IMM16 10  // 16-bit immediate
239 #define RJUMP 11  // Unconditional jump to register
240 #define UJUMP 12  // Unconditional jump
241 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
242 #define SJUMP 14  // Conditional branch (regimm format)
243 #define COP0 15   // Coprocessor 0
244 #define COP1 16   // Coprocessor 1
245 #define C1LS 17   // Coprocessor 1 load/store
246 #define FJUMP 18  // Conditional branch (floating point)
247 //#define FLOAT 19  // Floating point unit
248 //#define FCONV 20  // Convert integer to float
249 //#define FCOMP 21  // Floating point compare (sets FSREG)
250 #define SYSCALL 22// SYSCALL
251 #define OTHER 23  // Other
252 #define SPAN 24   // Branch/delay slot spans 2 pages
253 #define NI 25     // Not implemented
254 #define HLECALL 26// PCSX fake opcodes for HLE
255 #define COP2 27   // Coprocessor 2 move
256 #define C2LS 28   // Coprocessor 2 load/store
257 #define C2OP 29   // Coprocessor 2 operation
258 #define INTCALL 30// Call interpreter to handle rare corner cases
259
260   /* branch codes */
261 #define TAKEN 1
262 #define NOTTAKEN 2
263 #define NULLDS 3
264
265 // asm linkage
266 int new_recompile_block(int addr);
267 void *get_addr_ht(u_int vaddr);
268 void invalidate_block(u_int block);
269 void invalidate_addr(u_int addr);
270 void remove_hash(int vaddr);
271 void dyna_linker();
272 void dyna_linker_ds();
273 void verify_code();
274 void verify_code_vm();
275 void verify_code_ds();
276 void cc_interrupt();
277 void fp_exception();
278 void fp_exception_ds();
279 void jump_syscall_hle();
280 void jump_hlecall();
281 void jump_intcall();
282 void new_dyna_leave();
283
284 // Needed by assembler
285 static void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
286 static void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
287 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
288 static void load_all_regs(signed char i_regmap[]);
289 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
290 static void load_regs_entry(int t);
291 static void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
292
293 static int verify_dirty(u_int *ptr);
294 static int get_final_value(int hr, int i, int *value);
295 static void add_stub(enum stub_type type, void *addr, void *retaddr,
296   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e);
297 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
298   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist);
299 static void add_to_linker(void *addr, u_int target, int ext);
300
301 static void mprotect_w_x(void *start, void *end, int is_x)
302 {
303 #ifdef NO_WRITE_EXEC
304   #if defined(VITA)
305   // *Open* enables write on all memory that was
306   // allocated by sceKernelAllocMemBlockForVM()?
307   if (is_x)
308     sceKernelCloseVMDomain();
309   else
310     sceKernelOpenVMDomain();
311   #else
312   u_long mstart = (u_long)start & ~4095ul;
313   u_long mend = (u_long)end;
314   if (mprotect((void *)mstart, mend - mstart,
315                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
316     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
317   #endif
318 #endif
319 }
320
321 static void start_tcache_write(void *start, void *end)
322 {
323   mprotect_w_x(start, end, 0);
324 }
325
326 static void end_tcache_write(void *start, void *end)
327 {
328 #ifdef __arm__
329   size_t len = (char *)end - (char *)start;
330   #if   defined(__BLACKBERRY_QNX__)
331   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
332   #elif defined(__MACH__)
333   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
334   #elif defined(VITA)
335   sceKernelSyncVMDomain(sceBlock, start, len);
336   #elif defined(_3DS)
337   ctr_flush_invalidate_cache();
338   #else
339   __clear_cache(start, end);
340   #endif
341   (void)len;
342 #endif
343
344   mprotect_w_x(start, end, 1);
345 }
346
347 static void *start_block(void)
348 {
349   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
350   if (end > translation_cache + (1<<TARGET_SIZE_2))
351     end = translation_cache + (1<<TARGET_SIZE_2);
352   start_tcache_write(out, end);
353   return out;
354 }
355
356 static void end_block(void *start)
357 {
358   end_tcache_write(start, out);
359 }
360
361 //#define DEBUG_CYCLE_COUNT 1
362
363 #define NO_CYCLE_PENALTY_THR 12
364
365 int cycle_multiplier; // 100 for 1.0
366
367 static int CLOCK_ADJUST(int x)
368 {
369   int s=(x>>31)|1;
370   return (x * cycle_multiplier + s * 50) / 100;
371 }
372
373 static u_int get_page(u_int vaddr)
374 {
375   u_int page=vaddr&~0xe0000000;
376   if (page < 0x1000000)
377     page &= ~0x0e00000; // RAM mirrors
378   page>>=12;
379   if(page>2048) page=2048+(page&2047);
380   return page;
381 }
382
383 // no virtual mem in PCSX
384 static u_int get_vpage(u_int vaddr)
385 {
386   return get_page(vaddr);
387 }
388
389 static struct ht_entry *hash_table_get(u_int vaddr)
390 {
391   return &hash_table[((vaddr>>16)^vaddr)&0xFFFF];
392 }
393
394 static void hash_table_add(struct ht_entry *ht_bin, u_int vaddr, void *tcaddr)
395 {
396   ht_bin->vaddr[1] = ht_bin->vaddr[0];
397   ht_bin->tcaddr[1] = ht_bin->tcaddr[0];
398   ht_bin->vaddr[0] = vaddr;
399   ht_bin->tcaddr[0] = tcaddr;
400 }
401
402 // some messy ari64's code, seems to rely on unsigned 32bit overflow
403 static int doesnt_expire_soon(void *tcaddr)
404 {
405   u_int diff = (u_int)((u_char *)tcaddr - out) << (32-TARGET_SIZE_2);
406   return diff > (u_int)(0x60000000 + (MAX_OUTPUT_BLOCK_SIZE << (32-TARGET_SIZE_2)));
407 }
408
409 // Get address from virtual address
410 // This is called from the recompiled JR/JALR instructions
411 void *get_addr(u_int vaddr)
412 {
413   u_int page=get_page(vaddr);
414   u_int vpage=get_vpage(vaddr);
415   struct ll_entry *head;
416   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
417   head=jump_in[page];
418   while(head!=NULL) {
419     if(head->vaddr==vaddr) {
420   //printf("TRACE: count=%d next=%d (get_addr match %x: %p)\n",Count,next_interupt,vaddr,head->addr);
421       hash_table_add(hash_table_get(vaddr), vaddr, head->addr);
422       return head->addr;
423     }
424     head=head->next;
425   }
426   head=jump_dirty[vpage];
427   while(head!=NULL) {
428     if(head->vaddr==vaddr) {
429       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %p)\n",Count,next_interupt,vaddr,head->addr);
430       // Don't restore blocks which are about to expire from the cache
431       if (doesnt_expire_soon(head->addr))
432       if (verify_dirty(head->addr)) {
433         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
434         invalid_code[vaddr>>12]=0;
435         inv_code_start=inv_code_end=~0;
436         if(vpage<2048) {
437           restore_candidate[vpage>>3]|=1<<(vpage&7);
438         }
439         else restore_candidate[page>>3]|=1<<(page&7);
440         struct ht_entry *ht_bin = hash_table_get(vaddr);
441         if (ht_bin->vaddr[0] == vaddr)
442           ht_bin->tcaddr[0] = head->addr; // Replace existing entry
443         else
444           hash_table_add(ht_bin, vaddr, head->addr);
445
446         return head->addr;
447       }
448     }
449     head=head->next;
450   }
451   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
452   int r=new_recompile_block(vaddr);
453   if(r==0) return get_addr(vaddr);
454   // Execute in unmapped page, generate pagefault execption
455   Status|=2;
456   Cause=(vaddr<<31)|0x8;
457   EPC=(vaddr&1)?vaddr-5:vaddr;
458   BadVAddr=(vaddr&~1);
459   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
460   EntryHi=BadVAddr&0xFFFFE000;
461   return get_addr_ht(0x80000000);
462 }
463 // Look up address in hash table first
464 void *get_addr_ht(u_int vaddr)
465 {
466   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
467   const struct ht_entry *ht_bin = hash_table_get(vaddr);
468   if (ht_bin->vaddr[0] == vaddr) return ht_bin->tcaddr[0];
469   if (ht_bin->vaddr[1] == vaddr) return ht_bin->tcaddr[1];
470   return get_addr(vaddr);
471 }
472
473 void clear_all_regs(signed char regmap[])
474 {
475   int hr;
476   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
477 }
478
479 signed char get_reg(signed char regmap[],int r)
480 {
481   int hr;
482   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
483   return -1;
484 }
485
486 // Find a register that is available for two consecutive cycles
487 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
488 {
489   int hr;
490   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
491   return -1;
492 }
493
494 int count_free_regs(signed char regmap[])
495 {
496   int count=0;
497   int hr;
498   for(hr=0;hr<HOST_REGS;hr++)
499   {
500     if(hr!=EXCLUDE_REG) {
501       if(regmap[hr]<0) count++;
502     }
503   }
504   return count;
505 }
506
507 void dirty_reg(struct regstat *cur,signed char reg)
508 {
509   int hr;
510   if(!reg) return;
511   for (hr=0;hr<HOST_REGS;hr++) {
512     if((cur->regmap[hr]&63)==reg) {
513       cur->dirty|=1<<hr;
514     }
515   }
516 }
517
518 // If we dirty the lower half of a 64 bit register which is now being
519 // sign-extended, we need to dump the upper half.
520 // Note: Do this only after completion of the instruction, because
521 // some instructions may need to read the full 64-bit value even if
522 // overwriting it (eg SLTI, DSRA32).
523 static void flush_dirty_uppers(struct regstat *cur)
524 {
525   int hr,reg;
526   for (hr=0;hr<HOST_REGS;hr++) {
527     if((cur->dirty>>hr)&1) {
528       reg=cur->regmap[hr];
529       if(reg>=64)
530         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
531     }
532   }
533 }
534
535 void set_const(struct regstat *cur,signed char reg,uint64_t value)
536 {
537   int hr;
538   if(!reg) return;
539   for (hr=0;hr<HOST_REGS;hr++) {
540     if(cur->regmap[hr]==reg) {
541       cur->isconst|=1<<hr;
542       current_constmap[hr]=value;
543     }
544     else if((cur->regmap[hr]^64)==reg) {
545       cur->isconst|=1<<hr;
546       current_constmap[hr]=value>>32;
547     }
548   }
549 }
550
551 void clear_const(struct regstat *cur,signed char reg)
552 {
553   int hr;
554   if(!reg) return;
555   for (hr=0;hr<HOST_REGS;hr++) {
556     if((cur->regmap[hr]&63)==reg) {
557       cur->isconst&=~(1<<hr);
558     }
559   }
560 }
561
562 int is_const(struct regstat *cur,signed char reg)
563 {
564   int hr;
565   if(reg<0) return 0;
566   if(!reg) return 1;
567   for (hr=0;hr<HOST_REGS;hr++) {
568     if((cur->regmap[hr]&63)==reg) {
569       return (cur->isconst>>hr)&1;
570     }
571   }
572   return 0;
573 }
574 uint64_t get_const(struct regstat *cur,signed char reg)
575 {
576   int hr;
577   if(!reg) return 0;
578   for (hr=0;hr<HOST_REGS;hr++) {
579     if(cur->regmap[hr]==reg) {
580       return current_constmap[hr];
581     }
582   }
583   SysPrintf("Unknown constant in r%d\n",reg);
584   exit(1);
585 }
586
587 // Least soon needed registers
588 // Look at the next ten instructions and see which registers
589 // will be used.  Try not to reallocate these.
590 void lsn(u_char hsn[], int i, int *preferred_reg)
591 {
592   int j;
593   int b=-1;
594   for(j=0;j<9;j++)
595   {
596     if(i+j>=slen) {
597       j=slen-i-1;
598       break;
599     }
600     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
601     {
602       // Don't go past an unconditonal jump
603       j++;
604       break;
605     }
606   }
607   for(;j>=0;j--)
608   {
609     if(rs1[i+j]) hsn[rs1[i+j]]=j;
610     if(rs2[i+j]) hsn[rs2[i+j]]=j;
611     if(rt1[i+j]) hsn[rt1[i+j]]=j;
612     if(rt2[i+j]) hsn[rt2[i+j]]=j;
613     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
614       // Stores can allocate zero
615       hsn[rs1[i+j]]=j;
616       hsn[rs2[i+j]]=j;
617     }
618     // On some architectures stores need invc_ptr
619     #if defined(HOST_IMM8)
620     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
621       hsn[INVCP]=j;
622     }
623     #endif
624     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
625     {
626       hsn[CCREG]=j;
627       b=j;
628     }
629   }
630   if(b>=0)
631   {
632     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
633     {
634       // Follow first branch
635       int t=(ba[i+b]-start)>>2;
636       j=7-b;if(t+j>=slen) j=slen-t-1;
637       for(;j>=0;j--)
638       {
639         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
640         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
641         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
642         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
643       }
644     }
645     // TODO: preferred register based on backward branch
646   }
647   // Delay slot should preferably not overwrite branch conditions or cycle count
648   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
649     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
650     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
651     hsn[CCREG]=1;
652     // ...or hash tables
653     hsn[RHASH]=1;
654     hsn[RHTBL]=1;
655   }
656   // Coprocessor load/store needs FTEMP, even if not declared
657   if(itype[i]==C1LS||itype[i]==C2LS) {
658     hsn[FTEMP]=0;
659   }
660   // Load L/R also uses FTEMP as a temporary register
661   if(itype[i]==LOADLR) {
662     hsn[FTEMP]=0;
663   }
664   // Also SWL/SWR/SDL/SDR
665   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
666     hsn[FTEMP]=0;
667   }
668   // Don't remove the miniht registers
669   if(itype[i]==UJUMP||itype[i]==RJUMP)
670   {
671     hsn[RHASH]=0;
672     hsn[RHTBL]=0;
673   }
674 }
675
676 // We only want to allocate registers if we're going to use them again soon
677 int needed_again(int r, int i)
678 {
679   int j;
680   int b=-1;
681   int rn=10;
682
683   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
684   {
685     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
686       return 0; // Don't need any registers if exiting the block
687   }
688   for(j=0;j<9;j++)
689   {
690     if(i+j>=slen) {
691       j=slen-i-1;
692       break;
693     }
694     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
695     {
696       // Don't go past an unconditonal jump
697       j++;
698       break;
699     }
700     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
701     {
702       break;
703     }
704   }
705   for(;j>=1;j--)
706   {
707     if(rs1[i+j]==r) rn=j;
708     if(rs2[i+j]==r) rn=j;
709     if((unneeded_reg[i+j]>>r)&1) rn=10;
710     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
711     {
712       b=j;
713     }
714   }
715   /*
716   if(b>=0)
717   {
718     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
719     {
720       // Follow first branch
721       int o=rn;
722       int t=(ba[i+b]-start)>>2;
723       j=7-b;if(t+j>=slen) j=slen-t-1;
724       for(;j>=0;j--)
725       {
726         if(!((unneeded_reg[t+j]>>r)&1)) {
727           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
728           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
729         }
730         else rn=o;
731       }
732     }
733   }*/
734   if(rn<10) return 1;
735   (void)b;
736   return 0;
737 }
738
739 // Try to match register allocations at the end of a loop with those
740 // at the beginning
741 int loop_reg(int i, int r, int hr)
742 {
743   int j,k;
744   for(j=0;j<9;j++)
745   {
746     if(i+j>=slen) {
747       j=slen-i-1;
748       break;
749     }
750     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
751     {
752       // Don't go past an unconditonal jump
753       j++;
754       break;
755     }
756   }
757   k=0;
758   if(i>0){
759     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
760       k--;
761   }
762   for(;k<j;k++)
763   {
764     assert(r < 64);
765     if((unneeded_reg[i+k]>>r)&1) return hr;
766     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
767     {
768       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
769       {
770         int t=(ba[i+k]-start)>>2;
771         int reg=get_reg(regs[t].regmap_entry,r);
772         if(reg>=0) return reg;
773         //reg=get_reg(regs[t+1].regmap_entry,r);
774         //if(reg>=0) return reg;
775       }
776     }
777   }
778   return hr;
779 }
780
781
782 // Allocate every register, preserving source/target regs
783 void alloc_all(struct regstat *cur,int i)
784 {
785   int hr;
786
787   for(hr=0;hr<HOST_REGS;hr++) {
788     if(hr!=EXCLUDE_REG) {
789       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
790          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
791       {
792         cur->regmap[hr]=-1;
793         cur->dirty&=~(1<<hr);
794       }
795       // Don't need zeros
796       if((cur->regmap[hr]&63)==0)
797       {
798         cur->regmap[hr]=-1;
799         cur->dirty&=~(1<<hr);
800       }
801     }
802   }
803 }
804
805 #ifdef __i386__
806 #include "assem_x86.c"
807 #endif
808 #ifdef __x86_64__
809 #include "assem_x64.c"
810 #endif
811 #ifdef __arm__
812 #include "assem_arm.c"
813 #endif
814
815 // Add virtual address mapping to linked list
816 void ll_add(struct ll_entry **head,int vaddr,void *addr)
817 {
818   struct ll_entry *new_entry;
819   new_entry=malloc(sizeof(struct ll_entry));
820   assert(new_entry!=NULL);
821   new_entry->vaddr=vaddr;
822   new_entry->reg_sv_flags=0;
823   new_entry->addr=addr;
824   new_entry->next=*head;
825   *head=new_entry;
826 }
827
828 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
829 {
830   ll_add(head,vaddr,addr);
831   (*head)->reg_sv_flags=reg_sv_flags;
832 }
833
834 // Check if an address is already compiled
835 // but don't return addresses which are about to expire from the cache
836 void *check_addr(u_int vaddr)
837 {
838   struct ht_entry *ht_bin = hash_table_get(vaddr);
839   size_t i;
840   for (i = 0; i < ARRAY_SIZE(ht_bin->vaddr); i++) {
841     if (ht_bin->vaddr[i] == vaddr)
842       if (doesnt_expire_soon((u_char *)ht_bin->tcaddr[i] - MAX_OUTPUT_BLOCK_SIZE))
843         if (isclean(ht_bin->tcaddr[i]))
844           return ht_bin->tcaddr[i];
845   }
846   u_int page=get_page(vaddr);
847   struct ll_entry *head;
848   head=jump_in[page];
849   while (head != NULL) {
850     if (head->vaddr == vaddr) {
851       if (doesnt_expire_soon(head->addr)) {
852         // Update existing entry with current address
853         if (ht_bin->vaddr[0] == vaddr) {
854           ht_bin->tcaddr[0] = head->addr;
855           return head->addr;
856         }
857         if (ht_bin->vaddr[1] == vaddr) {
858           ht_bin->tcaddr[1] = head->addr;
859           return head->addr;
860         }
861         // Insert into hash table with low priority.
862         // Don't evict existing entries, as they are probably
863         // addresses that are being accessed frequently.
864         if (ht_bin->vaddr[0] == -1) {
865           ht_bin->vaddr[0] = vaddr;
866           ht_bin->tcaddr[0] = head->addr;
867         }
868         else if (ht_bin->vaddr[1] == -1) {
869           ht_bin->vaddr[1] = vaddr;
870           ht_bin->tcaddr[1] = head->addr;
871         }
872         return head->addr;
873       }
874     }
875     head=head->next;
876   }
877   return 0;
878 }
879
880 void remove_hash(int vaddr)
881 {
882   //printf("remove hash: %x\n",vaddr);
883   struct ht_entry *ht_bin = hash_table_get(vaddr);
884   if (ht_bin->vaddr[1] == vaddr) {
885     ht_bin->vaddr[1] = -1;
886     ht_bin->tcaddr[1] = NULL;
887   }
888   if (ht_bin->vaddr[0] == vaddr) {
889     ht_bin->vaddr[0] = ht_bin->vaddr[1];
890     ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
891     ht_bin->vaddr[1] = -1;
892     ht_bin->tcaddr[1] = NULL;
893   }
894 }
895
896 void ll_remove_matching_addrs(struct ll_entry **head,uintptr_t addr,int shift)
897 {
898   struct ll_entry *next;
899   while(*head) {
900     if(((uintptr_t)((*head)->addr)>>shift)==(addr>>shift) ||
901        ((uintptr_t)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
902     {
903       inv_debug("EXP: Remove pointer to %p (%x)\n",(*head)->addr,(*head)->vaddr);
904       remove_hash((*head)->vaddr);
905       next=(*head)->next;
906       free(*head);
907       *head=next;
908     }
909     else
910     {
911       head=&((*head)->next);
912     }
913   }
914 }
915
916 // Remove all entries from linked list
917 void ll_clear(struct ll_entry **head)
918 {
919   struct ll_entry *cur;
920   struct ll_entry *next;
921   if((cur=*head)) {
922     *head=0;
923     while(cur) {
924       next=cur->next;
925       free(cur);
926       cur=next;
927     }
928   }
929 }
930
931 // Dereference the pointers and remove if it matches
932 static void ll_kill_pointers(struct ll_entry *head,uintptr_t addr,int shift)
933 {
934   while(head) {
935     uintptr_t ptr = (uintptr_t)get_pointer(head->addr);
936     inv_debug("EXP: Lookup pointer to %lx at %p (%x)\n",(long)ptr,head->addr,head->vaddr);
937     if(((ptr>>shift)==(addr>>shift)) ||
938        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
939     {
940       inv_debug("EXP: Kill pointer at %p (%x)\n",head->addr,head->vaddr);
941       void *host_addr=find_extjump_insn(head->addr);
942       #ifdef __arm__
943         mark_clear_cache(host_addr);
944       #endif
945       set_jump_target(host_addr, head->addr);
946     }
947     head=head->next;
948   }
949 }
950
951 // This is called when we write to a compiled block (see do_invstub)
952 void invalidate_page(u_int page)
953 {
954   struct ll_entry *head;
955   struct ll_entry *next;
956   head=jump_in[page];
957   jump_in[page]=0;
958   while(head!=NULL) {
959     inv_debug("INVALIDATE: %x\n",head->vaddr);
960     remove_hash(head->vaddr);
961     next=head->next;
962     free(head);
963     head=next;
964   }
965   head=jump_out[page];
966   jump_out[page]=0;
967   while(head!=NULL) {
968     inv_debug("INVALIDATE: kill pointer to %x (%p)\n",head->vaddr,head->addr);
969     void *host_addr=find_extjump_insn(head->addr);
970     #ifdef __arm__
971       mark_clear_cache(host_addr);
972     #endif
973     set_jump_target(host_addr, head->addr);
974     next=head->next;
975     free(head);
976     head=next;
977   }
978 }
979
980 static void invalidate_block_range(u_int block, u_int first, u_int last)
981 {
982   u_int page=get_page(block<<12);
983   //printf("first=%d last=%d\n",first,last);
984   invalidate_page(page);
985   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
986   assert(last<page+5);
987   // Invalidate the adjacent pages if a block crosses a 4K boundary
988   while(first<page) {
989     invalidate_page(first);
990     first++;
991   }
992   for(first=page+1;first<last;first++) {
993     invalidate_page(first);
994   }
995   #ifdef __arm__
996     do_clear_cache();
997   #endif
998
999   // Don't trap writes
1000   invalid_code[block]=1;
1001
1002   #ifdef USE_MINI_HT
1003   memset(mini_ht,-1,sizeof(mini_ht));
1004   #endif
1005 }
1006
1007 void invalidate_block(u_int block)
1008 {
1009   u_int page=get_page(block<<12);
1010   u_int vpage=get_vpage(block<<12);
1011   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1012   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1013   u_int first,last;
1014   first=last=page;
1015   struct ll_entry *head;
1016   head=jump_dirty[vpage];
1017   //printf("page=%d vpage=%d\n",page,vpage);
1018   while(head!=NULL) {
1019     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1020       u_char *start, *end;
1021       get_bounds(head->addr, &start, &end);
1022       //printf("start: %p end: %p\n", start, end);
1023       if (page < 2048 && start >= rdram && end < rdram+RAM_SIZE) {
1024         if (((start-rdram)>>12) <= page && ((end-1-rdram)>>12) >= page) {
1025           if ((((start-rdram)>>12)&2047) < first) first = ((start-rdram)>>12)&2047;
1026           if ((((end-1-rdram)>>12)&2047) > last)  last = ((end-1-rdram)>>12)&2047;
1027         }
1028       }
1029     }
1030     head=head->next;
1031   }
1032   invalidate_block_range(block,first,last);
1033 }
1034
1035 void invalidate_addr(u_int addr)
1036 {
1037   //static int rhits;
1038   // this check is done by the caller
1039   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1040   u_int page=get_vpage(addr);
1041   if(page<2048) { // RAM
1042     struct ll_entry *head;
1043     u_int addr_min=~0, addr_max=0;
1044     u_int mask=RAM_SIZE-1;
1045     u_int addr_main=0x80000000|(addr&mask);
1046     int pg1;
1047     inv_code_start=addr_main&~0xfff;
1048     inv_code_end=addr_main|0xfff;
1049     pg1=page;
1050     if (pg1>0) {
1051       // must check previous page too because of spans..
1052       pg1--;
1053       inv_code_start-=0x1000;
1054     }
1055     for(;pg1<=page;pg1++) {
1056       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1057         u_char *start_h, *end_h;
1058         u_int start, end;
1059         get_bounds(head->addr, &start_h, &end_h);
1060         start = (uintptr_t)start_h - ram_offset;
1061         end = (uintptr_t)end_h - ram_offset;
1062         if(start<=addr_main&&addr_main<end) {
1063           if(start<addr_min) addr_min=start;
1064           if(end>addr_max) addr_max=end;
1065         }
1066         else if(addr_main<start) {
1067           if(start<inv_code_end)
1068             inv_code_end=start-1;
1069         }
1070         else {
1071           if(end>inv_code_start)
1072             inv_code_start=end;
1073         }
1074       }
1075     }
1076     if (addr_min!=~0) {
1077       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1078       inv_code_start=inv_code_end=~0;
1079       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1080       return;
1081     }
1082     else {
1083       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1084       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1085       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1086       return;
1087     }
1088   }
1089   invalidate_block(addr>>12);
1090 }
1091
1092 // This is called when loading a save state.
1093 // Anything could have changed, so invalidate everything.
1094 void invalidate_all_pages()
1095 {
1096   u_int page;
1097   for(page=0;page<4096;page++)
1098     invalidate_page(page);
1099   for(page=0;page<1048576;page++)
1100     if(!invalid_code[page]) {
1101       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1102       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1103     }
1104   #ifdef USE_MINI_HT
1105   memset(mini_ht,-1,sizeof(mini_ht));
1106   #endif
1107 }
1108
1109 // Add an entry to jump_out after making a link
1110 void add_link(u_int vaddr,void *src)
1111 {
1112   u_int page=get_page(vaddr);
1113   inv_debug("add_link: %p -> %x (%d)\n",src,vaddr,page);
1114   int *ptr=(int *)(src+4);
1115   assert((*ptr&0x0fff0000)==0x059f0000);
1116   (void)ptr;
1117   ll_add(jump_out+page,vaddr,src);
1118   //void *ptr=get_pointer(src);
1119   //inv_debug("add_link: Pointer is to %p\n",ptr);
1120 }
1121
1122 // If a code block was found to be unmodified (bit was set in
1123 // restore_candidate) and it remains unmodified (bit is clear
1124 // in invalid_code) then move the entries for that 4K page from
1125 // the dirty list to the clean list.
1126 void clean_blocks(u_int page)
1127 {
1128   struct ll_entry *head;
1129   inv_debug("INV: clean_blocks page=%d\n",page);
1130   head=jump_dirty[page];
1131   while(head!=NULL) {
1132     if(!invalid_code[head->vaddr>>12]) {
1133       // Don't restore blocks which are about to expire from the cache
1134       if (doesnt_expire_soon(head->addr)) {
1135         if(verify_dirty(head->addr)) {
1136           u_char *start, *end;
1137           //printf("Possibly Restore %x (%p)\n",head->vaddr, head->addr);
1138           u_int i;
1139           u_int inv=0;
1140           get_bounds(head->addr, &start, &end);
1141           if (start - rdram < RAM_SIZE) {
1142             for (i = (start-rdram+0x80000000)>>12; i <= (end-1-rdram+0x80000000)>>12; i++) {
1143               inv|=invalid_code[i];
1144             }
1145           }
1146           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1147             inv=1;
1148           }
1149           if(!inv) {
1150             void *clean_addr = get_clean_addr(head->addr);
1151             if (doesnt_expire_soon(clean_addr)) {
1152               u_int ppage=page;
1153               inv_debug("INV: Restored %x (%p/%p)\n",head->vaddr, head->addr, clean_addr);
1154               //printf("page=%x, addr=%x\n",page,head->vaddr);
1155               //assert(head->vaddr>>12==(page|0x80000));
1156               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1157               struct ht_entry *ht_bin = hash_table_get(head->vaddr);
1158               if (ht_bin->vaddr[0] == head->vaddr)
1159                 ht_bin->tcaddr[0] = clean_addr; // Replace existing entry
1160               if (ht_bin->vaddr[1] == head->vaddr)
1161                 ht_bin->tcaddr[1] = clean_addr; // Replace existing entry
1162             }
1163           }
1164         }
1165       }
1166     }
1167     head=head->next;
1168   }
1169 }
1170
1171
1172 void mov_alloc(struct regstat *current,int i)
1173 {
1174   // Note: Don't need to actually alloc the source registers
1175   if((~current->is32>>rs1[i])&1) {
1176     //alloc_reg64(current,i,rs1[i]);
1177     assert(0);
1178   } else {
1179     //alloc_reg(current,i,rs1[i]);
1180     alloc_reg(current,i,rt1[i]);
1181     current->is32|=(1LL<<rt1[i]);
1182   }
1183   clear_const(current,rs1[i]);
1184   clear_const(current,rt1[i]);
1185   dirty_reg(current,rt1[i]);
1186 }
1187
1188 void shiftimm_alloc(struct regstat *current,int i)
1189 {
1190   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1191   {
1192     if(rt1[i]) {
1193       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1194       else lt1[i]=rs1[i];
1195       alloc_reg(current,i,rt1[i]);
1196       current->is32|=1LL<<rt1[i];
1197       dirty_reg(current,rt1[i]);
1198       if(is_const(current,rs1[i])) {
1199         int v=get_const(current,rs1[i]);
1200         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1201         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1202         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1203       }
1204       else clear_const(current,rt1[i]);
1205     }
1206   }
1207   else
1208   {
1209     clear_const(current,rs1[i]);
1210     clear_const(current,rt1[i]);
1211   }
1212
1213   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1214   {
1215     assert(0);
1216   }
1217   if(opcode2[i]==0x3c) // DSLL32
1218   {
1219     assert(0);
1220   }
1221   if(opcode2[i]==0x3e) // DSRL32
1222   {
1223     assert(0);
1224   }
1225   if(opcode2[i]==0x3f) // DSRA32
1226   {
1227     assert(0);
1228   }
1229 }
1230
1231 void shift_alloc(struct regstat *current,int i)
1232 {
1233   if(rt1[i]) {
1234     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1235     {
1236       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1237       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1238       alloc_reg(current,i,rt1[i]);
1239       if(rt1[i]==rs2[i]) {
1240         alloc_reg_temp(current,i,-1);
1241         minimum_free_regs[i]=1;
1242       }
1243       current->is32|=1LL<<rt1[i];
1244     } else { // DSLLV/DSRLV/DSRAV
1245       assert(0);
1246     }
1247     clear_const(current,rs1[i]);
1248     clear_const(current,rs2[i]);
1249     clear_const(current,rt1[i]);
1250     dirty_reg(current,rt1[i]);
1251   }
1252 }
1253
1254 void alu_alloc(struct regstat *current,int i)
1255 {
1256   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1257     if(rt1[i]) {
1258       if(rs1[i]&&rs2[i]) {
1259         alloc_reg(current,i,rs1[i]);
1260         alloc_reg(current,i,rs2[i]);
1261       }
1262       else {
1263         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1264         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1265       }
1266       alloc_reg(current,i,rt1[i]);
1267     }
1268     current->is32|=1LL<<rt1[i];
1269   }
1270   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1271     if(rt1[i]) {
1272       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1273       {
1274         alloc_reg64(current,i,rs1[i]);
1275         alloc_reg64(current,i,rs2[i]);
1276         alloc_reg(current,i,rt1[i]);
1277       } else {
1278         alloc_reg(current,i,rs1[i]);
1279         alloc_reg(current,i,rs2[i]);
1280         alloc_reg(current,i,rt1[i]);
1281       }
1282     }
1283     current->is32|=1LL<<rt1[i];
1284   }
1285   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1286     if(rt1[i]) {
1287       if(rs1[i]&&rs2[i]) {
1288         alloc_reg(current,i,rs1[i]);
1289         alloc_reg(current,i,rs2[i]);
1290       }
1291       else
1292       {
1293         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1294         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1295       }
1296       alloc_reg(current,i,rt1[i]);
1297       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1298       {
1299         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1300           assert(0);
1301         }
1302         current->is32&=~(1LL<<rt1[i]);
1303       } else {
1304         current->is32|=1LL<<rt1[i];
1305       }
1306     }
1307   }
1308   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1309     assert(0);
1310   }
1311   clear_const(current,rs1[i]);
1312   clear_const(current,rs2[i]);
1313   clear_const(current,rt1[i]);
1314   dirty_reg(current,rt1[i]);
1315 }
1316
1317 void imm16_alloc(struct regstat *current,int i)
1318 {
1319   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1320   else lt1[i]=rs1[i];
1321   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1322   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1323     assert(0);
1324   }
1325   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1326     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1327     current->is32|=1LL<<rt1[i];
1328     clear_const(current,rs1[i]);
1329     clear_const(current,rt1[i]);
1330   }
1331   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1332     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1333       if(rs1[i]!=rt1[i]) {
1334         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1335         alloc_reg64(current,i,rt1[i]);
1336         current->is32&=~(1LL<<rt1[i]);
1337       }
1338     }
1339     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1340     if(is_const(current,rs1[i])) {
1341       int v=get_const(current,rs1[i]);
1342       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1343       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1344       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1345     }
1346     else clear_const(current,rt1[i]);
1347   }
1348   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1349     if(is_const(current,rs1[i])) {
1350       int v=get_const(current,rs1[i]);
1351       set_const(current,rt1[i],v+imm[i]);
1352     }
1353     else clear_const(current,rt1[i]);
1354     current->is32|=1LL<<rt1[i];
1355   }
1356   else {
1357     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1358     current->is32|=1LL<<rt1[i];
1359   }
1360   dirty_reg(current,rt1[i]);
1361 }
1362
1363 void load_alloc(struct regstat *current,int i)
1364 {
1365   clear_const(current,rt1[i]);
1366   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1367   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1368   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1369   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1370     alloc_reg(current,i,rt1[i]);
1371     assert(get_reg(current->regmap,rt1[i])>=0);
1372     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1373     {
1374       current->is32&=~(1LL<<rt1[i]);
1375       alloc_reg64(current,i,rt1[i]);
1376     }
1377     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1378     {
1379       current->is32&=~(1LL<<rt1[i]);
1380       alloc_reg64(current,i,rt1[i]);
1381       alloc_all(current,i);
1382       alloc_reg64(current,i,FTEMP);
1383       minimum_free_regs[i]=HOST_REGS;
1384     }
1385     else current->is32|=1LL<<rt1[i];
1386     dirty_reg(current,rt1[i]);
1387     // LWL/LWR need a temporary register for the old value
1388     if(opcode[i]==0x22||opcode[i]==0x26)
1389     {
1390       alloc_reg(current,i,FTEMP);
1391       alloc_reg_temp(current,i,-1);
1392       minimum_free_regs[i]=1;
1393     }
1394   }
1395   else
1396   {
1397     // Load to r0 or unneeded register (dummy load)
1398     // but we still need a register to calculate the address
1399     if(opcode[i]==0x22||opcode[i]==0x26)
1400     {
1401       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1402     }
1403     alloc_reg_temp(current,i,-1);
1404     minimum_free_regs[i]=1;
1405     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1406     {
1407       alloc_all(current,i);
1408       alloc_reg64(current,i,FTEMP);
1409       minimum_free_regs[i]=HOST_REGS;
1410     }
1411   }
1412 }
1413
1414 void store_alloc(struct regstat *current,int i)
1415 {
1416   clear_const(current,rs2[i]);
1417   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1418   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1419   alloc_reg(current,i,rs2[i]);
1420   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1421     alloc_reg64(current,i,rs2[i]);
1422     if(rs2[i]) alloc_reg(current,i,FTEMP);
1423   }
1424   #if defined(HOST_IMM8)
1425   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1426   else alloc_reg(current,i,INVCP);
1427   #endif
1428   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1429     alloc_reg(current,i,FTEMP);
1430   }
1431   // We need a temporary register for address generation
1432   alloc_reg_temp(current,i,-1);
1433   minimum_free_regs[i]=1;
1434 }
1435
1436 void c1ls_alloc(struct regstat *current,int i)
1437 {
1438   //clear_const(current,rs1[i]); // FIXME
1439   clear_const(current,rt1[i]);
1440   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1441   alloc_reg(current,i,CSREG); // Status
1442   alloc_reg(current,i,FTEMP);
1443   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1444     alloc_reg64(current,i,FTEMP);
1445   }
1446   #if defined(HOST_IMM8)
1447   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1448   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1449     alloc_reg(current,i,INVCP);
1450   #endif
1451   // We need a temporary register for address generation
1452   alloc_reg_temp(current,i,-1);
1453 }
1454
1455 void c2ls_alloc(struct regstat *current,int i)
1456 {
1457   clear_const(current,rt1[i]);
1458   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1459   alloc_reg(current,i,FTEMP);
1460   #if defined(HOST_IMM8)
1461   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1462   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1463     alloc_reg(current,i,INVCP);
1464   #endif
1465   // We need a temporary register for address generation
1466   alloc_reg_temp(current,i,-1);
1467   minimum_free_regs[i]=1;
1468 }
1469
1470 #ifndef multdiv_alloc
1471 void multdiv_alloc(struct regstat *current,int i)
1472 {
1473   //  case 0x18: MULT
1474   //  case 0x19: MULTU
1475   //  case 0x1A: DIV
1476   //  case 0x1B: DIVU
1477   //  case 0x1C: DMULT
1478   //  case 0x1D: DMULTU
1479   //  case 0x1E: DDIV
1480   //  case 0x1F: DDIVU
1481   clear_const(current,rs1[i]);
1482   clear_const(current,rs2[i]);
1483   if(rs1[i]&&rs2[i])
1484   {
1485     if((opcode2[i]&4)==0) // 32-bit
1486     {
1487       current->u&=~(1LL<<HIREG);
1488       current->u&=~(1LL<<LOREG);
1489       alloc_reg(current,i,HIREG);
1490       alloc_reg(current,i,LOREG);
1491       alloc_reg(current,i,rs1[i]);
1492       alloc_reg(current,i,rs2[i]);
1493       current->is32|=1LL<<HIREG;
1494       current->is32|=1LL<<LOREG;
1495       dirty_reg(current,HIREG);
1496       dirty_reg(current,LOREG);
1497     }
1498     else // 64-bit
1499     {
1500       assert(0);
1501     }
1502   }
1503   else
1504   {
1505     // Multiply by zero is zero.
1506     // MIPS does not have a divide by zero exception.
1507     // The result is undefined, we return zero.
1508     alloc_reg(current,i,HIREG);
1509     alloc_reg(current,i,LOREG);
1510     current->is32|=1LL<<HIREG;
1511     current->is32|=1LL<<LOREG;
1512     dirty_reg(current,HIREG);
1513     dirty_reg(current,LOREG);
1514   }
1515 }
1516 #endif
1517
1518 void cop0_alloc(struct regstat *current,int i)
1519 {
1520   if(opcode2[i]==0) // MFC0
1521   {
1522     if(rt1[i]) {
1523       clear_const(current,rt1[i]);
1524       alloc_all(current,i);
1525       alloc_reg(current,i,rt1[i]);
1526       current->is32|=1LL<<rt1[i];
1527       dirty_reg(current,rt1[i]);
1528     }
1529   }
1530   else if(opcode2[i]==4) // MTC0
1531   {
1532     if(rs1[i]){
1533       clear_const(current,rs1[i]);
1534       alloc_reg(current,i,rs1[i]);
1535       alloc_all(current,i);
1536     }
1537     else {
1538       alloc_all(current,i); // FIXME: Keep r0
1539       current->u&=~1LL;
1540       alloc_reg(current,i,0);
1541     }
1542   }
1543   else
1544   {
1545     // TLBR/TLBWI/TLBWR/TLBP/ERET
1546     assert(opcode2[i]==0x10);
1547     alloc_all(current,i);
1548   }
1549   minimum_free_regs[i]=HOST_REGS;
1550 }
1551
1552 static void cop12_alloc(struct regstat *current,int i)
1553 {
1554   alloc_reg(current,i,CSREG); // Load status
1555   if(opcode2[i]<3) // MFC1/CFC1
1556   {
1557     if(rt1[i]){
1558       clear_const(current,rt1[i]);
1559       alloc_reg(current,i,rt1[i]);
1560       current->is32|=1LL<<rt1[i];
1561       dirty_reg(current,rt1[i]);
1562     }
1563     alloc_reg_temp(current,i,-1);
1564   }
1565   else if(opcode2[i]>3) // MTC1/CTC1
1566   {
1567     if(rs1[i]){
1568       clear_const(current,rs1[i]);
1569       alloc_reg(current,i,rs1[i]);
1570     }
1571     else {
1572       current->u&=~1LL;
1573       alloc_reg(current,i,0);
1574     }
1575     alloc_reg_temp(current,i,-1);
1576   }
1577   minimum_free_regs[i]=1;
1578 }
1579
1580 void c2op_alloc(struct regstat *current,int i)
1581 {
1582   alloc_reg_temp(current,i,-1);
1583 }
1584
1585 void syscall_alloc(struct regstat *current,int i)
1586 {
1587   alloc_cc(current,i);
1588   dirty_reg(current,CCREG);
1589   alloc_all(current,i);
1590   minimum_free_regs[i]=HOST_REGS;
1591   current->isconst=0;
1592 }
1593
1594 void delayslot_alloc(struct regstat *current,int i)
1595 {
1596   switch(itype[i]) {
1597     case UJUMP:
1598     case CJUMP:
1599     case SJUMP:
1600     case RJUMP:
1601     case FJUMP:
1602     case SYSCALL:
1603     case HLECALL:
1604     case SPAN:
1605       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1606       SysPrintf("Disabled speculative precompilation\n");
1607       stop_after_jal=1;
1608       break;
1609     case IMM16:
1610       imm16_alloc(current,i);
1611       break;
1612     case LOAD:
1613     case LOADLR:
1614       load_alloc(current,i);
1615       break;
1616     case STORE:
1617     case STORELR:
1618       store_alloc(current,i);
1619       break;
1620     case ALU:
1621       alu_alloc(current,i);
1622       break;
1623     case SHIFT:
1624       shift_alloc(current,i);
1625       break;
1626     case MULTDIV:
1627       multdiv_alloc(current,i);
1628       break;
1629     case SHIFTIMM:
1630       shiftimm_alloc(current,i);
1631       break;
1632     case MOV:
1633       mov_alloc(current,i);
1634       break;
1635     case COP0:
1636       cop0_alloc(current,i);
1637       break;
1638     case COP1:
1639     case COP2:
1640       cop12_alloc(current,i);
1641       break;
1642     case C1LS:
1643       c1ls_alloc(current,i);
1644       break;
1645     case C2LS:
1646       c2ls_alloc(current,i);
1647       break;
1648     case C2OP:
1649       c2op_alloc(current,i);
1650       break;
1651   }
1652 }
1653
1654 // Special case where a branch and delay slot span two pages in virtual memory
1655 static void pagespan_alloc(struct regstat *current,int i)
1656 {
1657   current->isconst=0;
1658   current->wasconst=0;
1659   regs[i].wasconst=0;
1660   minimum_free_regs[i]=HOST_REGS;
1661   alloc_all(current,i);
1662   alloc_cc(current,i);
1663   dirty_reg(current,CCREG);
1664   if(opcode[i]==3) // JAL
1665   {
1666     alloc_reg(current,i,31);
1667     dirty_reg(current,31);
1668   }
1669   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1670   {
1671     alloc_reg(current,i,rs1[i]);
1672     if (rt1[i]!=0) {
1673       alloc_reg(current,i,rt1[i]);
1674       dirty_reg(current,rt1[i]);
1675     }
1676   }
1677   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1678   {
1679     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1680     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1681     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1682     {
1683       assert(0);
1684     }
1685   }
1686   else
1687   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1688   {
1689     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1690     if(!((current->is32>>rs1[i])&1))
1691     {
1692       assert(0);
1693     }
1694   }
1695   //else ...
1696 }
1697
1698 static void add_stub(enum stub_type type, void *addr, void *retaddr,
1699   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e)
1700 {
1701   assert(a < ARRAY_SIZE(stubs));
1702   stubs[stubcount].type = type;
1703   stubs[stubcount].addr = addr;
1704   stubs[stubcount].retaddr = retaddr;
1705   stubs[stubcount].a = a;
1706   stubs[stubcount].b = b;
1707   stubs[stubcount].c = c;
1708   stubs[stubcount].d = d;
1709   stubs[stubcount].e = e;
1710   stubcount++;
1711 }
1712
1713 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
1714   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist)
1715 {
1716   add_stub(type, addr, retaddr, i, addr_reg, (uintptr_t)i_regs, ccadj, reglist);
1717 }
1718
1719 // Write out a single register
1720 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1721 {
1722   int hr;
1723   for(hr=0;hr<HOST_REGS;hr++) {
1724     if(hr!=EXCLUDE_REG) {
1725       if((regmap[hr]&63)==r) {
1726         if((dirty>>hr)&1) {
1727           if(regmap[hr]<64) {
1728             emit_storereg(r,hr);
1729           }else{
1730             emit_storereg(r|64,hr);
1731           }
1732         }
1733       }
1734     }
1735   }
1736 }
1737
1738 void rlist()
1739 {
1740   int i;
1741   printf("TRACE: ");
1742   for(i=0;i<32;i++)
1743     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1744   printf("\n");
1745 }
1746
1747 void alu_assemble(int i,struct regstat *i_regs)
1748 {
1749   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1750     if(rt1[i]) {
1751       signed char s1,s2,t;
1752       t=get_reg(i_regs->regmap,rt1[i]);
1753       if(t>=0) {
1754         s1=get_reg(i_regs->regmap,rs1[i]);
1755         s2=get_reg(i_regs->regmap,rs2[i]);
1756         if(rs1[i]&&rs2[i]) {
1757           assert(s1>=0);
1758           assert(s2>=0);
1759           if(opcode2[i]&2) emit_sub(s1,s2,t);
1760           else emit_add(s1,s2,t);
1761         }
1762         else if(rs1[i]) {
1763           if(s1>=0) emit_mov(s1,t);
1764           else emit_loadreg(rs1[i],t);
1765         }
1766         else if(rs2[i]) {
1767           if(s2>=0) {
1768             if(opcode2[i]&2) emit_neg(s2,t);
1769             else emit_mov(s2,t);
1770           }
1771           else {
1772             emit_loadreg(rs2[i],t);
1773             if(opcode2[i]&2) emit_neg(t,t);
1774           }
1775         }
1776         else emit_zeroreg(t);
1777       }
1778     }
1779   }
1780   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1781     assert(0);
1782   }
1783   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1784     if(rt1[i]) {
1785       signed char s1l,s1h,s2l,s2h,t;
1786       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
1787       {
1788         t=get_reg(i_regs->regmap,rt1[i]);
1789         //assert(t>=0);
1790         if(t>=0) {
1791           s1l=get_reg(i_regs->regmap,rs1[i]);
1792           s1h=get_reg(i_regs->regmap,rs1[i]|64);
1793           s2l=get_reg(i_regs->regmap,rs2[i]);
1794           s2h=get_reg(i_regs->regmap,rs2[i]|64);
1795           if(rs2[i]==0) // rx<r0
1796           {
1797             assert(s1h>=0);
1798             if(opcode2[i]==0x2a) // SLT
1799               emit_shrimm(s1h,31,t);
1800             else // SLTU (unsigned can not be less than zero)
1801               emit_zeroreg(t);
1802           }
1803           else if(rs1[i]==0) // r0<rx
1804           {
1805             assert(s2h>=0);
1806             if(opcode2[i]==0x2a) // SLT
1807               emit_set_gz64_32(s2h,s2l,t);
1808             else // SLTU (set if not zero)
1809               emit_set_nz64_32(s2h,s2l,t);
1810           }
1811           else {
1812             assert(s1l>=0);assert(s1h>=0);
1813             assert(s2l>=0);assert(s2h>=0);
1814             if(opcode2[i]==0x2a) // SLT
1815               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
1816             else // SLTU
1817               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
1818           }
1819         }
1820       } else {
1821         t=get_reg(i_regs->regmap,rt1[i]);
1822         //assert(t>=0);
1823         if(t>=0) {
1824           s1l=get_reg(i_regs->regmap,rs1[i]);
1825           s2l=get_reg(i_regs->regmap,rs2[i]);
1826           if(rs2[i]==0) // rx<r0
1827           {
1828             assert(s1l>=0);
1829             if(opcode2[i]==0x2a) // SLT
1830               emit_shrimm(s1l,31,t);
1831             else // SLTU (unsigned can not be less than zero)
1832               emit_zeroreg(t);
1833           }
1834           else if(rs1[i]==0) // r0<rx
1835           {
1836             assert(s2l>=0);
1837             if(opcode2[i]==0x2a) // SLT
1838               emit_set_gz32(s2l,t);
1839             else // SLTU (set if not zero)
1840               emit_set_nz32(s2l,t);
1841           }
1842           else{
1843             assert(s1l>=0);assert(s2l>=0);
1844             if(opcode2[i]==0x2a) // SLT
1845               emit_set_if_less32(s1l,s2l,t);
1846             else // SLTU
1847               emit_set_if_carry32(s1l,s2l,t);
1848           }
1849         }
1850       }
1851     }
1852   }
1853   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1854     if(rt1[i]) {
1855       signed char s1l,s1h,s2l,s2h,th,tl;
1856       tl=get_reg(i_regs->regmap,rt1[i]);
1857       th=get_reg(i_regs->regmap,rt1[i]|64);
1858       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
1859       {
1860         assert(tl>=0);
1861         if(tl>=0) {
1862           s1l=get_reg(i_regs->regmap,rs1[i]);
1863           s1h=get_reg(i_regs->regmap,rs1[i]|64);
1864           s2l=get_reg(i_regs->regmap,rs2[i]);
1865           s2h=get_reg(i_regs->regmap,rs2[i]|64);
1866           if(rs1[i]&&rs2[i]) {
1867             assert(s1l>=0);assert(s1h>=0);
1868             assert(s2l>=0);assert(s2h>=0);
1869             if(opcode2[i]==0x24) { // AND
1870               emit_and(s1l,s2l,tl);
1871               emit_and(s1h,s2h,th);
1872             } else
1873             if(opcode2[i]==0x25) { // OR
1874               emit_or(s1l,s2l,tl);
1875               emit_or(s1h,s2h,th);
1876             } else
1877             if(opcode2[i]==0x26) { // XOR
1878               emit_xor(s1l,s2l,tl);
1879               emit_xor(s1h,s2h,th);
1880             } else
1881             if(opcode2[i]==0x27) { // NOR
1882               emit_or(s1l,s2l,tl);
1883               emit_or(s1h,s2h,th);
1884               emit_not(tl,tl);
1885               emit_not(th,th);
1886             }
1887           }
1888           else
1889           {
1890             if(opcode2[i]==0x24) { // AND
1891               emit_zeroreg(tl);
1892               emit_zeroreg(th);
1893             } else
1894             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
1895               if(rs1[i]){
1896                 if(s1l>=0) emit_mov(s1l,tl);
1897                 else emit_loadreg(rs1[i],tl);
1898                 if(s1h>=0) emit_mov(s1h,th);
1899                 else emit_loadreg(rs1[i]|64,th);
1900               }
1901               else
1902               if(rs2[i]){
1903                 if(s2l>=0) emit_mov(s2l,tl);
1904                 else emit_loadreg(rs2[i],tl);
1905                 if(s2h>=0) emit_mov(s2h,th);
1906                 else emit_loadreg(rs2[i]|64,th);
1907               }
1908               else{
1909                 emit_zeroreg(tl);
1910                 emit_zeroreg(th);
1911               }
1912             } else
1913             if(opcode2[i]==0x27) { // NOR
1914               if(rs1[i]){
1915                 if(s1l>=0) emit_not(s1l,tl);
1916                 else{
1917                   emit_loadreg(rs1[i],tl);
1918                   emit_not(tl,tl);
1919                 }
1920                 if(s1h>=0) emit_not(s1h,th);
1921                 else{
1922                   emit_loadreg(rs1[i]|64,th);
1923                   emit_not(th,th);
1924                 }
1925               }
1926               else
1927               if(rs2[i]){
1928                 if(s2l>=0) emit_not(s2l,tl);
1929                 else{
1930                   emit_loadreg(rs2[i],tl);
1931                   emit_not(tl,tl);
1932                 }
1933                 if(s2h>=0) emit_not(s2h,th);
1934                 else{
1935                   emit_loadreg(rs2[i]|64,th);
1936                   emit_not(th,th);
1937                 }
1938               }
1939               else {
1940                 emit_movimm(-1,tl);
1941                 emit_movimm(-1,th);
1942               }
1943             }
1944           }
1945         }
1946       }
1947       else
1948       {
1949         // 32 bit
1950         if(tl>=0) {
1951           s1l=get_reg(i_regs->regmap,rs1[i]);
1952           s2l=get_reg(i_regs->regmap,rs2[i]);
1953           if(rs1[i]&&rs2[i]) {
1954             assert(s1l>=0);
1955             assert(s2l>=0);
1956             if(opcode2[i]==0x24) { // AND
1957               emit_and(s1l,s2l,tl);
1958             } else
1959             if(opcode2[i]==0x25) { // OR
1960               emit_or(s1l,s2l,tl);
1961             } else
1962             if(opcode2[i]==0x26) { // XOR
1963               emit_xor(s1l,s2l,tl);
1964             } else
1965             if(opcode2[i]==0x27) { // NOR
1966               emit_or(s1l,s2l,tl);
1967               emit_not(tl,tl);
1968             }
1969           }
1970           else
1971           {
1972             if(opcode2[i]==0x24) { // AND
1973               emit_zeroreg(tl);
1974             } else
1975             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
1976               if(rs1[i]){
1977                 if(s1l>=0) emit_mov(s1l,tl);
1978                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
1979               }
1980               else
1981               if(rs2[i]){
1982                 if(s2l>=0) emit_mov(s2l,tl);
1983                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
1984               }
1985               else emit_zeroreg(tl);
1986             } else
1987             if(opcode2[i]==0x27) { // NOR
1988               if(rs1[i]){
1989                 if(s1l>=0) emit_not(s1l,tl);
1990                 else {
1991                   emit_loadreg(rs1[i],tl);
1992                   emit_not(tl,tl);
1993                 }
1994               }
1995               else
1996               if(rs2[i]){
1997                 if(s2l>=0) emit_not(s2l,tl);
1998                 else {
1999                   emit_loadreg(rs2[i],tl);
2000                   emit_not(tl,tl);
2001                 }
2002               }
2003               else emit_movimm(-1,tl);
2004             }
2005           }
2006         }
2007       }
2008     }
2009   }
2010 }
2011
2012 void imm16_assemble(int i,struct regstat *i_regs)
2013 {
2014   if (opcode[i]==0x0f) { // LUI
2015     if(rt1[i]) {
2016       signed char t;
2017       t=get_reg(i_regs->regmap,rt1[i]);
2018       //assert(t>=0);
2019       if(t>=0) {
2020         if(!((i_regs->isconst>>t)&1))
2021           emit_movimm(imm[i]<<16,t);
2022       }
2023     }
2024   }
2025   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2026     if(rt1[i]) {
2027       signed char s,t;
2028       t=get_reg(i_regs->regmap,rt1[i]);
2029       s=get_reg(i_regs->regmap,rs1[i]);
2030       if(rs1[i]) {
2031         //assert(t>=0);
2032         //assert(s>=0);
2033         if(t>=0) {
2034           if(!((i_regs->isconst>>t)&1)) {
2035             if(s<0) {
2036               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2037               emit_addimm(t,imm[i],t);
2038             }else{
2039               if(!((i_regs->wasconst>>s)&1))
2040                 emit_addimm(s,imm[i],t);
2041               else
2042                 emit_movimm(constmap[i][s]+imm[i],t);
2043             }
2044           }
2045         }
2046       } else {
2047         if(t>=0) {
2048           if(!((i_regs->isconst>>t)&1))
2049             emit_movimm(imm[i],t);
2050         }
2051       }
2052     }
2053   }
2054   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2055     if(rt1[i]) {
2056       signed char sh,sl,th,tl;
2057       th=get_reg(i_regs->regmap,rt1[i]|64);
2058       tl=get_reg(i_regs->regmap,rt1[i]);
2059       sh=get_reg(i_regs->regmap,rs1[i]|64);
2060       sl=get_reg(i_regs->regmap,rs1[i]);
2061       if(tl>=0) {
2062         if(rs1[i]) {
2063           assert(sh>=0);
2064           assert(sl>=0);
2065           if(th>=0) {
2066             emit_addimm64_32(sh,sl,imm[i],th,tl);
2067           }
2068           else {
2069             emit_addimm(sl,imm[i],tl);
2070           }
2071         } else {
2072           emit_movimm(imm[i],tl);
2073           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2074         }
2075       }
2076     }
2077   }
2078   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2079     if(rt1[i]) {
2080       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2081       signed char sh,sl,t;
2082       t=get_reg(i_regs->regmap,rt1[i]);
2083       sh=get_reg(i_regs->regmap,rs1[i]|64);
2084       sl=get_reg(i_regs->regmap,rs1[i]);
2085       //assert(t>=0);
2086       if(t>=0) {
2087         if(rs1[i]>0) {
2088           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2089           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2090             if(opcode[i]==0x0a) { // SLTI
2091               if(sl<0) {
2092                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2093                 emit_slti32(t,imm[i],t);
2094               }else{
2095                 emit_slti32(sl,imm[i],t);
2096               }
2097             }
2098             else { // SLTIU
2099               if(sl<0) {
2100                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2101                 emit_sltiu32(t,imm[i],t);
2102               }else{
2103                 emit_sltiu32(sl,imm[i],t);
2104               }
2105             }
2106           }else{ // 64-bit
2107             assert(sl>=0);
2108             if(opcode[i]==0x0a) // SLTI
2109               emit_slti64_32(sh,sl,imm[i],t);
2110             else // SLTIU
2111               emit_sltiu64_32(sh,sl,imm[i],t);
2112           }
2113         }else{
2114           // SLTI(U) with r0 is just stupid,
2115           // nonetheless examples can be found
2116           if(opcode[i]==0x0a) // SLTI
2117             if(0<imm[i]) emit_movimm(1,t);
2118             else emit_zeroreg(t);
2119           else // SLTIU
2120           {
2121             if(imm[i]) emit_movimm(1,t);
2122             else emit_zeroreg(t);
2123           }
2124         }
2125       }
2126     }
2127   }
2128   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2129     if(rt1[i]) {
2130       signed char sh,sl,th,tl;
2131       th=get_reg(i_regs->regmap,rt1[i]|64);
2132       tl=get_reg(i_regs->regmap,rt1[i]);
2133       sh=get_reg(i_regs->regmap,rs1[i]|64);
2134       sl=get_reg(i_regs->regmap,rs1[i]);
2135       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2136         if(opcode[i]==0x0c) //ANDI
2137         {
2138           if(rs1[i]) {
2139             if(sl<0) {
2140               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2141               emit_andimm(tl,imm[i],tl);
2142             }else{
2143               if(!((i_regs->wasconst>>sl)&1))
2144                 emit_andimm(sl,imm[i],tl);
2145               else
2146                 emit_movimm(constmap[i][sl]&imm[i],tl);
2147             }
2148           }
2149           else
2150             emit_zeroreg(tl);
2151           if(th>=0) emit_zeroreg(th);
2152         }
2153         else
2154         {
2155           if(rs1[i]) {
2156             if(sl<0) {
2157               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2158             }
2159             if(th>=0) {
2160               if(sh<0) {
2161                 emit_loadreg(rs1[i]|64,th);
2162               }else{
2163                 emit_mov(sh,th);
2164               }
2165             }
2166             if(opcode[i]==0x0d) { // ORI
2167               if(sl<0) {
2168                 emit_orimm(tl,imm[i],tl);
2169               }else{
2170                 if(!((i_regs->wasconst>>sl)&1))
2171                   emit_orimm(sl,imm[i],tl);
2172                 else
2173                   emit_movimm(constmap[i][sl]|imm[i],tl);
2174               }
2175             }
2176             if(opcode[i]==0x0e) { // XORI
2177               if(sl<0) {
2178                 emit_xorimm(tl,imm[i],tl);
2179               }else{
2180                 if(!((i_regs->wasconst>>sl)&1))
2181                   emit_xorimm(sl,imm[i],tl);
2182                 else
2183                   emit_movimm(constmap[i][sl]^imm[i],tl);
2184               }
2185             }
2186           }
2187           else {
2188             emit_movimm(imm[i],tl);
2189             if(th>=0) emit_zeroreg(th);
2190           }
2191         }
2192       }
2193     }
2194   }
2195 }
2196
2197 void shiftimm_assemble(int i,struct regstat *i_regs)
2198 {
2199   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2200   {
2201     if(rt1[i]) {
2202       signed char s,t;
2203       t=get_reg(i_regs->regmap,rt1[i]);
2204       s=get_reg(i_regs->regmap,rs1[i]);
2205       //assert(t>=0);
2206       if(t>=0&&!((i_regs->isconst>>t)&1)){
2207         if(rs1[i]==0)
2208         {
2209           emit_zeroreg(t);
2210         }
2211         else
2212         {
2213           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2214           if(imm[i]) {
2215             if(opcode2[i]==0) // SLL
2216             {
2217               emit_shlimm(s<0?t:s,imm[i],t);
2218             }
2219             if(opcode2[i]==2) // SRL
2220             {
2221               emit_shrimm(s<0?t:s,imm[i],t);
2222             }
2223             if(opcode2[i]==3) // SRA
2224             {
2225               emit_sarimm(s<0?t:s,imm[i],t);
2226             }
2227           }else{
2228             // Shift by zero
2229             if(s>=0 && s!=t) emit_mov(s,t);
2230           }
2231         }
2232       }
2233       //emit_storereg(rt1[i],t); //DEBUG
2234     }
2235   }
2236   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2237   {
2238     assert(0);
2239   }
2240   if(opcode2[i]==0x3c) // DSLL32
2241   {
2242     assert(0);
2243   }
2244   if(opcode2[i]==0x3e) // DSRL32
2245   {
2246     assert(0);
2247   }
2248   if(opcode2[i]==0x3f) // DSRA32
2249   {
2250     assert(0);
2251   }
2252 }
2253
2254 #ifndef shift_assemble
2255 void shift_assemble(int i,struct regstat *i_regs)
2256 {
2257   printf("Need shift_assemble for this architecture.\n");
2258   exit(1);
2259 }
2260 #endif
2261
2262 void load_assemble(int i,struct regstat *i_regs)
2263 {
2264   int s,th,tl,addr;
2265   int offset;
2266   void *jaddr=0;
2267   int memtarget=0,c=0;
2268   int fastload_reg_override=0;
2269   u_int hr,reglist=0;
2270   th=get_reg(i_regs->regmap,rt1[i]|64);
2271   tl=get_reg(i_regs->regmap,rt1[i]);
2272   s=get_reg(i_regs->regmap,rs1[i]);
2273   offset=imm[i];
2274   for(hr=0;hr<HOST_REGS;hr++) {
2275     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2276   }
2277   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2278   if(s>=0) {
2279     c=(i_regs->wasconst>>s)&1;
2280     if (c) {
2281       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2282     }
2283   }
2284   //printf("load_assemble: c=%d\n",c);
2285   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2286   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2287   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2288     ||rt1[i]==0) {
2289       // could be FIFO, must perform the read
2290       // ||dummy read
2291       assem_debug("(forced read)\n");
2292       tl=get_reg(i_regs->regmap,-1);
2293       assert(tl>=0);
2294   }
2295   if(offset||s<0||c) addr=tl;
2296   else addr=s;
2297   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2298  if(tl>=0) {
2299   //printf("load_assemble: c=%d\n",c);
2300   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2301   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2302   reglist&=~(1<<tl);
2303   if(th>=0) reglist&=~(1<<th);
2304   if(!c) {
2305     #ifdef R29_HACK
2306     // Strmnnrmn's speed hack
2307     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2308     #endif
2309     {
2310       jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2311     }
2312   }
2313   else if(ram_offset&&memtarget) {
2314     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2315     fastload_reg_override=HOST_TEMPREG;
2316   }
2317   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2318   if (opcode[i]==0x20) { // LB
2319     if(!c||memtarget) {
2320       if(!dummy) {
2321         {
2322           int x=0,a=tl;
2323           if(!c) a=addr;
2324           if(fastload_reg_override) a=fastload_reg_override;
2325
2326           emit_movsbl_indexed(x,a,tl);
2327         }
2328       }
2329       if(jaddr)
2330         add_stub_r(LOADB_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2331     }
2332     else
2333       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2334   }
2335   if (opcode[i]==0x21) { // LH
2336     if(!c||memtarget) {
2337       if(!dummy) {
2338         int x=0,a=tl;
2339         if(!c) a=addr;
2340         if(fastload_reg_override) a=fastload_reg_override;
2341         emit_movswl_indexed(x,a,tl);
2342       }
2343       if(jaddr)
2344         add_stub_r(LOADH_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2345     }
2346     else
2347       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2348   }
2349   if (opcode[i]==0x23) { // LW
2350     if(!c||memtarget) {
2351       if(!dummy) {
2352         int a=addr;
2353         if(fastload_reg_override) a=fastload_reg_override;
2354         emit_readword_indexed(0,a,tl);
2355       }
2356       if(jaddr)
2357         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2358     }
2359     else
2360       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2361   }
2362   if (opcode[i]==0x24) { // LBU
2363     if(!c||memtarget) {
2364       if(!dummy) {
2365         int x=0,a=tl;
2366         if(!c) a=addr;
2367         if(fastload_reg_override) a=fastload_reg_override;
2368
2369         emit_movzbl_indexed(x,a,tl);
2370       }
2371       if(jaddr)
2372         add_stub_r(LOADBU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2373     }
2374     else
2375       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2376   }
2377   if (opcode[i]==0x25) { // LHU
2378     if(!c||memtarget) {
2379       if(!dummy) {
2380         int x=0,a=tl;
2381         if(!c) a=addr;
2382         if(fastload_reg_override) a=fastload_reg_override;
2383         emit_movzwl_indexed(x,a,tl);
2384       }
2385       if(jaddr)
2386         add_stub_r(LOADHU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2387     }
2388     else
2389       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2390   }
2391   if (opcode[i]==0x27) { // LWU
2392     assert(th>=0);
2393     if(!c||memtarget) {
2394       if(!dummy) {
2395         int a=addr;
2396         if(fastload_reg_override) a=fastload_reg_override;
2397         emit_readword_indexed(0,a,tl);
2398       }
2399       if(jaddr)
2400         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2401     }
2402     else {
2403       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2404     }
2405     emit_zeroreg(th);
2406   }
2407   if (opcode[i]==0x37) { // LD
2408     assert(0);
2409   }
2410  }
2411 }
2412
2413 #ifndef loadlr_assemble
2414 void loadlr_assemble(int i,struct regstat *i_regs)
2415 {
2416   printf("Need loadlr_assemble for this architecture.\n");
2417   exit(1);
2418 }
2419 #endif
2420
2421 void store_assemble(int i,struct regstat *i_regs)
2422 {
2423   int s,tl;
2424   int addr,temp;
2425   int offset;
2426   void *jaddr=0;
2427   enum stub_type type;
2428   int memtarget=0,c=0;
2429   int agr=AGEN1+(i&1);
2430   int faststore_reg_override=0;
2431   u_int hr,reglist=0;
2432   tl=get_reg(i_regs->regmap,rs2[i]);
2433   s=get_reg(i_regs->regmap,rs1[i]);
2434   temp=get_reg(i_regs->regmap,agr);
2435   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2436   offset=imm[i];
2437   if(s>=0) {
2438     c=(i_regs->wasconst>>s)&1;
2439     if(c) {
2440       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2441     }
2442   }
2443   assert(tl>=0);
2444   assert(temp>=0);
2445   for(hr=0;hr<HOST_REGS;hr++) {
2446     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2447   }
2448   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2449   if(offset||s<0||c) addr=temp;
2450   else addr=s;
2451   if(!c) {
2452     jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
2453   }
2454   else if(ram_offset&&memtarget) {
2455     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2456     faststore_reg_override=HOST_TEMPREG;
2457   }
2458
2459   if (opcode[i]==0x28) { // SB
2460     if(!c||memtarget) {
2461       int x=0,a=temp;
2462       if(!c) a=addr;
2463       if(faststore_reg_override) a=faststore_reg_override;
2464       emit_writebyte_indexed(tl,x,a);
2465     }
2466     type=STOREB_STUB;
2467   }
2468   if (opcode[i]==0x29) { // SH
2469     if(!c||memtarget) {
2470       int x=0,a=temp;
2471       if(!c) a=addr;
2472       if(faststore_reg_override) a=faststore_reg_override;
2473       emit_writehword_indexed(tl,x,a);
2474     }
2475     type=STOREH_STUB;
2476   }
2477   if (opcode[i]==0x2B) { // SW
2478     if(!c||memtarget) {
2479       int a=addr;
2480       if(faststore_reg_override) a=faststore_reg_override;
2481       emit_writeword_indexed(tl,0,a);
2482     }
2483     type=STOREW_STUB;
2484   }
2485   if (opcode[i]==0x3F) { // SD
2486     assert(0);
2487     type=STORED_STUB;
2488   }
2489   if(jaddr) {
2490     // PCSX store handlers don't check invcode again
2491     reglist|=1<<addr;
2492     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2493     jaddr=0;
2494   }
2495   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2496     if(!c||memtarget) {
2497       #ifdef DESTRUCTIVE_SHIFT
2498       // The x86 shift operation is 'destructive'; it overwrites the
2499       // source register, so we need to make a copy first and use that.
2500       addr=temp;
2501       #endif
2502       #if defined(HOST_IMM8)
2503       int ir=get_reg(i_regs->regmap,INVCP);
2504       assert(ir>=0);
2505       emit_cmpmem_indexedsr12_reg(ir,addr,1);
2506       #else
2507       emit_cmpmem_indexedsr12_imm(invalid_code,addr,1);
2508       #endif
2509       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2510       emit_callne(invalidate_addr_reg[addr]);
2511       #else
2512       void *jaddr2 = out;
2513       emit_jne(0);
2514       add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),addr,0,0,0);
2515       #endif
2516     }
2517   }
2518   u_int addr_val=constmap[i][s]+offset;
2519   if(jaddr) {
2520     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2521   } else if(c&&!memtarget) {
2522     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
2523   }
2524   // basic current block modification detection..
2525   // not looking back as that should be in mips cache already
2526   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
2527     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
2528     assert(i_regs->regmap==regs[i].regmap); // not delay slot
2529     if(i_regs->regmap==regs[i].regmap) {
2530       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
2531       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
2532       emit_movimm(start+i*4+4,0);
2533       emit_writeword(0,&pcaddr);
2534       emit_jmp(do_interrupt);
2535     }
2536   }
2537 }
2538
2539 void storelr_assemble(int i,struct regstat *i_regs)
2540 {
2541   int s,tl;
2542   int temp;
2543   int offset;
2544   void *jaddr=0;
2545   void *case1, *case2, *case3;
2546   void *done0, *done1, *done2;
2547   int memtarget=0,c=0;
2548   int agr=AGEN1+(i&1);
2549   u_int hr,reglist=0;
2550   tl=get_reg(i_regs->regmap,rs2[i]);
2551   s=get_reg(i_regs->regmap,rs1[i]);
2552   temp=get_reg(i_regs->regmap,agr);
2553   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2554   offset=imm[i];
2555   if(s>=0) {
2556     c=(i_regs->isconst>>s)&1;
2557     if(c) {
2558       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2559     }
2560   }
2561   assert(tl>=0);
2562   for(hr=0;hr<HOST_REGS;hr++) {
2563     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2564   }
2565   assert(temp>=0);
2566   if(!c) {
2567     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
2568     if(!offset&&s!=temp) emit_mov(s,temp);
2569     jaddr=out;
2570     emit_jno(0);
2571   }
2572   else
2573   {
2574     if(!memtarget||!rs1[i]) {
2575       jaddr=out;
2576       emit_jmp(0);
2577     }
2578   }
2579   emit_addimm_no_flags(ram_offset,temp);
2580
2581   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
2582     assert(0);
2583   }
2584
2585   emit_xorimm(temp,3,temp);
2586   emit_testimm(temp,2);
2587   case2=out;
2588   emit_jne(0);
2589   emit_testimm(temp,1);
2590   case1=out;
2591   emit_jne(0);
2592   // 0
2593   if (opcode[i]==0x2A) { // SWL
2594     emit_writeword_indexed(tl,0,temp);
2595   }
2596   if (opcode[i]==0x2E) { // SWR
2597     emit_writebyte_indexed(tl,3,temp);
2598   }
2599   if (opcode[i]==0x2C) { // SDL
2600     assert(0);
2601   }
2602   if (opcode[i]==0x2D) { // SDR
2603     assert(0);
2604   }
2605   done0=out;
2606   emit_jmp(0);
2607   // 1
2608   set_jump_target(case1, out);
2609   if (opcode[i]==0x2A) { // SWL
2610     // Write 3 msb into three least significant bytes
2611     if(rs2[i]) emit_rorimm(tl,8,tl);
2612     emit_writehword_indexed(tl,-1,temp);
2613     if(rs2[i]) emit_rorimm(tl,16,tl);
2614     emit_writebyte_indexed(tl,1,temp);
2615     if(rs2[i]) emit_rorimm(tl,8,tl);
2616   }
2617   if (opcode[i]==0x2E) { // SWR
2618     // Write two lsb into two most significant bytes
2619     emit_writehword_indexed(tl,1,temp);
2620   }
2621   if (opcode[i]==0x2C) { // SDL
2622     assert(0);
2623   }
2624   if (opcode[i]==0x2D) { // SDR
2625     assert(0);
2626   }
2627   done1=out;
2628   emit_jmp(0);
2629   // 2
2630   set_jump_target(case2, out);
2631   emit_testimm(temp,1);
2632   case3=out;
2633   emit_jne(0);
2634   if (opcode[i]==0x2A) { // SWL
2635     // Write two msb into two least significant bytes
2636     if(rs2[i]) emit_rorimm(tl,16,tl);
2637     emit_writehword_indexed(tl,-2,temp);
2638     if(rs2[i]) emit_rorimm(tl,16,tl);
2639   }
2640   if (opcode[i]==0x2E) { // SWR
2641     // Write 3 lsb into three most significant bytes
2642     emit_writebyte_indexed(tl,-1,temp);
2643     if(rs2[i]) emit_rorimm(tl,8,tl);
2644     emit_writehword_indexed(tl,0,temp);
2645     if(rs2[i]) emit_rorimm(tl,24,tl);
2646   }
2647   if (opcode[i]==0x2C) { // SDL
2648     assert(0);
2649   }
2650   if (opcode[i]==0x2D) { // SDR
2651     assert(0);
2652   }
2653   done2=out;
2654   emit_jmp(0);
2655   // 3
2656   set_jump_target(case3, out);
2657   if (opcode[i]==0x2A) { // SWL
2658     // Write msb into least significant byte
2659     if(rs2[i]) emit_rorimm(tl,24,tl);
2660     emit_writebyte_indexed(tl,-3,temp);
2661     if(rs2[i]) emit_rorimm(tl,8,tl);
2662   }
2663   if (opcode[i]==0x2E) { // SWR
2664     // Write entire word
2665     emit_writeword_indexed(tl,-3,temp);
2666   }
2667   if (opcode[i]==0x2C) { // SDL
2668     assert(0);
2669   }
2670   if (opcode[i]==0x2D) { // SDR
2671     assert(0);
2672   }
2673   set_jump_target(done0, out);
2674   set_jump_target(done1, out);
2675   set_jump_target(done2, out);
2676   if (opcode[i]==0x2C) { // SDL
2677     assert(0);
2678   }
2679   if (opcode[i]==0x2D) { // SDR
2680     assert(0);
2681   }
2682   if(!c||!memtarget)
2683     add_stub_r(STORELR_STUB,jaddr,out,i,temp,i_regs,ccadj[i],reglist);
2684   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2685     emit_addimm_no_flags(-ram_offset,temp);
2686     #if defined(HOST_IMM8)
2687     int ir=get_reg(i_regs->regmap,INVCP);
2688     assert(ir>=0);
2689     emit_cmpmem_indexedsr12_reg(ir,temp,1);
2690     #else
2691     emit_cmpmem_indexedsr12_imm(invalid_code,temp,1);
2692     #endif
2693     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2694     emit_callne(invalidate_addr_reg[temp]);
2695     #else
2696     void *jaddr2 = out;
2697     emit_jne(0);
2698     add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),temp,0,0,0);
2699     #endif
2700   }
2701 }
2702
2703 void c1ls_assemble(int i,struct regstat *i_regs)
2704 {
2705   cop1_unusable(i, i_regs);
2706 }
2707
2708 void c2ls_assemble(int i,struct regstat *i_regs)
2709 {
2710   int s,tl;
2711   int ar;
2712   int offset;
2713   int memtarget=0,c=0;
2714   void *jaddr2=NULL;
2715   enum stub_type type;
2716   int agr=AGEN1+(i&1);
2717   int fastio_reg_override=0;
2718   u_int hr,reglist=0;
2719   u_int copr=(source[i]>>16)&0x1f;
2720   s=get_reg(i_regs->regmap,rs1[i]);
2721   tl=get_reg(i_regs->regmap,FTEMP);
2722   offset=imm[i];
2723   assert(rs1[i]>0);
2724   assert(tl>=0);
2725
2726   for(hr=0;hr<HOST_REGS;hr++) {
2727     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2728   }
2729   if(i_regs->regmap[HOST_CCREG]==CCREG)
2730     reglist&=~(1<<HOST_CCREG);
2731
2732   // get the address
2733   if (opcode[i]==0x3a) { // SWC2
2734     ar=get_reg(i_regs->regmap,agr);
2735     if(ar<0) ar=get_reg(i_regs->regmap,-1);
2736     reglist|=1<<ar;
2737   } else { // LWC2
2738     ar=tl;
2739   }
2740   if(s>=0) c=(i_regs->wasconst>>s)&1;
2741   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
2742   if (!offset&&!c&&s>=0) ar=s;
2743   assert(ar>=0);
2744
2745   if (opcode[i]==0x3a) { // SWC2
2746     cop2_get_dreg(copr,tl,HOST_TEMPREG);
2747     type=STOREW_STUB;
2748   }
2749   else
2750     type=LOADW_STUB;
2751
2752   if(c&&!memtarget) {
2753     jaddr2=out;
2754     emit_jmp(0); // inline_readstub/inline_writestub?
2755   }
2756   else {
2757     if(!c) {
2758       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
2759     }
2760     else if(ram_offset&&memtarget) {
2761       emit_addimm(ar,ram_offset,HOST_TEMPREG);
2762       fastio_reg_override=HOST_TEMPREG;
2763     }
2764     if (opcode[i]==0x32) { // LWC2
2765       int a=ar;
2766       if(fastio_reg_override) a=fastio_reg_override;
2767       emit_readword_indexed(0,a,tl);
2768     }
2769     if (opcode[i]==0x3a) { // SWC2
2770       #ifdef DESTRUCTIVE_SHIFT
2771       if(!offset&&!c&&s>=0) emit_mov(s,ar);
2772       #endif
2773       int a=ar;
2774       if(fastio_reg_override) a=fastio_reg_override;
2775       emit_writeword_indexed(tl,0,a);
2776     }
2777   }
2778   if(jaddr2)
2779     add_stub_r(type,jaddr2,out,i,ar,i_regs,ccadj[i],reglist);
2780   if(opcode[i]==0x3a) // SWC2
2781   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2782 #if defined(HOST_IMM8)
2783     int ir=get_reg(i_regs->regmap,INVCP);
2784     assert(ir>=0);
2785     emit_cmpmem_indexedsr12_reg(ir,ar,1);
2786 #else
2787     emit_cmpmem_indexedsr12_imm(invalid_code,ar,1);
2788 #endif
2789     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2790     emit_callne(invalidate_addr_reg[ar]);
2791     #else
2792     void *jaddr3 = out;
2793     emit_jne(0);
2794     add_stub(INVCODE_STUB,jaddr3,out,reglist|(1<<HOST_CCREG),ar,0,0,0);
2795     #endif
2796   }
2797   if (opcode[i]==0x32) { // LWC2
2798     cop2_put_dreg(copr,tl,HOST_TEMPREG);
2799   }
2800 }
2801
2802 #ifndef multdiv_assemble
2803 void multdiv_assemble(int i,struct regstat *i_regs)
2804 {
2805   printf("Need multdiv_assemble for this architecture.\n");
2806   exit(1);
2807 }
2808 #endif
2809
2810 void mov_assemble(int i,struct regstat *i_regs)
2811 {
2812   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
2813   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
2814   if(rt1[i]) {
2815     signed char sh,sl,th,tl;
2816     th=get_reg(i_regs->regmap,rt1[i]|64);
2817     tl=get_reg(i_regs->regmap,rt1[i]);
2818     //assert(tl>=0);
2819     if(tl>=0) {
2820       sh=get_reg(i_regs->regmap,rs1[i]|64);
2821       sl=get_reg(i_regs->regmap,rs1[i]);
2822       if(sl>=0) emit_mov(sl,tl);
2823       else emit_loadreg(rs1[i],tl);
2824       if(th>=0) {
2825         if(sh>=0) emit_mov(sh,th);
2826         else emit_loadreg(rs1[i]|64,th);
2827       }
2828     }
2829   }
2830 }
2831
2832 void syscall_assemble(int i,struct regstat *i_regs)
2833 {
2834   signed char ccreg=get_reg(i_regs->regmap,CCREG);
2835   assert(ccreg==HOST_CCREG);
2836   assert(!is_delayslot);
2837   (void)ccreg;
2838   emit_movimm(start+i*4,EAX); // Get PC
2839   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
2840   emit_jmp(jump_syscall_hle); // XXX
2841 }
2842
2843 void hlecall_assemble(int i,struct regstat *i_regs)
2844 {
2845   extern void psxNULL();
2846   signed char ccreg=get_reg(i_regs->regmap,CCREG);
2847   assert(ccreg==HOST_CCREG);
2848   assert(!is_delayslot);
2849   (void)ccreg;
2850   emit_movimm(start+i*4+4,0); // Get PC
2851   uint32_t hleCode = source[i] & 0x03ffffff;
2852   if (hleCode >= ARRAY_SIZE(psxHLEt))
2853     emit_movimm((uintptr_t)psxNULL,1);
2854   else
2855     emit_movimm((uintptr_t)psxHLEt[hleCode],1);
2856   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
2857   emit_jmp(jump_hlecall);
2858 }
2859
2860 void intcall_assemble(int i,struct regstat *i_regs)
2861 {
2862   signed char ccreg=get_reg(i_regs->regmap,CCREG);
2863   assert(ccreg==HOST_CCREG);
2864   assert(!is_delayslot);
2865   (void)ccreg;
2866   emit_movimm(start+i*4,0); // Get PC
2867   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
2868   emit_jmp(jump_intcall);
2869 }
2870
2871 void ds_assemble(int i,struct regstat *i_regs)
2872 {
2873   speculate_register_values(i);
2874   is_delayslot=1;
2875   switch(itype[i]) {
2876     case ALU:
2877       alu_assemble(i,i_regs);break;
2878     case IMM16:
2879       imm16_assemble(i,i_regs);break;
2880     case SHIFT:
2881       shift_assemble(i,i_regs);break;
2882     case SHIFTIMM:
2883       shiftimm_assemble(i,i_regs);break;
2884     case LOAD:
2885       load_assemble(i,i_regs);break;
2886     case LOADLR:
2887       loadlr_assemble(i,i_regs);break;
2888     case STORE:
2889       store_assemble(i,i_regs);break;
2890     case STORELR:
2891       storelr_assemble(i,i_regs);break;
2892     case COP0:
2893       cop0_assemble(i,i_regs);break;
2894     case COP1:
2895       cop1_assemble(i,i_regs);break;
2896     case C1LS:
2897       c1ls_assemble(i,i_regs);break;
2898     case COP2:
2899       cop2_assemble(i,i_regs);break;
2900     case C2LS:
2901       c2ls_assemble(i,i_regs);break;
2902     case C2OP:
2903       c2op_assemble(i,i_regs);break;
2904     case MULTDIV:
2905       multdiv_assemble(i,i_regs);break;
2906     case MOV:
2907       mov_assemble(i,i_regs);break;
2908     case SYSCALL:
2909     case HLECALL:
2910     case INTCALL:
2911     case SPAN:
2912     case UJUMP:
2913     case RJUMP:
2914     case CJUMP:
2915     case SJUMP:
2916     case FJUMP:
2917       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
2918   }
2919   is_delayslot=0;
2920 }
2921
2922 // Is the branch target a valid internal jump?
2923 int internal_branch(uint64_t i_is32,int addr)
2924 {
2925   if(addr&1) return 0; // Indirect (register) jump
2926   if(addr>=start && addr<start+slen*4-4)
2927   {
2928     return 1;
2929   }
2930   return 0;
2931 }
2932
2933 static void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,uint64_t u)
2934 {
2935   int hr;
2936   for(hr=0;hr<HOST_REGS;hr++) {
2937     if(hr!=EXCLUDE_REG) {
2938       if(pre[hr]!=entry[hr]) {
2939         if(pre[hr]>=0) {
2940           if((dirty>>hr)&1) {
2941             if(get_reg(entry,pre[hr])<0) {
2942               assert(pre[hr]<64);
2943               if(!((u>>pre[hr])&1))
2944                 emit_storereg(pre[hr],hr);
2945             }
2946           }
2947         }
2948       }
2949     }
2950   }
2951   // Move from one register to another (no writeback)
2952   for(hr=0;hr<HOST_REGS;hr++) {
2953     if(hr!=EXCLUDE_REG) {
2954       if(pre[hr]!=entry[hr]) {
2955         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
2956           int nr;
2957           if((nr=get_reg(entry,pre[hr]))>=0) {
2958             emit_mov(hr,nr);
2959           }
2960         }
2961       }
2962     }
2963   }
2964 }
2965
2966 // Load the specified registers
2967 // This only loads the registers given as arguments because
2968 // we don't want to load things that will be overwritten
2969 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
2970 {
2971   int hr;
2972   // Load 32-bit regs
2973   for(hr=0;hr<HOST_REGS;hr++) {
2974     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
2975       if(entry[hr]!=regmap[hr]) {
2976         if(regmap[hr]==rs1||regmap[hr]==rs2)
2977         {
2978           if(regmap[hr]==0) {
2979             emit_zeroreg(hr);
2980           }
2981           else
2982           {
2983             emit_loadreg(regmap[hr],hr);
2984           }
2985         }
2986       }
2987     }
2988   }
2989   //Load 64-bit regs
2990   for(hr=0;hr<HOST_REGS;hr++) {
2991     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
2992       if(entry[hr]!=regmap[hr]) {
2993         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
2994         {
2995           assert(regmap[hr]!=64);
2996           if((is32>>(regmap[hr]&63))&1) {
2997             int lr=get_reg(regmap,regmap[hr]-64);
2998             if(lr>=0)
2999               emit_sarimm(lr,31,hr);
3000             else
3001               emit_loadreg(regmap[hr],hr);
3002           }
3003           else
3004           {
3005             emit_loadreg(regmap[hr],hr);
3006           }
3007         }
3008       }
3009     }
3010   }
3011 }
3012
3013 // Load registers prior to the start of a loop
3014 // so that they are not loaded within the loop
3015 static void loop_preload(signed char pre[],signed char entry[])
3016 {
3017   int hr;
3018   for(hr=0;hr<HOST_REGS;hr++) {
3019     if(hr!=EXCLUDE_REG) {
3020       if(pre[hr]!=entry[hr]) {
3021         if(entry[hr]>=0) {
3022           if(get_reg(pre,entry[hr])<0) {
3023             assem_debug("loop preload:\n");
3024             //printf("loop preload: %d\n",hr);
3025             if(entry[hr]==0) {
3026               emit_zeroreg(hr);
3027             }
3028             else if(entry[hr]<TEMPREG)
3029             {
3030               emit_loadreg(entry[hr],hr);
3031             }
3032             else if(entry[hr]-64<TEMPREG)
3033             {
3034               emit_loadreg(entry[hr],hr);
3035             }
3036           }
3037         }
3038       }
3039     }
3040   }
3041 }
3042
3043 // Generate address for load/store instruction
3044 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
3045 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3046 {
3047   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
3048     int ra=-1;
3049     int agr=AGEN1+(i&1);
3050     if(itype[i]==LOAD) {
3051       ra=get_reg(i_regs->regmap,rt1[i]);
3052       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3053       assert(ra>=0);
3054     }
3055     if(itype[i]==LOADLR) {
3056       ra=get_reg(i_regs->regmap,FTEMP);
3057     }
3058     if(itype[i]==STORE||itype[i]==STORELR) {
3059       ra=get_reg(i_regs->regmap,agr);
3060       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3061     }
3062     if(itype[i]==C1LS||itype[i]==C2LS) {
3063       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
3064         ra=get_reg(i_regs->regmap,FTEMP);
3065       else { // SWC1/SDC1/SWC2/SDC2
3066         ra=get_reg(i_regs->regmap,agr);
3067         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3068       }
3069     }
3070     int rs=get_reg(i_regs->regmap,rs1[i]);
3071     if(ra>=0) {
3072       int offset=imm[i];
3073       int c=(i_regs->wasconst>>rs)&1;
3074       if(rs1[i]==0) {
3075         // Using r0 as a base address
3076         if(!entry||entry[ra]!=agr) {
3077           if (opcode[i]==0x22||opcode[i]==0x26) {
3078             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3079           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3080             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3081           }else{
3082             emit_movimm(offset,ra);
3083           }
3084         } // else did it in the previous cycle
3085       }
3086       else if(rs<0) {
3087         if(!entry||entry[ra]!=rs1[i])
3088           emit_loadreg(rs1[i],ra);
3089         //if(!entry||entry[ra]!=rs1[i])
3090         //  printf("poor load scheduling!\n");
3091       }
3092       else if(c) {
3093         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3094           if(!entry||entry[ra]!=agr) {
3095             if (opcode[i]==0x22||opcode[i]==0x26) {
3096               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3097             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3098               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3099             }else{
3100               emit_movimm(constmap[i][rs]+offset,ra);
3101               regs[i].loadedconst|=1<<ra;
3102             }
3103           } // else did it in the previous cycle
3104         } // else load_consts already did it
3105       }
3106       if(offset&&!c&&rs1[i]) {
3107         if(rs>=0) {
3108           emit_addimm(rs,offset,ra);
3109         }else{
3110           emit_addimm(ra,offset,ra);
3111         }
3112       }
3113     }
3114   }
3115   // Preload constants for next instruction
3116   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
3117     int agr,ra;
3118     // Actual address
3119     agr=AGEN1+((i+1)&1);
3120     ra=get_reg(i_regs->regmap,agr);
3121     if(ra>=0) {
3122       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3123       int offset=imm[i+1];
3124       int c=(regs[i+1].wasconst>>rs)&1;
3125       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3126         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3127           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3128         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3129           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3130         }else{
3131           emit_movimm(constmap[i+1][rs]+offset,ra);
3132           regs[i+1].loadedconst|=1<<ra;
3133         }
3134       }
3135       else if(rs1[i+1]==0) {
3136         // Using r0 as a base address
3137         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3138           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3139         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3140           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3141         }else{
3142           emit_movimm(offset,ra);
3143         }
3144       }
3145     }
3146   }
3147 }
3148
3149 static int get_final_value(int hr, int i, int *value)
3150 {
3151   int reg=regs[i].regmap[hr];
3152   while(i<slen-1) {
3153     if(regs[i+1].regmap[hr]!=reg) break;
3154     if(!((regs[i+1].isconst>>hr)&1)) break;
3155     if(bt[i+1]) break;
3156     i++;
3157   }
3158   if(i<slen-1) {
3159     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3160       *value=constmap[i][hr];
3161       return 1;
3162     }
3163     if(!bt[i+1]) {
3164       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3165         // Load in delay slot, out-of-order execution
3166         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3167         {
3168           // Precompute load address
3169           *value=constmap[i][hr]+imm[i+2];
3170           return 1;
3171         }
3172       }
3173       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3174       {
3175         // Precompute load address
3176         *value=constmap[i][hr]+imm[i+1];
3177         //printf("c=%x imm=%lx\n",(long)constmap[i][hr],imm[i+1]);
3178         return 1;
3179       }
3180     }
3181   }
3182   *value=constmap[i][hr];
3183   //printf("c=%lx\n",(long)constmap[i][hr]);
3184   if(i==slen-1) return 1;
3185   assert(reg < 64);
3186   return !((unneeded_reg[i+1]>>reg)&1);
3187 }
3188
3189 // Load registers with known constants
3190 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
3191 {
3192   int hr,hr2;
3193   // propagate loaded constant flags
3194   if(i==0||bt[i])
3195     regs[i].loadedconst=0;
3196   else {
3197     for(hr=0;hr<HOST_REGS;hr++) {
3198       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
3199          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
3200       {
3201         regs[i].loadedconst|=1<<hr;
3202       }
3203     }
3204   }
3205   // Load 32-bit regs
3206   for(hr=0;hr<HOST_REGS;hr++) {
3207     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3208       //if(entry[hr]!=regmap[hr]) {
3209       if(!((regs[i].loadedconst>>hr)&1)) {
3210         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3211           int value,similar=0;
3212           if(get_final_value(hr,i,&value)) {
3213             // see if some other register has similar value
3214             for(hr2=0;hr2<HOST_REGS;hr2++) {
3215               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
3216                 if(is_similar_value(value,constmap[i][hr2])) {
3217                   similar=1;
3218                   break;
3219                 }
3220               }
3221             }
3222             if(similar) {
3223               int value2;
3224               if(get_final_value(hr2,i,&value2)) // is this needed?
3225                 emit_movimm_from(value2,hr2,value,hr);
3226               else
3227                 emit_movimm(value,hr);
3228             }
3229             else if(value==0) {
3230               emit_zeroreg(hr);
3231             }
3232             else {
3233               emit_movimm(value,hr);
3234             }
3235           }
3236           regs[i].loadedconst|=1<<hr;
3237         }
3238       }
3239     }
3240   }
3241   // Load 64-bit regs
3242   for(hr=0;hr<HOST_REGS;hr++) {
3243     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3244       //if(entry[hr]!=regmap[hr]) {
3245       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3246         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3247           if((is32>>(regmap[hr]&63))&1) {
3248             int lr=get_reg(regmap,regmap[hr]-64);
3249             assert(lr>=0);
3250             emit_sarimm(lr,31,hr);
3251           }
3252           else
3253           {
3254             int value;
3255             if(get_final_value(hr,i,&value)) {
3256               if(value==0) {
3257                 emit_zeroreg(hr);
3258               }
3259               else {
3260                 emit_movimm(value,hr);
3261               }
3262             }
3263           }
3264         }
3265       }
3266     }
3267   }
3268 }
3269 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
3270 {
3271   int hr;
3272   // Load 32-bit regs
3273   for(hr=0;hr<HOST_REGS;hr++) {
3274     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3275       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3276         int value=constmap[i][hr];
3277         if(value==0) {
3278           emit_zeroreg(hr);
3279         }
3280         else {
3281           emit_movimm(value,hr);
3282         }
3283       }
3284     }
3285   }
3286   // Load 64-bit regs
3287   for(hr=0;hr<HOST_REGS;hr++) {
3288     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3289       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3290         if((is32>>(regmap[hr]&63))&1) {
3291           int lr=get_reg(regmap,regmap[hr]-64);
3292           assert(lr>=0);
3293           emit_sarimm(lr,31,hr);
3294         }
3295         else
3296         {
3297           int value=constmap[i][hr];
3298           if(value==0) {
3299             emit_zeroreg(hr);
3300           }
3301           else {
3302             emit_movimm(value,hr);
3303           }
3304         }
3305       }
3306     }
3307   }
3308 }
3309
3310 // Write out all dirty registers (except cycle count)
3311 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
3312 {
3313   int hr;
3314   for(hr=0;hr<HOST_REGS;hr++) {
3315     if(hr!=EXCLUDE_REG) {
3316       if(i_regmap[hr]>0) {
3317         if(i_regmap[hr]!=CCREG) {
3318           if((i_dirty>>hr)&1) {
3319             assert(i_regmap[hr]<64);
3320             emit_storereg(i_regmap[hr],hr);
3321           }
3322         }
3323       }
3324     }
3325   }
3326 }
3327 // Write out dirty registers that we need to reload (pair with load_needed_regs)
3328 // This writes the registers not written by store_regs_bt
3329 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
3330 {
3331   int hr;
3332   int t=(addr-start)>>2;
3333   for(hr=0;hr<HOST_REGS;hr++) {
3334     if(hr!=EXCLUDE_REG) {
3335       if(i_regmap[hr]>0) {
3336         if(i_regmap[hr]!=CCREG) {
3337           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32)>>(i_regmap[hr]&63))&1)) {
3338             if((i_dirty>>hr)&1) {
3339               assert(i_regmap[hr]<64);
3340               emit_storereg(i_regmap[hr],hr);
3341             }
3342           }
3343         }
3344       }
3345     }
3346   }
3347 }
3348
3349 // Load all registers (except cycle count)
3350 void load_all_regs(signed char i_regmap[])
3351 {
3352   int hr;
3353   for(hr=0;hr<HOST_REGS;hr++) {
3354     if(hr!=EXCLUDE_REG) {
3355       if(i_regmap[hr]==0) {
3356         emit_zeroreg(hr);
3357       }
3358       else
3359       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
3360       {
3361         emit_loadreg(i_regmap[hr],hr);
3362       }
3363     }
3364   }
3365 }
3366
3367 // Load all current registers also needed by next instruction
3368 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
3369 {
3370   int hr;
3371   for(hr=0;hr<HOST_REGS;hr++) {
3372     if(hr!=EXCLUDE_REG) {
3373       if(get_reg(next_regmap,i_regmap[hr])>=0) {
3374         if(i_regmap[hr]==0) {
3375           emit_zeroreg(hr);
3376         }
3377         else
3378         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
3379         {
3380           emit_loadreg(i_regmap[hr],hr);
3381         }
3382       }
3383     }
3384   }
3385 }
3386
3387 // Load all regs, storing cycle count if necessary
3388 void load_regs_entry(int t)
3389 {
3390   int hr;
3391   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
3392   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
3393   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
3394     emit_storereg(CCREG,HOST_CCREG);
3395   }
3396   // Load 32-bit regs
3397   for(hr=0;hr<HOST_REGS;hr++) {
3398     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
3399       if(regs[t].regmap_entry[hr]==0) {
3400         emit_zeroreg(hr);
3401       }
3402       else if(regs[t].regmap_entry[hr]!=CCREG)
3403       {
3404         emit_loadreg(regs[t].regmap_entry[hr],hr);
3405       }
3406     }
3407   }
3408   // Load 64-bit regs
3409   for(hr=0;hr<HOST_REGS;hr++) {
3410     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
3411       assert(regs[t].regmap_entry[hr]!=64);
3412       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
3413         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
3414         if(lr<0) {
3415           emit_loadreg(regs[t].regmap_entry[hr],hr);
3416         }
3417         else
3418         {
3419           emit_sarimm(lr,31,hr);
3420         }
3421       }
3422       else
3423       {
3424         emit_loadreg(regs[t].regmap_entry[hr],hr);
3425       }
3426     }
3427   }
3428 }
3429
3430 // Store dirty registers prior to branch
3431 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
3432 {
3433   if(internal_branch(i_is32,addr))
3434   {
3435     int t=(addr-start)>>2;
3436     int hr;
3437     for(hr=0;hr<HOST_REGS;hr++) {
3438       if(hr!=EXCLUDE_REG) {
3439         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
3440           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32)>>(i_regmap[hr]&63))&1)) {
3441             if((i_dirty>>hr)&1) {
3442               assert(i_regmap[hr]<64);
3443               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
3444                 emit_storereg(i_regmap[hr],hr);
3445             }
3446           }
3447         }
3448       }
3449     }
3450   }
3451   else
3452   {
3453     // Branch out of this block, write out all dirty regs
3454     wb_dirtys(i_regmap,i_is32,i_dirty);
3455   }
3456 }
3457
3458 // Load all needed registers for branch target
3459 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
3460 {
3461   //if(addr>=start && addr<(start+slen*4))
3462   if(internal_branch(i_is32,addr))
3463   {
3464     int t=(addr-start)>>2;
3465     int hr;
3466     // Store the cycle count before loading something else
3467     if(i_regmap[HOST_CCREG]!=CCREG) {
3468       assert(i_regmap[HOST_CCREG]==-1);
3469     }
3470     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
3471       emit_storereg(CCREG,HOST_CCREG);
3472     }
3473     // Load 32-bit regs
3474     for(hr=0;hr<HOST_REGS;hr++) {
3475       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
3476         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
3477           if(regs[t].regmap_entry[hr]==0) {
3478             emit_zeroreg(hr);
3479           }
3480           else if(regs[t].regmap_entry[hr]!=CCREG)
3481           {
3482             emit_loadreg(regs[t].regmap_entry[hr],hr);
3483           }
3484         }
3485       }
3486     }
3487     //Load 64-bit regs
3488     for(hr=0;hr<HOST_REGS;hr++) {
3489       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
3490         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
3491           assert(regs[t].regmap_entry[hr]!=64);
3492           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
3493             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
3494             if(lr<0) {
3495               emit_loadreg(regs[t].regmap_entry[hr],hr);
3496             }
3497             else
3498             {
3499               emit_sarimm(lr,31,hr);
3500             }
3501           }
3502           else
3503           {
3504             emit_loadreg(regs[t].regmap_entry[hr],hr);
3505           }
3506         }
3507         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
3508           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
3509           assert(lr>=0);
3510           emit_sarimm(lr,31,hr);
3511         }
3512       }
3513     }
3514   }
3515 }
3516
3517 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
3518 {
3519   if(addr>=start && addr<start+slen*4-4)
3520   {
3521     int t=(addr-start)>>2;
3522     int hr;
3523     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
3524     for(hr=0;hr<HOST_REGS;hr++)
3525     {
3526       if(hr!=EXCLUDE_REG)
3527       {
3528         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
3529         {
3530           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
3531           {
3532             return 0;
3533           }
3534           else
3535           if((i_dirty>>hr)&1)
3536           {
3537             if(i_regmap[hr]<TEMPREG)
3538             {
3539               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
3540                 return 0;
3541             }
3542             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
3543             {
3544               assert(0);
3545             }
3546           }
3547         }
3548         else // Same register but is it 32-bit or dirty?
3549         if(i_regmap[hr]>=0)
3550         {
3551           if(!((regs[t].dirty>>hr)&1))
3552           {
3553             if((i_dirty>>hr)&1)
3554             {
3555               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
3556               {
3557                 //printf("%x: dirty no match\n",addr);
3558                 return 0;
3559               }
3560             }
3561           }
3562         }
3563       }
3564     }
3565     // Delay slots are not valid branch targets
3566     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3567     // Delay slots require additional processing, so do not match
3568     if(is_ds[t]) return 0;
3569   }
3570   else
3571   {
3572     int hr;
3573     for(hr=0;hr<HOST_REGS;hr++)
3574     {
3575       if(hr!=EXCLUDE_REG)
3576       {
3577         if(i_regmap[hr]>=0)
3578         {
3579           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
3580           {
3581             if((i_dirty>>hr)&1)
3582             {
3583               return 0;
3584             }
3585           }
3586         }
3587       }
3588     }
3589   }
3590   return 1;
3591 }
3592
3593 #ifdef DRC_DBG
3594 static void drc_dbg_emit_do_cmp(int i)
3595 {
3596   extern void do_insn_cmp();
3597   extern int cycle;
3598   u_int hr,reglist=0;
3599
3600   for(hr=0;hr<HOST_REGS;hr++)
3601     if(regs[i].regmap[hr]>=0) reglist|=1<<hr;
3602   save_regs(reglist);
3603   emit_movimm(start+i*4,0);
3604   emit_writeword(0,&pcaddr);
3605   emit_call(do_insn_cmp);
3606   //emit_readword(&cycle,0);
3607   //emit_addimm(0,2,0);
3608   //emit_writeword(0,&cycle);
3609   restore_regs(reglist);
3610 }
3611 #else
3612 #define drc_dbg_emit_do_cmp(x)
3613 #endif
3614
3615 // Used when a branch jumps into the delay slot of another branch
3616 void ds_assemble_entry(int i)
3617 {
3618   int t=(ba[i]-start)>>2;
3619   if (!instr_addr[t])
3620     instr_addr[t] = out;
3621   assem_debug("Assemble delay slot at %x\n",ba[i]);
3622   assem_debug("<->\n");
3623   drc_dbg_emit_do_cmp(t);
3624   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
3625     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
3626   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
3627   address_generation(t,&regs[t],regs[t].regmap_entry);
3628   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
3629     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
3630   is_delayslot=0;
3631   switch(itype[t]) {
3632     case ALU:
3633       alu_assemble(t,&regs[t]);break;
3634     case IMM16:
3635       imm16_assemble(t,&regs[t]);break;
3636     case SHIFT:
3637       shift_assemble(t,&regs[t]);break;
3638     case SHIFTIMM:
3639       shiftimm_assemble(t,&regs[t]);break;
3640     case LOAD:
3641       load_assemble(t,&regs[t]);break;
3642     case LOADLR:
3643       loadlr_assemble(t,&regs[t]);break;
3644     case STORE:
3645       store_assemble(t,&regs[t]);break;
3646     case STORELR:
3647       storelr_assemble(t,&regs[t]);break;
3648     case COP0:
3649       cop0_assemble(t,&regs[t]);break;
3650     case COP1:
3651       cop1_assemble(t,&regs[t]);break;
3652     case C1LS:
3653       c1ls_assemble(t,&regs[t]);break;
3654     case COP2:
3655       cop2_assemble(t,&regs[t]);break;
3656     case C2LS:
3657       c2ls_assemble(t,&regs[t]);break;
3658     case C2OP:
3659       c2op_assemble(t,&regs[t]);break;
3660     case MULTDIV:
3661       multdiv_assemble(t,&regs[t]);break;
3662     case MOV:
3663       mov_assemble(t,&regs[t]);break;
3664     case SYSCALL:
3665     case HLECALL:
3666     case INTCALL:
3667     case SPAN:
3668     case UJUMP:
3669     case RJUMP:
3670     case CJUMP:
3671     case SJUMP:
3672     case FJUMP:
3673       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3674   }
3675   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
3676   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
3677   if(internal_branch(regs[t].is32,ba[i]+4))
3678     assem_debug("branch: internal\n");
3679   else
3680     assem_debug("branch: external\n");
3681   assert(internal_branch(regs[t].is32,ba[i]+4));
3682   add_to_linker(out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
3683   emit_jmp(0);
3684 }
3685
3686 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
3687 {
3688   int count;
3689   void *jaddr;
3690   void *idle=NULL;
3691   int t=0;
3692   if(itype[i]==RJUMP)
3693   {
3694     *adj=0;
3695   }
3696   //if(ba[i]>=start && ba[i]<(start+slen*4))
3697   if(internal_branch(branch_regs[i].is32,ba[i]))
3698   {
3699     t=(ba[i]-start)>>2;
3700     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
3701     else *adj=ccadj[t];
3702   }
3703   else
3704   {
3705     *adj=0;
3706   }
3707   count=ccadj[i];
3708   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
3709     // Idle loop
3710     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
3711     idle=out;
3712     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
3713     emit_andimm(HOST_CCREG,3,HOST_CCREG);
3714     jaddr=out;
3715     emit_jmp(0);
3716   }
3717   else if(*adj==0||invert) {
3718     int cycles=CLOCK_ADJUST(count+2);
3719     // faster loop HACK
3720     if (t&&*adj) {
3721       int rel=t-i;
3722       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
3723         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
3724     }
3725     emit_addimm_and_set_flags(cycles,HOST_CCREG);
3726     jaddr=out;
3727     emit_jns(0);
3728   }
3729   else
3730   {
3731     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
3732     jaddr=out;
3733     emit_jns(0);
3734   }
3735   add_stub(CC_STUB,jaddr,idle?idle:out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
3736 }
3737
3738 static void do_ccstub(int n)
3739 {
3740   literal_pool(256);
3741   assem_debug("do_ccstub %x\n",start+stubs[n].b*4);
3742   set_jump_target(stubs[n].addr, out);
3743   int i=stubs[n].b;
3744   if(stubs[n].d==NULLDS) {
3745     // Delay slot instruction is nullified ("likely" branch)
3746     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
3747   }
3748   else if(stubs[n].d!=TAKEN) {
3749     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
3750   }
3751   else {
3752     if(internal_branch(branch_regs[i].is32,ba[i]))
3753       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
3754   }
3755   if(stubs[n].c!=-1)
3756   {
3757     // Save PC as return address
3758     emit_movimm(stubs[n].c,EAX);
3759     emit_writeword(EAX,&pcaddr);
3760   }
3761   else
3762   {
3763     // Return address depends on which way the branch goes
3764     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
3765     {
3766       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
3767       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
3768       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
3769       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
3770       if(rs1[i]==0)
3771       {
3772         s1l=s2l;s1h=s2h;
3773         s2l=s2h=-1;
3774       }
3775       else if(rs2[i]==0)
3776       {
3777         s2l=s2h=-1;
3778       }
3779       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
3780         s1h=s2h=-1;
3781       }
3782       assert(s1l>=0);
3783       #ifdef DESTRUCTIVE_WRITEBACK
3784       if(rs1[i]) {
3785         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
3786           emit_loadreg(rs1[i],s1l);
3787       }
3788       else {
3789         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
3790           emit_loadreg(rs2[i],s1l);
3791       }
3792       if(s2l>=0)
3793         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
3794           emit_loadreg(rs2[i],s2l);
3795       #endif
3796       int hr=0;
3797       int addr=-1,alt=-1,ntaddr=-1;
3798       while(hr<HOST_REGS)
3799       {
3800         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
3801            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
3802            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
3803         {
3804           addr=hr++;break;
3805         }
3806         hr++;
3807       }
3808       while(hr<HOST_REGS)
3809       {
3810         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
3811            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
3812            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
3813         {
3814           alt=hr++;break;
3815         }
3816         hr++;
3817       }
3818       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
3819       {
3820         while(hr<HOST_REGS)
3821         {
3822           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
3823              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
3824              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
3825           {
3826             ntaddr=hr;break;
3827           }
3828           hr++;
3829         }
3830         assert(hr<HOST_REGS);
3831       }
3832       if((opcode[i]&0x2f)==4) // BEQ
3833       {
3834         #ifdef HAVE_CMOV_IMM
3835         if(s1h<0) {
3836           if(s2l>=0) emit_cmp(s1l,s2l);
3837           else emit_test(s1l,s1l);
3838           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
3839         }
3840         else
3841         #endif
3842         {
3843           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
3844           if(s1h>=0) {
3845             if(s2h>=0) emit_cmp(s1h,s2h);
3846             else emit_test(s1h,s1h);
3847             emit_cmovne_reg(alt,addr);
3848           }
3849           if(s2l>=0) emit_cmp(s1l,s2l);
3850           else emit_test(s1l,s1l);
3851           emit_cmovne_reg(alt,addr);
3852         }
3853       }
3854       if((opcode[i]&0x2f)==5) // BNE
3855       {
3856         #ifdef HAVE_CMOV_IMM
3857         if(s1h<0) {
3858           if(s2l>=0) emit_cmp(s1l,s2l);
3859           else emit_test(s1l,s1l);
3860           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
3861         }
3862         else
3863         #endif
3864         {
3865           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
3866           if(s1h>=0) {
3867             if(s2h>=0) emit_cmp(s1h,s2h);
3868             else emit_test(s1h,s1h);
3869             emit_cmovne_reg(alt,addr);
3870           }
3871           if(s2l>=0) emit_cmp(s1l,s2l);
3872           else emit_test(s1l,s1l);
3873           emit_cmovne_reg(alt,addr);
3874         }
3875       }
3876       if((opcode[i]&0x2f)==6) // BLEZ
3877       {
3878         //emit_movimm(ba[i],alt);
3879         //emit_movimm(start+i*4+8,addr);
3880         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
3881         emit_cmpimm(s1l,1);
3882         if(s1h>=0) emit_mov(addr,ntaddr);
3883         emit_cmovl_reg(alt,addr);
3884         if(s1h>=0) {
3885           emit_test(s1h,s1h);
3886           emit_cmovne_reg(ntaddr,addr);
3887           emit_cmovs_reg(alt,addr);
3888         }
3889       }
3890       if((opcode[i]&0x2f)==7) // BGTZ
3891       {
3892         //emit_movimm(ba[i],addr);
3893         //emit_movimm(start+i*4+8,ntaddr);
3894         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
3895         emit_cmpimm(s1l,1);
3896         if(s1h>=0) emit_mov(addr,alt);
3897         emit_cmovl_reg(ntaddr,addr);
3898         if(s1h>=0) {
3899           emit_test(s1h,s1h);
3900           emit_cmovne_reg(alt,addr);
3901           emit_cmovs_reg(ntaddr,addr);
3902         }
3903       }
3904       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
3905       {
3906         //emit_movimm(ba[i],alt);
3907         //emit_movimm(start+i*4+8,addr);
3908         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
3909         if(s1h>=0) emit_test(s1h,s1h);
3910         else emit_test(s1l,s1l);
3911         emit_cmovs_reg(alt,addr);
3912       }
3913       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
3914       {
3915         //emit_movimm(ba[i],addr);
3916         //emit_movimm(start+i*4+8,alt);
3917         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
3918         if(s1h>=0) emit_test(s1h,s1h);
3919         else emit_test(s1l,s1l);
3920         emit_cmovs_reg(alt,addr);
3921       }
3922       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
3923         if(source[i]&0x10000) // BC1T
3924         {
3925           //emit_movimm(ba[i],alt);
3926           //emit_movimm(start+i*4+8,addr);
3927           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
3928           emit_testimm(s1l,0x800000);
3929           emit_cmovne_reg(alt,addr);
3930         }
3931         else // BC1F
3932         {
3933           //emit_movimm(ba[i],addr);
3934           //emit_movimm(start+i*4+8,alt);
3935           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
3936           emit_testimm(s1l,0x800000);
3937           emit_cmovne_reg(alt,addr);
3938         }
3939       }
3940       emit_writeword(addr,&pcaddr);
3941     }
3942     else
3943     if(itype[i]==RJUMP)
3944     {
3945       int r=get_reg(branch_regs[i].regmap,rs1[i]);
3946       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
3947         r=get_reg(branch_regs[i].regmap,RTEMP);
3948       }
3949       emit_writeword(r,&pcaddr);
3950     }
3951     else {SysPrintf("Unknown branch type in do_ccstub\n");exit(1);}
3952   }
3953   // Update cycle count
3954   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
3955   if(stubs[n].a) emit_addimm(HOST_CCREG,CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
3956   emit_call(cc_interrupt);
3957   if(stubs[n].a) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
3958   if(stubs[n].d==TAKEN) {
3959     if(internal_branch(branch_regs[i].is32,ba[i]))
3960       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
3961     else if(itype[i]==RJUMP) {
3962       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
3963         emit_readword(&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
3964       else
3965         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
3966     }
3967   }else if(stubs[n].d==NOTTAKEN) {
3968     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
3969     else load_all_regs(branch_regs[i].regmap);
3970   }else if(stubs[n].d==NULLDS) {
3971     // Delay slot instruction is nullified ("likely" branch)
3972     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
3973     else load_all_regs(regs[i].regmap);
3974   }else{
3975     load_all_regs(branch_regs[i].regmap);
3976   }
3977   emit_jmp(stubs[n].retaddr);
3978 }
3979
3980 static void add_to_linker(void *addr, u_int target, int ext)
3981 {
3982   assert(linkcount < ARRAY_SIZE(link_addr));
3983   link_addr[linkcount].addr = addr;
3984   link_addr[linkcount].target = target;
3985   link_addr[linkcount].ext = ext;
3986   linkcount++;
3987 }
3988
3989 static void ujump_assemble_write_ra(int i)
3990 {
3991   int rt;
3992   unsigned int return_address;
3993   rt=get_reg(branch_regs[i].regmap,31);
3994   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
3995   //assert(rt>=0);
3996   return_address=start+i*4+8;
3997   if(rt>=0) {
3998     #ifdef USE_MINI_HT
3999     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
4000       int temp=-1; // note: must be ds-safe
4001       #ifdef HOST_TEMPREG
4002       temp=HOST_TEMPREG;
4003       #endif
4004       if(temp>=0) do_miniht_insert(return_address,rt,temp);
4005       else emit_movimm(return_address,rt);
4006     }
4007     else
4008     #endif
4009     {
4010       #ifdef REG_PREFETCH
4011       if(temp>=0)
4012       {
4013         if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4014       }
4015       #endif
4016       emit_movimm(return_address,rt); // PC into link register
4017       #ifdef IMM_PREFETCH
4018       emit_prefetch(hash_table_get(return_address));
4019       #endif
4020     }
4021   }
4022 }
4023
4024 void ujump_assemble(int i,struct regstat *i_regs)
4025 {
4026   int ra_done=0;
4027   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4028   address_generation(i+1,i_regs,regs[i].regmap_entry);
4029   #ifdef REG_PREFETCH
4030   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4031   if(rt1[i]==31&&temp>=0)
4032   {
4033     signed char *i_regmap=i_regs->regmap;
4034     int return_address=start+i*4+8;
4035     if(get_reg(branch_regs[i].regmap,31)>0)
4036     if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4037   }
4038   #endif
4039   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4040     ujump_assemble_write_ra(i); // writeback ra for DS
4041     ra_done=1;
4042   }
4043   ds_assemble(i+1,i_regs);
4044   uint64_t bc_unneeded=branch_regs[i].u;
4045   bc_unneeded|=1|(1LL<<rt1[i]);
4046   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,bc_unneeded);
4047   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4048   if(!ra_done&&rt1[i]==31)
4049     ujump_assemble_write_ra(i);
4050   int cc,adj;
4051   cc=get_reg(branch_regs[i].regmap,CCREG);
4052   assert(cc==HOST_CCREG);
4053   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4054   #ifdef REG_PREFETCH
4055   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4056   #endif
4057   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4058   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4059   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4060   if(internal_branch(branch_regs[i].is32,ba[i]))
4061     assem_debug("branch: internal\n");
4062   else
4063     assem_debug("branch: external\n");
4064   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4065     ds_assemble_entry(i);
4066   }
4067   else {
4068     add_to_linker(out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4069     emit_jmp(0);
4070   }
4071 }
4072
4073 static void rjump_assemble_write_ra(int i)
4074 {
4075   int rt,return_address;
4076   assert(rt1[i+1]!=rt1[i]);
4077   assert(rt2[i+1]!=rt1[i]);
4078   rt=get_reg(branch_regs[i].regmap,rt1[i]);
4079   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4080   assert(rt>=0);
4081   return_address=start+i*4+8;
4082   #ifdef REG_PREFETCH
4083   if(temp>=0)
4084   {
4085     if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4086   }
4087   #endif
4088   emit_movimm(return_address,rt); // PC into link register
4089   #ifdef IMM_PREFETCH
4090   emit_prefetch(hash_table_get(return_address));
4091   #endif
4092 }
4093
4094 void rjump_assemble(int i,struct regstat *i_regs)
4095 {
4096   int temp;
4097   int rs,cc;
4098   int ra_done=0;
4099   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4100   assert(rs>=0);
4101   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4102     // Delay slot abuse, make a copy of the branch address register
4103     temp=get_reg(branch_regs[i].regmap,RTEMP);
4104     assert(temp>=0);
4105     assert(regs[i].regmap[temp]==RTEMP);
4106     emit_mov(rs,temp);
4107     rs=temp;
4108   }
4109   address_generation(i+1,i_regs,regs[i].regmap_entry);
4110   #ifdef REG_PREFETCH
4111   if(rt1[i]==31)
4112   {
4113     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4114       signed char *i_regmap=i_regs->regmap;
4115       int return_address=start+i*4+8;
4116       if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4117     }
4118   }
4119   #endif
4120   #ifdef USE_MINI_HT
4121   if(rs1[i]==31) {
4122     int rh=get_reg(regs[i].regmap,RHASH);
4123     if(rh>=0) do_preload_rhash(rh);
4124   }
4125   #endif
4126   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4127     rjump_assemble_write_ra(i);
4128     ra_done=1;
4129   }
4130   ds_assemble(i+1,i_regs);
4131   uint64_t bc_unneeded=branch_regs[i].u;
4132   bc_unneeded|=1|(1LL<<rt1[i]);
4133   bc_unneeded&=~(1LL<<rs1[i]);
4134   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,bc_unneeded);
4135   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4136   if(!ra_done&&rt1[i]!=0)
4137     rjump_assemble_write_ra(i);
4138   cc=get_reg(branch_regs[i].regmap,CCREG);
4139   assert(cc==HOST_CCREG);
4140   (void)cc;
4141   #ifdef USE_MINI_HT
4142   int rh=get_reg(branch_regs[i].regmap,RHASH);
4143   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4144   if(rs1[i]==31) {
4145     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4146     do_preload_rhtbl(ht);
4147     do_rhash(rs,rh);
4148   }
4149   #endif
4150   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4151   #ifdef DESTRUCTIVE_WRITEBACK
4152   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4153     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4154       emit_loadreg(rs1[i],rs);
4155     }
4156   }
4157   #endif
4158   #ifdef REG_PREFETCH
4159   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4160   #endif
4161   #ifdef USE_MINI_HT
4162   if(rs1[i]==31) {
4163     do_miniht_load(ht,rh);
4164   }
4165   #endif
4166   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4167   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4168   //assert(adj==0);
4169   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4170   add_stub(CC_STUB,out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
4171   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
4172     // special case for RFE
4173     emit_jmp(0);
4174   else
4175     emit_jns(0);
4176   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4177   #ifdef USE_MINI_HT
4178   if(rs1[i]==31) {
4179     do_miniht_jump(rs,rh,ht);
4180   }
4181   else
4182   #endif
4183   {
4184     emit_jmp(jump_vaddr_reg[rs]);
4185   }
4186   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4187   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4188   #endif
4189 }
4190
4191 void cjump_assemble(int i,struct regstat *i_regs)
4192 {
4193   signed char *i_regmap=i_regs->regmap;
4194   int cc;
4195   int match;
4196   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4197   assem_debug("match=%d\n",match);
4198   int s1h,s1l,s2h,s2l;
4199   int unconditional=0,nop=0;
4200   int only32=0;
4201   int invert=0;
4202   int internal=internal_branch(branch_regs[i].is32,ba[i]);
4203   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4204   if(!match) invert=1;
4205   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4206   if(i>(ba[i]-start)>>2) invert=1;
4207   #endif
4208
4209   if(ooo[i]) {
4210     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4211     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4212     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4213     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4214   }
4215   else {
4216     s1l=get_reg(i_regmap,rs1[i]);
4217     s1h=get_reg(i_regmap,rs1[i]|64);
4218     s2l=get_reg(i_regmap,rs2[i]);
4219     s2h=get_reg(i_regmap,rs2[i]|64);
4220   }
4221   if(rs1[i]==0&&rs2[i]==0)
4222   {
4223     if(opcode[i]&1) nop=1;
4224     else unconditional=1;
4225     //assert(opcode[i]!=5);
4226     //assert(opcode[i]!=7);
4227     //assert(opcode[i]!=0x15);
4228     //assert(opcode[i]!=0x17);
4229   }
4230   else if(rs1[i]==0)
4231   {
4232     s1l=s2l;s1h=s2h;
4233     s2l=s2h=-1;
4234     only32=(regs[i].was32>>rs2[i])&1;
4235   }
4236   else if(rs2[i]==0)
4237   {
4238     s2l=s2h=-1;
4239     only32=(regs[i].was32>>rs1[i])&1;
4240   }
4241   else {
4242     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
4243   }
4244
4245   if(ooo[i]) {
4246     // Out of order execution (delay slot first)
4247     //printf("OOOE\n");
4248     address_generation(i+1,i_regs,regs[i].regmap_entry);
4249     ds_assemble(i+1,i_regs);
4250     int adj;
4251     uint64_t bc_unneeded=branch_regs[i].u;
4252     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
4253     bc_unneeded|=1;
4254     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,bc_unneeded);
4255     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
4256     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4257     cc=get_reg(branch_regs[i].regmap,CCREG);
4258     assert(cc==HOST_CCREG);
4259     if(unconditional)
4260       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4261     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
4262     //assem_debug("cycle count (adj)\n");
4263     if(unconditional) {
4264       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4265       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
4266         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4267         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4268         if(internal)
4269           assem_debug("branch: internal\n");
4270         else
4271           assem_debug("branch: external\n");
4272         if(internal&&is_ds[(ba[i]-start)>>2]) {
4273           ds_assemble_entry(i);
4274         }
4275         else {
4276           add_to_linker(out,ba[i],internal);
4277           emit_jmp(0);
4278         }
4279         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4280         if(((u_int)out)&7) emit_addnop(0);
4281         #endif
4282       }
4283     }
4284     else if(nop) {
4285       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
4286       void *jaddr=out;
4287       emit_jns(0);
4288       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
4289     }
4290     else {
4291       void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
4292       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
4293       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4294       if(!only32)
4295       {
4296         assert(s1h>=0);
4297         if(opcode[i]==4) // BEQ
4298         {
4299           if(s2h>=0) emit_cmp(s1h,s2h);
4300           else emit_test(s1h,s1h);
4301           nottaken1=out;
4302           emit_jne((void *)1l);
4303         }
4304         if(opcode[i]==5) // BNE
4305         {
4306           if(s2h>=0) emit_cmp(s1h,s2h);
4307           else emit_test(s1h,s1h);
4308           if(invert) taken=out;
4309           else add_to_linker(out,ba[i],internal);
4310           emit_jne(0);
4311         }
4312         if(opcode[i]==6) // BLEZ
4313         {
4314           emit_test(s1h,s1h);
4315           if(invert) taken=out;
4316           else add_to_linker(out,ba[i],internal);
4317           emit_js(0);
4318           nottaken1=out;
4319           emit_jne((void *)1l);
4320         }
4321         if(opcode[i]==7) // BGTZ
4322         {
4323           emit_test(s1h,s1h);
4324           nottaken1=out;
4325           emit_js(1);
4326           if(invert) taken=out;
4327           else add_to_linker(out,ba[i],internal);
4328           emit_jne(0);
4329         }
4330       } // if(!only32)
4331
4332       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4333       assert(s1l>=0);
4334       if(opcode[i]==4) // BEQ
4335       {
4336         if(s2l>=0) emit_cmp(s1l,s2l);
4337         else emit_test(s1l,s1l);
4338         if(invert){
4339           nottaken=out;
4340           emit_jne((void *)1l);
4341         }else{
4342           add_to_linker(out,ba[i],internal);
4343           emit_jeq(0);
4344         }
4345       }
4346       if(opcode[i]==5) // BNE
4347       {
4348         if(s2l>=0) emit_cmp(s1l,s2l);
4349         else emit_test(s1l,s1l);
4350         if(invert){
4351           nottaken=out;
4352           emit_jeq(1);
4353         }else{
4354           add_to_linker(out,ba[i],internal);
4355           emit_jne(0);
4356         }
4357       }
4358       if(opcode[i]==6) // BLEZ
4359       {
4360         emit_cmpimm(s1l,1);
4361         if(invert){
4362           nottaken=out;
4363           emit_jge(1);
4364         }else{
4365           add_to_linker(out,ba[i],internal);
4366           emit_jl(0);
4367         }
4368       }
4369       if(opcode[i]==7) // BGTZ
4370       {
4371         emit_cmpimm(s1l,1);
4372         if(invert){
4373           nottaken=out;
4374           emit_jl(1);
4375         }else{
4376           add_to_linker(out,ba[i],internal);
4377           emit_jge(0);
4378         }
4379       }
4380       if(invert) {
4381         if(taken) set_jump_target(taken, out);
4382         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4383         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
4384           if(adj) {
4385             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
4386             add_to_linker(out,ba[i],internal);
4387           }else{
4388             emit_addnop(13);
4389             add_to_linker(out,ba[i],internal*2);
4390           }
4391           emit_jmp(0);
4392         }else
4393         #endif
4394         {
4395           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
4396           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4397           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4398           if(internal)
4399             assem_debug("branch: internal\n");
4400           else
4401             assem_debug("branch: external\n");
4402           if(internal&&is_ds[(ba[i]-start)>>2]) {
4403             ds_assemble_entry(i);
4404           }
4405           else {
4406             add_to_linker(out,ba[i],internal);
4407             emit_jmp(0);
4408           }
4409         }
4410         set_jump_target(nottaken, out);
4411       }
4412
4413       if(nottaken1) set_jump_target(nottaken1, out);
4414       if(adj) {
4415         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
4416       }
4417     } // (!unconditional)
4418   } // if(ooo)
4419   else
4420   {
4421     // In-order execution (branch first)
4422     //if(likely[i]) printf("IOL\n");
4423     //else
4424     //printf("IOE\n");
4425     void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
4426     if(!unconditional&&!nop) {
4427       if(!only32)
4428       {
4429         assert(s1h>=0);
4430         if((opcode[i]&0x2f)==4) // BEQ
4431         {
4432           if(s2h>=0) emit_cmp(s1h,s2h);
4433           else emit_test(s1h,s1h);
4434           nottaken1=out;
4435           emit_jne((void *)2l);
4436         }
4437         if((opcode[i]&0x2f)==5) // BNE
4438         {
4439           if(s2h>=0) emit_cmp(s1h,s2h);
4440           else emit_test(s1h,s1h);
4441           taken=out;
4442           emit_jne((void *)1l);
4443         }
4444         if((opcode[i]&0x2f)==6) // BLEZ
4445         {
4446           emit_test(s1h,s1h);
4447           taken=out;
4448           emit_js(1);
4449           nottaken1=out;
4450           emit_jne((void *)2l);
4451         }
4452         if((opcode[i]&0x2f)==7) // BGTZ
4453         {
4454           emit_test(s1h,s1h);
4455           nottaken1=out;
4456           emit_js(2);
4457           taken=out;
4458           emit_jne((void *)1l);
4459         }
4460       } // if(!only32)
4461
4462       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4463       assert(s1l>=0);
4464       if((opcode[i]&0x2f)==4) // BEQ
4465       {
4466         if(s2l>=0) emit_cmp(s1l,s2l);
4467         else emit_test(s1l,s1l);
4468         nottaken=out;
4469         emit_jne((void *)2l);
4470       }
4471       if((opcode[i]&0x2f)==5) // BNE
4472       {
4473         if(s2l>=0) emit_cmp(s1l,s2l);
4474         else emit_test(s1l,s1l);
4475         nottaken=out;
4476         emit_jeq(2);
4477       }
4478       if((opcode[i]&0x2f)==6) // BLEZ
4479       {
4480         emit_cmpimm(s1l,1);
4481         nottaken=out;
4482         emit_jge(2);
4483       }
4484       if((opcode[i]&0x2f)==7) // BGTZ
4485       {
4486         emit_cmpimm(s1l,1);
4487         nottaken=out;
4488         emit_jl(2);
4489       }
4490     } // if(!unconditional)
4491     int adj;
4492     uint64_t ds_unneeded=branch_regs[i].u;
4493     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
4494     ds_unneeded|=1;
4495     // branch taken
4496     if(!nop) {
4497       if(taken) set_jump_target(taken, out);
4498       assem_debug("1:\n");
4499       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,ds_unneeded);
4500       // load regs
4501       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
4502       address_generation(i+1,&branch_regs[i],0);
4503       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
4504       ds_assemble(i+1,&branch_regs[i]);
4505       cc=get_reg(branch_regs[i].regmap,CCREG);
4506       if(cc==-1) {
4507         emit_loadreg(CCREG,cc=HOST_CCREG);
4508         // CHECK: Is the following instruction (fall thru) allocated ok?
4509       }
4510       assert(cc==HOST_CCREG);
4511       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4512       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
4513       assem_debug("cycle count (adj)\n");
4514       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4515       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4516       if(internal)
4517         assem_debug("branch: internal\n");
4518       else
4519         assem_debug("branch: external\n");
4520       if(internal&&is_ds[(ba[i]-start)>>2]) {
4521         ds_assemble_entry(i);
4522       }
4523       else {
4524         add_to_linker(out,ba[i],internal);
4525         emit_jmp(0);
4526       }
4527     }
4528     // branch not taken
4529     if(!unconditional) {
4530       if(nottaken1) set_jump_target(nottaken1, out);
4531       set_jump_target(nottaken, out);
4532       assem_debug("2:\n");
4533       if(!likely[i]) {
4534         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,ds_unneeded);
4535         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
4536         address_generation(i+1,&branch_regs[i],0);
4537         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4538         ds_assemble(i+1,&branch_regs[i]);
4539       }
4540       cc=get_reg(branch_regs[i].regmap,CCREG);
4541       if(cc==-1&&!likely[i]) {
4542         // Cycle count isn't in a register, temporarily load it then write it out
4543         emit_loadreg(CCREG,HOST_CCREG);
4544         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4545         void *jaddr=out;
4546         emit_jns(0);
4547         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
4548         emit_storereg(CCREG,HOST_CCREG);
4549       }
4550       else{
4551         cc=get_reg(i_regmap,CCREG);
4552         assert(cc==HOST_CCREG);
4553         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
4554         void *jaddr=out;
4555         emit_jns(0);
4556         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
4557       }
4558     }
4559   }
4560 }
4561
4562 void sjump_assemble(int i,struct regstat *i_regs)
4563 {
4564   signed char *i_regmap=i_regs->regmap;
4565   int cc;
4566   int match;
4567   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4568   assem_debug("smatch=%d\n",match);
4569   int s1h,s1l;
4570   int unconditional=0,nevertaken=0;
4571   int only32=0;
4572   int invert=0;
4573   int internal=internal_branch(branch_regs[i].is32,ba[i]);
4574   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4575   if(!match) invert=1;
4576   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4577   if(i>(ba[i]-start)>>2) invert=1;
4578   #endif
4579
4580   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
4581   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
4582
4583   if(ooo[i]) {
4584     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4585     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4586   }
4587   else {
4588     s1l=get_reg(i_regmap,rs1[i]);
4589     s1h=get_reg(i_regmap,rs1[i]|64);
4590   }
4591   if(rs1[i]==0)
4592   {
4593     if(opcode2[i]&1) unconditional=1;
4594     else nevertaken=1;
4595     // These are never taken (r0 is never less than zero)
4596     //assert(opcode2[i]!=0);
4597     //assert(opcode2[i]!=2);
4598     //assert(opcode2[i]!=0x10);
4599     //assert(opcode2[i]!=0x12);
4600   }
4601   else {
4602     only32=(regs[i].was32>>rs1[i])&1;
4603   }
4604
4605   if(ooo[i]) {
4606     // Out of order execution (delay slot first)
4607     //printf("OOOE\n");
4608     address_generation(i+1,i_regs,regs[i].regmap_entry);
4609     ds_assemble(i+1,i_regs);
4610     int adj;
4611     uint64_t bc_unneeded=branch_regs[i].u;
4612     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
4613     bc_unneeded|=1;
4614     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,bc_unneeded);
4615     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
4616     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4617     if(rt1[i]==31) {
4618       int rt,return_address;
4619       rt=get_reg(branch_regs[i].regmap,31);
4620       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4621       if(rt>=0) {
4622         // Save the PC even if the branch is not taken
4623         return_address=start+i*4+8;
4624         emit_movimm(return_address,rt); // PC into link register
4625         #ifdef IMM_PREFETCH
4626         if(!nevertaken) emit_prefetch(hash_table_get(return_address));
4627         #endif
4628       }
4629     }
4630     cc=get_reg(branch_regs[i].regmap,CCREG);
4631     assert(cc==HOST_CCREG);
4632     if(unconditional)
4633       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4634     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
4635     assem_debug("cycle count (adj)\n");
4636     if(unconditional) {
4637       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4638       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
4639         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4640         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4641         if(internal)
4642           assem_debug("branch: internal\n");
4643         else
4644           assem_debug("branch: external\n");
4645         if(internal&&is_ds[(ba[i]-start)>>2]) {
4646           ds_assemble_entry(i);
4647         }
4648         else {
4649           add_to_linker(out,ba[i],internal);
4650           emit_jmp(0);
4651         }
4652         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4653         if(((u_int)out)&7) emit_addnop(0);
4654         #endif
4655       }
4656     }
4657     else if(nevertaken) {
4658       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
4659       void *jaddr=out;
4660       emit_jns(0);
4661       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
4662     }
4663     else {
4664       void *nottaken = NULL;
4665       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
4666       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4667       if(!only32)
4668       {
4669         assert(s1h>=0);
4670         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
4671         {
4672           emit_test(s1h,s1h);
4673           if(invert){
4674             nottaken=out;
4675             emit_jns(1);
4676           }else{
4677             add_to_linker(out,ba[i],internal);
4678             emit_js(0);
4679           }
4680         }
4681         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
4682         {
4683           emit_test(s1h,s1h);
4684           if(invert){
4685             nottaken=out;
4686             emit_js(1);
4687           }else{
4688             add_to_linker(out,ba[i],internal);
4689             emit_jns(0);
4690           }
4691         }
4692       } // if(!only32)
4693       else
4694       {
4695         assert(s1l>=0);
4696         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
4697         {
4698           emit_test(s1l,s1l);
4699           if(invert){
4700             nottaken=out;
4701             emit_jns(1);
4702           }else{
4703             add_to_linker(out,ba[i],internal);
4704             emit_js(0);
4705           }
4706         }
4707         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
4708         {
4709           emit_test(s1l,s1l);
4710           if(invert){
4711             nottaken=out;
4712             emit_js(1);
4713           }else{
4714             add_to_linker(out,ba[i],internal);
4715             emit_jns(0);
4716           }
4717         }
4718       } // if(!only32)
4719
4720       if(invert) {
4721         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4722         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
4723           if(adj) {
4724             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
4725             add_to_linker(out,ba[i],internal);
4726           }else{
4727             emit_addnop(13);
4728             add_to_linker(out,ba[i],internal*2);
4729           }
4730           emit_jmp(0);
4731         }else
4732         #endif
4733         {
4734           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
4735           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4736           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4737           if(internal)
4738             assem_debug("branch: internal\n");
4739           else
4740             assem_debug("branch: external\n");
4741           if(internal&&is_ds[(ba[i]-start)>>2]) {
4742             ds_assemble_entry(i);
4743           }
4744           else {
4745             add_to_linker(out,ba[i],internal);
4746             emit_jmp(0);
4747           }
4748         }
4749         set_jump_target(nottaken, out);
4750       }
4751
4752       if(adj) {
4753         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
4754       }
4755     } // (!unconditional)
4756   } // if(ooo)
4757   else
4758   {
4759     // In-order execution (branch first)
4760     //printf("IOE\n");
4761     void *nottaken = NULL;
4762     if(rt1[i]==31) {
4763       int rt,return_address;
4764       rt=get_reg(branch_regs[i].regmap,31);
4765       if(rt>=0) {
4766         // Save the PC even if the branch is not taken
4767         return_address=start+i*4+8;
4768         emit_movimm(return_address,rt); // PC into link register
4769         #ifdef IMM_PREFETCH
4770         emit_prefetch(hash_table_get(return_address));
4771         #endif
4772       }
4773     }
4774     if(!unconditional) {
4775       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4776       if(!only32)
4777       {
4778         assert(s1h>=0);
4779         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
4780         {
4781           emit_test(s1h,s1h);
4782           nottaken=out;
4783           emit_jns(1);
4784         }
4785         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
4786         {
4787           emit_test(s1h,s1h);
4788           nottaken=out;
4789           emit_js(1);
4790         }
4791       } // if(!only32)
4792       else
4793       {
4794         assert(s1l>=0);
4795         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
4796         {
4797           emit_test(s1l,s1l);
4798           nottaken=out;
4799           emit_jns(1);
4800         }
4801         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
4802         {
4803           emit_test(s1l,s1l);
4804           nottaken=out;
4805           emit_js(1);
4806         }
4807       }
4808     } // if(!unconditional)
4809     int adj;
4810     uint64_t ds_unneeded=branch_regs[i].u;
4811     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
4812     ds_unneeded|=1;
4813     // branch taken
4814     if(!nevertaken) {
4815       //assem_debug("1:\n");
4816       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,ds_unneeded);
4817       // load regs
4818       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
4819       address_generation(i+1,&branch_regs[i],0);
4820       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
4821       ds_assemble(i+1,&branch_regs[i]);
4822       cc=get_reg(branch_regs[i].regmap,CCREG);
4823       if(cc==-1) {
4824         emit_loadreg(CCREG,cc=HOST_CCREG);
4825         // CHECK: Is the following instruction (fall thru) allocated ok?
4826       }
4827       assert(cc==HOST_CCREG);
4828       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4829       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
4830       assem_debug("cycle count (adj)\n");
4831       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4832       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4833       if(internal)
4834         assem_debug("branch: internal\n");
4835       else
4836         assem_debug("branch: external\n");
4837       if(internal&&is_ds[(ba[i]-start)>>2]) {
4838         ds_assemble_entry(i);
4839       }
4840       else {
4841         add_to_linker(out,ba[i],internal);
4842         emit_jmp(0);
4843       }
4844     }
4845     // branch not taken
4846     if(!unconditional) {
4847       set_jump_target(nottaken, out);
4848       assem_debug("1:\n");
4849       if(!likely[i]) {
4850         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,ds_unneeded);
4851         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
4852         address_generation(i+1,&branch_regs[i],0);
4853         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4854         ds_assemble(i+1,&branch_regs[i]);
4855       }
4856       cc=get_reg(branch_regs[i].regmap,CCREG);
4857       if(cc==-1&&!likely[i]) {
4858         // Cycle count isn't in a register, temporarily load it then write it out
4859         emit_loadreg(CCREG,HOST_CCREG);
4860         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4861         void *jaddr=out;
4862         emit_jns(0);
4863         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
4864         emit_storereg(CCREG,HOST_CCREG);
4865       }
4866       else{
4867         cc=get_reg(i_regmap,CCREG);
4868         assert(cc==HOST_CCREG);
4869         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
4870         void *jaddr=out;
4871         emit_jns(0);
4872         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
4873       }
4874     }
4875   }
4876 }
4877
4878 static void pagespan_assemble(int i,struct regstat *i_regs)
4879 {
4880   int s1l=get_reg(i_regs->regmap,rs1[i]);
4881   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
4882   int s2l=get_reg(i_regs->regmap,rs2[i]);
4883   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
4884   void *taken = NULL;
4885   void *nottaken = NULL;
4886   int unconditional=0;
4887   if(rs1[i]==0)
4888   {
4889     s1l=s2l;s1h=s2h;
4890     s2l=s2h=-1;
4891   }
4892   else if(rs2[i]==0)
4893   {
4894     s2l=s2h=-1;
4895   }
4896   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
4897     s1h=s2h=-1;
4898   }
4899   int hr=0;
4900   int addr=-1,alt=-1,ntaddr=-1;
4901   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
4902   else {
4903     while(hr<HOST_REGS)
4904     {
4905       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4906          (i_regs->regmap[hr]&63)!=rs1[i] &&
4907          (i_regs->regmap[hr]&63)!=rs2[i] )
4908       {
4909         addr=hr++;break;
4910       }
4911       hr++;
4912     }
4913   }
4914   while(hr<HOST_REGS)
4915   {
4916     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
4917        (i_regs->regmap[hr]&63)!=rs1[i] &&
4918        (i_regs->regmap[hr]&63)!=rs2[i] )
4919     {
4920       alt=hr++;break;
4921     }
4922     hr++;
4923   }
4924   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4925   {
4926     while(hr<HOST_REGS)
4927     {
4928       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
4929          (i_regs->regmap[hr]&63)!=rs1[i] &&
4930          (i_regs->regmap[hr]&63)!=rs2[i] )
4931       {
4932         ntaddr=hr;break;
4933       }
4934       hr++;
4935     }
4936   }
4937   assert(hr<HOST_REGS);
4938   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
4939     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
4940   }
4941   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4942   if(opcode[i]==2) // J
4943   {
4944     unconditional=1;
4945   }
4946   if(opcode[i]==3) // JAL
4947   {
4948     // TODO: mini_ht
4949     int rt=get_reg(i_regs->regmap,31);
4950     emit_movimm(start+i*4+8,rt);
4951     unconditional=1;
4952   }
4953   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
4954   {
4955     emit_mov(s1l,addr);
4956     if(opcode2[i]==9) // JALR
4957     {
4958       int rt=get_reg(i_regs->regmap,rt1[i]);
4959       emit_movimm(start+i*4+8,rt);
4960     }
4961   }
4962   if((opcode[i]&0x3f)==4) // BEQ
4963   {
4964     if(rs1[i]==rs2[i])
4965     {
4966       unconditional=1;
4967     }
4968     else
4969     #ifdef HAVE_CMOV_IMM
4970     if(s1h<0) {
4971       if(s2l>=0) emit_cmp(s1l,s2l);
4972       else emit_test(s1l,s1l);
4973       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4974     }
4975     else
4976     #endif
4977     {
4978       assert(s1l>=0);
4979       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4980       if(s1h>=0) {
4981         if(s2h>=0) emit_cmp(s1h,s2h);
4982         else emit_test(s1h,s1h);
4983         emit_cmovne_reg(alt,addr);
4984       }
4985       if(s2l>=0) emit_cmp(s1l,s2l);
4986       else emit_test(s1l,s1l);
4987       emit_cmovne_reg(alt,addr);
4988     }
4989   }
4990   if((opcode[i]&0x3f)==5) // BNE
4991   {
4992     #ifdef HAVE_CMOV_IMM
4993     if(s1h<0) {
4994       if(s2l>=0) emit_cmp(s1l,s2l);
4995       else emit_test(s1l,s1l);
4996       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4997     }
4998     else
4999     #endif
5000     {
5001       assert(s1l>=0);
5002       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5003       if(s1h>=0) {
5004         if(s2h>=0) emit_cmp(s1h,s2h);
5005         else emit_test(s1h,s1h);
5006         emit_cmovne_reg(alt,addr);
5007       }
5008       if(s2l>=0) emit_cmp(s1l,s2l);
5009       else emit_test(s1l,s1l);
5010       emit_cmovne_reg(alt,addr);
5011     }
5012   }
5013   if((opcode[i]&0x3f)==0x14) // BEQL
5014   {
5015     if(s1h>=0) {
5016       if(s2h>=0) emit_cmp(s1h,s2h);
5017       else emit_test(s1h,s1h);
5018       nottaken=out;
5019       emit_jne(0);
5020     }
5021     if(s2l>=0) emit_cmp(s1l,s2l);
5022     else emit_test(s1l,s1l);
5023     if(nottaken) set_jump_target(nottaken, out);
5024     nottaken=out;
5025     emit_jne(0);
5026   }
5027   if((opcode[i]&0x3f)==0x15) // BNEL
5028   {
5029     if(s1h>=0) {
5030       if(s2h>=0) emit_cmp(s1h,s2h);
5031       else emit_test(s1h,s1h);
5032       taken=out;
5033       emit_jne(0);
5034     }
5035     if(s2l>=0) emit_cmp(s1l,s2l);
5036     else emit_test(s1l,s1l);
5037     nottaken=out;
5038     emit_jeq(0);
5039     if(taken) set_jump_target(taken, out);
5040   }
5041   if((opcode[i]&0x3f)==6) // BLEZ
5042   {
5043     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5044     emit_cmpimm(s1l,1);
5045     if(s1h>=0) emit_mov(addr,ntaddr);
5046     emit_cmovl_reg(alt,addr);
5047     if(s1h>=0) {
5048       emit_test(s1h,s1h);
5049       emit_cmovne_reg(ntaddr,addr);
5050       emit_cmovs_reg(alt,addr);
5051     }
5052   }
5053   if((opcode[i]&0x3f)==7) // BGTZ
5054   {
5055     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5056     emit_cmpimm(s1l,1);
5057     if(s1h>=0) emit_mov(addr,alt);
5058     emit_cmovl_reg(ntaddr,addr);
5059     if(s1h>=0) {
5060       emit_test(s1h,s1h);
5061       emit_cmovne_reg(alt,addr);
5062       emit_cmovs_reg(ntaddr,addr);
5063     }
5064   }
5065   if((opcode[i]&0x3f)==0x16) // BLEZL
5066   {
5067     assert((opcode[i]&0x3f)!=0x16);
5068   }
5069   if((opcode[i]&0x3f)==0x17) // BGTZL
5070   {
5071     assert((opcode[i]&0x3f)!=0x17);
5072   }
5073   assert(opcode[i]!=1); // BLTZ/BGEZ
5074
5075   //FIXME: Check CSREG
5076   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5077     if((source[i]&0x30000)==0) // BC1F
5078     {
5079       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5080       emit_testimm(s1l,0x800000);
5081       emit_cmovne_reg(alt,addr);
5082     }
5083     if((source[i]&0x30000)==0x10000) // BC1T
5084     {
5085       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5086       emit_testimm(s1l,0x800000);
5087       emit_cmovne_reg(alt,addr);
5088     }
5089     if((source[i]&0x30000)==0x20000) // BC1FL
5090     {
5091       emit_testimm(s1l,0x800000);
5092       nottaken=out;
5093       emit_jne(0);
5094     }
5095     if((source[i]&0x30000)==0x30000) // BC1TL
5096     {
5097       emit_testimm(s1l,0x800000);
5098       nottaken=out;
5099       emit_jeq(0);
5100     }
5101   }
5102
5103   assert(i_regs->regmap[HOST_CCREG]==CCREG);
5104   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
5105   if(likely[i]||unconditional)
5106   {
5107     emit_movimm(ba[i],HOST_BTREG);
5108   }
5109   else if(addr!=HOST_BTREG)
5110   {
5111     emit_mov(addr,HOST_BTREG);
5112   }
5113   void *branch_addr=out;
5114   emit_jmp(0);
5115   int target_addr=start+i*4+5;
5116   void *stub=out;
5117   void *compiled_target_addr=check_addr(target_addr);
5118   emit_extjump_ds(branch_addr, target_addr);
5119   if(compiled_target_addr) {
5120     set_jump_target(branch_addr, compiled_target_addr);
5121     add_link(target_addr,stub);
5122   }
5123   else set_jump_target(branch_addr, stub);
5124   if(likely[i]) {
5125     // Not-taken path
5126     set_jump_target(nottaken, out);
5127     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
5128     void *branch_addr=out;
5129     emit_jmp(0);
5130     int target_addr=start+i*4+8;
5131     void *stub=out;
5132     void *compiled_target_addr=check_addr(target_addr);
5133     emit_extjump_ds(branch_addr, target_addr);
5134     if(compiled_target_addr) {
5135       set_jump_target(branch_addr, compiled_target_addr);
5136       add_link(target_addr,stub);
5137     }
5138     else set_jump_target(branch_addr, stub);
5139   }
5140 }
5141
5142 // Assemble the delay slot for the above
5143 static void pagespan_ds()
5144 {
5145   assem_debug("initial delay slot:\n");
5146   u_int vaddr=start+1;
5147   u_int page=get_page(vaddr);
5148   u_int vpage=get_vpage(vaddr);
5149   ll_add(jump_dirty+vpage,vaddr,(void *)out);
5150   do_dirty_stub_ds();
5151   ll_add(jump_in+page,vaddr,(void *)out);
5152   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
5153   if(regs[0].regmap[HOST_CCREG]!=CCREG)
5154     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
5155   if(regs[0].regmap[HOST_BTREG]!=BTREG)
5156     emit_writeword(HOST_BTREG,&branch_target);
5157   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
5158   address_generation(0,&regs[0],regs[0].regmap_entry);
5159   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
5160     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
5161   is_delayslot=0;
5162   switch(itype[0]) {
5163     case ALU:
5164       alu_assemble(0,&regs[0]);break;
5165     case IMM16:
5166       imm16_assemble(0,&regs[0]);break;
5167     case SHIFT:
5168       shift_assemble(0,&regs[0]);break;
5169     case SHIFTIMM:
5170       shiftimm_assemble(0,&regs[0]);break;
5171     case LOAD:
5172       load_assemble(0,&regs[0]);break;
5173     case LOADLR:
5174       loadlr_assemble(0,&regs[0]);break;
5175     case STORE:
5176       store_assemble(0,&regs[0]);break;
5177     case STORELR:
5178       storelr_assemble(0,&regs[0]);break;
5179     case COP0:
5180       cop0_assemble(0,&regs[0]);break;
5181     case COP1:
5182       cop1_assemble(0,&regs[0]);break;
5183     case C1LS:
5184       c1ls_assemble(0,&regs[0]);break;
5185     case COP2:
5186       cop2_assemble(0,&regs[0]);break;
5187     case C2LS:
5188       c2ls_assemble(0,&regs[0]);break;
5189     case C2OP:
5190       c2op_assemble(0,&regs[0]);break;
5191     case MULTDIV:
5192       multdiv_assemble(0,&regs[0]);break;
5193     case MOV:
5194       mov_assemble(0,&regs[0]);break;
5195     case SYSCALL:
5196     case HLECALL:
5197     case INTCALL:
5198     case SPAN:
5199     case UJUMP:
5200     case RJUMP:
5201     case CJUMP:
5202     case SJUMP:
5203     case FJUMP:
5204       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
5205   }
5206   int btaddr=get_reg(regs[0].regmap,BTREG);
5207   if(btaddr<0) {
5208     btaddr=get_reg(regs[0].regmap,-1);
5209     emit_readword(&branch_target,btaddr);
5210   }
5211   assert(btaddr!=HOST_CCREG);
5212   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
5213 #ifdef HOST_IMM8
5214   emit_movimm(start+4,HOST_TEMPREG);
5215   emit_cmp(btaddr,HOST_TEMPREG);
5216 #else
5217   emit_cmpimm(btaddr,start+4);
5218 #endif
5219   void *branch = out;
5220   emit_jeq(0);
5221   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
5222   emit_jmp(jump_vaddr_reg[btaddr]);
5223   set_jump_target(branch, out);
5224   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
5225   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
5226 }
5227
5228 // Basic liveness analysis for MIPS registers
5229 void unneeded_registers(int istart,int iend,int r)
5230 {
5231   int i;
5232   uint64_t u,gte_u,b,gte_b;
5233   uint64_t temp_u,temp_gte_u=0;
5234   uint64_t gte_u_unknown=0;
5235   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
5236     gte_u_unknown=~0ll;
5237   if(iend==slen-1) {
5238     u=1;
5239     gte_u=gte_u_unknown;
5240   }else{
5241     //u=unneeded_reg[iend+1];
5242     u=1;
5243     gte_u=gte_unneeded[iend+1];
5244   }
5245
5246   for (i=iend;i>=istart;i--)
5247   {
5248     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
5249     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
5250     {
5251       // If subroutine call, flag return address as a possible branch target
5252       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
5253
5254       if(ba[i]<start || ba[i]>=(start+slen*4))
5255       {
5256         // Branch out of this block, flush all regs
5257         u=1;
5258         gte_u=gte_u_unknown;
5259         branch_unneeded_reg[i]=u;
5260         // Merge in delay slot
5261         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5262         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5263         u|=1;
5264         gte_u|=gte_rt[i+1];
5265         gte_u&=~gte_rs[i+1];
5266         // If branch is "likely" (and conditional)
5267         // then we skip the delay slot on the fall-thru path
5268         if(likely[i]) {
5269           if(i<slen-1) {
5270             u&=unneeded_reg[i+2];
5271             gte_u&=gte_unneeded[i+2];
5272           }
5273           else
5274           {
5275             u=1;
5276             gte_u=gte_u_unknown;
5277           }
5278         }
5279       }
5280       else
5281       {
5282         // Internal branch, flag target
5283         bt[(ba[i]-start)>>2]=1;
5284         if(ba[i]<=start+i*4) {
5285           // Backward branch
5286           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
5287           {
5288             // Unconditional branch
5289             temp_u=1;
5290             temp_gte_u=0;
5291           } else {
5292             // Conditional branch (not taken case)
5293             temp_u=unneeded_reg[i+2];
5294             temp_gte_u&=gte_unneeded[i+2];
5295           }
5296           // Merge in delay slot
5297           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5298           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5299           temp_u|=1;
5300           temp_gte_u|=gte_rt[i+1];
5301           temp_gte_u&=~gte_rs[i+1];
5302           // If branch is "likely" (and conditional)
5303           // then we skip the delay slot on the fall-thru path
5304           if(likely[i]) {
5305             if(i<slen-1) {
5306               temp_u&=unneeded_reg[i+2];
5307               temp_gte_u&=gte_unneeded[i+2];
5308             }
5309             else
5310             {
5311               temp_u=1;
5312               temp_gte_u=gte_u_unknown;
5313             }
5314           }
5315           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
5316           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5317           temp_u|=1;
5318           temp_gte_u|=gte_rt[i];
5319           temp_gte_u&=~gte_rs[i];
5320           unneeded_reg[i]=temp_u;
5321           gte_unneeded[i]=temp_gte_u;
5322           // Only go three levels deep.  This recursion can take an
5323           // excessive amount of time if there are a lot of nested loops.
5324           if(r<2) {
5325             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
5326           }else{
5327             unneeded_reg[(ba[i]-start)>>2]=1;
5328             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
5329           }
5330         } /*else*/ if(1) {
5331           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
5332           {
5333             // Unconditional branch
5334             u=unneeded_reg[(ba[i]-start)>>2];
5335             gte_u=gte_unneeded[(ba[i]-start)>>2];
5336             branch_unneeded_reg[i]=u;
5337             // Merge in delay slot
5338             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5339             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5340             u|=1;
5341             gte_u|=gte_rt[i+1];
5342             gte_u&=~gte_rs[i+1];
5343           } else {
5344             // Conditional branch
5345             b=unneeded_reg[(ba[i]-start)>>2];
5346             gte_b=gte_unneeded[(ba[i]-start)>>2];
5347             branch_unneeded_reg[i]=b;
5348             // Branch delay slot
5349             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5350             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5351             b|=1;
5352             gte_b|=gte_rt[i+1];
5353             gte_b&=~gte_rs[i+1];
5354             // If branch is "likely" then we skip the
5355             // delay slot on the fall-thru path
5356             if(likely[i]) {
5357               u=b;
5358               gte_u=gte_b;
5359               if(i<slen-1) {
5360                 u&=unneeded_reg[i+2];
5361                 gte_u&=gte_unneeded[i+2];
5362               }
5363             } else {
5364               u&=b;
5365               gte_u&=gte_b;
5366             }
5367             if(i<slen-1) {
5368               branch_unneeded_reg[i]&=unneeded_reg[i+2];
5369             } else {
5370               branch_unneeded_reg[i]=1;
5371             }
5372           }
5373         }
5374       }
5375     }
5376     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
5377     {
5378       // SYSCALL instruction (software interrupt)
5379       u=1;
5380     }
5381     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
5382     {
5383       // ERET instruction (return from interrupt)
5384       u=1;
5385     }
5386     //u=1; // DEBUG
5387     // Written registers are unneeded
5388     u|=1LL<<rt1[i];
5389     u|=1LL<<rt2[i];
5390     gte_u|=gte_rt[i];
5391     // Accessed registers are needed
5392     u&=~(1LL<<rs1[i]);
5393     u&=~(1LL<<rs2[i]);
5394     gte_u&=~gte_rs[i];
5395     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
5396       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
5397     // Source-target dependencies
5398     // R0 is always unneeded
5399     u|=1;
5400     // Save it
5401     unneeded_reg[i]=u;
5402     gte_unneeded[i]=gte_u;
5403     /*
5404     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
5405     printf("U:");
5406     int r;
5407     for(r=1;r<=CCREG;r++) {
5408       if((unneeded_reg[i]>>r)&1) {
5409         if(r==HIREG) printf(" HI");
5410         else if(r==LOREG) printf(" LO");
5411         else printf(" r%d",r);
5412       }
5413     }
5414     printf("\n");
5415     */
5416   }
5417 }
5418
5419 // Write back dirty registers as soon as we will no longer modify them,
5420 // so that we don't end up with lots of writes at the branches.
5421 void clean_registers(int istart,int iend,int wr)
5422 {
5423   int i;
5424   int r;
5425   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
5426   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
5427   if(iend==slen-1) {
5428     will_dirty_i=will_dirty_next=0;
5429     wont_dirty_i=wont_dirty_next=0;
5430   }else{
5431     will_dirty_i=will_dirty_next=will_dirty[iend+1];
5432     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
5433   }
5434   for (i=iend;i>=istart;i--)
5435   {
5436     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
5437     {
5438       if(ba[i]<start || ba[i]>=(start+slen*4))
5439       {
5440         // Branch out of this block, flush all regs
5441         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
5442         {
5443           // Unconditional branch
5444           will_dirty_i=0;
5445           wont_dirty_i=0;
5446           // Merge in delay slot (will dirty)
5447           for(r=0;r<HOST_REGS;r++) {
5448             if(r!=EXCLUDE_REG) {
5449               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
5450               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
5451               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
5452               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
5453               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
5454               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
5455               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
5456               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
5457               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
5458               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
5459               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
5460               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
5461               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
5462               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
5463             }
5464           }
5465         }
5466         else
5467         {
5468           // Conditional branch
5469           will_dirty_i=0;
5470           wont_dirty_i=wont_dirty_next;
5471           // Merge in delay slot (will dirty)
5472           for(r=0;r<HOST_REGS;r++) {
5473             if(r!=EXCLUDE_REG) {
5474               if(!likely[i]) {
5475                 // Might not dirty if likely branch is not taken
5476                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
5477                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
5478                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
5479                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
5480                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
5481                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
5482                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
5483                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
5484                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
5485                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
5486                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
5487                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
5488                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
5489                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
5490               }
5491             }
5492           }
5493         }
5494         // Merge in delay slot (wont dirty)
5495         for(r=0;r<HOST_REGS;r++) {
5496           if(r!=EXCLUDE_REG) {
5497             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
5498             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
5499             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
5500             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
5501             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
5502             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
5503             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
5504             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
5505             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
5506             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
5507           }
5508         }
5509         if(wr) {
5510           #ifndef DESTRUCTIVE_WRITEBACK
5511           branch_regs[i].dirty&=wont_dirty_i;
5512           #endif
5513           branch_regs[i].dirty|=will_dirty_i;
5514         }
5515       }
5516       else
5517       {
5518         // Internal branch
5519         if(ba[i]<=start+i*4) {
5520           // Backward branch
5521           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
5522           {
5523             // Unconditional branch
5524             temp_will_dirty=0;
5525             temp_wont_dirty=0;
5526             // Merge in delay slot (will dirty)
5527             for(r=0;r<HOST_REGS;r++) {
5528               if(r!=EXCLUDE_REG) {
5529                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
5530                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
5531                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
5532                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
5533                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
5534                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
5535                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
5536                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
5537                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
5538                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
5539                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
5540                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
5541                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
5542                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
5543               }
5544             }
5545           } else {
5546             // Conditional branch (not taken case)
5547             temp_will_dirty=will_dirty_next;
5548             temp_wont_dirty=wont_dirty_next;
5549             // Merge in delay slot (will dirty)
5550             for(r=0;r<HOST_REGS;r++) {
5551               if(r!=EXCLUDE_REG) {
5552                 if(!likely[i]) {
5553                   // Will not dirty if likely branch is not taken
5554                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
5555                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
5556                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
5557                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
5558                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
5559                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
5560                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
5561                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
5562                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
5563                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
5564                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
5565                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
5566                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
5567                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
5568                 }
5569               }
5570             }
5571           }
5572           // Merge in delay slot (wont dirty)
5573           for(r=0;r<HOST_REGS;r++) {
5574             if(r!=EXCLUDE_REG) {
5575               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
5576               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
5577               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
5578               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
5579               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
5580               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
5581               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
5582               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
5583               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
5584               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
5585             }
5586           }
5587           // Deal with changed mappings
5588           if(i<iend) {
5589             for(r=0;r<HOST_REGS;r++) {
5590               if(r!=EXCLUDE_REG) {
5591                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
5592                   temp_will_dirty&=~(1<<r);
5593                   temp_wont_dirty&=~(1<<r);
5594                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
5595                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
5596                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
5597                   } else {
5598                     temp_will_dirty|=1<<r;
5599                     temp_wont_dirty|=1<<r;
5600                   }
5601                 }
5602               }
5603             }
5604           }
5605           if(wr) {
5606             will_dirty[i]=temp_will_dirty;
5607             wont_dirty[i]=temp_wont_dirty;
5608             clean_registers((ba[i]-start)>>2,i-1,0);
5609           }else{
5610             // Limit recursion.  It can take an excessive amount
5611             // of time if there are a lot of nested loops.
5612             will_dirty[(ba[i]-start)>>2]=0;
5613             wont_dirty[(ba[i]-start)>>2]=-1;
5614           }
5615         }
5616         /*else*/ if(1)
5617         {
5618           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
5619           {
5620             // Unconditional branch
5621             will_dirty_i=0;
5622             wont_dirty_i=0;
5623           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
5624             for(r=0;r<HOST_REGS;r++) {
5625               if(r!=EXCLUDE_REG) {
5626                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
5627                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
5628                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
5629                 }
5630                 if(branch_regs[i].regmap[r]>=0) {
5631                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
5632                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
5633                 }
5634               }
5635             }
5636           //}
5637             // Merge in delay slot
5638             for(r=0;r<HOST_REGS;r++) {
5639               if(r!=EXCLUDE_REG) {
5640                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
5641                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
5642                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
5643                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
5644                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
5645                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
5646                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
5647                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
5648                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
5649                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
5650                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
5651                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
5652                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
5653                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
5654               }
5655             }
5656           } else {
5657             // Conditional branch
5658             will_dirty_i=will_dirty_next;
5659             wont_dirty_i=wont_dirty_next;
5660           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
5661             for(r=0;r<HOST_REGS;r++) {
5662               if(r!=EXCLUDE_REG) {
5663                 signed char target_reg=branch_regs[i].regmap[r];
5664                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
5665                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
5666                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
5667                 }
5668                 else if(target_reg>=0) {
5669                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
5670                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
5671                 }
5672                 // Treat delay slot as part of branch too
5673                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
5674                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
5675                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
5676                 }
5677                 else
5678                 {
5679                   will_dirty[i+1]&=~(1<<r);
5680                 }*/
5681               }
5682             }
5683           //}
5684             // Merge in delay slot
5685             for(r=0;r<HOST_REGS;r++) {
5686               if(r!=EXCLUDE_REG) {
5687                 if(!likely[i]) {
5688                   // Might not dirty if likely branch is not taken
5689                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
5690                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
5691                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
5692                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
5693                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
5694                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
5695                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
5696                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
5697                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
5698                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
5699                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
5700                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
5701                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
5702                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
5703                 }
5704               }
5705             }
5706           }
5707           // Merge in delay slot (won't dirty)
5708           for(r=0;r<HOST_REGS;r++) {
5709             if(r!=EXCLUDE_REG) {
5710               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
5711               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
5712               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
5713               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
5714               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
5715               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
5716               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
5717               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
5718               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
5719               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
5720             }
5721           }
5722           if(wr) {
5723             #ifndef DESTRUCTIVE_WRITEBACK
5724             branch_regs[i].dirty&=wont_dirty_i;
5725             #endif
5726             branch_regs[i].dirty|=will_dirty_i;
5727           }
5728         }
5729       }
5730     }
5731     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
5732     {
5733       // SYSCALL instruction (software interrupt)
5734       will_dirty_i=0;
5735       wont_dirty_i=0;
5736     }
5737     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
5738     {
5739       // ERET instruction (return from interrupt)
5740       will_dirty_i=0;
5741       wont_dirty_i=0;
5742     }
5743     will_dirty_next=will_dirty_i;
5744     wont_dirty_next=wont_dirty_i;
5745     for(r=0;r<HOST_REGS;r++) {
5746       if(r!=EXCLUDE_REG) {
5747         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
5748         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
5749         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
5750         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
5751         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
5752         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
5753         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
5754         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
5755         if(i>istart) {
5756           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP)
5757           {
5758             // Don't store a register immediately after writing it,
5759             // may prevent dual-issue.
5760             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
5761             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
5762           }
5763         }
5764       }
5765     }
5766     // Save it
5767     will_dirty[i]=will_dirty_i;
5768     wont_dirty[i]=wont_dirty_i;
5769     // Mark registers that won't be dirtied as not dirty
5770     if(wr) {
5771       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
5772       for(r=0;r<HOST_REGS;r++) {
5773         if((will_dirty_i>>r)&1) {
5774           printf(" r%d",r);
5775         }
5776       }
5777       printf("\n");*/
5778
5779       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
5780         regs[i].dirty|=will_dirty_i;
5781         #ifndef DESTRUCTIVE_WRITEBACK
5782         regs[i].dirty&=wont_dirty_i;
5783         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
5784         {
5785           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
5786             for(r=0;r<HOST_REGS;r++) {
5787               if(r!=EXCLUDE_REG) {
5788                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
5789                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
5790                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
5791               }
5792             }
5793           }
5794         }
5795         else
5796         {
5797           if(i<iend) {
5798             for(r=0;r<HOST_REGS;r++) {
5799               if(r!=EXCLUDE_REG) {
5800                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
5801                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
5802                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
5803               }
5804             }
5805           }
5806         }
5807         #endif
5808       //}
5809     }
5810     // Deal with changed mappings
5811     temp_will_dirty=will_dirty_i;
5812     temp_wont_dirty=wont_dirty_i;
5813     for(r=0;r<HOST_REGS;r++) {
5814       if(r!=EXCLUDE_REG) {
5815         int nr;
5816         if(regs[i].regmap[r]==regmap_pre[i][r]) {
5817           if(wr) {
5818             #ifndef DESTRUCTIVE_WRITEBACK
5819             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
5820             #endif
5821             regs[i].wasdirty|=will_dirty_i&(1<<r);
5822           }
5823         }
5824         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
5825           // Register moved to a different register
5826           will_dirty_i&=~(1<<r);
5827           wont_dirty_i&=~(1<<r);
5828           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
5829           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
5830           if(wr) {
5831             #ifndef DESTRUCTIVE_WRITEBACK
5832             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
5833             #endif
5834             regs[i].wasdirty|=will_dirty_i&(1<<r);
5835           }
5836         }
5837         else {
5838           will_dirty_i&=~(1<<r);
5839           wont_dirty_i&=~(1<<r);
5840           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
5841             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
5842             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
5843           } else {
5844             wont_dirty_i|=1<<r;
5845             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
5846           }
5847         }
5848       }
5849     }
5850   }
5851 }
5852
5853 #ifdef DISASM
5854   /* disassembly */
5855 void disassemble_inst(int i)
5856 {
5857     if (bt[i]) printf("*"); else printf(" ");
5858     switch(itype[i]) {
5859       case UJUMP:
5860         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
5861       case CJUMP:
5862         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
5863       case SJUMP:
5864         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
5865       case FJUMP:
5866         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
5867       case RJUMP:
5868         if (opcode[i]==0x9&&rt1[i]!=31)
5869           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
5870         else
5871           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
5872         break;
5873       case SPAN:
5874         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
5875       case IMM16:
5876         if(opcode[i]==0xf) //LUI
5877           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
5878         else
5879           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
5880         break;
5881       case LOAD:
5882       case LOADLR:
5883         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
5884         break;
5885       case STORE:
5886       case STORELR:
5887         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
5888         break;
5889       case ALU:
5890       case SHIFT:
5891         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
5892         break;
5893       case MULTDIV:
5894         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
5895         break;
5896       case SHIFTIMM:
5897         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
5898         break;
5899       case MOV:
5900         if((opcode2[i]&0x1d)==0x10)
5901           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
5902         else if((opcode2[i]&0x1d)==0x11)
5903           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
5904         else
5905           printf (" %x: %s\n",start+i*4,insn[i]);
5906         break;
5907       case COP0:
5908         if(opcode2[i]==0)
5909           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
5910         else if(opcode2[i]==4)
5911           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
5912         else printf (" %x: %s\n",start+i*4,insn[i]);
5913         break;
5914       case COP1:
5915         if(opcode2[i]<3)
5916           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
5917         else if(opcode2[i]>3)
5918           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
5919         else printf (" %x: %s\n",start+i*4,insn[i]);
5920         break;
5921       case COP2:
5922         if(opcode2[i]<3)
5923           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
5924         else if(opcode2[i]>3)
5925           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
5926         else printf (" %x: %s\n",start+i*4,insn[i]);
5927         break;
5928       case C1LS:
5929         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
5930         break;
5931       case C2LS:
5932         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
5933         break;
5934       case INTCALL:
5935         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
5936         break;
5937       default:
5938         //printf (" %s %8x\n",insn[i],source[i]);
5939         printf (" %x: %s\n",start+i*4,insn[i]);
5940     }
5941 }
5942 #else
5943 static void disassemble_inst(int i) {}
5944 #endif // DISASM
5945
5946 #define DRC_TEST_VAL 0x74657374
5947
5948 static int new_dynarec_test(void)
5949 {
5950   int (*testfunc)(void) = (void *)out;
5951   void *beginning;
5952   int ret;
5953
5954   beginning = start_block();
5955   emit_movimm(DRC_TEST_VAL,0); // test
5956   emit_jmpreg(14);
5957   literal_pool(0);
5958   end_block(beginning);
5959   SysPrintf("testing if we can run recompiled code..\n");
5960   ret = testfunc();
5961   if (ret == DRC_TEST_VAL)
5962     SysPrintf("test passed.\n");
5963   else
5964     SysPrintf("test failed: %08x\n", ret);
5965   out = translation_cache;
5966   return ret == DRC_TEST_VAL;
5967 }
5968
5969 // clear the state completely, instead of just marking
5970 // things invalid like invalidate_all_pages() does
5971 void new_dynarec_clear_full()
5972 {
5973   int n;
5974   out = translation_cache;
5975   memset(invalid_code,1,sizeof(invalid_code));
5976   memset(hash_table,0xff,sizeof(hash_table));
5977   memset(mini_ht,-1,sizeof(mini_ht));
5978   memset(restore_candidate,0,sizeof(restore_candidate));
5979   memset(shadow,0,sizeof(shadow));
5980   copy=shadow;
5981   expirep=16384; // Expiry pointer, +2 blocks
5982   pending_exception=0;
5983   literalcount=0;
5984   stop_after_jal=0;
5985   inv_code_start=inv_code_end=~0;
5986   // TLB
5987   for(n=0;n<4096;n++) ll_clear(jump_in+n);
5988   for(n=0;n<4096;n++) ll_clear(jump_out+n);
5989   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
5990 }
5991
5992 void new_dynarec_init()
5993 {
5994   SysPrintf("Init new dynarec\n");
5995
5996   // allocate/prepare a buffer for translation cache
5997   // see assem_arm.h for some explanation
5998 #if   defined(BASE_ADDR_FIXED)
5999   if (mmap(translation_cache, 1 << TARGET_SIZE_2,
6000             PROT_READ | PROT_WRITE | PROT_EXEC,
6001             MAP_PRIVATE | MAP_ANONYMOUS,
6002             -1, 0) != translation_cache) {
6003     SysPrintf("mmap() failed: %s\n", strerror(errno));
6004     SysPrintf("disable BASE_ADDR_FIXED and recompile\n");
6005     abort();
6006   }
6007 #elif defined(BASE_ADDR_DYNAMIC)
6008   #ifdef VITA
6009   sceBlock = sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
6010   if (sceBlock < 0)
6011     SysPrintf("sceKernelAllocMemBlockForVM failed\n");
6012   int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&translation_cache);
6013   if (ret < 0)
6014     SysPrintf("sceKernelGetMemBlockBase failed\n");
6015   #else
6016   translation_cache = mmap (NULL, 1 << TARGET_SIZE_2,
6017             PROT_READ | PROT_WRITE | PROT_EXEC,
6018             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
6019   if (translation_cache == MAP_FAILED) {
6020     SysPrintf("mmap() failed: %s\n", strerror(errno));
6021     abort();
6022   }
6023   #endif
6024 #else
6025   #ifndef NO_WRITE_EXEC
6026   // not all systems allow execute in data segment by default
6027   if (mprotect(translation_cache, 1<<TARGET_SIZE_2, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
6028     SysPrintf("mprotect() failed: %s\n", strerror(errno));
6029   #endif
6030 #endif
6031   out = translation_cache;
6032   cycle_multiplier=200;
6033   new_dynarec_clear_full();
6034 #ifdef HOST_IMM8
6035   // Copy this into local area so we don't have to put it in every literal pool
6036   invc_ptr=invalid_code;
6037 #endif
6038   arch_init();
6039   new_dynarec_test();
6040 #ifndef RAM_FIXED
6041   ram_offset=(uintptr_t)rdram-0x80000000;
6042 #endif
6043   if (ram_offset!=0)
6044     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
6045 }
6046
6047 void new_dynarec_cleanup()
6048 {
6049   int n;
6050 #if defined(BASE_ADDR_FIXED) || defined(BASE_ADDR_DYNAMIC)
6051   #ifdef VITA
6052   sceKernelFreeMemBlock(sceBlock);
6053   sceBlock = -1;
6054   #else
6055   if (munmap(translation_cache, 1<<TARGET_SIZE_2) < 0)
6056     SysPrintf("munmap() failed\n");
6057   #endif
6058 #endif
6059   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6060   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6061   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6062   #ifdef ROM_COPY
6063   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
6064   #endif
6065 }
6066
6067 static u_int *get_source_start(u_int addr, u_int *limit)
6068 {
6069   if (addr < 0x00200000 ||
6070     (0xa0000000 <= addr && addr < 0xa0200000)) {
6071     // used for BIOS calls mostly?
6072     *limit = (addr&0xa0000000)|0x00200000;
6073     return (u_int *)(rdram + (addr&0x1fffff));
6074   }
6075   else if (!Config.HLE && (
6076     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
6077     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
6078     // BIOS
6079     *limit = (addr & 0xfff00000) | 0x80000;
6080     return (u_int *)((u_char *)psxR + (addr&0x7ffff));
6081   }
6082   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
6083     *limit = (addr & 0x80600000) + 0x00200000;
6084     return (u_int *)(rdram + (addr&0x1fffff));
6085   }
6086   return NULL;
6087 }
6088
6089 static u_int scan_for_ret(u_int addr)
6090 {
6091   u_int limit = 0;
6092   u_int *mem;
6093
6094   mem = get_source_start(addr, &limit);
6095   if (mem == NULL)
6096     return addr;
6097
6098   if (limit > addr + 0x1000)
6099     limit = addr + 0x1000;
6100   for (; addr < limit; addr += 4, mem++) {
6101     if (*mem == 0x03e00008) // jr $ra
6102       return addr + 8;
6103   }
6104   return addr;
6105 }
6106
6107 struct savestate_block {
6108   uint32_t addr;
6109   uint32_t regflags;
6110 };
6111
6112 static int addr_cmp(const void *p1_, const void *p2_)
6113 {
6114   const struct savestate_block *p1 = p1_, *p2 = p2_;
6115   return p1->addr - p2->addr;
6116 }
6117
6118 int new_dynarec_save_blocks(void *save, int size)
6119 {
6120   struct savestate_block *blocks = save;
6121   int maxcount = size / sizeof(blocks[0]);
6122   struct savestate_block tmp_blocks[1024];
6123   struct ll_entry *head;
6124   int p, s, d, o, bcnt;
6125   u_int addr;
6126
6127   o = 0;
6128   for (p = 0; p < ARRAY_SIZE(jump_in); p++) {
6129     bcnt = 0;
6130     for (head = jump_in[p]; head != NULL; head = head->next) {
6131       tmp_blocks[bcnt].addr = head->vaddr;
6132       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
6133       bcnt++;
6134     }
6135     if (bcnt < 1)
6136       continue;
6137     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
6138
6139     addr = tmp_blocks[0].addr;
6140     for (s = d = 0; s < bcnt; s++) {
6141       if (tmp_blocks[s].addr < addr)
6142         continue;
6143       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
6144         tmp_blocks[d++] = tmp_blocks[s];
6145       addr = scan_for_ret(tmp_blocks[s].addr);
6146     }
6147
6148     if (o + d > maxcount)
6149       d = maxcount - o;
6150     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
6151     o += d;
6152   }
6153
6154   return o * sizeof(blocks[0]);
6155 }
6156
6157 void new_dynarec_load_blocks(const void *save, int size)
6158 {
6159   const struct savestate_block *blocks = save;
6160   int count = size / sizeof(blocks[0]);
6161   u_int regs_save[32];
6162   uint32_t f;
6163   int i, b;
6164
6165   get_addr(psxRegs.pc);
6166
6167   // change GPRs for speculation to at least partially work..
6168   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
6169   for (i = 1; i < 32; i++)
6170     psxRegs.GPR.r[i] = 0x80000000;
6171
6172   for (b = 0; b < count; b++) {
6173     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
6174       if (f & 1)
6175         psxRegs.GPR.r[i] = 0x1f800000;
6176     }
6177
6178     get_addr(blocks[b].addr);
6179
6180     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
6181       if (f & 1)
6182         psxRegs.GPR.r[i] = 0x80000000;
6183     }
6184   }
6185
6186   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
6187 }
6188
6189 int new_recompile_block(int addr)
6190 {
6191   u_int pagelimit = 0;
6192   u_int state_rflags = 0;
6193   int i;
6194
6195   assem_debug("NOTCOMPILED: addr = %x -> %p\n", addr, out);
6196   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
6197   //if(debug)
6198   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
6199
6200   // this is just for speculation
6201   for (i = 1; i < 32; i++) {
6202     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
6203       state_rflags |= 1 << i;
6204   }
6205
6206   start = (u_int)addr&~3;
6207   //assert(((u_int)addr&1)==0);
6208   new_dynarec_did_compile=1;
6209   if (Config.HLE && start == 0x80001000) // hlecall
6210   {
6211     // XXX: is this enough? Maybe check hleSoftCall?
6212     void *beginning=start_block();
6213     u_int page=get_page(start);
6214
6215     invalid_code[start>>12]=0;
6216     emit_movimm(start,0);
6217     emit_writeword(0,&pcaddr);
6218     emit_jmp(new_dyna_leave);
6219     literal_pool(0);
6220     end_block(beginning);
6221     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
6222     return 0;
6223   }
6224
6225   source = get_source_start(start, &pagelimit);
6226   if (source == NULL) {
6227     SysPrintf("Compile at bogus memory address: %08x\n", addr);
6228     exit(1);
6229   }
6230
6231   /* Pass 1: disassemble */
6232   /* Pass 2: register dependencies, branch targets */
6233   /* Pass 3: register allocation */
6234   /* Pass 4: branch dependencies */
6235   /* Pass 5: pre-alloc */
6236   /* Pass 6: optimize clean/dirty state */
6237   /* Pass 7: flag 32-bit registers */
6238   /* Pass 8: assembly */
6239   /* Pass 9: linker */
6240   /* Pass 10: garbage collection / free memory */
6241
6242   int j;
6243   int done=0;
6244   unsigned int type,op,op2;
6245
6246   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
6247
6248   /* Pass 1 disassembly */
6249
6250   for(i=0;!done;i++) {
6251     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
6252     minimum_free_regs[i]=0;
6253     opcode[i]=op=source[i]>>26;
6254     switch(op)
6255     {
6256       case 0x00: strcpy(insn[i],"special"); type=NI;
6257         op2=source[i]&0x3f;
6258         switch(op2)
6259         {
6260           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
6261           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
6262           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
6263           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
6264           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
6265           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
6266           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
6267           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
6268           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
6269           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
6270           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
6271           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
6272           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
6273           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
6274           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
6275           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
6276           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
6277           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
6278           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
6279           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
6280           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
6281           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
6282           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
6283           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
6284           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
6285           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
6286           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
6287           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
6288           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
6289           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
6290           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
6291           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
6292           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
6293           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
6294           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
6295 #if 0
6296           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
6297           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
6298           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
6299           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
6300           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
6301           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
6302           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
6303           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
6304           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
6305           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
6306           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
6307           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
6308           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
6309           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
6310           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
6311           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
6312           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
6313 #endif
6314         }
6315         break;
6316       case 0x01: strcpy(insn[i],"regimm"); type=NI;
6317         op2=(source[i]>>16)&0x1f;
6318         switch(op2)
6319         {
6320           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
6321           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
6322           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
6323           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
6324           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
6325           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
6326           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
6327           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
6328           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
6329           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
6330           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
6331           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
6332           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
6333           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
6334         }
6335         break;
6336       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
6337       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
6338       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
6339       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
6340       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
6341       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
6342       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
6343       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
6344       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
6345       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
6346       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
6347       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
6348       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
6349       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
6350       case 0x10: strcpy(insn[i],"cop0"); type=NI;
6351         op2=(source[i]>>21)&0x1f;
6352         switch(op2)
6353         {
6354           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
6355           case 0x02: strcpy(insn[i],"CFC0"); type=COP0; break;
6356           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
6357           case 0x06: strcpy(insn[i],"CTC0"); type=COP0; break;
6358           case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
6359         }
6360         break;
6361       case 0x11: strcpy(insn[i],"cop1"); type=COP1;
6362         op2=(source[i]>>21)&0x1f;
6363         break;
6364 #if 0
6365       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
6366       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
6367       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
6368       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
6369       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
6370       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
6371       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
6372       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
6373 #endif
6374       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
6375       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
6376       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
6377       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
6378       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
6379       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
6380       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
6381 #if 0
6382       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
6383 #endif
6384       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
6385       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
6386       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
6387       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
6388 #if 0
6389       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
6390       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
6391 #endif
6392       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
6393       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
6394       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
6395       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
6396 #if 0
6397       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
6398       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
6399       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
6400 #endif
6401       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
6402       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
6403 #if 0
6404       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
6405       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
6406       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
6407 #endif
6408       case 0x12: strcpy(insn[i],"COP2"); type=NI;
6409         op2=(source[i]>>21)&0x1f;
6410         //if (op2 & 0x10) {
6411         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
6412           if (gte_handlers[source[i]&0x3f]!=NULL) {
6413             if (gte_regnames[source[i]&0x3f]!=NULL)
6414               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
6415             else
6416               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
6417             type=C2OP;
6418           }
6419         }
6420         else switch(op2)
6421         {
6422           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
6423           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
6424           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
6425           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
6426         }
6427         break;
6428       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
6429       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
6430       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
6431       default: strcpy(insn[i],"???"); type=NI;
6432         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
6433         break;
6434     }
6435     itype[i]=type;
6436     opcode2[i]=op2;
6437     /* Get registers/immediates */
6438     lt1[i]=0;
6439     us1[i]=0;
6440     us2[i]=0;
6441     dep1[i]=0;
6442     dep2[i]=0;
6443     gte_rs[i]=gte_rt[i]=0;
6444     switch(type) {
6445       case LOAD:
6446         rs1[i]=(source[i]>>21)&0x1f;
6447         rs2[i]=0;
6448         rt1[i]=(source[i]>>16)&0x1f;
6449         rt2[i]=0;
6450         imm[i]=(short)source[i];
6451         break;
6452       case STORE:
6453       case STORELR:
6454         rs1[i]=(source[i]>>21)&0x1f;
6455         rs2[i]=(source[i]>>16)&0x1f;
6456         rt1[i]=0;
6457         rt2[i]=0;
6458         imm[i]=(short)source[i];
6459         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
6460         break;
6461       case LOADLR:
6462         // LWL/LWR only load part of the register,
6463         // therefore the target register must be treated as a source too
6464         rs1[i]=(source[i]>>21)&0x1f;
6465         rs2[i]=(source[i]>>16)&0x1f;
6466         rt1[i]=(source[i]>>16)&0x1f;
6467         rt2[i]=0;
6468         imm[i]=(short)source[i];
6469         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
6470         if(op==0x26) dep1[i]=rt1[i]; // LWR
6471         break;
6472       case IMM16:
6473         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
6474         else rs1[i]=(source[i]>>21)&0x1f;
6475         rs2[i]=0;
6476         rt1[i]=(source[i]>>16)&0x1f;
6477         rt2[i]=0;
6478         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
6479           imm[i]=(unsigned short)source[i];
6480         }else{
6481           imm[i]=(short)source[i];
6482         }
6483         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
6484         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
6485         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
6486         break;
6487       case UJUMP:
6488         rs1[i]=0;
6489         rs2[i]=0;
6490         rt1[i]=0;
6491         rt2[i]=0;
6492         // The JAL instruction writes to r31.
6493         if (op&1) {
6494           rt1[i]=31;
6495         }
6496         rs2[i]=CCREG;
6497         break;
6498       case RJUMP:
6499         rs1[i]=(source[i]>>21)&0x1f;
6500         rs2[i]=0;
6501         rt1[i]=0;
6502         rt2[i]=0;
6503         // The JALR instruction writes to rd.
6504         if (op2&1) {
6505           rt1[i]=(source[i]>>11)&0x1f;
6506         }
6507         rs2[i]=CCREG;
6508         break;
6509       case CJUMP:
6510         rs1[i]=(source[i]>>21)&0x1f;
6511         rs2[i]=(source[i]>>16)&0x1f;
6512         rt1[i]=0;
6513         rt2[i]=0;
6514         if(op&2) { // BGTZ/BLEZ
6515           rs2[i]=0;
6516         }
6517         us1[i]=rs1[i];
6518         us2[i]=rs2[i];
6519         likely[i]=op>>4;
6520         break;
6521       case SJUMP:
6522         rs1[i]=(source[i]>>21)&0x1f;
6523         rs2[i]=CCREG;
6524         rt1[i]=0;
6525         rt2[i]=0;
6526         us1[i]=rs1[i];
6527         if(op2&0x10) { // BxxAL
6528           rt1[i]=31;
6529           // NOTE: If the branch is not taken, r31 is still overwritten
6530         }
6531         likely[i]=(op2&2)>>1;
6532         break;
6533       case FJUMP:
6534         rs1[i]=FSREG;
6535         rs2[i]=CSREG;
6536         rt1[i]=0;
6537         rt2[i]=0;
6538         likely[i]=((source[i])>>17)&1;
6539         break;
6540       case ALU:
6541         rs1[i]=(source[i]>>21)&0x1f; // source
6542         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
6543         rt1[i]=(source[i]>>11)&0x1f; // destination
6544         rt2[i]=0;
6545         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
6546           us1[i]=rs1[i];us2[i]=rs2[i];
6547         }
6548         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
6549           dep1[i]=rs1[i];dep2[i]=rs2[i];
6550         }
6551         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
6552           dep1[i]=rs1[i];dep2[i]=rs2[i];
6553         }
6554         break;
6555       case MULTDIV:
6556         rs1[i]=(source[i]>>21)&0x1f; // source
6557         rs2[i]=(source[i]>>16)&0x1f; // divisor
6558         rt1[i]=HIREG;
6559         rt2[i]=LOREG;
6560         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
6561           us1[i]=rs1[i];us2[i]=rs2[i];
6562         }
6563         break;
6564       case MOV:
6565         rs1[i]=0;
6566         rs2[i]=0;
6567         rt1[i]=0;
6568         rt2[i]=0;
6569         if(op2==0x10) rs1[i]=HIREG; // MFHI
6570         if(op2==0x11) rt1[i]=HIREG; // MTHI
6571         if(op2==0x12) rs1[i]=LOREG; // MFLO
6572         if(op2==0x13) rt1[i]=LOREG; // MTLO
6573         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
6574         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
6575         dep1[i]=rs1[i];
6576         break;
6577       case SHIFT:
6578         rs1[i]=(source[i]>>16)&0x1f; // target of shift
6579         rs2[i]=(source[i]>>21)&0x1f; // shift amount
6580         rt1[i]=(source[i]>>11)&0x1f; // destination
6581         rt2[i]=0;
6582         // DSLLV/DSRLV/DSRAV are 64-bit
6583         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
6584         break;
6585       case SHIFTIMM:
6586         rs1[i]=(source[i]>>16)&0x1f;
6587         rs2[i]=0;
6588         rt1[i]=(source[i]>>11)&0x1f;
6589         rt2[i]=0;
6590         imm[i]=(source[i]>>6)&0x1f;
6591         // DSxx32 instructions
6592         if(op2>=0x3c) imm[i]|=0x20;
6593         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
6594         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
6595         break;
6596       case COP0:
6597         rs1[i]=0;
6598         rs2[i]=0;
6599         rt1[i]=0;
6600         rt2[i]=0;
6601         if(op2==0||op2==2) rt1[i]=(source[i]>>16)&0x1F; // MFC0/CFC0
6602         if(op2==4||op2==6) rs1[i]=(source[i]>>16)&0x1F; // MTC0/CTC0
6603         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
6604         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
6605         break;
6606       case COP1:
6607         rs1[i]=0;
6608         rs2[i]=0;
6609         rt1[i]=0;
6610         rt2[i]=0;
6611         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
6612         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
6613         if(op2==5) us1[i]=rs1[i]; // DMTC1
6614         rs2[i]=CSREG;
6615         break;
6616       case COP2:
6617         rs1[i]=0;
6618         rs2[i]=0;
6619         rt1[i]=0;
6620         rt2[i]=0;
6621         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
6622         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
6623         rs2[i]=CSREG;
6624         int gr=(source[i]>>11)&0x1F;
6625         switch(op2)
6626         {
6627           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
6628           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
6629           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
6630           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
6631         }
6632         break;
6633       case C1LS:
6634         rs1[i]=(source[i]>>21)&0x1F;
6635         rs2[i]=CSREG;
6636         rt1[i]=0;
6637         rt2[i]=0;
6638         imm[i]=(short)source[i];
6639         break;
6640       case C2LS:
6641         rs1[i]=(source[i]>>21)&0x1F;
6642         rs2[i]=0;
6643         rt1[i]=0;
6644         rt2[i]=0;
6645         imm[i]=(short)source[i];
6646         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
6647         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
6648         break;
6649       case C2OP:
6650         rs1[i]=0;
6651         rs2[i]=0;
6652         rt1[i]=0;
6653         rt2[i]=0;
6654         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
6655         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
6656         gte_rt[i]|=1ll<<63; // every op changes flags
6657         if((source[i]&0x3f)==GTE_MVMVA) {
6658           int v = (source[i] >> 15) & 3;
6659           gte_rs[i]&=~0xe3fll;
6660           if(v==3) gte_rs[i]|=0xe00ll;
6661           else gte_rs[i]|=3ll<<(v*2);
6662         }
6663         break;
6664       case SYSCALL:
6665       case HLECALL:
6666       case INTCALL:
6667         rs1[i]=CCREG;
6668         rs2[i]=0;
6669         rt1[i]=0;
6670         rt2[i]=0;
6671         break;
6672       default:
6673         rs1[i]=0;
6674         rs2[i]=0;
6675         rt1[i]=0;
6676         rt2[i]=0;
6677     }
6678     /* Calculate branch target addresses */
6679     if(type==UJUMP)
6680       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
6681     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
6682       ba[i]=start+i*4+8; // Ignore never taken branch
6683     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
6684       ba[i]=start+i*4+8; // Ignore never taken branch
6685     else if(type==CJUMP||type==SJUMP||type==FJUMP)
6686       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
6687     else ba[i]=-1;
6688     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
6689       int do_in_intrp=0;
6690       // branch in delay slot?
6691       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
6692         // don't handle first branch and call interpreter if it's hit
6693         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
6694         do_in_intrp=1;
6695       }
6696       // basic load delay detection
6697       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
6698         int t=(ba[i-1]-start)/4;
6699         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
6700           // jump target wants DS result - potential load delay effect
6701           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
6702           do_in_intrp=1;
6703           bt[t+1]=1; // expected return from interpreter
6704         }
6705         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
6706               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
6707           // v0 overwrite like this is a sign of trouble, bail out
6708           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
6709           do_in_intrp=1;
6710         }
6711       }
6712       if(do_in_intrp) {
6713         rs1[i-1]=CCREG;
6714         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
6715         ba[i-1]=-1;
6716         itype[i-1]=INTCALL;
6717         done=2;
6718         i--; // don't compile the DS
6719       }
6720     }
6721     /* Is this the end of the block? */
6722     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
6723       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
6724         done=2;
6725       }
6726       else {
6727         if(stop_after_jal) done=1;
6728         // Stop on BREAK
6729         if((source[i+1]&0xfc00003f)==0x0d) done=1;
6730       }
6731       // Don't recompile stuff that's already compiled
6732       if(check_addr(start+i*4+4)) done=1;
6733       // Don't get too close to the limit
6734       if(i>MAXBLOCK/2) done=1;
6735     }
6736     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
6737     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
6738     if(done==2) {
6739       // Does the block continue due to a branch?
6740       for(j=i-1;j>=0;j--)
6741       {
6742         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
6743         if(ba[j]==start+i*4+4) done=j=0;
6744         if(ba[j]==start+i*4+8) done=j=0;
6745       }
6746     }
6747     //assert(i<MAXBLOCK-1);
6748     if(start+i*4==pagelimit-4) done=1;
6749     assert(start+i*4<pagelimit);
6750     if (i==MAXBLOCK-1) done=1;
6751     // Stop if we're compiling junk
6752     if(itype[i]==NI&&opcode[i]==0x11) {
6753       done=stop_after_jal=1;
6754       SysPrintf("Disabled speculative precompilation\n");
6755     }
6756   }
6757   slen=i;
6758   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
6759     if(start+i*4==pagelimit) {
6760       itype[i-1]=SPAN;
6761     }
6762   }
6763   assert(slen>0);
6764
6765   /* Pass 2 - Register dependencies and branch targets */
6766
6767   unneeded_registers(0,slen-1,0);
6768
6769   /* Pass 3 - Register allocation */
6770
6771   struct regstat current; // Current register allocations/status
6772   current.is32=1;
6773   current.dirty=0;
6774   current.u=unneeded_reg[0];
6775   clear_all_regs(current.regmap);
6776   alloc_reg(&current,0,CCREG);
6777   dirty_reg(&current,CCREG);
6778   current.isconst=0;
6779   current.wasconst=0;
6780   current.waswritten=0;
6781   int ds=0;
6782   int cc=0;
6783   int hr=-1;
6784
6785   if((u_int)addr&1) {
6786     // First instruction is delay slot
6787     cc=-1;
6788     bt[1]=1;
6789     ds=1;
6790     unneeded_reg[0]=1;
6791     current.regmap[HOST_BTREG]=BTREG;
6792   }
6793
6794   for(i=0;i<slen;i++)
6795   {
6796     if(bt[i])
6797     {
6798       int hr;
6799       for(hr=0;hr<HOST_REGS;hr++)
6800       {
6801         // Is this really necessary?
6802         if(current.regmap[hr]==0) current.regmap[hr]=-1;
6803       }
6804       current.isconst=0;
6805       current.waswritten=0;
6806     }
6807     if(i>1)
6808     {
6809       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
6810       {
6811         if(rs1[i-2]==0||rs2[i-2]==0)
6812         {
6813           if(rs1[i-2]) {
6814             current.is32|=1LL<<rs1[i-2];
6815             int hr=get_reg(current.regmap,rs1[i-2]|64);
6816             if(hr>=0) current.regmap[hr]=-1;
6817           }
6818           if(rs2[i-2]) {
6819             current.is32|=1LL<<rs2[i-2];
6820             int hr=get_reg(current.regmap,rs2[i-2]|64);
6821             if(hr>=0) current.regmap[hr]=-1;
6822           }
6823         }
6824       }
6825     }
6826     current.is32=-1LL;
6827
6828     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
6829     regs[i].wasconst=current.isconst;
6830     regs[i].was32=current.is32;
6831     regs[i].wasdirty=current.dirty;
6832     regs[i].loadedconst=0;
6833     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
6834       if(i+1<slen) {
6835         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
6836         current.u|=1;
6837       } else {
6838         current.u=1;
6839       }
6840     } else {
6841       if(i+1<slen) {
6842         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6843         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6844         current.u|=1;
6845       } else { SysPrintf("oops, branch at end of block with no delay slot\n");exit(1); }
6846     }
6847     is_ds[i]=ds;
6848     if(ds) {
6849       ds=0; // Skip delay slot, already allocated as part of branch
6850       // ...but we need to alloc it in case something jumps here
6851       if(i+1<slen) {
6852         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
6853       }else{
6854         current.u=branch_unneeded_reg[i-1];
6855       }
6856       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6857       current.u|=1;
6858       struct regstat temp;
6859       memcpy(&temp,&current,sizeof(current));
6860       temp.wasdirty=temp.dirty;
6861       temp.was32=temp.is32;
6862       // TODO: Take into account unconditional branches, as below
6863       delayslot_alloc(&temp,i);
6864       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
6865       regs[i].wasdirty=temp.wasdirty;
6866       regs[i].was32=temp.was32;
6867       regs[i].dirty=temp.dirty;
6868       regs[i].is32=temp.is32;
6869       regs[i].isconst=0;
6870       regs[i].wasconst=0;
6871       current.isconst=0;
6872       // Create entry (branch target) regmap
6873       for(hr=0;hr<HOST_REGS;hr++)
6874       {
6875         int r=temp.regmap[hr];
6876         if(r>=0) {
6877           if(r!=regmap_pre[i][hr]) {
6878             regs[i].regmap_entry[hr]=-1;
6879           }
6880           else
6881           {
6882             if(r<64){
6883               if((current.u>>r)&1) {
6884                 regs[i].regmap_entry[hr]=-1;
6885                 regs[i].regmap[hr]=-1;
6886                 //Don't clear regs in the delay slot as the branch might need them
6887                 //current.regmap[hr]=-1;
6888               }else
6889                 regs[i].regmap_entry[hr]=r;
6890             }
6891             else {
6892               assert(0);
6893             }
6894           }
6895         } else {
6896           // First instruction expects CCREG to be allocated
6897           if(i==0&&hr==HOST_CCREG)
6898             regs[i].regmap_entry[hr]=CCREG;
6899           else
6900             regs[i].regmap_entry[hr]=-1;
6901         }
6902       }
6903     }
6904     else { // Not delay slot
6905       switch(itype[i]) {
6906         case UJUMP:
6907           //current.isconst=0; // DEBUG
6908           //current.wasconst=0; // DEBUG
6909           //regs[i].wasconst=0; // DEBUG
6910           clear_const(&current,rt1[i]);
6911           alloc_cc(&current,i);
6912           dirty_reg(&current,CCREG);
6913           if (rt1[i]==31) {
6914             alloc_reg(&current,i,31);
6915             dirty_reg(&current,31);
6916             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
6917             //assert(rt1[i+1]!=rt1[i]);
6918             #ifdef REG_PREFETCH
6919             alloc_reg(&current,i,PTEMP);
6920             #endif
6921             //current.is32|=1LL<<rt1[i];
6922           }
6923           ooo[i]=1;
6924           delayslot_alloc(&current,i+1);
6925           //current.isconst=0; // DEBUG
6926           ds=1;
6927           //printf("i=%d, isconst=%x\n",i,current.isconst);
6928           break;
6929         case RJUMP:
6930           //current.isconst=0;
6931           //current.wasconst=0;
6932           //regs[i].wasconst=0;
6933           clear_const(&current,rs1[i]);
6934           clear_const(&current,rt1[i]);
6935           alloc_cc(&current,i);
6936           dirty_reg(&current,CCREG);
6937           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
6938             alloc_reg(&current,i,rs1[i]);
6939             if (rt1[i]!=0) {
6940               alloc_reg(&current,i,rt1[i]);
6941               dirty_reg(&current,rt1[i]);
6942               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
6943               assert(rt1[i+1]!=rt1[i]);
6944               #ifdef REG_PREFETCH
6945               alloc_reg(&current,i,PTEMP);
6946               #endif
6947             }
6948             #ifdef USE_MINI_HT
6949             if(rs1[i]==31) { // JALR
6950               alloc_reg(&current,i,RHASH);
6951               alloc_reg(&current,i,RHTBL);
6952             }
6953             #endif
6954             delayslot_alloc(&current,i+1);
6955           } else {
6956             // The delay slot overwrites our source register,
6957             // allocate a temporary register to hold the old value.
6958             current.isconst=0;
6959             current.wasconst=0;
6960             regs[i].wasconst=0;
6961             delayslot_alloc(&current,i+1);
6962             current.isconst=0;
6963             alloc_reg(&current,i,RTEMP);
6964           }
6965           //current.isconst=0; // DEBUG
6966           ooo[i]=1;
6967           ds=1;
6968           break;
6969         case CJUMP:
6970           //current.isconst=0;
6971           //current.wasconst=0;
6972           //regs[i].wasconst=0;
6973           clear_const(&current,rs1[i]);
6974           clear_const(&current,rs2[i]);
6975           if((opcode[i]&0x3E)==4) // BEQ/BNE
6976           {
6977             alloc_cc(&current,i);
6978             dirty_reg(&current,CCREG);
6979             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
6980             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
6981             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
6982             {
6983               assert(0);
6984             }
6985             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
6986                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
6987               // The delay slot overwrites one of our conditions.
6988               // Allocate the branch condition registers instead.
6989               current.isconst=0;
6990               current.wasconst=0;
6991               regs[i].wasconst=0;
6992               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
6993               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
6994               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
6995               {
6996                 assert(0);
6997               }
6998             }
6999             else
7000             {
7001               ooo[i]=1;
7002               delayslot_alloc(&current,i+1);
7003             }
7004           }
7005           else
7006           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
7007           {
7008             alloc_cc(&current,i);
7009             dirty_reg(&current,CCREG);
7010             alloc_reg(&current,i,rs1[i]);
7011             if(!(current.is32>>rs1[i]&1))
7012             {
7013               assert(0);
7014             }
7015             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
7016               // The delay slot overwrites one of our conditions.
7017               // Allocate the branch condition registers instead.
7018               current.isconst=0;
7019               current.wasconst=0;
7020               regs[i].wasconst=0;
7021               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7022               if(!((current.is32>>rs1[i])&1))
7023               {
7024                 assert(0);
7025               }
7026             }
7027             else
7028             {
7029               ooo[i]=1;
7030               delayslot_alloc(&current,i+1);
7031             }
7032           }
7033           else
7034           // Don't alloc the delay slot yet because we might not execute it
7035           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
7036           {
7037             current.isconst=0;
7038             current.wasconst=0;
7039             regs[i].wasconst=0;
7040             alloc_cc(&current,i);
7041             dirty_reg(&current,CCREG);
7042             alloc_reg(&current,i,rs1[i]);
7043             alloc_reg(&current,i,rs2[i]);
7044             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
7045             {
7046               assert(0);
7047             }
7048           }
7049           else
7050           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
7051           {
7052             current.isconst=0;
7053             current.wasconst=0;
7054             regs[i].wasconst=0;
7055             alloc_cc(&current,i);
7056             dirty_reg(&current,CCREG);
7057             alloc_reg(&current,i,rs1[i]);
7058             if(!(current.is32>>rs1[i]&1))
7059             {
7060               assert(0);
7061             }
7062           }
7063           ds=1;
7064           //current.isconst=0;
7065           break;
7066         case SJUMP:
7067           //current.isconst=0;
7068           //current.wasconst=0;
7069           //regs[i].wasconst=0;
7070           clear_const(&current,rs1[i]);
7071           clear_const(&current,rt1[i]);
7072           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
7073           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
7074           {
7075             alloc_cc(&current,i);
7076             dirty_reg(&current,CCREG);
7077             alloc_reg(&current,i,rs1[i]);
7078             if(!(current.is32>>rs1[i]&1))
7079             {
7080               assert(0);
7081             }
7082             if (rt1[i]==31) { // BLTZAL/BGEZAL
7083               alloc_reg(&current,i,31);
7084               dirty_reg(&current,31);
7085               //#ifdef REG_PREFETCH
7086               //alloc_reg(&current,i,PTEMP);
7087               //#endif
7088               //current.is32|=1LL<<rt1[i];
7089             }
7090             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
7091                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
7092               // Allocate the branch condition registers instead.
7093               current.isconst=0;
7094               current.wasconst=0;
7095               regs[i].wasconst=0;
7096               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7097               if(!((current.is32>>rs1[i])&1))
7098               {
7099                 assert(0);
7100               }
7101             }
7102             else
7103             {
7104               ooo[i]=1;
7105               delayslot_alloc(&current,i+1);
7106             }
7107           }
7108           else
7109           // Don't alloc the delay slot yet because we might not execute it
7110           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
7111           {
7112             current.isconst=0;
7113             current.wasconst=0;
7114             regs[i].wasconst=0;
7115             alloc_cc(&current,i);
7116             dirty_reg(&current,CCREG);
7117             alloc_reg(&current,i,rs1[i]);
7118             if(!(current.is32>>rs1[i]&1))
7119             {
7120               assert(0);
7121             }
7122           }
7123           ds=1;
7124           //current.isconst=0;
7125           break;
7126         case FJUMP:
7127           assert(0);
7128           break;
7129         case IMM16:
7130           imm16_alloc(&current,i);
7131           break;
7132         case LOAD:
7133         case LOADLR:
7134           load_alloc(&current,i);
7135           break;
7136         case STORE:
7137         case STORELR:
7138           store_alloc(&current,i);
7139           break;
7140         case ALU:
7141           alu_alloc(&current,i);
7142           break;
7143         case SHIFT:
7144           shift_alloc(&current,i);
7145           break;
7146         case MULTDIV:
7147           multdiv_alloc(&current,i);
7148           break;
7149         case SHIFTIMM:
7150           shiftimm_alloc(&current,i);
7151           break;
7152         case MOV:
7153           mov_alloc(&current,i);
7154           break;
7155         case COP0:
7156           cop0_alloc(&current,i);
7157           break;
7158         case COP1:
7159         case COP2:
7160           cop12_alloc(&current,i);
7161           break;
7162         case C1LS:
7163           c1ls_alloc(&current,i);
7164           break;
7165         case C2LS:
7166           c2ls_alloc(&current,i);
7167           break;
7168         case C2OP:
7169           c2op_alloc(&current,i);
7170           break;
7171         case SYSCALL:
7172         case HLECALL:
7173         case INTCALL:
7174           syscall_alloc(&current,i);
7175           break;
7176         case SPAN:
7177           pagespan_alloc(&current,i);
7178           break;
7179       }
7180
7181       // Create entry (branch target) regmap
7182       for(hr=0;hr<HOST_REGS;hr++)
7183       {
7184         int r,or;
7185         r=current.regmap[hr];
7186         if(r>=0) {
7187           if(r!=regmap_pre[i][hr]) {
7188             // TODO: delay slot (?)
7189             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
7190             if(or<0||(r&63)>=TEMPREG){
7191               regs[i].regmap_entry[hr]=-1;
7192             }
7193             else
7194             {
7195               // Just move it to a different register
7196               regs[i].regmap_entry[hr]=r;
7197               // If it was dirty before, it's still dirty
7198               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
7199             }
7200           }
7201           else
7202           {
7203             // Unneeded
7204             if(r==0){
7205               regs[i].regmap_entry[hr]=0;
7206             }
7207             else
7208             if(r<64){
7209               if((current.u>>r)&1) {
7210                 regs[i].regmap_entry[hr]=-1;
7211                 //regs[i].regmap[hr]=-1;
7212                 current.regmap[hr]=-1;
7213               }else
7214                 regs[i].regmap_entry[hr]=r;
7215             }
7216             else {
7217               assert(0);
7218             }
7219           }
7220         } else {
7221           // Branches expect CCREG to be allocated at the target
7222           if(regmap_pre[i][hr]==CCREG)
7223             regs[i].regmap_entry[hr]=CCREG;
7224           else
7225             regs[i].regmap_entry[hr]=-1;
7226         }
7227       }
7228       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
7229     }
7230
7231     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
7232       current.waswritten|=1<<rs1[i-1];
7233     current.waswritten&=~(1<<rt1[i]);
7234     current.waswritten&=~(1<<rt2[i]);
7235     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
7236       current.waswritten&=~(1<<rs1[i]);
7237
7238     /* Branch post-alloc */
7239     if(i>0)
7240     {
7241       current.was32=current.is32;
7242       current.wasdirty=current.dirty;
7243       switch(itype[i-1]) {
7244         case UJUMP:
7245           memcpy(&branch_regs[i-1],&current,sizeof(current));
7246           branch_regs[i-1].isconst=0;
7247           branch_regs[i-1].wasconst=0;
7248           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
7249           alloc_cc(&branch_regs[i-1],i-1);
7250           dirty_reg(&branch_regs[i-1],CCREG);
7251           if(rt1[i-1]==31) { // JAL
7252             alloc_reg(&branch_regs[i-1],i-1,31);
7253             dirty_reg(&branch_regs[i-1],31);
7254             branch_regs[i-1].is32|=1LL<<31;
7255           }
7256           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7257           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
7258           break;
7259         case RJUMP:
7260           memcpy(&branch_regs[i-1],&current,sizeof(current));
7261           branch_regs[i-1].isconst=0;
7262           branch_regs[i-1].wasconst=0;
7263           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
7264           alloc_cc(&branch_regs[i-1],i-1);
7265           dirty_reg(&branch_regs[i-1],CCREG);
7266           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
7267           if(rt1[i-1]!=0) { // JALR
7268             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
7269             dirty_reg(&branch_regs[i-1],rt1[i-1]);
7270             branch_regs[i-1].is32|=1LL<<rt1[i-1];
7271           }
7272           #ifdef USE_MINI_HT
7273           if(rs1[i-1]==31) { // JALR
7274             alloc_reg(&branch_regs[i-1],i-1,RHASH);
7275             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
7276           }
7277           #endif
7278           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7279           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
7280           break;
7281         case CJUMP:
7282           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
7283           {
7284             alloc_cc(&current,i-1);
7285             dirty_reg(&current,CCREG);
7286             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
7287                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
7288               // The delay slot overwrote one of our conditions
7289               // Delay slot goes after the test (in order)
7290               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7291               current.u|=1;
7292               delayslot_alloc(&current,i);
7293               current.isconst=0;
7294             }
7295             else
7296             {
7297               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
7298               // Alloc the branch condition registers
7299               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
7300               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
7301               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
7302               {
7303                 assert(0);
7304               }
7305             }
7306             memcpy(&branch_regs[i-1],&current,sizeof(current));
7307             branch_regs[i-1].isconst=0;
7308             branch_regs[i-1].wasconst=0;
7309             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
7310             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
7311           }
7312           else
7313           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
7314           {
7315             alloc_cc(&current,i-1);
7316             dirty_reg(&current,CCREG);
7317             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
7318               // The delay slot overwrote the branch condition
7319               // Delay slot goes after the test (in order)
7320               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7321               current.u|=1;
7322               delayslot_alloc(&current,i);
7323               current.isconst=0;
7324             }
7325             else
7326             {
7327               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
7328               // Alloc the branch condition register
7329               alloc_reg(&current,i-1,rs1[i-1]);
7330               if(!(current.is32>>rs1[i-1]&1))
7331               {
7332                 assert(0);
7333               }
7334             }
7335             memcpy(&branch_regs[i-1],&current,sizeof(current));
7336             branch_regs[i-1].isconst=0;
7337             branch_regs[i-1].wasconst=0;
7338             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
7339             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
7340           }
7341           else
7342           // Alloc the delay slot in case the branch is taken
7343           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
7344           {
7345             memcpy(&branch_regs[i-1],&current,sizeof(current));
7346             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
7347             alloc_cc(&branch_regs[i-1],i);
7348             dirty_reg(&branch_regs[i-1],CCREG);
7349             delayslot_alloc(&branch_regs[i-1],i);
7350             branch_regs[i-1].isconst=0;
7351             alloc_reg(&current,i,CCREG); // Not taken path
7352             dirty_reg(&current,CCREG);
7353             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7354           }
7355           else
7356           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
7357           {
7358             memcpy(&branch_regs[i-1],&current,sizeof(current));
7359             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
7360             alloc_cc(&branch_regs[i-1],i);
7361             dirty_reg(&branch_regs[i-1],CCREG);
7362             delayslot_alloc(&branch_regs[i-1],i);
7363             branch_regs[i-1].isconst=0;
7364             alloc_reg(&current,i,CCREG); // Not taken path
7365             dirty_reg(&current,CCREG);
7366             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7367           }
7368           break;
7369         case SJUMP:
7370           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
7371           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
7372           {
7373             alloc_cc(&current,i-1);
7374             dirty_reg(&current,CCREG);
7375             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
7376               // The delay slot overwrote the branch condition
7377               // Delay slot goes after the test (in order)
7378               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7379               current.u|=1;
7380               delayslot_alloc(&current,i);
7381               current.isconst=0;
7382             }
7383             else
7384             {
7385               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
7386               // Alloc the branch condition register
7387               alloc_reg(&current,i-1,rs1[i-1]);
7388               if(!(current.is32>>rs1[i-1]&1))
7389               {
7390                 assert(0);
7391               }
7392             }
7393             memcpy(&branch_regs[i-1],&current,sizeof(current));
7394             branch_regs[i-1].isconst=0;
7395             branch_regs[i-1].wasconst=0;
7396             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
7397             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
7398           }
7399           else
7400           // Alloc the delay slot in case the branch is taken
7401           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
7402           {
7403             memcpy(&branch_regs[i-1],&current,sizeof(current));
7404             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
7405             alloc_cc(&branch_regs[i-1],i);
7406             dirty_reg(&branch_regs[i-1],CCREG);
7407             delayslot_alloc(&branch_regs[i-1],i);
7408             branch_regs[i-1].isconst=0;
7409             alloc_reg(&current,i,CCREG); // Not taken path
7410             dirty_reg(&current,CCREG);
7411             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
7412           }
7413           // FIXME: BLTZAL/BGEZAL
7414           if(opcode2[i-1]&0x10) { // BxxZAL
7415             alloc_reg(&branch_regs[i-1],i-1,31);
7416             dirty_reg(&branch_regs[i-1],31);
7417             branch_regs[i-1].is32|=1LL<<31;
7418           }
7419           break;
7420         case FJUMP:
7421           assert(0);
7422           break;
7423       }
7424
7425       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
7426       {
7427         if(rt1[i-1]==31) // JAL/JALR
7428         {
7429           // Subroutine call will return here, don't alloc any registers
7430           current.is32=1;
7431           current.dirty=0;
7432           clear_all_regs(current.regmap);
7433           alloc_reg(&current,i,CCREG);
7434           dirty_reg(&current,CCREG);
7435         }
7436         else if(i+1<slen)
7437         {
7438           // Internal branch will jump here, match registers to caller
7439           current.is32=0x3FFFFFFFFLL;
7440           current.dirty=0;
7441           clear_all_regs(current.regmap);
7442           alloc_reg(&current,i,CCREG);
7443           dirty_reg(&current,CCREG);
7444           for(j=i-1;j>=0;j--)
7445           {
7446             if(ba[j]==start+i*4+4) {
7447               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
7448               current.is32=branch_regs[j].is32;
7449               current.dirty=branch_regs[j].dirty;
7450               break;
7451             }
7452           }
7453           while(j>=0) {
7454             if(ba[j]==start+i*4+4) {
7455               for(hr=0;hr<HOST_REGS;hr++) {
7456                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
7457                   current.regmap[hr]=-1;
7458                 }
7459                 current.is32&=branch_regs[j].is32;
7460                 current.dirty&=branch_regs[j].dirty;
7461               }
7462             }
7463             j--;
7464           }
7465         }
7466       }
7467     }
7468
7469     // Count cycles in between branches
7470     ccadj[i]=cc;
7471     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
7472     {
7473       cc=0;
7474     }
7475 #if !defined(DRC_DBG)
7476     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
7477     {
7478       // GTE runs in parallel until accessed, divide by 2 for a rough guess
7479       cc+=gte_cycletab[source[i]&0x3f]/2;
7480     }
7481     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
7482     {
7483       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
7484     }
7485     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
7486     {
7487       cc+=4;
7488     }
7489     else if(itype[i]==C2LS)
7490     {
7491       cc+=4;
7492     }
7493 #endif
7494     else
7495     {
7496       cc++;
7497     }
7498
7499     flush_dirty_uppers(&current);
7500     if(!is_ds[i]) {
7501       regs[i].is32=current.is32;
7502       regs[i].dirty=current.dirty;
7503       regs[i].isconst=current.isconst;
7504       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
7505     }
7506     for(hr=0;hr<HOST_REGS;hr++) {
7507       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
7508         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
7509           regs[i].wasconst&=~(1<<hr);
7510         }
7511       }
7512     }
7513     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
7514     regs[i].waswritten=current.waswritten;
7515   }
7516
7517   /* Pass 4 - Cull unused host registers */
7518
7519   uint64_t nr=0;
7520
7521   for (i=slen-1;i>=0;i--)
7522   {
7523     int hr;
7524     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7525     {
7526       if(ba[i]<start || ba[i]>=(start+slen*4))
7527       {
7528         // Branch out of this block, don't need anything
7529         nr=0;
7530       }
7531       else
7532       {
7533         // Internal branch
7534         // Need whatever matches the target
7535         nr=0;
7536         int t=(ba[i]-start)>>2;
7537         for(hr=0;hr<HOST_REGS;hr++)
7538         {
7539           if(regs[i].regmap_entry[hr]>=0) {
7540             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
7541           }
7542         }
7543       }
7544       // Conditional branch may need registers for following instructions
7545       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7546       {
7547         if(i<slen-2) {
7548           nr|=needed_reg[i+2];
7549           for(hr=0;hr<HOST_REGS;hr++)
7550           {
7551             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
7552             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
7553           }
7554         }
7555       }
7556       // Don't need stuff which is overwritten
7557       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
7558       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
7559       // Merge in delay slot
7560       for(hr=0;hr<HOST_REGS;hr++)
7561       {
7562         if(!likely[i]) {
7563           // These are overwritten unless the branch is "likely"
7564           // and the delay slot is nullified if not taken
7565           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
7566           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
7567         }
7568         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
7569         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
7570         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
7571         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
7572         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
7573         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
7574         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
7575         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
7576         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
7577           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
7578           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
7579         }
7580       }
7581     }
7582     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7583     {
7584       // SYSCALL instruction (software interrupt)
7585       nr=0;
7586     }
7587     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7588     {
7589       // ERET instruction (return from interrupt)
7590       nr=0;
7591     }
7592     else // Non-branch
7593     {
7594       if(i<slen-1) {
7595         for(hr=0;hr<HOST_REGS;hr++) {
7596           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
7597           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
7598           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
7599           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
7600         }
7601       }
7602     }
7603     for(hr=0;hr<HOST_REGS;hr++)
7604     {
7605       // Overwritten registers are not needed
7606       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
7607       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
7608       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
7609       // Source registers are needed
7610       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
7611       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
7612       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
7613       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
7614       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
7615       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
7616       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
7617       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
7618       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
7619         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
7620         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
7621       }
7622       // Don't store a register immediately after writing it,
7623       // may prevent dual-issue.
7624       // But do so if this is a branch target, otherwise we
7625       // might have to load the register before the branch.
7626       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
7627         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1))) {
7628           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
7629           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
7630         }
7631         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1))) {
7632           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
7633           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
7634         }
7635       }
7636     }
7637     // Cycle count is needed at branches.  Assume it is needed at the target too.
7638     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
7639       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
7640       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
7641     }
7642     // Save it
7643     needed_reg[i]=nr;
7644
7645     // Deallocate unneeded registers
7646     for(hr=0;hr<HOST_REGS;hr++)
7647     {
7648       if(!((nr>>hr)&1)) {
7649         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
7650         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
7651            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
7652            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
7653         {
7654           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7655           {
7656             if(likely[i]) {
7657               regs[i].regmap[hr]=-1;
7658               regs[i].isconst&=~(1<<hr);
7659               if(i<slen-2) {
7660                 regmap_pre[i+2][hr]=-1;
7661                 regs[i+2].wasconst&=~(1<<hr);
7662               }
7663             }
7664           }
7665         }
7666         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7667         {
7668           int d1=0,d2=0,map=0,temp=0;
7669           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
7670           {
7671             d1=dep1[i+1];
7672             d2=dep2[i+1];
7673           }
7674           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
7675              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
7676             map=INVCP;
7677           }
7678           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
7679              itype[i+1]==C1LS || itype[i+1]==C2LS)
7680             temp=FTEMP;
7681           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
7682              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
7683              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
7684              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
7685              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
7686              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
7687              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
7688              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
7689              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
7690              regs[i].regmap[hr]!=map )
7691           {
7692             regs[i].regmap[hr]=-1;
7693             regs[i].isconst&=~(1<<hr);
7694             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
7695                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
7696                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
7697                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
7698                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
7699                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
7700                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
7701                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
7702                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
7703                branch_regs[i].regmap[hr]!=map)
7704             {
7705               branch_regs[i].regmap[hr]=-1;
7706               branch_regs[i].regmap_entry[hr]=-1;
7707               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7708               {
7709                 if(!likely[i]&&i<slen-2) {
7710                   regmap_pre[i+2][hr]=-1;
7711                   regs[i+2].wasconst&=~(1<<hr);
7712                 }
7713               }
7714             }
7715           }
7716         }
7717         else
7718         {
7719           // Non-branch
7720           if(i>0)
7721           {
7722             int d1=0,d2=0,map=-1,temp=-1;
7723             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
7724             {
7725               d1=dep1[i];
7726               d2=dep2[i];
7727             }
7728             if(itype[i]==STORE || itype[i]==STORELR ||
7729                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
7730               map=INVCP;
7731             }
7732             if(itype[i]==LOADLR || itype[i]==STORELR ||
7733                itype[i]==C1LS || itype[i]==C2LS)
7734               temp=FTEMP;
7735             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
7736                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
7737                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
7738                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
7739                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
7740                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
7741             {
7742               if(i<slen-1&&!is_ds[i]) {
7743                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
7744                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
7745                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
7746                 {
7747                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
7748                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
7749                 }
7750                 regmap_pre[i+1][hr]=-1;
7751                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
7752                 regs[i+1].wasconst&=~(1<<hr);
7753               }
7754               regs[i].regmap[hr]=-1;
7755               regs[i].isconst&=~(1<<hr);
7756             }
7757           }
7758         }
7759       }
7760     }
7761   }
7762
7763   /* Pass 5 - Pre-allocate registers */
7764
7765   // If a register is allocated during a loop, try to allocate it for the
7766   // entire loop, if possible.  This avoids loading/storing registers
7767   // inside of the loop.
7768
7769   signed char f_regmap[HOST_REGS];
7770   clear_all_regs(f_regmap);
7771   for(i=0;i<slen-1;i++)
7772   {
7773     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7774     {
7775       if(ba[i]>=start && ba[i]<(start+i*4))
7776       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
7777       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
7778       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
7779       ||itype[i+1]==SHIFT||itype[i+1]==COP1
7780       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
7781       {
7782         int t=(ba[i]-start)>>2;
7783         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
7784         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
7785         for(hr=0;hr<HOST_REGS;hr++)
7786         {
7787           if(regs[i].regmap[hr]>64) {
7788             if(!((regs[i].dirty>>hr)&1))
7789               f_regmap[hr]=regs[i].regmap[hr];
7790             else f_regmap[hr]=-1;
7791           }
7792           else if(regs[i].regmap[hr]>=0) {
7793             if(f_regmap[hr]!=regs[i].regmap[hr]) {
7794               // dealloc old register
7795               int n;
7796               for(n=0;n<HOST_REGS;n++)
7797               {
7798                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
7799               }
7800               // and alloc new one
7801               f_regmap[hr]=regs[i].regmap[hr];
7802             }
7803           }
7804           if(branch_regs[i].regmap[hr]>64) {
7805             if(!((branch_regs[i].dirty>>hr)&1))
7806               f_regmap[hr]=branch_regs[i].regmap[hr];
7807             else f_regmap[hr]=-1;
7808           }
7809           else if(branch_regs[i].regmap[hr]>=0) {
7810             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
7811               // dealloc old register
7812               int n;
7813               for(n=0;n<HOST_REGS;n++)
7814               {
7815                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
7816               }
7817               // and alloc new one
7818               f_regmap[hr]=branch_regs[i].regmap[hr];
7819             }
7820           }
7821           if(ooo[i]) {
7822             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
7823               f_regmap[hr]=branch_regs[i].regmap[hr];
7824           }else{
7825             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
7826               f_regmap[hr]=branch_regs[i].regmap[hr];
7827           }
7828           // Avoid dirty->clean transition
7829           #ifdef DESTRUCTIVE_WRITEBACK
7830           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
7831           #endif
7832           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
7833           // case above, however it's always a good idea.  We can't hoist the
7834           // load if the register was already allocated, so there's no point
7835           // wasting time analyzing most of these cases.  It only "succeeds"
7836           // when the mapping was different and the load can be replaced with
7837           // a mov, which is of negligible benefit.  So such cases are
7838           // skipped below.
7839           if(f_regmap[hr]>0) {
7840             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
7841               int r=f_regmap[hr];
7842               for(j=t;j<=i;j++)
7843               {
7844                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
7845                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
7846                 assert(r < 64);
7847                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
7848                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
7849                   int k;
7850                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
7851                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
7852                     if(r>63) {
7853                       if(get_reg(regs[i].regmap,r&63)<0) break;
7854                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
7855                     }
7856                     k=i;
7857                     while(k>1&&regs[k-1].regmap[hr]==-1) {
7858                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
7859                         //printf("no free regs for store %x\n",start+(k-1)*4);
7860                         break;
7861                       }
7862                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
7863                         //printf("no-match due to different register\n");
7864                         break;
7865                       }
7866                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
7867                         //printf("no-match due to branch\n");
7868                         break;
7869                       }
7870                       // call/ret fast path assumes no registers allocated
7871                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
7872                         break;
7873                       }
7874                       if(r>63) {
7875                         // NB This can exclude the case where the upper-half
7876                         // register is lower numbered than the lower-half
7877                         // register.  Not sure if it's worth fixing...
7878                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
7879                         if(regs[k-1].is32&(1LL<<(r&63))) break;
7880                       }
7881                       k--;
7882                     }
7883                     if(i<slen-1) {
7884                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
7885                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
7886                         //printf("bad match after branch\n");
7887                         break;
7888                       }
7889                     }
7890                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
7891                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
7892                       while(k<i) {
7893                         regs[k].regmap_entry[hr]=f_regmap[hr];
7894                         regs[k].regmap[hr]=f_regmap[hr];
7895                         regmap_pre[k+1][hr]=f_regmap[hr];
7896                         regs[k].wasdirty&=~(1<<hr);
7897                         regs[k].dirty&=~(1<<hr);
7898                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
7899                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
7900                         regs[k].wasconst&=~(1<<hr);
7901                         regs[k].isconst&=~(1<<hr);
7902                         k++;
7903                       }
7904                     }
7905                     else {
7906                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
7907                       break;
7908                     }
7909                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
7910                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
7911                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
7912                       regs[i].regmap_entry[hr]=f_regmap[hr];
7913                       regs[i].regmap[hr]=f_regmap[hr];
7914                       regs[i].wasdirty&=~(1<<hr);
7915                       regs[i].dirty&=~(1<<hr);
7916                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
7917                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
7918                       regs[i].wasconst&=~(1<<hr);
7919                       regs[i].isconst&=~(1<<hr);
7920                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
7921                       branch_regs[i].wasdirty&=~(1<<hr);
7922                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
7923                       branch_regs[i].regmap[hr]=f_regmap[hr];
7924                       branch_regs[i].dirty&=~(1<<hr);
7925                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
7926                       branch_regs[i].wasconst&=~(1<<hr);
7927                       branch_regs[i].isconst&=~(1<<hr);
7928                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7929                         regmap_pre[i+2][hr]=f_regmap[hr];
7930                         regs[i+2].wasdirty&=~(1<<hr);
7931                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
7932                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
7933                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
7934                       }
7935                     }
7936                   }
7937                   for(k=t;k<j;k++) {
7938                     // Alloc register clean at beginning of loop,
7939                     // but may dirty it in pass 6
7940                     regs[k].regmap_entry[hr]=f_regmap[hr];
7941                     regs[k].regmap[hr]=f_regmap[hr];
7942                     regs[k].dirty&=~(1<<hr);
7943                     regs[k].wasconst&=~(1<<hr);
7944                     regs[k].isconst&=~(1<<hr);
7945                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
7946                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
7947                       branch_regs[k].regmap[hr]=f_regmap[hr];
7948                       branch_regs[k].dirty&=~(1<<hr);
7949                       branch_regs[k].wasconst&=~(1<<hr);
7950                       branch_regs[k].isconst&=~(1<<hr);
7951                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
7952                         regmap_pre[k+2][hr]=f_regmap[hr];
7953                         regs[k+2].wasdirty&=~(1<<hr);
7954                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
7955                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
7956                       }
7957                     }
7958                     else
7959                     {
7960                       regmap_pre[k+1][hr]=f_regmap[hr];
7961                       regs[k+1].wasdirty&=~(1<<hr);
7962                     }
7963                   }
7964                   if(regs[j].regmap[hr]==f_regmap[hr])
7965                     regs[j].regmap_entry[hr]=f_regmap[hr];
7966                   break;
7967                 }
7968                 if(j==i) break;
7969                 if(regs[j].regmap[hr]>=0)
7970                   break;
7971                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
7972                   //printf("no-match due to different register\n");
7973                   break;
7974                 }
7975                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
7976                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
7977                   break;
7978                 }
7979                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
7980                 {
7981                   // Stop on unconditional branch
7982                   break;
7983                 }
7984                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
7985                 {
7986                   if(ooo[j]) {
7987                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
7988                       break;
7989                   }else{
7990                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
7991                       break;
7992                   }
7993                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
7994                     //printf("no-match due to different register (branch)\n");
7995                     break;
7996                   }
7997                 }
7998                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
7999                   //printf("No free regs for store %x\n",start+j*4);
8000                   break;
8001                 }
8002                 if(f_regmap[hr]>=64) {
8003                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
8004                     break;
8005                   }
8006                   else
8007                   {
8008                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
8009                       break;
8010                     }
8011                   }
8012                 }
8013               }
8014             }
8015           }
8016         }
8017       }
8018     }else{
8019       // Non branch or undetermined branch target
8020       for(hr=0;hr<HOST_REGS;hr++)
8021       {
8022         if(hr!=EXCLUDE_REG) {
8023           if(regs[i].regmap[hr]>64) {
8024             if(!((regs[i].dirty>>hr)&1))
8025               f_regmap[hr]=regs[i].regmap[hr];
8026           }
8027           else if(regs[i].regmap[hr]>=0) {
8028             if(f_regmap[hr]!=regs[i].regmap[hr]) {
8029               // dealloc old register
8030               int n;
8031               for(n=0;n<HOST_REGS;n++)
8032               {
8033                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
8034               }
8035               // and alloc new one
8036               f_regmap[hr]=regs[i].regmap[hr];
8037             }
8038           }
8039         }
8040       }
8041       // Try to restore cycle count at branch targets
8042       if(bt[i]) {
8043         for(j=i;j<slen-1;j++) {
8044           if(regs[j].regmap[HOST_CCREG]!=-1) break;
8045           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
8046             //printf("no free regs for store %x\n",start+j*4);
8047             break;
8048           }
8049         }
8050         if(regs[j].regmap[HOST_CCREG]==CCREG) {
8051           int k=i;
8052           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
8053           while(k<j) {
8054             regs[k].regmap_entry[HOST_CCREG]=CCREG;
8055             regs[k].regmap[HOST_CCREG]=CCREG;
8056             regmap_pre[k+1][HOST_CCREG]=CCREG;
8057             regs[k+1].wasdirty|=1<<HOST_CCREG;
8058             regs[k].dirty|=1<<HOST_CCREG;
8059             regs[k].wasconst&=~(1<<HOST_CCREG);
8060             regs[k].isconst&=~(1<<HOST_CCREG);
8061             k++;
8062           }
8063           regs[j].regmap_entry[HOST_CCREG]=CCREG;
8064         }
8065         // Work backwards from the branch target
8066         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
8067         {
8068           //printf("Extend backwards\n");
8069           int k;
8070           k=i;
8071           while(regs[k-1].regmap[HOST_CCREG]==-1) {
8072             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
8073               //printf("no free regs for store %x\n",start+(k-1)*4);
8074               break;
8075             }
8076             k--;
8077           }
8078           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
8079             //printf("Extend CC, %x ->\n",start+k*4);
8080             while(k<=i) {
8081               regs[k].regmap_entry[HOST_CCREG]=CCREG;
8082               regs[k].regmap[HOST_CCREG]=CCREG;
8083               regmap_pre[k+1][HOST_CCREG]=CCREG;
8084               regs[k+1].wasdirty|=1<<HOST_CCREG;
8085               regs[k].dirty|=1<<HOST_CCREG;
8086               regs[k].wasconst&=~(1<<HOST_CCREG);
8087               regs[k].isconst&=~(1<<HOST_CCREG);
8088               k++;
8089             }
8090           }
8091           else {
8092             //printf("Fail Extend CC, %x ->\n",start+k*4);
8093           }
8094         }
8095       }
8096       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
8097          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
8098          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1)
8099       {
8100         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
8101       }
8102     }
8103   }
8104
8105   // This allocates registers (if possible) one instruction prior
8106   // to use, which can avoid a load-use penalty on certain CPUs.
8107   for(i=0;i<slen-1;i++)
8108   {
8109     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
8110     {
8111       if(!bt[i+1])
8112       {
8113         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
8114            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
8115         {
8116           if(rs1[i+1]) {
8117             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
8118             {
8119               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8120               {
8121                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
8122                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
8123                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
8124                 regs[i].isconst&=~(1<<hr);
8125                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8126                 constmap[i][hr]=constmap[i+1][hr];
8127                 regs[i+1].wasdirty&=~(1<<hr);
8128                 regs[i].dirty&=~(1<<hr);
8129               }
8130             }
8131           }
8132           if(rs2[i+1]) {
8133             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
8134             {
8135               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8136               {
8137                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
8138                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
8139                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
8140                 regs[i].isconst&=~(1<<hr);
8141                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8142                 constmap[i][hr]=constmap[i+1][hr];
8143                 regs[i+1].wasdirty&=~(1<<hr);
8144                 regs[i].dirty&=~(1<<hr);
8145               }
8146             }
8147           }
8148           // Preload target address for load instruction (non-constant)
8149           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8150             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
8151             {
8152               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8153               {
8154                 regs[i].regmap[hr]=rs1[i+1];
8155                 regmap_pre[i+1][hr]=rs1[i+1];
8156                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8157                 regs[i].isconst&=~(1<<hr);
8158                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8159                 constmap[i][hr]=constmap[i+1][hr];
8160                 regs[i+1].wasdirty&=~(1<<hr);
8161                 regs[i].dirty&=~(1<<hr);
8162               }
8163             }
8164           }
8165           // Load source into target register
8166           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8167             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
8168             {
8169               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8170               {
8171                 regs[i].regmap[hr]=rs1[i+1];
8172                 regmap_pre[i+1][hr]=rs1[i+1];
8173                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8174                 regs[i].isconst&=~(1<<hr);
8175                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8176                 constmap[i][hr]=constmap[i+1][hr];
8177                 regs[i+1].wasdirty&=~(1<<hr);
8178                 regs[i].dirty&=~(1<<hr);
8179               }
8180             }
8181           }
8182           // Address for store instruction (non-constant)
8183           if(itype[i+1]==STORE||itype[i+1]==STORELR
8184              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
8185             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8186               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
8187               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
8188               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
8189               assert(hr>=0);
8190               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8191               {
8192                 regs[i].regmap[hr]=rs1[i+1];
8193                 regmap_pre[i+1][hr]=rs1[i+1];
8194                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8195                 regs[i].isconst&=~(1<<hr);
8196                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8197                 constmap[i][hr]=constmap[i+1][hr];
8198                 regs[i+1].wasdirty&=~(1<<hr);
8199                 regs[i].dirty&=~(1<<hr);
8200               }
8201             }
8202           }
8203           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
8204             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
8205               int nr;
8206               hr=get_reg(regs[i+1].regmap,FTEMP);
8207               assert(hr>=0);
8208               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
8209               {
8210                 regs[i].regmap[hr]=rs1[i+1];
8211                 regmap_pre[i+1][hr]=rs1[i+1];
8212                 regs[i+1].regmap_entry[hr]=rs1[i+1];
8213                 regs[i].isconst&=~(1<<hr);
8214                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
8215                 constmap[i][hr]=constmap[i+1][hr];
8216                 regs[i+1].wasdirty&=~(1<<hr);
8217                 regs[i].dirty&=~(1<<hr);
8218               }
8219               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
8220               {
8221                 // move it to another register
8222                 regs[i+1].regmap[hr]=-1;
8223                 regmap_pre[i+2][hr]=-1;
8224                 regs[i+1].regmap[nr]=FTEMP;
8225                 regmap_pre[i+2][nr]=FTEMP;
8226                 regs[i].regmap[nr]=rs1[i+1];
8227                 regmap_pre[i+1][nr]=rs1[i+1];
8228                 regs[i+1].regmap_entry[nr]=rs1[i+1];
8229                 regs[i].isconst&=~(1<<nr);
8230                 regs[i+1].isconst&=~(1<<nr);
8231                 regs[i].dirty&=~(1<<nr);
8232                 regs[i+1].wasdirty&=~(1<<nr);
8233                 regs[i+1].dirty&=~(1<<nr);
8234                 regs[i+2].wasdirty&=~(1<<nr);
8235               }
8236             }
8237           }
8238           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
8239             if(itype[i+1]==LOAD)
8240               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
8241             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
8242               hr=get_reg(regs[i+1].regmap,FTEMP);
8243             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
8244               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
8245               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
8246             }
8247             if(hr>=0&&regs[i].regmap[hr]<0) {
8248               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
8249               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
8250                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
8251                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
8252                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
8253                 regs[i].isconst&=~(1<<hr);
8254                 regs[i+1].wasdirty&=~(1<<hr);
8255                 regs[i].dirty&=~(1<<hr);
8256               }
8257             }
8258           }
8259         }
8260       }
8261     }
8262   }
8263
8264   /* Pass 6 - Optimize clean/dirty state */
8265   clean_registers(0,slen-1,1);
8266
8267   /* Pass 7 - Identify 32-bit registers */
8268   for (i=slen-1;i>=0;i--)
8269   {
8270     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8271     {
8272       // Conditional branch
8273       if((source[i]>>16)!=0x1000&&i<slen-2) {
8274         // Mark this address as a branch target since it may be called
8275         // upon return from interrupt
8276         bt[i+2]=1;
8277       }
8278     }
8279   }
8280
8281   if(itype[slen-1]==SPAN) {
8282     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
8283   }
8284
8285 #ifdef DISASM
8286   /* Debug/disassembly */
8287   for(i=0;i<slen;i++)
8288   {
8289     printf("U:");
8290     int r;
8291     for(r=1;r<=CCREG;r++) {
8292       if((unneeded_reg[i]>>r)&1) {
8293         if(r==HIREG) printf(" HI");
8294         else if(r==LOREG) printf(" LO");
8295         else printf(" r%d",r);
8296       }
8297     }
8298     printf("\n");
8299     #if defined(__i386__) || defined(__x86_64__)
8300     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
8301     #endif
8302     #ifdef __arm__
8303     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
8304     #endif
8305     printf("needs: ");
8306     if(needed_reg[i]&1) printf("eax ");
8307     if((needed_reg[i]>>1)&1) printf("ecx ");
8308     if((needed_reg[i]>>2)&1) printf("edx ");
8309     if((needed_reg[i]>>3)&1) printf("ebx ");
8310     if((needed_reg[i]>>5)&1) printf("ebp ");
8311     if((needed_reg[i]>>6)&1) printf("esi ");
8312     if((needed_reg[i]>>7)&1) printf("edi ");
8313     printf("\n");
8314     #if defined(__i386__) || defined(__x86_64__)
8315     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
8316     printf("dirty: ");
8317     if(regs[i].wasdirty&1) printf("eax ");
8318     if((regs[i].wasdirty>>1)&1) printf("ecx ");
8319     if((regs[i].wasdirty>>2)&1) printf("edx ");
8320     if((regs[i].wasdirty>>3)&1) printf("ebx ");
8321     if((regs[i].wasdirty>>5)&1) printf("ebp ");
8322     if((regs[i].wasdirty>>6)&1) printf("esi ");
8323     if((regs[i].wasdirty>>7)&1) printf("edi ");
8324     #endif
8325     #ifdef __arm__
8326     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
8327     printf("dirty: ");
8328     if(regs[i].wasdirty&1) printf("r0 ");
8329     if((regs[i].wasdirty>>1)&1) printf("r1 ");
8330     if((regs[i].wasdirty>>2)&1) printf("r2 ");
8331     if((regs[i].wasdirty>>3)&1) printf("r3 ");
8332     if((regs[i].wasdirty>>4)&1) printf("r4 ");
8333     if((regs[i].wasdirty>>5)&1) printf("r5 ");
8334     if((regs[i].wasdirty>>6)&1) printf("r6 ");
8335     if((regs[i].wasdirty>>7)&1) printf("r7 ");
8336     if((regs[i].wasdirty>>8)&1) printf("r8 ");
8337     if((regs[i].wasdirty>>9)&1) printf("r9 ");
8338     if((regs[i].wasdirty>>10)&1) printf("r10 ");
8339     if((regs[i].wasdirty>>12)&1) printf("r12 ");
8340     #endif
8341     printf("\n");
8342     disassemble_inst(i);
8343     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
8344     #if defined(__i386__) || defined(__x86_64__)
8345     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
8346     if(regs[i].dirty&1) printf("eax ");
8347     if((regs[i].dirty>>1)&1) printf("ecx ");
8348     if((regs[i].dirty>>2)&1) printf("edx ");
8349     if((regs[i].dirty>>3)&1) printf("ebx ");
8350     if((regs[i].dirty>>5)&1) printf("ebp ");
8351     if((regs[i].dirty>>6)&1) printf("esi ");
8352     if((regs[i].dirty>>7)&1) printf("edi ");
8353     #endif
8354     #ifdef __arm__
8355     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
8356     if(regs[i].dirty&1) printf("r0 ");
8357     if((regs[i].dirty>>1)&1) printf("r1 ");
8358     if((regs[i].dirty>>2)&1) printf("r2 ");
8359     if((regs[i].dirty>>3)&1) printf("r3 ");
8360     if((regs[i].dirty>>4)&1) printf("r4 ");
8361     if((regs[i].dirty>>5)&1) printf("r5 ");
8362     if((regs[i].dirty>>6)&1) printf("r6 ");
8363     if((regs[i].dirty>>7)&1) printf("r7 ");
8364     if((regs[i].dirty>>8)&1) printf("r8 ");
8365     if((regs[i].dirty>>9)&1) printf("r9 ");
8366     if((regs[i].dirty>>10)&1) printf("r10 ");
8367     if((regs[i].dirty>>12)&1) printf("r12 ");
8368     #endif
8369     printf("\n");
8370     if(regs[i].isconst) {
8371       printf("constants: ");
8372       #if defined(__i386__) || defined(__x86_64__)
8373       if(regs[i].isconst&1) printf("eax=%x ",(u_int)constmap[i][0]);
8374       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(u_int)constmap[i][1]);
8375       if((regs[i].isconst>>2)&1) printf("edx=%x ",(u_int)constmap[i][2]);
8376       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(u_int)constmap[i][3]);
8377       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(u_int)constmap[i][5]);
8378       if((regs[i].isconst>>6)&1) printf("esi=%x ",(u_int)constmap[i][6]);
8379       if((regs[i].isconst>>7)&1) printf("edi=%x ",(u_int)constmap[i][7]);
8380       #endif
8381       #ifdef __arm__
8382       int r;
8383       for (r = 0; r < ARRAY_SIZE(constmap[i]); r++)
8384         if ((regs[i].isconst >> r) & 1)
8385           printf(" r%d=%x", r, (u_int)constmap[i][r]);
8386       #endif
8387       printf("\n");
8388     }
8389     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
8390       #if defined(__i386__) || defined(__x86_64__)
8391       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
8392       if(branch_regs[i].dirty&1) printf("eax ");
8393       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
8394       if((branch_regs[i].dirty>>2)&1) printf("edx ");
8395       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
8396       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
8397       if((branch_regs[i].dirty>>6)&1) printf("esi ");
8398       if((branch_regs[i].dirty>>7)&1) printf("edi ");
8399       #endif
8400       #ifdef __arm__
8401       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
8402       if(branch_regs[i].dirty&1) printf("r0 ");
8403       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
8404       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
8405       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
8406       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
8407       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
8408       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
8409       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
8410       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
8411       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
8412       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
8413       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
8414       #endif
8415     }
8416   }
8417 #endif // DISASM
8418
8419   /* Pass 8 - Assembly */
8420   linkcount=0;stubcount=0;
8421   ds=0;is_delayslot=0;
8422   uint64_t is32_pre=0;
8423   u_int dirty_pre=0;
8424   void *beginning=start_block();
8425   if((u_int)addr&1) {
8426     ds=1;
8427     pagespan_ds();
8428   }
8429   void *instr_addr0_override = NULL;
8430
8431   if (start == 0x80030000) {
8432     // nasty hack for fastbios thing
8433     // override block entry to this code
8434     instr_addr0_override = out;
8435     emit_movimm(start,0);
8436     // abuse io address var as a flag that we
8437     // have already returned here once
8438     emit_readword(&address,1);
8439     emit_writeword(0,&pcaddr);
8440     emit_writeword(0,&address);
8441     emit_cmp(0,1);
8442     emit_jne(new_dyna_leave);
8443   }
8444   for(i=0;i<slen;i++)
8445   {
8446     //if(ds) printf("ds: ");
8447     disassemble_inst(i);
8448     if(ds) {
8449       ds=0; // Skip delay slot
8450       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
8451       instr_addr[i] = NULL;
8452     } else {
8453       speculate_register_values(i);
8454       #ifndef DESTRUCTIVE_WRITEBACK
8455       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
8456       {
8457         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
8458               unneeded_reg[i]);
8459       }
8460       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
8461         is32_pre=branch_regs[i].is32;
8462         dirty_pre=branch_regs[i].dirty;
8463       }else{
8464         is32_pre=regs[i].is32;
8465         dirty_pre=regs[i].dirty;
8466       }
8467       #endif
8468       // write back
8469       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
8470       {
8471         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,unneeded_reg[i]);
8472         loop_preload(regmap_pre[i],regs[i].regmap_entry);
8473       }
8474       // branch target entry point
8475       instr_addr[i] = out;
8476       assem_debug("<->\n");
8477       drc_dbg_emit_do_cmp(i);
8478
8479       // load regs
8480       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
8481         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
8482       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
8483       address_generation(i,&regs[i],regs[i].regmap_entry);
8484       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
8485       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8486       {
8487         // Load the delay slot registers if necessary
8488         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
8489           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
8490         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
8491           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
8492         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
8493           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
8494       }
8495       else if(i+1<slen)
8496       {
8497         // Preload registers for following instruction
8498         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
8499           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
8500             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
8501         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
8502           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
8503             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
8504       }
8505       // TODO: if(is_ooo(i)) address_generation(i+1);
8506       if(itype[i]==CJUMP||itype[i]==FJUMP)
8507         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
8508       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
8509         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
8510       // assemble
8511       switch(itype[i]) {
8512         case ALU:
8513           alu_assemble(i,&regs[i]);break;
8514         case IMM16:
8515           imm16_assemble(i,&regs[i]);break;
8516         case SHIFT:
8517           shift_assemble(i,&regs[i]);break;
8518         case SHIFTIMM:
8519           shiftimm_assemble(i,&regs[i]);break;
8520         case LOAD:
8521           load_assemble(i,&regs[i]);break;
8522         case LOADLR:
8523           loadlr_assemble(i,&regs[i]);break;
8524         case STORE:
8525           store_assemble(i,&regs[i]);break;
8526         case STORELR:
8527           storelr_assemble(i,&regs[i]);break;
8528         case COP0:
8529           cop0_assemble(i,&regs[i]);break;
8530         case COP1:
8531           cop1_assemble(i,&regs[i]);break;
8532         case C1LS:
8533           c1ls_assemble(i,&regs[i]);break;
8534         case COP2:
8535           cop2_assemble(i,&regs[i]);break;
8536         case C2LS:
8537           c2ls_assemble(i,&regs[i]);break;
8538         case C2OP:
8539           c2op_assemble(i,&regs[i]);break;
8540         case MULTDIV:
8541           multdiv_assemble(i,&regs[i]);break;
8542         case MOV:
8543           mov_assemble(i,&regs[i]);break;
8544         case SYSCALL:
8545           syscall_assemble(i,&regs[i]);break;
8546         case HLECALL:
8547           hlecall_assemble(i,&regs[i]);break;
8548         case INTCALL:
8549           intcall_assemble(i,&regs[i]);break;
8550         case UJUMP:
8551           ujump_assemble(i,&regs[i]);ds=1;break;
8552         case RJUMP:
8553           rjump_assemble(i,&regs[i]);ds=1;break;
8554         case CJUMP:
8555           cjump_assemble(i,&regs[i]);ds=1;break;
8556         case SJUMP:
8557           sjump_assemble(i,&regs[i]);ds=1;break;
8558         case FJUMP:
8559           assert(0);ds=1;break;
8560         case SPAN:
8561           pagespan_assemble(i,&regs[i]);break;
8562       }
8563       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
8564         literal_pool(1024);
8565       else
8566         literal_pool_jumpover(256);
8567     }
8568   }
8569   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
8570   // If the block did not end with an unconditional branch,
8571   // add a jump to the next instruction.
8572   if(i>1) {
8573     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
8574       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
8575       assert(i==slen);
8576       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
8577         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
8578         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
8579           emit_loadreg(CCREG,HOST_CCREG);
8580         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
8581       }
8582       else if(!likely[i-2])
8583       {
8584         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
8585         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
8586       }
8587       else
8588       {
8589         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
8590         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
8591       }
8592       add_to_linker(out,start+i*4,0);
8593       emit_jmp(0);
8594     }
8595   }
8596   else
8597   {
8598     assert(i>0);
8599     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
8600     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
8601     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
8602       emit_loadreg(CCREG,HOST_CCREG);
8603     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
8604     add_to_linker(out,start+i*4,0);
8605     emit_jmp(0);
8606   }
8607
8608   // TODO: delay slot stubs?
8609   // Stubs
8610   for(i=0;i<stubcount;i++)
8611   {
8612     switch(stubs[i].type)
8613     {
8614       case LOADB_STUB:
8615       case LOADH_STUB:
8616       case LOADW_STUB:
8617       case LOADD_STUB:
8618       case LOADBU_STUB:
8619       case LOADHU_STUB:
8620         do_readstub(i);break;
8621       case STOREB_STUB:
8622       case STOREH_STUB:
8623       case STOREW_STUB:
8624       case STORED_STUB:
8625         do_writestub(i);break;
8626       case CC_STUB:
8627         do_ccstub(i);break;
8628       case INVCODE_STUB:
8629         do_invstub(i);break;
8630       case FP_STUB:
8631         do_cop1stub(i);break;
8632       case STORELR_STUB:
8633         do_unalignedwritestub(i);break;
8634     }
8635   }
8636
8637   if (instr_addr0_override)
8638     instr_addr[0] = instr_addr0_override;
8639
8640   /* Pass 9 - Linker */
8641   for(i=0;i<linkcount;i++)
8642   {
8643     assem_debug("%p -> %8x\n",link_addr[i].addr,link_addr[i].target);
8644     literal_pool(64);
8645     if (!link_addr[i].ext)
8646     {
8647       void *stub = out;
8648       void *addr = check_addr(link_addr[i].target);
8649       emit_extjump(link_addr[i].addr, link_addr[i].target);
8650       if (addr) {
8651         set_jump_target(link_addr[i].addr, addr);
8652         add_link(link_addr[i].target,stub);
8653       }
8654       else
8655         set_jump_target(link_addr[i].addr, stub);
8656     }
8657     else
8658     {
8659       // Internal branch
8660       int target=(link_addr[i].target-start)>>2;
8661       assert(target>=0&&target<slen);
8662       assert(instr_addr[target]);
8663       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
8664       //set_jump_target_fillslot(link_addr[i].addr,instr_addr[target],link_addr[i].ext>>1);
8665       //#else
8666       set_jump_target(link_addr[i].addr, instr_addr[target]);
8667       //#endif
8668     }
8669   }
8670   // External Branch Targets (jump_in)
8671   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
8672   for(i=0;i<slen;i++)
8673   {
8674     if(bt[i]||i==0)
8675     {
8676       if(instr_addr[i]) // TODO - delay slots (=null)
8677       {
8678         u_int vaddr=start+i*4;
8679         u_int page=get_page(vaddr);
8680         u_int vpage=get_vpage(vaddr);
8681         literal_pool(256);
8682         {
8683           assem_debug("%p (%d) <- %8x\n",instr_addr[i],i,start+i*4);
8684           assem_debug("jump_in: %x\n",start+i*4);
8685           ll_add(jump_dirty+vpage,vaddr,out);
8686           void *entry_point = do_dirty_stub(i);
8687           ll_add_flags(jump_in+page,vaddr,state_rflags,entry_point);
8688           // If there was an existing entry in the hash table,
8689           // replace it with the new address.
8690           // Don't add new entries.  We'll insert the
8691           // ones that actually get used in check_addr().
8692           struct ht_entry *ht_bin = hash_table_get(vaddr);
8693           if (ht_bin->vaddr[0] == vaddr)
8694             ht_bin->tcaddr[0] = entry_point;
8695           if (ht_bin->vaddr[1] == vaddr)
8696             ht_bin->tcaddr[1] = entry_point;
8697         }
8698       }
8699     }
8700   }
8701   // Write out the literal pool if necessary
8702   literal_pool(0);
8703   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
8704   // Align code
8705   if(((u_int)out)&7) emit_addnop(13);
8706   #endif
8707   assert(out - (u_char *)beginning < MAX_OUTPUT_BLOCK_SIZE);
8708   //printf("shadow buffer: %p-%p\n",copy,(u_char *)copy+slen*4);
8709   memcpy(copy,source,slen*4);
8710   copy+=slen*4;
8711
8712   end_block(beginning);
8713
8714   // If we're within 256K of the end of the buffer,
8715   // start over from the beginning. (Is 256K enough?)
8716   if (out > translation_cache+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE)
8717     out = translation_cache;
8718
8719   // Trap writes to any of the pages we compiled
8720   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
8721     invalid_code[i]=0;
8722   }
8723   inv_code_start=inv_code_end=~0;
8724
8725   // for PCSX we need to mark all mirrors too
8726   if(get_page(start)<(RAM_SIZE>>12))
8727     for(i=start>>12;i<=(start+slen*4)>>12;i++)
8728       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
8729       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
8730       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
8731
8732   /* Pass 10 - Free memory by expiring oldest blocks */
8733
8734   int end=(((out-translation_cache)>>(TARGET_SIZE_2-16))+16384)&65535;
8735   while(expirep!=end)
8736   {
8737     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
8738     uintptr_t base=(uintptr_t)translation_cache+((expirep>>13)<<shift); // Base address of this block
8739     inv_debug("EXP: Phase %d\n",expirep);
8740     switch((expirep>>11)&3)
8741     {
8742       case 0:
8743         // Clear jump_in and jump_dirty
8744         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
8745         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
8746         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
8747         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
8748         break;
8749       case 1:
8750         // Clear pointers
8751         ll_kill_pointers(jump_out[expirep&2047],base,shift);
8752         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
8753         break;
8754       case 2:
8755         // Clear hash table
8756         for(i=0;i<32;i++) {
8757           struct ht_entry *ht_bin = &hash_table[((expirep&2047)<<5)+i];
8758           if (((uintptr_t)ht_bin->tcaddr[1]>>shift) == (base>>shift) ||
8759              (((uintptr_t)ht_bin->tcaddr[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
8760             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[1],ht_bin->tcaddr[1]);
8761             ht_bin->vaddr[1] = -1;
8762             ht_bin->tcaddr[1] = NULL;
8763           }
8764           if (((uintptr_t)ht_bin->tcaddr[0]>>shift) == (base>>shift) ||
8765              (((uintptr_t)ht_bin->tcaddr[0]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
8766             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[0],ht_bin->tcaddr[0]);
8767             ht_bin->vaddr[0] = ht_bin->vaddr[1];
8768             ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
8769             ht_bin->vaddr[1] = -1;
8770             ht_bin->tcaddr[1] = NULL;
8771           }
8772         }
8773         break;
8774       case 3:
8775         // Clear jump_out
8776         #ifdef __arm__
8777         if((expirep&2047)==0)
8778           do_clear_cache();
8779         #endif
8780         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
8781         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
8782         break;
8783     }
8784     expirep=(expirep+1)&65535;
8785   }
8786   return 0;
8787 }
8788
8789 // vim:shiftwidth=2:expandtab