78e53d45e75933b991911503ea598ba7eafc3bd4
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29 #ifdef _3DS
30 #include <3ds_utils.h>
31 #endif
32 #ifdef VITA
33 #include <psp2/kernel/sysmem.h>
34 static int sceBlock;
35 #endif
36
37 #include "new_dynarec_config.h"
38 #include "../psxhle.h" //emulator interface
39 #include "emu_if.h" //emulator interface
40
41 #ifndef ARRAY_SIZE
42 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
43 #endif
44
45 //#define DISASM
46 //#define assem_debug printf
47 //#define inv_debug printf
48 #define assem_debug(...)
49 #define inv_debug(...)
50
51 #ifdef __i386__
52 #include "assem_x86.h"
53 #endif
54 #ifdef __x86_64__
55 #include "assem_x64.h"
56 #endif
57 #ifdef __arm__
58 #include "assem_arm.h"
59 #endif
60
61 #define MAXBLOCK 4096
62 #define MAX_OUTPUT_BLOCK_SIZE 262144
63
64 // stubs
65 enum stub_type {
66   CC_STUB = 1,
67   FP_STUB = 2,
68   LOADB_STUB = 3,
69   LOADH_STUB = 4,
70   LOADW_STUB = 5,
71   LOADD_STUB = 6,
72   LOADBU_STUB = 7,
73   LOADHU_STUB = 8,
74   STOREB_STUB = 9,
75   STOREH_STUB = 10,
76   STOREW_STUB = 11,
77   STORED_STUB = 12,
78   STORELR_STUB = 13,
79   INVCODE_STUB = 14,
80 };
81
82 struct regstat
83 {
84   signed char regmap_entry[HOST_REGS];
85   signed char regmap[HOST_REGS];
86   uint64_t was32;
87   uint64_t is32;
88   uint64_t wasdirty;
89   uint64_t dirty;
90   uint64_t u;
91   uint64_t uu;
92   u_int wasconst;
93   u_int isconst;
94   u_int loadedconst;             // host regs that have constants loaded
95   u_int waswritten;              // MIPS regs that were used as store base before
96 };
97
98 // note: asm depends on this layout
99 struct ll_entry
100 {
101   u_int vaddr;
102   u_int reg_sv_flags;
103   void *addr;
104   struct ll_entry *next;
105 };
106
107 struct ht_entry
108 {
109   u_int vaddr[2];
110   void *tcaddr[2];
111 };
112
113 struct code_stub
114 {
115   enum stub_type type;
116   void *addr;
117   void *retaddr;
118   u_int a;
119   uintptr_t b;
120   uintptr_t c;
121   u_int d;
122   u_int e;
123 };
124
125 struct link_entry
126 {
127   void *addr;
128   u_int target;
129   u_int ext;
130 };
131
132   // used by asm:
133   u_char *out;
134   struct ht_entry hash_table[65536]  __attribute__((aligned(16)));
135   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
136   struct ll_entry *jump_dirty[4096];
137
138   static struct ll_entry *jump_out[4096];
139   static u_int start;
140   static u_int *source;
141   static char insn[MAXBLOCK][10];
142   static u_char itype[MAXBLOCK];
143   static u_char opcode[MAXBLOCK];
144   static u_char opcode2[MAXBLOCK];
145   static u_char bt[MAXBLOCK];
146   static u_char rs1[MAXBLOCK];
147   static u_char rs2[MAXBLOCK];
148   static u_char rt1[MAXBLOCK];
149   static u_char rt2[MAXBLOCK];
150   static u_char us1[MAXBLOCK];
151   static u_char us2[MAXBLOCK];
152   static u_char dep1[MAXBLOCK];
153   static u_char dep2[MAXBLOCK];
154   static u_char lt1[MAXBLOCK];
155   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
156   static uint64_t gte_rt[MAXBLOCK];
157   static uint64_t gte_unneeded[MAXBLOCK];
158   static u_int smrv[32]; // speculated MIPS register values
159   static u_int smrv_strong; // mask or regs that are likely to have correct values
160   static u_int smrv_weak; // same, but somewhat less likely
161   static u_int smrv_strong_next; // same, but after current insn executes
162   static u_int smrv_weak_next;
163   static int imm[MAXBLOCK];
164   static u_int ba[MAXBLOCK];
165   static char likely[MAXBLOCK];
166   static char is_ds[MAXBLOCK];
167   static char ooo[MAXBLOCK];
168   static uint64_t unneeded_reg[MAXBLOCK];
169   static uint64_t unneeded_reg_upper[MAXBLOCK];
170   static uint64_t branch_unneeded_reg[MAXBLOCK];
171   static uint64_t branch_unneeded_reg_upper[MAXBLOCK];
172   static signed char regmap_pre[MAXBLOCK][HOST_REGS];
173   static uint64_t current_constmap[HOST_REGS];
174   static uint64_t constmap[MAXBLOCK][HOST_REGS];
175   static struct regstat regs[MAXBLOCK];
176   static struct regstat branch_regs[MAXBLOCK];
177   static signed char minimum_free_regs[MAXBLOCK];
178   static u_int needed_reg[MAXBLOCK];
179   static u_int wont_dirty[MAXBLOCK];
180   static u_int will_dirty[MAXBLOCK];
181   static int ccadj[MAXBLOCK];
182   static int slen;
183   static void *instr_addr[MAXBLOCK];
184   static struct link_entry link_addr[MAXBLOCK];
185   static int linkcount;
186   static struct code_stub stubs[MAXBLOCK*3];
187   static int stubcount;
188   static u_int literals[1024][2];
189   static int literalcount;
190   static int is_delayslot;
191   static int cop1_usable;
192   static char shadow[1048576]  __attribute__((aligned(16)));
193   static void *copy;
194   static int expirep;
195   static u_int stop_after_jal;
196 #ifndef RAM_FIXED
197   static u_int ram_offset;
198 #else
199   static const u_int ram_offset=0;
200 #endif
201
202   int new_dynarec_hacks;
203   int new_dynarec_did_compile;
204   extern u_char restore_candidate[512];
205   extern int cycle_count;
206
207   /* registers that may be allocated */
208   /* 1-31 gpr */
209 #define HIREG 32 // hi
210 #define LOREG 33 // lo
211 #define FSREG 34 // FPU status (FCSR)
212 #define CSREG 35 // Coprocessor status
213 #define CCREG 36 // Cycle count
214 #define INVCP 37 // Pointer to invalid_code
215 //#define MMREG 38 // Pointer to memory_map
216 #define ROREG 39 // ram offset (if rdram!=0x80000000)
217 #define TEMPREG 40
218 #define FTEMP 40 // FPU temporary register
219 #define PTEMP 41 // Prefetch temporary register
220 //#define TLREG 42 // TLB mapping offset
221 #define RHASH 43 // Return address hash
222 #define RHTBL 44 // Return address hash table address
223 #define RTEMP 45 // JR/JALR address register
224 #define MAXREG 45
225 #define AGEN1 46 // Address generation temporary register
226 //#define AGEN2 47 // Address generation temporary register
227 //#define MGEN1 48 // Maptable address generation temporary register
228 //#define MGEN2 49 // Maptable address generation temporary register
229 #define BTREG 50 // Branch target temporary register
230
231   /* instruction types */
232 #define NOP 0     // No operation
233 #define LOAD 1    // Load
234 #define STORE 2   // Store
235 #define LOADLR 3  // Unaligned load
236 #define STORELR 4 // Unaligned store
237 #define MOV 5     // Move
238 #define ALU 6     // Arithmetic/logic
239 #define MULTDIV 7 // Multiply/divide
240 #define SHIFT 8   // Shift by register
241 #define SHIFTIMM 9// Shift by immediate
242 #define IMM16 10  // 16-bit immediate
243 #define RJUMP 11  // Unconditional jump to register
244 #define UJUMP 12  // Unconditional jump
245 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
246 #define SJUMP 14  // Conditional branch (regimm format)
247 #define COP0 15   // Coprocessor 0
248 #define COP1 16   // Coprocessor 1
249 #define C1LS 17   // Coprocessor 1 load/store
250 #define FJUMP 18  // Conditional branch (floating point)
251 #define FLOAT 19  // Floating point unit
252 #define FCONV 20  // Convert integer to float
253 #define FCOMP 21  // Floating point compare (sets FSREG)
254 #define SYSCALL 22// SYSCALL
255 #define OTHER 23  // Other
256 #define SPAN 24   // Branch/delay slot spans 2 pages
257 #define NI 25     // Not implemented
258 #define HLECALL 26// PCSX fake opcodes for HLE
259 #define COP2 27   // Coprocessor 2 move
260 #define C2LS 28   // Coprocessor 2 load/store
261 #define C2OP 29   // Coprocessor 2 operation
262 #define INTCALL 30// Call interpreter to handle rare corner cases
263
264   /* branch codes */
265 #define TAKEN 1
266 #define NOTTAKEN 2
267 #define NULLDS 3
268
269 // asm linkage
270 int new_recompile_block(int addr);
271 void *get_addr_ht(u_int vaddr);
272 void invalidate_block(u_int block);
273 void invalidate_addr(u_int addr);
274 void remove_hash(int vaddr);
275 void dyna_linker();
276 void dyna_linker_ds();
277 void verify_code();
278 void verify_code_vm();
279 void verify_code_ds();
280 void cc_interrupt();
281 void fp_exception();
282 void fp_exception_ds();
283 void jump_syscall_hle();
284 void jump_hlecall();
285 void jump_intcall();
286 void new_dyna_leave();
287
288 // Needed by assembler
289 static void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
290 static void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
291 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
292 static void load_all_regs(signed char i_regmap[]);
293 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
294 static void load_regs_entry(int t);
295 static void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
296
297 static int verify_dirty(u_int *ptr);
298 static int get_final_value(int hr, int i, int *value);
299 static void add_stub(enum stub_type type, void *addr, void *retaddr,
300   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e);
301 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
302   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist);
303 static void add_to_linker(void *addr, u_int target, int ext);
304
305 static void mprotect_w_x(void *start, void *end, int is_x)
306 {
307 #ifdef NO_WRITE_EXEC
308   #if defined(VITA)
309   // *Open* enables write on all memory that was
310   // allocated by sceKernelAllocMemBlockForVM()?
311   if (is_x)
312     sceKernelCloseVMDomain();
313   else
314     sceKernelOpenVMDomain();
315   #else
316   u_long mstart = (u_long)start & ~4095ul;
317   u_long mend = (u_long)end;
318   if (mprotect((void *)mstart, mend - mstart,
319                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
320     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
321   #endif
322 #endif
323 }
324
325 static void start_tcache_write(void *start, void *end)
326 {
327   mprotect_w_x(start, end, 0);
328 }
329
330 static void end_tcache_write(void *start, void *end)
331 {
332 #ifdef __arm__
333   size_t len = (char *)end - (char *)start;
334   #if   defined(__BLACKBERRY_QNX__)
335   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
336   #elif defined(__MACH__)
337   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
338   #elif defined(VITA)
339   sceKernelSyncVMDomain(sceBlock, start, len);
340   #elif defined(_3DS)
341   ctr_flush_invalidate_cache();
342   #else
343   __clear_cache(start, end);
344   #endif
345   (void)len;
346 #endif
347
348   mprotect_w_x(start, end, 1);
349 }
350
351 static void *start_block(void)
352 {
353   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
354   if (end > translation_cache + (1<<TARGET_SIZE_2))
355     end = translation_cache + (1<<TARGET_SIZE_2);
356   start_tcache_write(out, end);
357   return out;
358 }
359
360 static void end_block(void *start)
361 {
362   end_tcache_write(start, out);
363 }
364
365 //#define DEBUG_CYCLE_COUNT 1
366
367 #define NO_CYCLE_PENALTY_THR 12
368
369 int cycle_multiplier; // 100 for 1.0
370
371 static int CLOCK_ADJUST(int x)
372 {
373   int s=(x>>31)|1;
374   return (x * cycle_multiplier + s * 50) / 100;
375 }
376
377 static u_int get_page(u_int vaddr)
378 {
379   u_int page=vaddr&~0xe0000000;
380   if (page < 0x1000000)
381     page &= ~0x0e00000; // RAM mirrors
382   page>>=12;
383   if(page>2048) page=2048+(page&2047);
384   return page;
385 }
386
387 // no virtual mem in PCSX
388 static u_int get_vpage(u_int vaddr)
389 {
390   return get_page(vaddr);
391 }
392
393 static struct ht_entry *hash_table_get(u_int vaddr)
394 {
395   return &hash_table[((vaddr>>16)^vaddr)&0xFFFF];
396 }
397
398 static void hash_table_add(struct ht_entry *ht_bin, u_int vaddr, void *tcaddr)
399 {
400   ht_bin->vaddr[1] = ht_bin->vaddr[0];
401   ht_bin->tcaddr[1] = ht_bin->tcaddr[0];
402   ht_bin->vaddr[0] = vaddr;
403   ht_bin->tcaddr[0] = tcaddr;
404 }
405
406 // some messy ari64's code, seems to rely on unsigned 32bit overflow
407 static int doesnt_expire_soon(void *tcaddr)
408 {
409   u_int diff = (u_int)((u_char *)tcaddr - out) << (32-TARGET_SIZE_2);
410   return diff > (u_int)(0x60000000 + (MAX_OUTPUT_BLOCK_SIZE << (32-TARGET_SIZE_2)));
411 }
412
413 // Get address from virtual address
414 // This is called from the recompiled JR/JALR instructions
415 void *get_addr(u_int vaddr)
416 {
417   u_int page=get_page(vaddr);
418   u_int vpage=get_vpage(vaddr);
419   struct ll_entry *head;
420   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
421   head=jump_in[page];
422   while(head!=NULL) {
423     if(head->vaddr==vaddr) {
424   //printf("TRACE: count=%d next=%d (get_addr match %x: %p)\n",Count,next_interupt,vaddr,head->addr);
425       hash_table_add(hash_table_get(vaddr), vaddr, head->addr);
426       return head->addr;
427     }
428     head=head->next;
429   }
430   head=jump_dirty[vpage];
431   while(head!=NULL) {
432     if(head->vaddr==vaddr) {
433       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %p)\n",Count,next_interupt,vaddr,head->addr);
434       // Don't restore blocks which are about to expire from the cache
435       if (doesnt_expire_soon(head->addr))
436       if (verify_dirty(head->addr)) {
437         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
438         invalid_code[vaddr>>12]=0;
439         inv_code_start=inv_code_end=~0;
440         if(vpage<2048) {
441           restore_candidate[vpage>>3]|=1<<(vpage&7);
442         }
443         else restore_candidate[page>>3]|=1<<(page&7);
444         struct ht_entry *ht_bin = hash_table_get(vaddr);
445         if (ht_bin->vaddr[0] == vaddr)
446           ht_bin->tcaddr[0] = head->addr; // Replace existing entry
447         else
448           hash_table_add(ht_bin, vaddr, head->addr);
449
450         return head->addr;
451       }
452     }
453     head=head->next;
454   }
455   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
456   int r=new_recompile_block(vaddr);
457   if(r==0) return get_addr(vaddr);
458   // Execute in unmapped page, generate pagefault execption
459   Status|=2;
460   Cause=(vaddr<<31)|0x8;
461   EPC=(vaddr&1)?vaddr-5:vaddr;
462   BadVAddr=(vaddr&~1);
463   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
464   EntryHi=BadVAddr&0xFFFFE000;
465   return get_addr_ht(0x80000000);
466 }
467 // Look up address in hash table first
468 void *get_addr_ht(u_int vaddr)
469 {
470   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
471   const struct ht_entry *ht_bin = hash_table_get(vaddr);
472   if (ht_bin->vaddr[0] == vaddr) return ht_bin->tcaddr[0];
473   if (ht_bin->vaddr[1] == vaddr) return ht_bin->tcaddr[1];
474   return get_addr(vaddr);
475 }
476
477 void clear_all_regs(signed char regmap[])
478 {
479   int hr;
480   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
481 }
482
483 signed char get_reg(signed char regmap[],int r)
484 {
485   int hr;
486   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
487   return -1;
488 }
489
490 // Find a register that is available for two consecutive cycles
491 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
492 {
493   int hr;
494   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
495   return -1;
496 }
497
498 int count_free_regs(signed char regmap[])
499 {
500   int count=0;
501   int hr;
502   for(hr=0;hr<HOST_REGS;hr++)
503   {
504     if(hr!=EXCLUDE_REG) {
505       if(regmap[hr]<0) count++;
506     }
507   }
508   return count;
509 }
510
511 void dirty_reg(struct regstat *cur,signed char reg)
512 {
513   int hr;
514   if(!reg) return;
515   for (hr=0;hr<HOST_REGS;hr++) {
516     if((cur->regmap[hr]&63)==reg) {
517       cur->dirty|=1<<hr;
518     }
519   }
520 }
521
522 // If we dirty the lower half of a 64 bit register which is now being
523 // sign-extended, we need to dump the upper half.
524 // Note: Do this only after completion of the instruction, because
525 // some instructions may need to read the full 64-bit value even if
526 // overwriting it (eg SLTI, DSRA32).
527 static void flush_dirty_uppers(struct regstat *cur)
528 {
529   int hr,reg;
530   for (hr=0;hr<HOST_REGS;hr++) {
531     if((cur->dirty>>hr)&1) {
532       reg=cur->regmap[hr];
533       if(reg>=64)
534         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
535     }
536   }
537 }
538
539 void set_const(struct regstat *cur,signed char reg,uint64_t value)
540 {
541   int hr;
542   if(!reg) return;
543   for (hr=0;hr<HOST_REGS;hr++) {
544     if(cur->regmap[hr]==reg) {
545       cur->isconst|=1<<hr;
546       current_constmap[hr]=value;
547     }
548     else if((cur->regmap[hr]^64)==reg) {
549       cur->isconst|=1<<hr;
550       current_constmap[hr]=value>>32;
551     }
552   }
553 }
554
555 void clear_const(struct regstat *cur,signed char reg)
556 {
557   int hr;
558   if(!reg) return;
559   for (hr=0;hr<HOST_REGS;hr++) {
560     if((cur->regmap[hr]&63)==reg) {
561       cur->isconst&=~(1<<hr);
562     }
563   }
564 }
565
566 int is_const(struct regstat *cur,signed char reg)
567 {
568   int hr;
569   if(reg<0) return 0;
570   if(!reg) return 1;
571   for (hr=0;hr<HOST_REGS;hr++) {
572     if((cur->regmap[hr]&63)==reg) {
573       return (cur->isconst>>hr)&1;
574     }
575   }
576   return 0;
577 }
578 uint64_t get_const(struct regstat *cur,signed char reg)
579 {
580   int hr;
581   if(!reg) return 0;
582   for (hr=0;hr<HOST_REGS;hr++) {
583     if(cur->regmap[hr]==reg) {
584       return current_constmap[hr];
585     }
586   }
587   SysPrintf("Unknown constant in r%d\n",reg);
588   exit(1);
589 }
590
591 // Least soon needed registers
592 // Look at the next ten instructions and see which registers
593 // will be used.  Try not to reallocate these.
594 void lsn(u_char hsn[], int i, int *preferred_reg)
595 {
596   int j;
597   int b=-1;
598   for(j=0;j<9;j++)
599   {
600     if(i+j>=slen) {
601       j=slen-i-1;
602       break;
603     }
604     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
605     {
606       // Don't go past an unconditonal jump
607       j++;
608       break;
609     }
610   }
611   for(;j>=0;j--)
612   {
613     if(rs1[i+j]) hsn[rs1[i+j]]=j;
614     if(rs2[i+j]) hsn[rs2[i+j]]=j;
615     if(rt1[i+j]) hsn[rt1[i+j]]=j;
616     if(rt2[i+j]) hsn[rt2[i+j]]=j;
617     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
618       // Stores can allocate zero
619       hsn[rs1[i+j]]=j;
620       hsn[rs2[i+j]]=j;
621     }
622     // On some architectures stores need invc_ptr
623     #if defined(HOST_IMM8)
624     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
625       hsn[INVCP]=j;
626     }
627     #endif
628     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
629     {
630       hsn[CCREG]=j;
631       b=j;
632     }
633   }
634   if(b>=0)
635   {
636     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
637     {
638       // Follow first branch
639       int t=(ba[i+b]-start)>>2;
640       j=7-b;if(t+j>=slen) j=slen-t-1;
641       for(;j>=0;j--)
642       {
643         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
644         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
645         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
646         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
647       }
648     }
649     // TODO: preferred register based on backward branch
650   }
651   // Delay slot should preferably not overwrite branch conditions or cycle count
652   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
653     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
654     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
655     hsn[CCREG]=1;
656     // ...or hash tables
657     hsn[RHASH]=1;
658     hsn[RHTBL]=1;
659   }
660   // Coprocessor load/store needs FTEMP, even if not declared
661   if(itype[i]==C1LS||itype[i]==C2LS) {
662     hsn[FTEMP]=0;
663   }
664   // Load L/R also uses FTEMP as a temporary register
665   if(itype[i]==LOADLR) {
666     hsn[FTEMP]=0;
667   }
668   // Also SWL/SWR/SDL/SDR
669   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
670     hsn[FTEMP]=0;
671   }
672   // Don't remove the miniht registers
673   if(itype[i]==UJUMP||itype[i]==RJUMP)
674   {
675     hsn[RHASH]=0;
676     hsn[RHTBL]=0;
677   }
678 }
679
680 // We only want to allocate registers if we're going to use them again soon
681 int needed_again(int r, int i)
682 {
683   int j;
684   int b=-1;
685   int rn=10;
686
687   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
688   {
689     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
690       return 0; // Don't need any registers if exiting the block
691   }
692   for(j=0;j<9;j++)
693   {
694     if(i+j>=slen) {
695       j=slen-i-1;
696       break;
697     }
698     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
699     {
700       // Don't go past an unconditonal jump
701       j++;
702       break;
703     }
704     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
705     {
706       break;
707     }
708   }
709   for(;j>=1;j--)
710   {
711     if(rs1[i+j]==r) rn=j;
712     if(rs2[i+j]==r) rn=j;
713     if((unneeded_reg[i+j]>>r)&1) rn=10;
714     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
715     {
716       b=j;
717     }
718   }
719   /*
720   if(b>=0)
721   {
722     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
723     {
724       // Follow first branch
725       int o=rn;
726       int t=(ba[i+b]-start)>>2;
727       j=7-b;if(t+j>=slen) j=slen-t-1;
728       for(;j>=0;j--)
729       {
730         if(!((unneeded_reg[t+j]>>r)&1)) {
731           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
732           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
733         }
734         else rn=o;
735       }
736     }
737   }*/
738   if(rn<10) return 1;
739   (void)b;
740   return 0;
741 }
742
743 // Try to match register allocations at the end of a loop with those
744 // at the beginning
745 int loop_reg(int i, int r, int hr)
746 {
747   int j,k;
748   for(j=0;j<9;j++)
749   {
750     if(i+j>=slen) {
751       j=slen-i-1;
752       break;
753     }
754     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
755     {
756       // Don't go past an unconditonal jump
757       j++;
758       break;
759     }
760   }
761   k=0;
762   if(i>0){
763     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
764       k--;
765   }
766   for(;k<j;k++)
767   {
768     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
769     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
770     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
771     {
772       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
773       {
774         int t=(ba[i+k]-start)>>2;
775         int reg=get_reg(regs[t].regmap_entry,r);
776         if(reg>=0) return reg;
777         //reg=get_reg(regs[t+1].regmap_entry,r);
778         //if(reg>=0) return reg;
779       }
780     }
781   }
782   return hr;
783 }
784
785
786 // Allocate every register, preserving source/target regs
787 void alloc_all(struct regstat *cur,int i)
788 {
789   int hr;
790
791   for(hr=0;hr<HOST_REGS;hr++) {
792     if(hr!=EXCLUDE_REG) {
793       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
794          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
795       {
796         cur->regmap[hr]=-1;
797         cur->dirty&=~(1<<hr);
798       }
799       // Don't need zeros
800       if((cur->regmap[hr]&63)==0)
801       {
802         cur->regmap[hr]=-1;
803         cur->dirty&=~(1<<hr);
804       }
805     }
806   }
807 }
808
809 #ifdef __i386__
810 #include "assem_x86.c"
811 #endif
812 #ifdef __x86_64__
813 #include "assem_x64.c"
814 #endif
815 #ifdef __arm__
816 #include "assem_arm.c"
817 #endif
818
819 // Add virtual address mapping to linked list
820 void ll_add(struct ll_entry **head,int vaddr,void *addr)
821 {
822   struct ll_entry *new_entry;
823   new_entry=malloc(sizeof(struct ll_entry));
824   assert(new_entry!=NULL);
825   new_entry->vaddr=vaddr;
826   new_entry->reg_sv_flags=0;
827   new_entry->addr=addr;
828   new_entry->next=*head;
829   *head=new_entry;
830 }
831
832 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
833 {
834   ll_add(head,vaddr,addr);
835   (*head)->reg_sv_flags=reg_sv_flags;
836 }
837
838 // Check if an address is already compiled
839 // but don't return addresses which are about to expire from the cache
840 void *check_addr(u_int vaddr)
841 {
842   struct ht_entry *ht_bin = hash_table_get(vaddr);
843   size_t i;
844   for (i = 0; i < ARRAY_SIZE(ht_bin->vaddr); i++) {
845     if (ht_bin->vaddr[i] == vaddr)
846       if (doesnt_expire_soon((u_char *)ht_bin->tcaddr[i] - MAX_OUTPUT_BLOCK_SIZE))
847         if (isclean(ht_bin->tcaddr[i]))
848           return ht_bin->tcaddr[i];
849   }
850   u_int page=get_page(vaddr);
851   struct ll_entry *head;
852   head=jump_in[page];
853   while (head != NULL) {
854     if (head->vaddr == vaddr) {
855       if (doesnt_expire_soon(head->addr)) {
856         // Update existing entry with current address
857         if (ht_bin->vaddr[0] == vaddr) {
858           ht_bin->tcaddr[0] = head->addr;
859           return head->addr;
860         }
861         if (ht_bin->vaddr[1] == vaddr) {
862           ht_bin->tcaddr[1] = head->addr;
863           return head->addr;
864         }
865         // Insert into hash table with low priority.
866         // Don't evict existing entries, as they are probably
867         // addresses that are being accessed frequently.
868         if (ht_bin->vaddr[0] == -1) {
869           ht_bin->vaddr[0] = vaddr;
870           ht_bin->tcaddr[0] = head->addr;
871         }
872         else if (ht_bin->vaddr[1] == -1) {
873           ht_bin->vaddr[1] = vaddr;
874           ht_bin->tcaddr[1] = head->addr;
875         }
876         return head->addr;
877       }
878     }
879     head=head->next;
880   }
881   return 0;
882 }
883
884 void remove_hash(int vaddr)
885 {
886   //printf("remove hash: %x\n",vaddr);
887   struct ht_entry *ht_bin = hash_table_get(vaddr);
888   if (ht_bin->vaddr[1] == vaddr) {
889     ht_bin->vaddr[1] = -1;
890     ht_bin->tcaddr[1] = NULL;
891   }
892   if (ht_bin->vaddr[0] == vaddr) {
893     ht_bin->vaddr[0] = ht_bin->vaddr[1];
894     ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
895     ht_bin->vaddr[1] = -1;
896     ht_bin->tcaddr[1] = NULL;
897   }
898 }
899
900 void ll_remove_matching_addrs(struct ll_entry **head,uintptr_t addr,int shift)
901 {
902   struct ll_entry *next;
903   while(*head) {
904     if(((uintptr_t)((*head)->addr)>>shift)==(addr>>shift) ||
905        ((uintptr_t)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
906     {
907       inv_debug("EXP: Remove pointer to %p (%x)\n",(*head)->addr,(*head)->vaddr);
908       remove_hash((*head)->vaddr);
909       next=(*head)->next;
910       free(*head);
911       *head=next;
912     }
913     else
914     {
915       head=&((*head)->next);
916     }
917   }
918 }
919
920 // Remove all entries from linked list
921 void ll_clear(struct ll_entry **head)
922 {
923   struct ll_entry *cur;
924   struct ll_entry *next;
925   if((cur=*head)) {
926     *head=0;
927     while(cur) {
928       next=cur->next;
929       free(cur);
930       cur=next;
931     }
932   }
933 }
934
935 // Dereference the pointers and remove if it matches
936 static void ll_kill_pointers(struct ll_entry *head,uintptr_t addr,int shift)
937 {
938   while(head) {
939     uintptr_t ptr = (uintptr_t)get_pointer(head->addr);
940     inv_debug("EXP: Lookup pointer to %lx at %p (%x)\n",(long)ptr,head->addr,head->vaddr);
941     if(((ptr>>shift)==(addr>>shift)) ||
942        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
943     {
944       inv_debug("EXP: Kill pointer at %p (%x)\n",head->addr,head->vaddr);
945       void *host_addr=find_extjump_insn(head->addr);
946       #ifdef __arm__
947         mark_clear_cache(host_addr);
948       #endif
949       set_jump_target(host_addr, head->addr);
950     }
951     head=head->next;
952   }
953 }
954
955 // This is called when we write to a compiled block (see do_invstub)
956 void invalidate_page(u_int page)
957 {
958   struct ll_entry *head;
959   struct ll_entry *next;
960   head=jump_in[page];
961   jump_in[page]=0;
962   while(head!=NULL) {
963     inv_debug("INVALIDATE: %x\n",head->vaddr);
964     remove_hash(head->vaddr);
965     next=head->next;
966     free(head);
967     head=next;
968   }
969   head=jump_out[page];
970   jump_out[page]=0;
971   while(head!=NULL) {
972     inv_debug("INVALIDATE: kill pointer to %x (%p)\n",head->vaddr,head->addr);
973     void *host_addr=find_extjump_insn(head->addr);
974     #ifdef __arm__
975       mark_clear_cache(host_addr);
976     #endif
977     set_jump_target(host_addr, head->addr);
978     next=head->next;
979     free(head);
980     head=next;
981   }
982 }
983
984 static void invalidate_block_range(u_int block, u_int first, u_int last)
985 {
986   u_int page=get_page(block<<12);
987   //printf("first=%d last=%d\n",first,last);
988   invalidate_page(page);
989   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
990   assert(last<page+5);
991   // Invalidate the adjacent pages if a block crosses a 4K boundary
992   while(first<page) {
993     invalidate_page(first);
994     first++;
995   }
996   for(first=page+1;first<last;first++) {
997     invalidate_page(first);
998   }
999   #ifdef __arm__
1000     do_clear_cache();
1001   #endif
1002
1003   // Don't trap writes
1004   invalid_code[block]=1;
1005
1006   #ifdef USE_MINI_HT
1007   memset(mini_ht,-1,sizeof(mini_ht));
1008   #endif
1009 }
1010
1011 void invalidate_block(u_int block)
1012 {
1013   u_int page=get_page(block<<12);
1014   u_int vpage=get_vpage(block<<12);
1015   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1016   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1017   u_int first,last;
1018   first=last=page;
1019   struct ll_entry *head;
1020   head=jump_dirty[vpage];
1021   //printf("page=%d vpage=%d\n",page,vpage);
1022   while(head!=NULL) {
1023     u_int start,end;
1024     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1025       get_bounds(head->addr,&start,&end);
1026       //printf("start: %x end: %x\n",start,end);
1027       if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE) {
1028         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1029           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1030           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1031         }
1032       }
1033     }
1034     head=head->next;
1035   }
1036   invalidate_block_range(block,first,last);
1037 }
1038
1039 void invalidate_addr(u_int addr)
1040 {
1041   //static int rhits;
1042   // this check is done by the caller
1043   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1044   u_int page=get_vpage(addr);
1045   if(page<2048) { // RAM
1046     struct ll_entry *head;
1047     u_int addr_min=~0, addr_max=0;
1048     u_int mask=RAM_SIZE-1;
1049     u_int addr_main=0x80000000|(addr&mask);
1050     int pg1;
1051     inv_code_start=addr_main&~0xfff;
1052     inv_code_end=addr_main|0xfff;
1053     pg1=page;
1054     if (pg1>0) {
1055       // must check previous page too because of spans..
1056       pg1--;
1057       inv_code_start-=0x1000;
1058     }
1059     for(;pg1<=page;pg1++) {
1060       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1061         u_int start,end;
1062         get_bounds(head->addr,&start,&end);
1063         if(ram_offset) {
1064           start-=ram_offset;
1065           end-=ram_offset;
1066         }
1067         if(start<=addr_main&&addr_main<end) {
1068           if(start<addr_min) addr_min=start;
1069           if(end>addr_max) addr_max=end;
1070         }
1071         else if(addr_main<start) {
1072           if(start<inv_code_end)
1073             inv_code_end=start-1;
1074         }
1075         else {
1076           if(end>inv_code_start)
1077             inv_code_start=end;
1078         }
1079       }
1080     }
1081     if (addr_min!=~0) {
1082       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1083       inv_code_start=inv_code_end=~0;
1084       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1085       return;
1086     }
1087     else {
1088       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1089       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1090       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1091       return;
1092     }
1093   }
1094   invalidate_block(addr>>12);
1095 }
1096
1097 // This is called when loading a save state.
1098 // Anything could have changed, so invalidate everything.
1099 void invalidate_all_pages()
1100 {
1101   u_int page;
1102   for(page=0;page<4096;page++)
1103     invalidate_page(page);
1104   for(page=0;page<1048576;page++)
1105     if(!invalid_code[page]) {
1106       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1107       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1108     }
1109   #ifdef USE_MINI_HT
1110   memset(mini_ht,-1,sizeof(mini_ht));
1111   #endif
1112 }
1113
1114 // Add an entry to jump_out after making a link
1115 void add_link(u_int vaddr,void *src)
1116 {
1117   u_int page=get_page(vaddr);
1118   inv_debug("add_link: %p -> %x (%d)\n",src,vaddr,page);
1119   int *ptr=(int *)(src+4);
1120   assert((*ptr&0x0fff0000)==0x059f0000);
1121   (void)ptr;
1122   ll_add(jump_out+page,vaddr,src);
1123   //void *ptr=get_pointer(src);
1124   //inv_debug("add_link: Pointer is to %p\n",ptr);
1125 }
1126
1127 // If a code block was found to be unmodified (bit was set in
1128 // restore_candidate) and it remains unmodified (bit is clear
1129 // in invalid_code) then move the entries for that 4K page from
1130 // the dirty list to the clean list.
1131 void clean_blocks(u_int page)
1132 {
1133   struct ll_entry *head;
1134   inv_debug("INV: clean_blocks page=%d\n",page);
1135   head=jump_dirty[page];
1136   while(head!=NULL) {
1137     if(!invalid_code[head->vaddr>>12]) {
1138       // Don't restore blocks which are about to expire from the cache
1139       if (doesnt_expire_soon(head->addr)) {
1140         u_int start,end;
1141         if(verify_dirty(head->addr)) {
1142           //printf("Possibly Restore %x (%p)\n",head->vaddr, head->addr);
1143           u_int i;
1144           u_int inv=0;
1145           get_bounds(head->addr,&start,&end);
1146           if(start-(u_int)rdram<RAM_SIZE) {
1147             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1148               inv|=invalid_code[i];
1149             }
1150           }
1151           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1152             inv=1;
1153           }
1154           if(!inv) {
1155             void *clean_addr = get_clean_addr(head->addr);
1156             if (doesnt_expire_soon(clean_addr)) {
1157               u_int ppage=page;
1158               inv_debug("INV: Restored %x (%p/%p)\n",head->vaddr, head->addr, clean_addr);
1159               //printf("page=%x, addr=%x\n",page,head->vaddr);
1160               //assert(head->vaddr>>12==(page|0x80000));
1161               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1162               struct ht_entry *ht_bin = hash_table_get(head->vaddr);
1163               if (ht_bin->vaddr[0] == head->vaddr)
1164                 ht_bin->tcaddr[0] = clean_addr; // Replace existing entry
1165               if (ht_bin->vaddr[1] == head->vaddr)
1166                 ht_bin->tcaddr[1] = clean_addr; // Replace existing entry
1167             }
1168           }
1169         }
1170       }
1171     }
1172     head=head->next;
1173   }
1174 }
1175
1176
1177 void mov_alloc(struct regstat *current,int i)
1178 {
1179   // Note: Don't need to actually alloc the source registers
1180   if((~current->is32>>rs1[i])&1) {
1181     //alloc_reg64(current,i,rs1[i]);
1182     alloc_reg64(current,i,rt1[i]);
1183     current->is32&=~(1LL<<rt1[i]);
1184   } else {
1185     //alloc_reg(current,i,rs1[i]);
1186     alloc_reg(current,i,rt1[i]);
1187     current->is32|=(1LL<<rt1[i]);
1188   }
1189   clear_const(current,rs1[i]);
1190   clear_const(current,rt1[i]);
1191   dirty_reg(current,rt1[i]);
1192 }
1193
1194 void shiftimm_alloc(struct regstat *current,int i)
1195 {
1196   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1197   {
1198     if(rt1[i]) {
1199       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1200       else lt1[i]=rs1[i];
1201       alloc_reg(current,i,rt1[i]);
1202       current->is32|=1LL<<rt1[i];
1203       dirty_reg(current,rt1[i]);
1204       if(is_const(current,rs1[i])) {
1205         int v=get_const(current,rs1[i]);
1206         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1207         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1208         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1209       }
1210       else clear_const(current,rt1[i]);
1211     }
1212   }
1213   else
1214   {
1215     clear_const(current,rs1[i]);
1216     clear_const(current,rt1[i]);
1217   }
1218
1219   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1220   {
1221     if(rt1[i]) {
1222       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1223       alloc_reg64(current,i,rt1[i]);
1224       current->is32&=~(1LL<<rt1[i]);
1225       dirty_reg(current,rt1[i]);
1226     }
1227   }
1228   if(opcode2[i]==0x3c) // DSLL32
1229   {
1230     if(rt1[i]) {
1231       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1232       alloc_reg64(current,i,rt1[i]);
1233       current->is32&=~(1LL<<rt1[i]);
1234       dirty_reg(current,rt1[i]);
1235     }
1236   }
1237   if(opcode2[i]==0x3e) // DSRL32
1238   {
1239     if(rt1[i]) {
1240       alloc_reg64(current,i,rs1[i]);
1241       if(imm[i]==32) {
1242         alloc_reg64(current,i,rt1[i]);
1243         current->is32&=~(1LL<<rt1[i]);
1244       } else {
1245         alloc_reg(current,i,rt1[i]);
1246         current->is32|=1LL<<rt1[i];
1247       }
1248       dirty_reg(current,rt1[i]);
1249     }
1250   }
1251   if(opcode2[i]==0x3f) // DSRA32
1252   {
1253     if(rt1[i]) {
1254       alloc_reg64(current,i,rs1[i]);
1255       alloc_reg(current,i,rt1[i]);
1256       current->is32|=1LL<<rt1[i];
1257       dirty_reg(current,rt1[i]);
1258     }
1259   }
1260 }
1261
1262 void shift_alloc(struct regstat *current,int i)
1263 {
1264   if(rt1[i]) {
1265     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1266     {
1267       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1268       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1269       alloc_reg(current,i,rt1[i]);
1270       if(rt1[i]==rs2[i]) {
1271         alloc_reg_temp(current,i,-1);
1272         minimum_free_regs[i]=1;
1273       }
1274       current->is32|=1LL<<rt1[i];
1275     } else { // DSLLV/DSRLV/DSRAV
1276       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1277       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1278       alloc_reg64(current,i,rt1[i]);
1279       current->is32&=~(1LL<<rt1[i]);
1280       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1281       {
1282         alloc_reg_temp(current,i,-1);
1283         minimum_free_regs[i]=1;
1284       }
1285     }
1286     clear_const(current,rs1[i]);
1287     clear_const(current,rs2[i]);
1288     clear_const(current,rt1[i]);
1289     dirty_reg(current,rt1[i]);
1290   }
1291 }
1292
1293 void alu_alloc(struct regstat *current,int i)
1294 {
1295   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1296     if(rt1[i]) {
1297       if(rs1[i]&&rs2[i]) {
1298         alloc_reg(current,i,rs1[i]);
1299         alloc_reg(current,i,rs2[i]);
1300       }
1301       else {
1302         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1303         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1304       }
1305       alloc_reg(current,i,rt1[i]);
1306     }
1307     current->is32|=1LL<<rt1[i];
1308   }
1309   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1310     if(rt1[i]) {
1311       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1312       {
1313         alloc_reg64(current,i,rs1[i]);
1314         alloc_reg64(current,i,rs2[i]);
1315         alloc_reg(current,i,rt1[i]);
1316       } else {
1317         alloc_reg(current,i,rs1[i]);
1318         alloc_reg(current,i,rs2[i]);
1319         alloc_reg(current,i,rt1[i]);
1320       }
1321     }
1322     current->is32|=1LL<<rt1[i];
1323   }
1324   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1325     if(rt1[i]) {
1326       if(rs1[i]&&rs2[i]) {
1327         alloc_reg(current,i,rs1[i]);
1328         alloc_reg(current,i,rs2[i]);
1329       }
1330       else
1331       {
1332         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1333         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1334       }
1335       alloc_reg(current,i,rt1[i]);
1336       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1337       {
1338         if(!((current->uu>>rt1[i])&1)) {
1339           alloc_reg64(current,i,rt1[i]);
1340         }
1341         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1342           if(rs1[i]&&rs2[i]) {
1343             alloc_reg64(current,i,rs1[i]);
1344             alloc_reg64(current,i,rs2[i]);
1345           }
1346           else
1347           {
1348             // Is is really worth it to keep 64-bit values in registers?
1349             #ifdef NATIVE_64BIT
1350             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1351             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1352             #endif
1353           }
1354         }
1355         current->is32&=~(1LL<<rt1[i]);
1356       } else {
1357         current->is32|=1LL<<rt1[i];
1358       }
1359     }
1360   }
1361   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1362     if(rt1[i]) {
1363       if(rs1[i]&&rs2[i]) {
1364         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1365           alloc_reg64(current,i,rs1[i]);
1366           alloc_reg64(current,i,rs2[i]);
1367           alloc_reg64(current,i,rt1[i]);
1368         } else {
1369           alloc_reg(current,i,rs1[i]);
1370           alloc_reg(current,i,rs2[i]);
1371           alloc_reg(current,i,rt1[i]);
1372         }
1373       }
1374       else {
1375         alloc_reg(current,i,rt1[i]);
1376         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1377           // DADD used as move, or zeroing
1378           // If we have a 64-bit source, then make the target 64 bits too
1379           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1380             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1381             alloc_reg64(current,i,rt1[i]);
1382           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1383             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1384             alloc_reg64(current,i,rt1[i]);
1385           }
1386           if(opcode2[i]>=0x2e&&rs2[i]) {
1387             // DSUB used as negation - 64-bit result
1388             // If we have a 32-bit register, extend it to 64 bits
1389             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1390             alloc_reg64(current,i,rt1[i]);
1391           }
1392         }
1393       }
1394       if(rs1[i]&&rs2[i]) {
1395         current->is32&=~(1LL<<rt1[i]);
1396       } else if(rs1[i]) {
1397         current->is32&=~(1LL<<rt1[i]);
1398         if((current->is32>>rs1[i])&1)
1399           current->is32|=1LL<<rt1[i];
1400       } else if(rs2[i]) {
1401         current->is32&=~(1LL<<rt1[i]);
1402         if((current->is32>>rs2[i])&1)
1403           current->is32|=1LL<<rt1[i];
1404       } else {
1405         current->is32|=1LL<<rt1[i];
1406       }
1407     }
1408   }
1409   clear_const(current,rs1[i]);
1410   clear_const(current,rs2[i]);
1411   clear_const(current,rt1[i]);
1412   dirty_reg(current,rt1[i]);
1413 }
1414
1415 void imm16_alloc(struct regstat *current,int i)
1416 {
1417   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1418   else lt1[i]=rs1[i];
1419   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1420   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1421     current->is32&=~(1LL<<rt1[i]);
1422     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1423       // TODO: Could preserve the 32-bit flag if the immediate is zero
1424       alloc_reg64(current,i,rt1[i]);
1425       alloc_reg64(current,i,rs1[i]);
1426     }
1427     clear_const(current,rs1[i]);
1428     clear_const(current,rt1[i]);
1429   }
1430   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1431     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1432     current->is32|=1LL<<rt1[i];
1433     clear_const(current,rs1[i]);
1434     clear_const(current,rt1[i]);
1435   }
1436   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1437     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1438       if(rs1[i]!=rt1[i]) {
1439         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1440         alloc_reg64(current,i,rt1[i]);
1441         current->is32&=~(1LL<<rt1[i]);
1442       }
1443     }
1444     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1445     if(is_const(current,rs1[i])) {
1446       int v=get_const(current,rs1[i]);
1447       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1448       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1449       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1450     }
1451     else clear_const(current,rt1[i]);
1452   }
1453   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1454     if(is_const(current,rs1[i])) {
1455       int v=get_const(current,rs1[i]);
1456       set_const(current,rt1[i],v+imm[i]);
1457     }
1458     else clear_const(current,rt1[i]);
1459     current->is32|=1LL<<rt1[i];
1460   }
1461   else {
1462     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1463     current->is32|=1LL<<rt1[i];
1464   }
1465   dirty_reg(current,rt1[i]);
1466 }
1467
1468 void load_alloc(struct regstat *current,int i)
1469 {
1470   clear_const(current,rt1[i]);
1471   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1472   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1473   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1474   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1475     alloc_reg(current,i,rt1[i]);
1476     assert(get_reg(current->regmap,rt1[i])>=0);
1477     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1478     {
1479       current->is32&=~(1LL<<rt1[i]);
1480       alloc_reg64(current,i,rt1[i]);
1481     }
1482     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1483     {
1484       current->is32&=~(1LL<<rt1[i]);
1485       alloc_reg64(current,i,rt1[i]);
1486       alloc_all(current,i);
1487       alloc_reg64(current,i,FTEMP);
1488       minimum_free_regs[i]=HOST_REGS;
1489     }
1490     else current->is32|=1LL<<rt1[i];
1491     dirty_reg(current,rt1[i]);
1492     // LWL/LWR need a temporary register for the old value
1493     if(opcode[i]==0x22||opcode[i]==0x26)
1494     {
1495       alloc_reg(current,i,FTEMP);
1496       alloc_reg_temp(current,i,-1);
1497       minimum_free_regs[i]=1;
1498     }
1499   }
1500   else
1501   {
1502     // Load to r0 or unneeded register (dummy load)
1503     // but we still need a register to calculate the address
1504     if(opcode[i]==0x22||opcode[i]==0x26)
1505     {
1506       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1507     }
1508     alloc_reg_temp(current,i,-1);
1509     minimum_free_regs[i]=1;
1510     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1511     {
1512       alloc_all(current,i);
1513       alloc_reg64(current,i,FTEMP);
1514       minimum_free_regs[i]=HOST_REGS;
1515     }
1516   }
1517 }
1518
1519 void store_alloc(struct regstat *current,int i)
1520 {
1521   clear_const(current,rs2[i]);
1522   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1523   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1524   alloc_reg(current,i,rs2[i]);
1525   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1526     alloc_reg64(current,i,rs2[i]);
1527     if(rs2[i]) alloc_reg(current,i,FTEMP);
1528   }
1529   #if defined(HOST_IMM8)
1530   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1531   else alloc_reg(current,i,INVCP);
1532   #endif
1533   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1534     alloc_reg(current,i,FTEMP);
1535   }
1536   // We need a temporary register for address generation
1537   alloc_reg_temp(current,i,-1);
1538   minimum_free_regs[i]=1;
1539 }
1540
1541 void c1ls_alloc(struct regstat *current,int i)
1542 {
1543   //clear_const(current,rs1[i]); // FIXME
1544   clear_const(current,rt1[i]);
1545   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1546   alloc_reg(current,i,CSREG); // Status
1547   alloc_reg(current,i,FTEMP);
1548   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1549     alloc_reg64(current,i,FTEMP);
1550   }
1551   #if defined(HOST_IMM8)
1552   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1553   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1554     alloc_reg(current,i,INVCP);
1555   #endif
1556   // We need a temporary register for address generation
1557   alloc_reg_temp(current,i,-1);
1558 }
1559
1560 void c2ls_alloc(struct regstat *current,int i)
1561 {
1562   clear_const(current,rt1[i]);
1563   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1564   alloc_reg(current,i,FTEMP);
1565   #if defined(HOST_IMM8)
1566   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1567   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1568     alloc_reg(current,i,INVCP);
1569   #endif
1570   // We need a temporary register for address generation
1571   alloc_reg_temp(current,i,-1);
1572   minimum_free_regs[i]=1;
1573 }
1574
1575 #ifndef multdiv_alloc
1576 void multdiv_alloc(struct regstat *current,int i)
1577 {
1578   //  case 0x18: MULT
1579   //  case 0x19: MULTU
1580   //  case 0x1A: DIV
1581   //  case 0x1B: DIVU
1582   //  case 0x1C: DMULT
1583   //  case 0x1D: DMULTU
1584   //  case 0x1E: DDIV
1585   //  case 0x1F: DDIVU
1586   clear_const(current,rs1[i]);
1587   clear_const(current,rs2[i]);
1588   if(rs1[i]&&rs2[i])
1589   {
1590     if((opcode2[i]&4)==0) // 32-bit
1591     {
1592       current->u&=~(1LL<<HIREG);
1593       current->u&=~(1LL<<LOREG);
1594       alloc_reg(current,i,HIREG);
1595       alloc_reg(current,i,LOREG);
1596       alloc_reg(current,i,rs1[i]);
1597       alloc_reg(current,i,rs2[i]);
1598       current->is32|=1LL<<HIREG;
1599       current->is32|=1LL<<LOREG;
1600       dirty_reg(current,HIREG);
1601       dirty_reg(current,LOREG);
1602     }
1603     else // 64-bit
1604     {
1605       current->u&=~(1LL<<HIREG);
1606       current->u&=~(1LL<<LOREG);
1607       current->uu&=~(1LL<<HIREG);
1608       current->uu&=~(1LL<<LOREG);
1609       alloc_reg64(current,i,HIREG);
1610       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1611       alloc_reg64(current,i,rs1[i]);
1612       alloc_reg64(current,i,rs2[i]);
1613       alloc_all(current,i);
1614       current->is32&=~(1LL<<HIREG);
1615       current->is32&=~(1LL<<LOREG);
1616       dirty_reg(current,HIREG);
1617       dirty_reg(current,LOREG);
1618       minimum_free_regs[i]=HOST_REGS;
1619     }
1620   }
1621   else
1622   {
1623     // Multiply by zero is zero.
1624     // MIPS does not have a divide by zero exception.
1625     // The result is undefined, we return zero.
1626     alloc_reg(current,i,HIREG);
1627     alloc_reg(current,i,LOREG);
1628     current->is32|=1LL<<HIREG;
1629     current->is32|=1LL<<LOREG;
1630     dirty_reg(current,HIREG);
1631     dirty_reg(current,LOREG);
1632   }
1633 }
1634 #endif
1635
1636 void cop0_alloc(struct regstat *current,int i)
1637 {
1638   if(opcode2[i]==0) // MFC0
1639   {
1640     if(rt1[i]) {
1641       clear_const(current,rt1[i]);
1642       alloc_all(current,i);
1643       alloc_reg(current,i,rt1[i]);
1644       current->is32|=1LL<<rt1[i];
1645       dirty_reg(current,rt1[i]);
1646     }
1647   }
1648   else if(opcode2[i]==4) // MTC0
1649   {
1650     if(rs1[i]){
1651       clear_const(current,rs1[i]);
1652       alloc_reg(current,i,rs1[i]);
1653       alloc_all(current,i);
1654     }
1655     else {
1656       alloc_all(current,i); // FIXME: Keep r0
1657       current->u&=~1LL;
1658       alloc_reg(current,i,0);
1659     }
1660   }
1661   else
1662   {
1663     // TLBR/TLBWI/TLBWR/TLBP/ERET
1664     assert(opcode2[i]==0x10);
1665     alloc_all(current,i);
1666   }
1667   minimum_free_regs[i]=HOST_REGS;
1668 }
1669
1670 void cop1_alloc(struct regstat *current,int i)
1671 {
1672   alloc_reg(current,i,CSREG); // Load status
1673   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1674   {
1675     if(rt1[i]){
1676       clear_const(current,rt1[i]);
1677       if(opcode2[i]==1) {
1678         alloc_reg64(current,i,rt1[i]); // DMFC1
1679         current->is32&=~(1LL<<rt1[i]);
1680       }else{
1681         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1682         current->is32|=1LL<<rt1[i];
1683       }
1684       dirty_reg(current,rt1[i]);
1685     }
1686     alloc_reg_temp(current,i,-1);
1687   }
1688   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1689   {
1690     if(rs1[i]){
1691       clear_const(current,rs1[i]);
1692       if(opcode2[i]==5)
1693         alloc_reg64(current,i,rs1[i]); // DMTC1
1694       else
1695         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1696       alloc_reg_temp(current,i,-1);
1697     }
1698     else {
1699       current->u&=~1LL;
1700       alloc_reg(current,i,0);
1701       alloc_reg_temp(current,i,-1);
1702     }
1703   }
1704   minimum_free_regs[i]=1;
1705 }
1706 void fconv_alloc(struct regstat *current,int i)
1707 {
1708   alloc_reg(current,i,CSREG); // Load status
1709   alloc_reg_temp(current,i,-1);
1710   minimum_free_regs[i]=1;
1711 }
1712 void float_alloc(struct regstat *current,int i)
1713 {
1714   alloc_reg(current,i,CSREG); // Load status
1715   alloc_reg_temp(current,i,-1);
1716   minimum_free_regs[i]=1;
1717 }
1718 void c2op_alloc(struct regstat *current,int i)
1719 {
1720   alloc_reg_temp(current,i,-1);
1721 }
1722 void fcomp_alloc(struct regstat *current,int i)
1723 {
1724   alloc_reg(current,i,CSREG); // Load status
1725   alloc_reg(current,i,FSREG); // Load flags
1726   dirty_reg(current,FSREG); // Flag will be modified
1727   alloc_reg_temp(current,i,-1);
1728   minimum_free_regs[i]=1;
1729 }
1730
1731 void syscall_alloc(struct regstat *current,int i)
1732 {
1733   alloc_cc(current,i);
1734   dirty_reg(current,CCREG);
1735   alloc_all(current,i);
1736   minimum_free_regs[i]=HOST_REGS;
1737   current->isconst=0;
1738 }
1739
1740 void delayslot_alloc(struct regstat *current,int i)
1741 {
1742   switch(itype[i]) {
1743     case UJUMP:
1744     case CJUMP:
1745     case SJUMP:
1746     case RJUMP:
1747     case FJUMP:
1748     case SYSCALL:
1749     case HLECALL:
1750     case SPAN:
1751       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1752       SysPrintf("Disabled speculative precompilation\n");
1753       stop_after_jal=1;
1754       break;
1755     case IMM16:
1756       imm16_alloc(current,i);
1757       break;
1758     case LOAD:
1759     case LOADLR:
1760       load_alloc(current,i);
1761       break;
1762     case STORE:
1763     case STORELR:
1764       store_alloc(current,i);
1765       break;
1766     case ALU:
1767       alu_alloc(current,i);
1768       break;
1769     case SHIFT:
1770       shift_alloc(current,i);
1771       break;
1772     case MULTDIV:
1773       multdiv_alloc(current,i);
1774       break;
1775     case SHIFTIMM:
1776       shiftimm_alloc(current,i);
1777       break;
1778     case MOV:
1779       mov_alloc(current,i);
1780       break;
1781     case COP0:
1782       cop0_alloc(current,i);
1783       break;
1784     case COP1:
1785     case COP2:
1786       cop1_alloc(current,i);
1787       break;
1788     case C1LS:
1789       c1ls_alloc(current,i);
1790       break;
1791     case C2LS:
1792       c2ls_alloc(current,i);
1793       break;
1794     case FCONV:
1795       fconv_alloc(current,i);
1796       break;
1797     case FLOAT:
1798       float_alloc(current,i);
1799       break;
1800     case FCOMP:
1801       fcomp_alloc(current,i);
1802       break;
1803     case C2OP:
1804       c2op_alloc(current,i);
1805       break;
1806   }
1807 }
1808
1809 // Special case where a branch and delay slot span two pages in virtual memory
1810 static void pagespan_alloc(struct regstat *current,int i)
1811 {
1812   current->isconst=0;
1813   current->wasconst=0;
1814   regs[i].wasconst=0;
1815   minimum_free_regs[i]=HOST_REGS;
1816   alloc_all(current,i);
1817   alloc_cc(current,i);
1818   dirty_reg(current,CCREG);
1819   if(opcode[i]==3) // JAL
1820   {
1821     alloc_reg(current,i,31);
1822     dirty_reg(current,31);
1823   }
1824   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1825   {
1826     alloc_reg(current,i,rs1[i]);
1827     if (rt1[i]!=0) {
1828       alloc_reg(current,i,rt1[i]);
1829       dirty_reg(current,rt1[i]);
1830     }
1831   }
1832   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1833   {
1834     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1835     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1836     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1837     {
1838       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1839       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1840     }
1841   }
1842   else
1843   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1844   {
1845     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1846     if(!((current->is32>>rs1[i])&1))
1847     {
1848       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1849     }
1850   }
1851   else
1852   if(opcode[i]==0x11) // BC1
1853   {
1854     alloc_reg(current,i,FSREG);
1855     alloc_reg(current,i,CSREG);
1856   }
1857   //else ...
1858 }
1859
1860 static void add_stub(enum stub_type type, void *addr, void *retaddr,
1861   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e)
1862 {
1863   assert(a < ARRAY_SIZE(stubs));
1864   stubs[stubcount].type = type;
1865   stubs[stubcount].addr = addr;
1866   stubs[stubcount].retaddr = retaddr;
1867   stubs[stubcount].a = a;
1868   stubs[stubcount].b = b;
1869   stubs[stubcount].c = c;
1870   stubs[stubcount].d = d;
1871   stubs[stubcount].e = e;
1872   stubcount++;
1873 }
1874
1875 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
1876   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist)
1877 {
1878   add_stub(type, addr, retaddr, i, addr_reg, (uintptr_t)i_regs, ccadj, reglist);
1879 }
1880
1881 // Write out a single register
1882 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1883 {
1884   int hr;
1885   for(hr=0;hr<HOST_REGS;hr++) {
1886     if(hr!=EXCLUDE_REG) {
1887       if((regmap[hr]&63)==r) {
1888         if((dirty>>hr)&1) {
1889           if(regmap[hr]<64) {
1890             emit_storereg(r,hr);
1891           }else{
1892             emit_storereg(r|64,hr);
1893           }
1894         }
1895       }
1896     }
1897   }
1898 }
1899
1900 int mchecksum()
1901 {
1902   int i;
1903   int sum=0;
1904   for(i=0;i<2097152;i++) {
1905     unsigned int temp=sum;
1906     sum<<=1;
1907     sum|=(~temp)>>31;
1908     sum^=((u_int *)rdram)[i];
1909   }
1910   return sum;
1911 }
1912 int rchecksum()
1913 {
1914   int i;
1915   int sum=0;
1916   for(i=0;i<64;i++)
1917     sum^=((u_int *)reg)[i];
1918   return sum;
1919 }
1920 void rlist()
1921 {
1922   int i;
1923   printf("TRACE: ");
1924   for(i=0;i<32;i++)
1925     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1926   printf("\n");
1927 }
1928
1929 void alu_assemble(int i,struct regstat *i_regs)
1930 {
1931   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1932     if(rt1[i]) {
1933       signed char s1,s2,t;
1934       t=get_reg(i_regs->regmap,rt1[i]);
1935       if(t>=0) {
1936         s1=get_reg(i_regs->regmap,rs1[i]);
1937         s2=get_reg(i_regs->regmap,rs2[i]);
1938         if(rs1[i]&&rs2[i]) {
1939           assert(s1>=0);
1940           assert(s2>=0);
1941           if(opcode2[i]&2) emit_sub(s1,s2,t);
1942           else emit_add(s1,s2,t);
1943         }
1944         else if(rs1[i]) {
1945           if(s1>=0) emit_mov(s1,t);
1946           else emit_loadreg(rs1[i],t);
1947         }
1948         else if(rs2[i]) {
1949           if(s2>=0) {
1950             if(opcode2[i]&2) emit_neg(s2,t);
1951             else emit_mov(s2,t);
1952           }
1953           else {
1954             emit_loadreg(rs2[i],t);
1955             if(opcode2[i]&2) emit_neg(t,t);
1956           }
1957         }
1958         else emit_zeroreg(t);
1959       }
1960     }
1961   }
1962   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1963     if(rt1[i]) {
1964       signed char s1l,s2l,s1h,s2h,tl,th;
1965       tl=get_reg(i_regs->regmap,rt1[i]);
1966       th=get_reg(i_regs->regmap,rt1[i]|64);
1967       if(tl>=0) {
1968         s1l=get_reg(i_regs->regmap,rs1[i]);
1969         s2l=get_reg(i_regs->regmap,rs2[i]);
1970         s1h=get_reg(i_regs->regmap,rs1[i]|64);
1971         s2h=get_reg(i_regs->regmap,rs2[i]|64);
1972         if(rs1[i]&&rs2[i]) {
1973           assert(s1l>=0);
1974           assert(s2l>=0);
1975           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
1976           else emit_adds(s1l,s2l,tl);
1977           if(th>=0) {
1978             #ifdef INVERTED_CARRY
1979             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
1980             #else
1981             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
1982             #endif
1983             else emit_add(s1h,s2h,th);
1984           }
1985         }
1986         else if(rs1[i]) {
1987           if(s1l>=0) emit_mov(s1l,tl);
1988           else emit_loadreg(rs1[i],tl);
1989           if(th>=0) {
1990             if(s1h>=0) emit_mov(s1h,th);
1991             else emit_loadreg(rs1[i]|64,th);
1992           }
1993         }
1994         else if(rs2[i]) {
1995           if(s2l>=0) {
1996             if(opcode2[i]&2) emit_negs(s2l,tl);
1997             else emit_mov(s2l,tl);
1998           }
1999           else {
2000             emit_loadreg(rs2[i],tl);
2001             if(opcode2[i]&2) emit_negs(tl,tl);
2002           }
2003           if(th>=0) {
2004             #ifdef INVERTED_CARRY
2005             if(s2h>=0) emit_mov(s2h,th);
2006             else emit_loadreg(rs2[i]|64,th);
2007             if(opcode2[i]&2) {
2008               emit_adcimm(-1,th); // x86 has inverted carry flag
2009               emit_not(th,th);
2010             }
2011             #else
2012             if(opcode2[i]&2) {
2013               if(s2h>=0) emit_rscimm(s2h,0,th);
2014               else {
2015                 emit_loadreg(rs2[i]|64,th);
2016                 emit_rscimm(th,0,th);
2017               }
2018             }else{
2019               if(s2h>=0) emit_mov(s2h,th);
2020               else emit_loadreg(rs2[i]|64,th);
2021             }
2022             #endif
2023           }
2024         }
2025         else {
2026           emit_zeroreg(tl);
2027           if(th>=0) emit_zeroreg(th);
2028         }
2029       }
2030     }
2031   }
2032   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2033     if(rt1[i]) {
2034       signed char s1l,s1h,s2l,s2h,t;
2035       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2036       {
2037         t=get_reg(i_regs->regmap,rt1[i]);
2038         //assert(t>=0);
2039         if(t>=0) {
2040           s1l=get_reg(i_regs->regmap,rs1[i]);
2041           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2042           s2l=get_reg(i_regs->regmap,rs2[i]);
2043           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2044           if(rs2[i]==0) // rx<r0
2045           {
2046             assert(s1h>=0);
2047             if(opcode2[i]==0x2a) // SLT
2048               emit_shrimm(s1h,31,t);
2049             else // SLTU (unsigned can not be less than zero)
2050               emit_zeroreg(t);
2051           }
2052           else if(rs1[i]==0) // r0<rx
2053           {
2054             assert(s2h>=0);
2055             if(opcode2[i]==0x2a) // SLT
2056               emit_set_gz64_32(s2h,s2l,t);
2057             else // SLTU (set if not zero)
2058               emit_set_nz64_32(s2h,s2l,t);
2059           }
2060           else {
2061             assert(s1l>=0);assert(s1h>=0);
2062             assert(s2l>=0);assert(s2h>=0);
2063             if(opcode2[i]==0x2a) // SLT
2064               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2065             else // SLTU
2066               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2067           }
2068         }
2069       } else {
2070         t=get_reg(i_regs->regmap,rt1[i]);
2071         //assert(t>=0);
2072         if(t>=0) {
2073           s1l=get_reg(i_regs->regmap,rs1[i]);
2074           s2l=get_reg(i_regs->regmap,rs2[i]);
2075           if(rs2[i]==0) // rx<r0
2076           {
2077             assert(s1l>=0);
2078             if(opcode2[i]==0x2a) // SLT
2079               emit_shrimm(s1l,31,t);
2080             else // SLTU (unsigned can not be less than zero)
2081               emit_zeroreg(t);
2082           }
2083           else if(rs1[i]==0) // r0<rx
2084           {
2085             assert(s2l>=0);
2086             if(opcode2[i]==0x2a) // SLT
2087               emit_set_gz32(s2l,t);
2088             else // SLTU (set if not zero)
2089               emit_set_nz32(s2l,t);
2090           }
2091           else{
2092             assert(s1l>=0);assert(s2l>=0);
2093             if(opcode2[i]==0x2a) // SLT
2094               emit_set_if_less32(s1l,s2l,t);
2095             else // SLTU
2096               emit_set_if_carry32(s1l,s2l,t);
2097           }
2098         }
2099       }
2100     }
2101   }
2102   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2103     if(rt1[i]) {
2104       signed char s1l,s1h,s2l,s2h,th,tl;
2105       tl=get_reg(i_regs->regmap,rt1[i]);
2106       th=get_reg(i_regs->regmap,rt1[i]|64);
2107       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2108       {
2109         assert(tl>=0);
2110         if(tl>=0) {
2111           s1l=get_reg(i_regs->regmap,rs1[i]);
2112           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2113           s2l=get_reg(i_regs->regmap,rs2[i]);
2114           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2115           if(rs1[i]&&rs2[i]) {
2116             assert(s1l>=0);assert(s1h>=0);
2117             assert(s2l>=0);assert(s2h>=0);
2118             if(opcode2[i]==0x24) { // AND
2119               emit_and(s1l,s2l,tl);
2120               emit_and(s1h,s2h,th);
2121             } else
2122             if(opcode2[i]==0x25) { // OR
2123               emit_or(s1l,s2l,tl);
2124               emit_or(s1h,s2h,th);
2125             } else
2126             if(opcode2[i]==0x26) { // XOR
2127               emit_xor(s1l,s2l,tl);
2128               emit_xor(s1h,s2h,th);
2129             } else
2130             if(opcode2[i]==0x27) { // NOR
2131               emit_or(s1l,s2l,tl);
2132               emit_or(s1h,s2h,th);
2133               emit_not(tl,tl);
2134               emit_not(th,th);
2135             }
2136           }
2137           else
2138           {
2139             if(opcode2[i]==0x24) { // AND
2140               emit_zeroreg(tl);
2141               emit_zeroreg(th);
2142             } else
2143             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2144               if(rs1[i]){
2145                 if(s1l>=0) emit_mov(s1l,tl);
2146                 else emit_loadreg(rs1[i],tl);
2147                 if(s1h>=0) emit_mov(s1h,th);
2148                 else emit_loadreg(rs1[i]|64,th);
2149               }
2150               else
2151               if(rs2[i]){
2152                 if(s2l>=0) emit_mov(s2l,tl);
2153                 else emit_loadreg(rs2[i],tl);
2154                 if(s2h>=0) emit_mov(s2h,th);
2155                 else emit_loadreg(rs2[i]|64,th);
2156               }
2157               else{
2158                 emit_zeroreg(tl);
2159                 emit_zeroreg(th);
2160               }
2161             } else
2162             if(opcode2[i]==0x27) { // NOR
2163               if(rs1[i]){
2164                 if(s1l>=0) emit_not(s1l,tl);
2165                 else{
2166                   emit_loadreg(rs1[i],tl);
2167                   emit_not(tl,tl);
2168                 }
2169                 if(s1h>=0) emit_not(s1h,th);
2170                 else{
2171                   emit_loadreg(rs1[i]|64,th);
2172                   emit_not(th,th);
2173                 }
2174               }
2175               else
2176               if(rs2[i]){
2177                 if(s2l>=0) emit_not(s2l,tl);
2178                 else{
2179                   emit_loadreg(rs2[i],tl);
2180                   emit_not(tl,tl);
2181                 }
2182                 if(s2h>=0) emit_not(s2h,th);
2183                 else{
2184                   emit_loadreg(rs2[i]|64,th);
2185                   emit_not(th,th);
2186                 }
2187               }
2188               else {
2189                 emit_movimm(-1,tl);
2190                 emit_movimm(-1,th);
2191               }
2192             }
2193           }
2194         }
2195       }
2196       else
2197       {
2198         // 32 bit
2199         if(tl>=0) {
2200           s1l=get_reg(i_regs->regmap,rs1[i]);
2201           s2l=get_reg(i_regs->regmap,rs2[i]);
2202           if(rs1[i]&&rs2[i]) {
2203             assert(s1l>=0);
2204             assert(s2l>=0);
2205             if(opcode2[i]==0x24) { // AND
2206               emit_and(s1l,s2l,tl);
2207             } else
2208             if(opcode2[i]==0x25) { // OR
2209               emit_or(s1l,s2l,tl);
2210             } else
2211             if(opcode2[i]==0x26) { // XOR
2212               emit_xor(s1l,s2l,tl);
2213             } else
2214             if(opcode2[i]==0x27) { // NOR
2215               emit_or(s1l,s2l,tl);
2216               emit_not(tl,tl);
2217             }
2218           }
2219           else
2220           {
2221             if(opcode2[i]==0x24) { // AND
2222               emit_zeroreg(tl);
2223             } else
2224             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2225               if(rs1[i]){
2226                 if(s1l>=0) emit_mov(s1l,tl);
2227                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2228               }
2229               else
2230               if(rs2[i]){
2231                 if(s2l>=0) emit_mov(s2l,tl);
2232                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2233               }
2234               else emit_zeroreg(tl);
2235             } else
2236             if(opcode2[i]==0x27) { // NOR
2237               if(rs1[i]){
2238                 if(s1l>=0) emit_not(s1l,tl);
2239                 else {
2240                   emit_loadreg(rs1[i],tl);
2241                   emit_not(tl,tl);
2242                 }
2243               }
2244               else
2245               if(rs2[i]){
2246                 if(s2l>=0) emit_not(s2l,tl);
2247                 else {
2248                   emit_loadreg(rs2[i],tl);
2249                   emit_not(tl,tl);
2250                 }
2251               }
2252               else emit_movimm(-1,tl);
2253             }
2254           }
2255         }
2256       }
2257     }
2258   }
2259 }
2260
2261 void imm16_assemble(int i,struct regstat *i_regs)
2262 {
2263   if (opcode[i]==0x0f) { // LUI
2264     if(rt1[i]) {
2265       signed char t;
2266       t=get_reg(i_regs->regmap,rt1[i]);
2267       //assert(t>=0);
2268       if(t>=0) {
2269         if(!((i_regs->isconst>>t)&1))
2270           emit_movimm(imm[i]<<16,t);
2271       }
2272     }
2273   }
2274   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2275     if(rt1[i]) {
2276       signed char s,t;
2277       t=get_reg(i_regs->regmap,rt1[i]);
2278       s=get_reg(i_regs->regmap,rs1[i]);
2279       if(rs1[i]) {
2280         //assert(t>=0);
2281         //assert(s>=0);
2282         if(t>=0) {
2283           if(!((i_regs->isconst>>t)&1)) {
2284             if(s<0) {
2285               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2286               emit_addimm(t,imm[i],t);
2287             }else{
2288               if(!((i_regs->wasconst>>s)&1))
2289                 emit_addimm(s,imm[i],t);
2290               else
2291                 emit_movimm(constmap[i][s]+imm[i],t);
2292             }
2293           }
2294         }
2295       } else {
2296         if(t>=0) {
2297           if(!((i_regs->isconst>>t)&1))
2298             emit_movimm(imm[i],t);
2299         }
2300       }
2301     }
2302   }
2303   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2304     if(rt1[i]) {
2305       signed char sh,sl,th,tl;
2306       th=get_reg(i_regs->regmap,rt1[i]|64);
2307       tl=get_reg(i_regs->regmap,rt1[i]);
2308       sh=get_reg(i_regs->regmap,rs1[i]|64);
2309       sl=get_reg(i_regs->regmap,rs1[i]);
2310       if(tl>=0) {
2311         if(rs1[i]) {
2312           assert(sh>=0);
2313           assert(sl>=0);
2314           if(th>=0) {
2315             emit_addimm64_32(sh,sl,imm[i],th,tl);
2316           }
2317           else {
2318             emit_addimm(sl,imm[i],tl);
2319           }
2320         } else {
2321           emit_movimm(imm[i],tl);
2322           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2323         }
2324       }
2325     }
2326   }
2327   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2328     if(rt1[i]) {
2329       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2330       signed char sh,sl,t;
2331       t=get_reg(i_regs->regmap,rt1[i]);
2332       sh=get_reg(i_regs->regmap,rs1[i]|64);
2333       sl=get_reg(i_regs->regmap,rs1[i]);
2334       //assert(t>=0);
2335       if(t>=0) {
2336         if(rs1[i]>0) {
2337           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2338           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2339             if(opcode[i]==0x0a) { // SLTI
2340               if(sl<0) {
2341                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2342                 emit_slti32(t,imm[i],t);
2343               }else{
2344                 emit_slti32(sl,imm[i],t);
2345               }
2346             }
2347             else { // SLTIU
2348               if(sl<0) {
2349                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2350                 emit_sltiu32(t,imm[i],t);
2351               }else{
2352                 emit_sltiu32(sl,imm[i],t);
2353               }
2354             }
2355           }else{ // 64-bit
2356             assert(sl>=0);
2357             if(opcode[i]==0x0a) // SLTI
2358               emit_slti64_32(sh,sl,imm[i],t);
2359             else // SLTIU
2360               emit_sltiu64_32(sh,sl,imm[i],t);
2361           }
2362         }else{
2363           // SLTI(U) with r0 is just stupid,
2364           // nonetheless examples can be found
2365           if(opcode[i]==0x0a) // SLTI
2366             if(0<imm[i]) emit_movimm(1,t);
2367             else emit_zeroreg(t);
2368           else // SLTIU
2369           {
2370             if(imm[i]) emit_movimm(1,t);
2371             else emit_zeroreg(t);
2372           }
2373         }
2374       }
2375     }
2376   }
2377   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2378     if(rt1[i]) {
2379       signed char sh,sl,th,tl;
2380       th=get_reg(i_regs->regmap,rt1[i]|64);
2381       tl=get_reg(i_regs->regmap,rt1[i]);
2382       sh=get_reg(i_regs->regmap,rs1[i]|64);
2383       sl=get_reg(i_regs->regmap,rs1[i]);
2384       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2385         if(opcode[i]==0x0c) //ANDI
2386         {
2387           if(rs1[i]) {
2388             if(sl<0) {
2389               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2390               emit_andimm(tl,imm[i],tl);
2391             }else{
2392               if(!((i_regs->wasconst>>sl)&1))
2393                 emit_andimm(sl,imm[i],tl);
2394               else
2395                 emit_movimm(constmap[i][sl]&imm[i],tl);
2396             }
2397           }
2398           else
2399             emit_zeroreg(tl);
2400           if(th>=0) emit_zeroreg(th);
2401         }
2402         else
2403         {
2404           if(rs1[i]) {
2405             if(sl<0) {
2406               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2407             }
2408             if(th>=0) {
2409               if(sh<0) {
2410                 emit_loadreg(rs1[i]|64,th);
2411               }else{
2412                 emit_mov(sh,th);
2413               }
2414             }
2415             if(opcode[i]==0x0d) { // ORI
2416               if(sl<0) {
2417                 emit_orimm(tl,imm[i],tl);
2418               }else{
2419                 if(!((i_regs->wasconst>>sl)&1))
2420                   emit_orimm(sl,imm[i],tl);
2421                 else
2422                   emit_movimm(constmap[i][sl]|imm[i],tl);
2423               }
2424             }
2425             if(opcode[i]==0x0e) { // XORI
2426               if(sl<0) {
2427                 emit_xorimm(tl,imm[i],tl);
2428               }else{
2429                 if(!((i_regs->wasconst>>sl)&1))
2430                   emit_xorimm(sl,imm[i],tl);
2431                 else
2432                   emit_movimm(constmap[i][sl]^imm[i],tl);
2433               }
2434             }
2435           }
2436           else {
2437             emit_movimm(imm[i],tl);
2438             if(th>=0) emit_zeroreg(th);
2439           }
2440         }
2441       }
2442     }
2443   }
2444 }
2445
2446 void shiftimm_assemble(int i,struct regstat *i_regs)
2447 {
2448   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2449   {
2450     if(rt1[i]) {
2451       signed char s,t;
2452       t=get_reg(i_regs->regmap,rt1[i]);
2453       s=get_reg(i_regs->regmap,rs1[i]);
2454       //assert(t>=0);
2455       if(t>=0&&!((i_regs->isconst>>t)&1)){
2456         if(rs1[i]==0)
2457         {
2458           emit_zeroreg(t);
2459         }
2460         else
2461         {
2462           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2463           if(imm[i]) {
2464             if(opcode2[i]==0) // SLL
2465             {
2466               emit_shlimm(s<0?t:s,imm[i],t);
2467             }
2468             if(opcode2[i]==2) // SRL
2469             {
2470               emit_shrimm(s<0?t:s,imm[i],t);
2471             }
2472             if(opcode2[i]==3) // SRA
2473             {
2474               emit_sarimm(s<0?t:s,imm[i],t);
2475             }
2476           }else{
2477             // Shift by zero
2478             if(s>=0 && s!=t) emit_mov(s,t);
2479           }
2480         }
2481       }
2482       //emit_storereg(rt1[i],t); //DEBUG
2483     }
2484   }
2485   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2486   {
2487     if(rt1[i]) {
2488       signed char sh,sl,th,tl;
2489       th=get_reg(i_regs->regmap,rt1[i]|64);
2490       tl=get_reg(i_regs->regmap,rt1[i]);
2491       sh=get_reg(i_regs->regmap,rs1[i]|64);
2492       sl=get_reg(i_regs->regmap,rs1[i]);
2493       if(tl>=0) {
2494         if(rs1[i]==0)
2495         {
2496           emit_zeroreg(tl);
2497           if(th>=0) emit_zeroreg(th);
2498         }
2499         else
2500         {
2501           assert(sl>=0);
2502           assert(sh>=0);
2503           if(imm[i]) {
2504             if(opcode2[i]==0x38) // DSLL
2505             {
2506               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2507               emit_shlimm(sl,imm[i],tl);
2508             }
2509             if(opcode2[i]==0x3a) // DSRL
2510             {
2511               emit_shrdimm(sl,sh,imm[i],tl);
2512               if(th>=0) emit_shrimm(sh,imm[i],th);
2513             }
2514             if(opcode2[i]==0x3b) // DSRA
2515             {
2516               emit_shrdimm(sl,sh,imm[i],tl);
2517               if(th>=0) emit_sarimm(sh,imm[i],th);
2518             }
2519           }else{
2520             // Shift by zero
2521             if(sl!=tl) emit_mov(sl,tl);
2522             if(th>=0&&sh!=th) emit_mov(sh,th);
2523           }
2524         }
2525       }
2526     }
2527   }
2528   if(opcode2[i]==0x3c) // DSLL32
2529   {
2530     if(rt1[i]) {
2531       signed char sl,tl,th;
2532       tl=get_reg(i_regs->regmap,rt1[i]);
2533       th=get_reg(i_regs->regmap,rt1[i]|64);
2534       sl=get_reg(i_regs->regmap,rs1[i]);
2535       if(th>=0||tl>=0){
2536         assert(tl>=0);
2537         assert(th>=0);
2538         assert(sl>=0);
2539         emit_mov(sl,th);
2540         emit_zeroreg(tl);
2541         if(imm[i]>32)
2542         {
2543           emit_shlimm(th,imm[i]&31,th);
2544         }
2545       }
2546     }
2547   }
2548   if(opcode2[i]==0x3e) // DSRL32
2549   {
2550     if(rt1[i]) {
2551       signed char sh,tl,th;
2552       tl=get_reg(i_regs->regmap,rt1[i]);
2553       th=get_reg(i_regs->regmap,rt1[i]|64);
2554       sh=get_reg(i_regs->regmap,rs1[i]|64);
2555       if(tl>=0){
2556         assert(sh>=0);
2557         emit_mov(sh,tl);
2558         if(th>=0) emit_zeroreg(th);
2559         if(imm[i]>32)
2560         {
2561           emit_shrimm(tl,imm[i]&31,tl);
2562         }
2563       }
2564     }
2565   }
2566   if(opcode2[i]==0x3f) // DSRA32
2567   {
2568     if(rt1[i]) {
2569       signed char sh,tl;
2570       tl=get_reg(i_regs->regmap,rt1[i]);
2571       sh=get_reg(i_regs->regmap,rs1[i]|64);
2572       if(tl>=0){
2573         assert(sh>=0);
2574         emit_mov(sh,tl);
2575         if(imm[i]>32)
2576         {
2577           emit_sarimm(tl,imm[i]&31,tl);
2578         }
2579       }
2580     }
2581   }
2582 }
2583
2584 #ifndef shift_assemble
2585 void shift_assemble(int i,struct regstat *i_regs)
2586 {
2587   printf("Need shift_assemble for this architecture.\n");
2588   exit(1);
2589 }
2590 #endif
2591
2592 void load_assemble(int i,struct regstat *i_regs)
2593 {
2594   int s,th,tl,addr,map=-1;
2595   int offset;
2596   void *jaddr=0;
2597   int memtarget=0,c=0;
2598   int fastload_reg_override=0;
2599   u_int hr,reglist=0;
2600   th=get_reg(i_regs->regmap,rt1[i]|64);
2601   tl=get_reg(i_regs->regmap,rt1[i]);
2602   s=get_reg(i_regs->regmap,rs1[i]);
2603   offset=imm[i];
2604   for(hr=0;hr<HOST_REGS;hr++) {
2605     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2606   }
2607   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2608   if(s>=0) {
2609     c=(i_regs->wasconst>>s)&1;
2610     if (c) {
2611       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2612     }
2613   }
2614   //printf("load_assemble: c=%d\n",c);
2615   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2616   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2617   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2618     ||rt1[i]==0) {
2619       // could be FIFO, must perform the read
2620       // ||dummy read
2621       assem_debug("(forced read)\n");
2622       tl=get_reg(i_regs->regmap,-1);
2623       assert(tl>=0);
2624   }
2625   if(offset||s<0||c) addr=tl;
2626   else addr=s;
2627   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2628  if(tl>=0) {
2629   //printf("load_assemble: c=%d\n",c);
2630   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2631   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2632   reglist&=~(1<<tl);
2633   if(th>=0) reglist&=~(1<<th);
2634   if(!c) {
2635     #ifdef RAM_OFFSET
2636     map=get_reg(i_regs->regmap,ROREG);
2637     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2638     #endif
2639     #ifdef R29_HACK
2640     // Strmnnrmn's speed hack
2641     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2642     #endif
2643     {
2644       jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2645     }
2646   }
2647   else if(ram_offset&&memtarget) {
2648     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2649     fastload_reg_override=HOST_TEMPREG;
2650   }
2651   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2652   if (opcode[i]==0x20) { // LB
2653     if(!c||memtarget) {
2654       if(!dummy) {
2655         #ifdef HOST_IMM_ADDR32
2656         if(c)
2657           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2658         else
2659         #endif
2660         {
2661           //emit_xorimm(addr,3,tl);
2662           //emit_movsbl_indexed(rdram-0x80000000,tl,tl);
2663           int x=0,a=tl;
2664 #ifdef BIG_ENDIAN_MIPS
2665           if(!c) emit_xorimm(addr,3,tl);
2666           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2667 #else
2668           if(!c) a=addr;
2669 #endif
2670           if(fastload_reg_override) a=fastload_reg_override;
2671
2672           emit_movsbl_indexed_tlb(x,a,map,tl);
2673         }
2674       }
2675       if(jaddr)
2676         add_stub_r(LOADB_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2677     }
2678     else
2679       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2680   }
2681   if (opcode[i]==0x21) { // LH
2682     if(!c||memtarget) {
2683       if(!dummy) {
2684         #ifdef HOST_IMM_ADDR32
2685         if(c)
2686           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2687         else
2688         #endif
2689         {
2690           int x=0,a=tl;
2691 #ifdef BIG_ENDIAN_MIPS
2692           if(!c) emit_xorimm(addr,2,tl);
2693           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2694 #else
2695           if(!c) a=addr;
2696 #endif
2697           if(fastload_reg_override) a=fastload_reg_override;
2698           //#ifdef
2699           //emit_movswl_indexed_tlb(x,tl,map,tl);
2700           //else
2701           if(map>=0) {
2702             emit_movswl_indexed(x,a,tl);
2703           }else{
2704             #if 1 //def RAM_OFFSET
2705             emit_movswl_indexed(x,a,tl);
2706             #else
2707             emit_movswl_indexed(rdram-0x80000000+x,a,tl);
2708             #endif
2709           }
2710         }
2711       }
2712       if(jaddr)
2713         add_stub_r(LOADH_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2714     }
2715     else
2716       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2717   }
2718   if (opcode[i]==0x23) { // LW
2719     if(!c||memtarget) {
2720       if(!dummy) {
2721         int a=addr;
2722         if(fastload_reg_override) a=fastload_reg_override;
2723         //emit_readword_indexed(rdram-0x80000000,addr,tl);
2724         #ifdef HOST_IMM_ADDR32
2725         if(c)
2726           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2727         else
2728         #endif
2729         emit_readword_indexed_tlb(0,a,map,tl);
2730       }
2731       if(jaddr)
2732         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2733     }
2734     else
2735       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2736   }
2737   if (opcode[i]==0x24) { // LBU
2738     if(!c||memtarget) {
2739       if(!dummy) {
2740         #ifdef HOST_IMM_ADDR32
2741         if(c)
2742           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2743         else
2744         #endif
2745         {
2746           //emit_xorimm(addr,3,tl);
2747           //emit_movzbl_indexed(rdram-0x80000000,tl,tl);
2748           int x=0,a=tl;
2749 #ifdef BIG_ENDIAN_MIPS
2750           if(!c) emit_xorimm(addr,3,tl);
2751           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2752 #else
2753           if(!c) a=addr;
2754 #endif
2755           if(fastload_reg_override) a=fastload_reg_override;
2756
2757           emit_movzbl_indexed_tlb(x,a,map,tl);
2758         }
2759       }
2760       if(jaddr)
2761         add_stub_r(LOADBU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2762     }
2763     else
2764       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2765   }
2766   if (opcode[i]==0x25) { // LHU
2767     if(!c||memtarget) {
2768       if(!dummy) {
2769         #ifdef HOST_IMM_ADDR32
2770         if(c)
2771           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2772         else
2773         #endif
2774         {
2775           int x=0,a=tl;
2776 #ifdef BIG_ENDIAN_MIPS
2777           if(!c) emit_xorimm(addr,2,tl);
2778           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2779 #else
2780           if(!c) a=addr;
2781 #endif
2782           if(fastload_reg_override) a=fastload_reg_override;
2783           //#ifdef
2784           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2785           //#else
2786           if(map>=0) {
2787             emit_movzwl_indexed(x,a,tl);
2788           }else{
2789             #if 1 //def RAM_OFFSET
2790             emit_movzwl_indexed(x,a,tl);
2791             #else
2792             emit_movzwl_indexed(rdram-0x80000000+x,a,tl);
2793             #endif
2794           }
2795         }
2796       }
2797       if(jaddr)
2798         add_stub_r(LOADHU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2799     }
2800     else
2801       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2802   }
2803   if (opcode[i]==0x27) { // LWU
2804     assert(th>=0);
2805     if(!c||memtarget) {
2806       if(!dummy) {
2807         int a=addr;
2808         if(fastload_reg_override) a=fastload_reg_override;
2809         //emit_readword_indexed(rdram-0x80000000,addr,tl);
2810         #ifdef HOST_IMM_ADDR32
2811         if(c)
2812           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2813         else
2814         #endif
2815         emit_readword_indexed_tlb(0,a,map,tl);
2816       }
2817       if(jaddr)
2818         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2819     }
2820     else {
2821       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2822     }
2823     emit_zeroreg(th);
2824   }
2825   if (opcode[i]==0x37) { // LD
2826     if(!c||memtarget) {
2827       if(!dummy) {
2828         int a=addr;
2829         if(fastload_reg_override) a=fastload_reg_override;
2830         //if(th>=0) emit_readword_indexed(rdram-0x80000000,addr,th);
2831         //emit_readword_indexed(rdram-0x7FFFFFFC,addr,tl);
2832         #ifdef HOST_IMM_ADDR32
2833         if(c)
2834           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2835         else
2836         #endif
2837         emit_readdword_indexed_tlb(0,a,map,th,tl);
2838       }
2839       if(jaddr)
2840         add_stub_r(LOADD_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2841     }
2842     else
2843       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2844   }
2845  }
2846 }
2847
2848 #ifndef loadlr_assemble
2849 void loadlr_assemble(int i,struct regstat *i_regs)
2850 {
2851   printf("Need loadlr_assemble for this architecture.\n");
2852   exit(1);
2853 }
2854 #endif
2855
2856 void store_assemble(int i,struct regstat *i_regs)
2857 {
2858   int s,th,tl,map=-1;
2859   int addr,temp;
2860   int offset;
2861   void *jaddr=0;
2862   enum stub_type type;
2863   int memtarget=0,c=0;
2864   int agr=AGEN1+(i&1);
2865   int faststore_reg_override=0;
2866   u_int hr,reglist=0;
2867   th=get_reg(i_regs->regmap,rs2[i]|64);
2868   tl=get_reg(i_regs->regmap,rs2[i]);
2869   s=get_reg(i_regs->regmap,rs1[i]);
2870   temp=get_reg(i_regs->regmap,agr);
2871   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2872   offset=imm[i];
2873   if(s>=0) {
2874     c=(i_regs->wasconst>>s)&1;
2875     if(c) {
2876       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2877     }
2878   }
2879   assert(tl>=0);
2880   assert(temp>=0);
2881   for(hr=0;hr<HOST_REGS;hr++) {
2882     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2883   }
2884   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2885   if(offset||s<0||c) addr=temp;
2886   else addr=s;
2887   if(!c) {
2888     jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
2889   }
2890   else if(ram_offset&&memtarget) {
2891     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2892     faststore_reg_override=HOST_TEMPREG;
2893   }
2894
2895   if (opcode[i]==0x28) { // SB
2896     if(!c||memtarget) {
2897       int x=0,a=temp;
2898 #ifdef BIG_ENDIAN_MIPS
2899       if(!c) emit_xorimm(addr,3,temp);
2900       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2901 #else
2902       if(!c) a=addr;
2903 #endif
2904       if(faststore_reg_override) a=faststore_reg_override;
2905       //emit_writebyte_indexed(tl,rdram-0x80000000,temp);
2906       emit_writebyte_indexed_tlb(tl,x,a,map,a);
2907     }
2908     type=STOREB_STUB;
2909   }
2910   if (opcode[i]==0x29) { // SH
2911     if(!c||memtarget) {
2912       int x=0,a=temp;
2913 #ifdef BIG_ENDIAN_MIPS
2914       if(!c) emit_xorimm(addr,2,temp);
2915       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2916 #else
2917       if(!c) a=addr;
2918 #endif
2919       if(faststore_reg_override) a=faststore_reg_override;
2920       //#ifdef
2921       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
2922       //#else
2923       if(map>=0) {
2924         emit_writehword_indexed(tl,x,a);
2925       }else
2926         //emit_writehword_indexed(tl,rdram-0x80000000+x,a);
2927         emit_writehword_indexed(tl,x,a);
2928     }
2929     type=STOREH_STUB;
2930   }
2931   if (opcode[i]==0x2B) { // SW
2932     if(!c||memtarget) {
2933       int a=addr;
2934       if(faststore_reg_override) a=faststore_reg_override;
2935       //emit_writeword_indexed(tl,rdram-0x80000000,addr);
2936       emit_writeword_indexed_tlb(tl,0,a,map,temp);
2937     }
2938     type=STOREW_STUB;
2939   }
2940   if (opcode[i]==0x3F) { // SD
2941     if(!c||memtarget) {
2942       int a=addr;
2943       if(faststore_reg_override) a=faststore_reg_override;
2944       if(rs2[i]) {
2945         assert(th>=0);
2946         //emit_writeword_indexed(th,rdram-0x80000000,addr);
2947         //emit_writeword_indexed(tl,rdram-0x7FFFFFFC,addr);
2948         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
2949       }else{
2950         // Store zero
2951         //emit_writeword_indexed(tl,rdram-0x80000000,temp);
2952         //emit_writeword_indexed(tl,rdram-0x7FFFFFFC,temp);
2953         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
2954       }
2955     }
2956     type=STORED_STUB;
2957   }
2958   if(jaddr) {
2959     // PCSX store handlers don't check invcode again
2960     reglist|=1<<addr;
2961     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2962     jaddr=0;
2963   }
2964   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2965     if(!c||memtarget) {
2966       #ifdef DESTRUCTIVE_SHIFT
2967       // The x86 shift operation is 'destructive'; it overwrites the
2968       // source register, so we need to make a copy first and use that.
2969       addr=temp;
2970       #endif
2971       #if defined(HOST_IMM8)
2972       int ir=get_reg(i_regs->regmap,INVCP);
2973       assert(ir>=0);
2974       emit_cmpmem_indexedsr12_reg(ir,addr,1);
2975       #else
2976       emit_cmpmem_indexedsr12_imm(invalid_code,addr,1);
2977       #endif
2978       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2979       emit_callne(invalidate_addr_reg[addr]);
2980       #else
2981       void *jaddr2 = out;
2982       emit_jne(0);
2983       add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),addr,0,0,0);
2984       #endif
2985     }
2986   }
2987   u_int addr_val=constmap[i][s]+offset;
2988   if(jaddr) {
2989     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2990   } else if(c&&!memtarget) {
2991     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
2992   }
2993   // basic current block modification detection..
2994   // not looking back as that should be in mips cache already
2995   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
2996     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
2997     assert(i_regs->regmap==regs[i].regmap); // not delay slot
2998     if(i_regs->regmap==regs[i].regmap) {
2999       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
3000       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
3001       emit_movimm(start+i*4+4,0);
3002       emit_writeword(0,&pcaddr);
3003       emit_jmp(do_interrupt);
3004     }
3005   }
3006 }
3007
3008 void storelr_assemble(int i,struct regstat *i_regs)
3009 {
3010   int s,th,tl;
3011   int temp;
3012   int temp2=-1;
3013   int offset;
3014   void *jaddr=0;
3015   void *case1, *case2, *case3;
3016   void *done0, *done1, *done2;
3017   int memtarget=0,c=0;
3018   int agr=AGEN1+(i&1);
3019   u_int hr,reglist=0;
3020   th=get_reg(i_regs->regmap,rs2[i]|64);
3021   tl=get_reg(i_regs->regmap,rs2[i]);
3022   s=get_reg(i_regs->regmap,rs1[i]);
3023   temp=get_reg(i_regs->regmap,agr);
3024   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3025   offset=imm[i];
3026   if(s>=0) {
3027     c=(i_regs->isconst>>s)&1;
3028     if(c) {
3029       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3030     }
3031   }
3032   assert(tl>=0);
3033   for(hr=0;hr<HOST_REGS;hr++) {
3034     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3035   }
3036   assert(temp>=0);
3037   if(!c) {
3038     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3039     if(!offset&&s!=temp) emit_mov(s,temp);
3040     jaddr=out;
3041     emit_jno(0);
3042   }
3043   else
3044   {
3045     if(!memtarget||!rs1[i]) {
3046       jaddr=out;
3047       emit_jmp(0);
3048     }
3049   }
3050   #ifdef RAM_OFFSET
3051   int map=get_reg(i_regs->regmap,ROREG);
3052   if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3053   #else
3054   if((u_int)rdram!=0x80000000)
3055     emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3056   #endif
3057
3058   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3059     temp2=get_reg(i_regs->regmap,FTEMP);
3060     if(!rs2[i]) temp2=th=tl;
3061   }
3062
3063 #ifndef BIG_ENDIAN_MIPS
3064     emit_xorimm(temp,3,temp);
3065 #endif
3066   emit_testimm(temp,2);
3067   case2=out;
3068   emit_jne(0);
3069   emit_testimm(temp,1);
3070   case1=out;
3071   emit_jne(0);
3072   // 0
3073   if (opcode[i]==0x2A) { // SWL
3074     emit_writeword_indexed(tl,0,temp);
3075   }
3076   if (opcode[i]==0x2E) { // SWR
3077     emit_writebyte_indexed(tl,3,temp);
3078   }
3079   if (opcode[i]==0x2C) { // SDL
3080     emit_writeword_indexed(th,0,temp);
3081     if(rs2[i]) emit_mov(tl,temp2);
3082   }
3083   if (opcode[i]==0x2D) { // SDR
3084     emit_writebyte_indexed(tl,3,temp);
3085     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3086   }
3087   done0=out;
3088   emit_jmp(0);
3089   // 1
3090   set_jump_target(case1, out);
3091   if (opcode[i]==0x2A) { // SWL
3092     // Write 3 msb into three least significant bytes
3093     if(rs2[i]) emit_rorimm(tl,8,tl);
3094     emit_writehword_indexed(tl,-1,temp);
3095     if(rs2[i]) emit_rorimm(tl,16,tl);
3096     emit_writebyte_indexed(tl,1,temp);
3097     if(rs2[i]) emit_rorimm(tl,8,tl);
3098   }
3099   if (opcode[i]==0x2E) { // SWR
3100     // Write two lsb into two most significant bytes
3101     emit_writehword_indexed(tl,1,temp);
3102   }
3103   if (opcode[i]==0x2C) { // SDL
3104     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3105     // Write 3 msb into three least significant bytes
3106     if(rs2[i]) emit_rorimm(th,8,th);
3107     emit_writehword_indexed(th,-1,temp);
3108     if(rs2[i]) emit_rorimm(th,16,th);
3109     emit_writebyte_indexed(th,1,temp);
3110     if(rs2[i]) emit_rorimm(th,8,th);
3111   }
3112   if (opcode[i]==0x2D) { // SDR
3113     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3114     // Write two lsb into two most significant bytes
3115     emit_writehword_indexed(tl,1,temp);
3116   }
3117   done1=out;
3118   emit_jmp(0);
3119   // 2
3120   set_jump_target(case2, out);
3121   emit_testimm(temp,1);
3122   case3=out;
3123   emit_jne(0);
3124   if (opcode[i]==0x2A) { // SWL
3125     // Write two msb into two least significant bytes
3126     if(rs2[i]) emit_rorimm(tl,16,tl);
3127     emit_writehword_indexed(tl,-2,temp);
3128     if(rs2[i]) emit_rorimm(tl,16,tl);
3129   }
3130   if (opcode[i]==0x2E) { // SWR
3131     // Write 3 lsb into three most significant bytes
3132     emit_writebyte_indexed(tl,-1,temp);
3133     if(rs2[i]) emit_rorimm(tl,8,tl);
3134     emit_writehword_indexed(tl,0,temp);
3135     if(rs2[i]) emit_rorimm(tl,24,tl);
3136   }
3137   if (opcode[i]==0x2C) { // SDL
3138     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3139     // Write two msb into two least significant bytes
3140     if(rs2[i]) emit_rorimm(th,16,th);
3141     emit_writehword_indexed(th,-2,temp);
3142     if(rs2[i]) emit_rorimm(th,16,th);
3143   }
3144   if (opcode[i]==0x2D) { // SDR
3145     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3146     // Write 3 lsb into three most significant bytes
3147     emit_writebyte_indexed(tl,-1,temp);
3148     if(rs2[i]) emit_rorimm(tl,8,tl);
3149     emit_writehword_indexed(tl,0,temp);
3150     if(rs2[i]) emit_rorimm(tl,24,tl);
3151   }
3152   done2=out;
3153   emit_jmp(0);
3154   // 3
3155   set_jump_target(case3, out);
3156   if (opcode[i]==0x2A) { // SWL
3157     // Write msb into least significant byte
3158     if(rs2[i]) emit_rorimm(tl,24,tl);
3159     emit_writebyte_indexed(tl,-3,temp);
3160     if(rs2[i]) emit_rorimm(tl,8,tl);
3161   }
3162   if (opcode[i]==0x2E) { // SWR
3163     // Write entire word
3164     emit_writeword_indexed(tl,-3,temp);
3165   }
3166   if (opcode[i]==0x2C) { // SDL
3167     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3168     // Write msb into least significant byte
3169     if(rs2[i]) emit_rorimm(th,24,th);
3170     emit_writebyte_indexed(th,-3,temp);
3171     if(rs2[i]) emit_rorimm(th,8,th);
3172   }
3173   if (opcode[i]==0x2D) { // SDR
3174     if(rs2[i]) emit_mov(th,temp2);
3175     // Write entire word
3176     emit_writeword_indexed(tl,-3,temp);
3177   }
3178   set_jump_target(done0, out);
3179   set_jump_target(done1, out);
3180   set_jump_target(done2, out);
3181   if (opcode[i]==0x2C) { // SDL
3182     emit_testimm(temp,4);
3183     done0=out;
3184     emit_jne(0);
3185     emit_andimm(temp,~3,temp);
3186     emit_writeword_indexed(temp2,4,temp);
3187     set_jump_target(done0, out);
3188   }
3189   if (opcode[i]==0x2D) { // SDR
3190     emit_testimm(temp,4);
3191     done0=out;
3192     emit_jeq(0);
3193     emit_andimm(temp,~3,temp);
3194     emit_writeword_indexed(temp2,-4,temp);
3195     set_jump_target(done0, out);
3196   }
3197   if(!c||!memtarget)
3198     add_stub_r(STORELR_STUB,jaddr,out,i,temp,i_regs,ccadj[i],reglist);
3199   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3200     #ifdef RAM_OFFSET
3201     int map=get_reg(i_regs->regmap,ROREG);
3202     if(map<0) map=HOST_TEMPREG;
3203     gen_orig_addr_w(temp,map);
3204     #else
3205     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3206     #endif
3207     #if defined(HOST_IMM8)
3208     int ir=get_reg(i_regs->regmap,INVCP);
3209     assert(ir>=0);
3210     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3211     #else
3212     emit_cmpmem_indexedsr12_imm(invalid_code,temp,1);
3213     #endif
3214     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3215     emit_callne(invalidate_addr_reg[temp]);
3216     #else
3217     void *jaddr2 = out;
3218     emit_jne(0);
3219     add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3220     #endif
3221   }
3222 }
3223
3224 void c1ls_assemble(int i,struct regstat *i_regs)
3225 {
3226   cop1_unusable(i, i_regs);
3227 }
3228
3229 void c2ls_assemble(int i,struct regstat *i_regs)
3230 {
3231   int s,tl;
3232   int ar;
3233   int offset;
3234   int memtarget=0,c=0;
3235   void *jaddr2=NULL;
3236   enum stub_type type;
3237   int agr=AGEN1+(i&1);
3238   int fastio_reg_override=0;
3239   u_int hr,reglist=0;
3240   u_int copr=(source[i]>>16)&0x1f;
3241   s=get_reg(i_regs->regmap,rs1[i]);
3242   tl=get_reg(i_regs->regmap,FTEMP);
3243   offset=imm[i];
3244   assert(rs1[i]>0);
3245   assert(tl>=0);
3246
3247   for(hr=0;hr<HOST_REGS;hr++) {
3248     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3249   }
3250   if(i_regs->regmap[HOST_CCREG]==CCREG)
3251     reglist&=~(1<<HOST_CCREG);
3252
3253   // get the address
3254   if (opcode[i]==0x3a) { // SWC2
3255     ar=get_reg(i_regs->regmap,agr);
3256     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3257     reglist|=1<<ar;
3258   } else { // LWC2
3259     ar=tl;
3260   }
3261   if(s>=0) c=(i_regs->wasconst>>s)&1;
3262   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3263   if (!offset&&!c&&s>=0) ar=s;
3264   assert(ar>=0);
3265
3266   if (opcode[i]==0x3a) { // SWC2
3267     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3268     type=STOREW_STUB;
3269   }
3270   else
3271     type=LOADW_STUB;
3272
3273   if(c&&!memtarget) {
3274     jaddr2=out;
3275     emit_jmp(0); // inline_readstub/inline_writestub?
3276   }
3277   else {
3278     if(!c) {
3279       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3280     }
3281     else if(ram_offset&&memtarget) {
3282       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3283       fastio_reg_override=HOST_TEMPREG;
3284     }
3285     if (opcode[i]==0x32) { // LWC2
3286       #ifdef HOST_IMM_ADDR32
3287       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3288       else
3289       #endif
3290       int a=ar;
3291       if(fastio_reg_override) a=fastio_reg_override;
3292       emit_readword_indexed(0,a,tl);
3293     }
3294     if (opcode[i]==0x3a) { // SWC2
3295       #ifdef DESTRUCTIVE_SHIFT
3296       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3297       #endif
3298       int a=ar;
3299       if(fastio_reg_override) a=fastio_reg_override;
3300       emit_writeword_indexed(tl,0,a);
3301     }
3302   }
3303   if(jaddr2)
3304     add_stub_r(type,jaddr2,out,i,ar,i_regs,ccadj[i],reglist);
3305   if(opcode[i]==0x3a) // SWC2
3306   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3307 #if defined(HOST_IMM8)
3308     int ir=get_reg(i_regs->regmap,INVCP);
3309     assert(ir>=0);
3310     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3311 #else
3312     emit_cmpmem_indexedsr12_imm(invalid_code,ar,1);
3313 #endif
3314     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3315     emit_callne(invalidate_addr_reg[ar]);
3316     #else
3317     void *jaddr3 = out;
3318     emit_jne(0);
3319     add_stub(INVCODE_STUB,jaddr3,out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3320     #endif
3321   }
3322   if (opcode[i]==0x32) { // LWC2
3323     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3324   }
3325 }
3326
3327 #ifndef multdiv_assemble
3328 void multdiv_assemble(int i,struct regstat *i_regs)
3329 {
3330   printf("Need multdiv_assemble for this architecture.\n");
3331   exit(1);
3332 }
3333 #endif
3334
3335 void mov_assemble(int i,struct regstat *i_regs)
3336 {
3337   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3338   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3339   if(rt1[i]) {
3340     signed char sh,sl,th,tl;
3341     th=get_reg(i_regs->regmap,rt1[i]|64);
3342     tl=get_reg(i_regs->regmap,rt1[i]);
3343     //assert(tl>=0);
3344     if(tl>=0) {
3345       sh=get_reg(i_regs->regmap,rs1[i]|64);
3346       sl=get_reg(i_regs->regmap,rs1[i]);
3347       if(sl>=0) emit_mov(sl,tl);
3348       else emit_loadreg(rs1[i],tl);
3349       if(th>=0) {
3350         if(sh>=0) emit_mov(sh,th);
3351         else emit_loadreg(rs1[i]|64,th);
3352       }
3353     }
3354   }
3355 }
3356
3357 #ifndef fconv_assemble
3358 void fconv_assemble(int i,struct regstat *i_regs)
3359 {
3360   printf("Need fconv_assemble for this architecture.\n");
3361   exit(1);
3362 }
3363 #endif
3364
3365 #if 0
3366 void float_assemble(int i,struct regstat *i_regs)
3367 {
3368   printf("Need float_assemble for this architecture.\n");
3369   exit(1);
3370 }
3371 #endif
3372
3373 void syscall_assemble(int i,struct regstat *i_regs)
3374 {
3375   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3376   assert(ccreg==HOST_CCREG);
3377   assert(!is_delayslot);
3378   (void)ccreg;
3379   emit_movimm(start+i*4,EAX); // Get PC
3380   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3381   emit_jmp(jump_syscall_hle); // XXX
3382 }
3383
3384 void hlecall_assemble(int i,struct regstat *i_regs)
3385 {
3386   extern void psxNULL();
3387   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3388   assert(ccreg==HOST_CCREG);
3389   assert(!is_delayslot);
3390   (void)ccreg;
3391   emit_movimm(start+i*4+4,0); // Get PC
3392   uint32_t hleCode = source[i] & 0x03ffffff;
3393   if (hleCode >= ARRAY_SIZE(psxHLEt))
3394     emit_movimm((uintptr_t)psxNULL,1);
3395   else
3396     emit_movimm((uintptr_t)psxHLEt[hleCode],1);
3397   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3398   emit_jmp(jump_hlecall);
3399 }
3400
3401 void intcall_assemble(int i,struct regstat *i_regs)
3402 {
3403   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3404   assert(ccreg==HOST_CCREG);
3405   assert(!is_delayslot);
3406   (void)ccreg;
3407   emit_movimm(start+i*4,0); // Get PC
3408   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3409   emit_jmp(jump_intcall);
3410 }
3411
3412 void ds_assemble(int i,struct regstat *i_regs)
3413 {
3414   speculate_register_values(i);
3415   is_delayslot=1;
3416   switch(itype[i]) {
3417     case ALU:
3418       alu_assemble(i,i_regs);break;
3419     case IMM16:
3420       imm16_assemble(i,i_regs);break;
3421     case SHIFT:
3422       shift_assemble(i,i_regs);break;
3423     case SHIFTIMM:
3424       shiftimm_assemble(i,i_regs);break;
3425     case LOAD:
3426       load_assemble(i,i_regs);break;
3427     case LOADLR:
3428       loadlr_assemble(i,i_regs);break;
3429     case STORE:
3430       store_assemble(i,i_regs);break;
3431     case STORELR:
3432       storelr_assemble(i,i_regs);break;
3433     case COP0:
3434       cop0_assemble(i,i_regs);break;
3435     case COP1:
3436       cop1_assemble(i,i_regs);break;
3437     case C1LS:
3438       c1ls_assemble(i,i_regs);break;
3439     case COP2:
3440       cop2_assemble(i,i_regs);break;
3441     case C2LS:
3442       c2ls_assemble(i,i_regs);break;
3443     case C2OP:
3444       c2op_assemble(i,i_regs);break;
3445     case FCONV:
3446       fconv_assemble(i,i_regs);break;
3447     case FLOAT:
3448       float_assemble(i,i_regs);break;
3449     case FCOMP:
3450       fcomp_assemble(i,i_regs);break;
3451     case MULTDIV:
3452       multdiv_assemble(i,i_regs);break;
3453     case MOV:
3454       mov_assemble(i,i_regs);break;
3455     case SYSCALL:
3456     case HLECALL:
3457     case INTCALL:
3458     case SPAN:
3459     case UJUMP:
3460     case RJUMP:
3461     case CJUMP:
3462     case SJUMP:
3463     case FJUMP:
3464       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3465   }
3466   is_delayslot=0;
3467 }
3468
3469 // Is the branch target a valid internal jump?
3470 int internal_branch(uint64_t i_is32,int addr)
3471 {
3472   if(addr&1) return 0; // Indirect (register) jump
3473   if(addr>=start && addr<start+slen*4-4)
3474   {
3475     //int t=(addr-start)>>2;
3476     // Delay slots are not valid branch targets
3477     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3478     // 64 -> 32 bit transition requires a recompile
3479     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3480     {
3481       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3482       else printf("optimizable: yes\n");
3483     }*/
3484     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3485     return 1;
3486   }
3487   return 0;
3488 }
3489
3490 #ifndef wb_invalidate
3491 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3492   uint64_t u,uint64_t uu)
3493 {
3494   int hr;
3495   for(hr=0;hr<HOST_REGS;hr++) {
3496     if(hr!=EXCLUDE_REG) {
3497       if(pre[hr]!=entry[hr]) {
3498         if(pre[hr]>=0) {
3499           if((dirty>>hr)&1) {
3500             if(get_reg(entry,pre[hr])<0) {
3501               if(pre[hr]<64) {
3502                 if(!((u>>pre[hr])&1)) {
3503                   emit_storereg(pre[hr],hr);
3504                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3505                     emit_sarimm(hr,31,hr);
3506                     emit_storereg(pre[hr]|64,hr);
3507                   }
3508                 }
3509               }else{
3510                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3511                   emit_storereg(pre[hr],hr);
3512                 }
3513               }
3514             }
3515           }
3516         }
3517       }
3518     }
3519   }
3520   // Move from one register to another (no writeback)
3521   for(hr=0;hr<HOST_REGS;hr++) {
3522     if(hr!=EXCLUDE_REG) {
3523       if(pre[hr]!=entry[hr]) {
3524         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3525           int nr;
3526           if((nr=get_reg(entry,pre[hr]))>=0) {
3527             emit_mov(hr,nr);
3528           }
3529         }
3530       }
3531     }
3532   }
3533 }
3534 #endif
3535
3536 // Load the specified registers
3537 // This only loads the registers given as arguments because
3538 // we don't want to load things that will be overwritten
3539 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3540 {
3541   int hr;
3542   // Load 32-bit regs
3543   for(hr=0;hr<HOST_REGS;hr++) {
3544     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3545       if(entry[hr]!=regmap[hr]) {
3546         if(regmap[hr]==rs1||regmap[hr]==rs2)
3547         {
3548           if(regmap[hr]==0) {
3549             emit_zeroreg(hr);
3550           }
3551           else
3552           {
3553             emit_loadreg(regmap[hr],hr);
3554           }
3555         }
3556       }
3557     }
3558   }
3559   //Load 64-bit regs
3560   for(hr=0;hr<HOST_REGS;hr++) {
3561     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3562       if(entry[hr]!=regmap[hr]) {
3563         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3564         {
3565           assert(regmap[hr]!=64);
3566           if((is32>>(regmap[hr]&63))&1) {
3567             int lr=get_reg(regmap,regmap[hr]-64);
3568             if(lr>=0)
3569               emit_sarimm(lr,31,hr);
3570             else
3571               emit_loadreg(regmap[hr],hr);
3572           }
3573           else
3574           {
3575             emit_loadreg(regmap[hr],hr);
3576           }
3577         }
3578       }
3579     }
3580   }
3581 }
3582
3583 // Load registers prior to the start of a loop
3584 // so that they are not loaded within the loop
3585 static void loop_preload(signed char pre[],signed char entry[])
3586 {
3587   int hr;
3588   for(hr=0;hr<HOST_REGS;hr++) {
3589     if(hr!=EXCLUDE_REG) {
3590       if(pre[hr]!=entry[hr]) {
3591         if(entry[hr]>=0) {
3592           if(get_reg(pre,entry[hr])<0) {
3593             assem_debug("loop preload:\n");
3594             //printf("loop preload: %d\n",hr);
3595             if(entry[hr]==0) {
3596               emit_zeroreg(hr);
3597             }
3598             else if(entry[hr]<TEMPREG)
3599             {
3600               emit_loadreg(entry[hr],hr);
3601             }
3602             else if(entry[hr]-64<TEMPREG)
3603             {
3604               emit_loadreg(entry[hr],hr);
3605             }
3606           }
3607         }
3608       }
3609     }
3610   }
3611 }
3612
3613 // Generate address for load/store instruction
3614 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
3615 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3616 {
3617   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
3618     int ra=-1;
3619     int agr=AGEN1+(i&1);
3620     if(itype[i]==LOAD) {
3621       ra=get_reg(i_regs->regmap,rt1[i]);
3622       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3623       assert(ra>=0);
3624     }
3625     if(itype[i]==LOADLR) {
3626       ra=get_reg(i_regs->regmap,FTEMP);
3627     }
3628     if(itype[i]==STORE||itype[i]==STORELR) {
3629       ra=get_reg(i_regs->regmap,agr);
3630       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3631     }
3632     if(itype[i]==C1LS||itype[i]==C2LS) {
3633       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
3634         ra=get_reg(i_regs->regmap,FTEMP);
3635       else { // SWC1/SDC1/SWC2/SDC2
3636         ra=get_reg(i_regs->regmap,agr);
3637         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3638       }
3639     }
3640     int rs=get_reg(i_regs->regmap,rs1[i]);
3641     if(ra>=0) {
3642       int offset=imm[i];
3643       int c=(i_regs->wasconst>>rs)&1;
3644       if(rs1[i]==0) {
3645         // Using r0 as a base address
3646         if(!entry||entry[ra]!=agr) {
3647           if (opcode[i]==0x22||opcode[i]==0x26) {
3648             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3649           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3650             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3651           }else{
3652             emit_movimm(offset,ra);
3653           }
3654         } // else did it in the previous cycle
3655       }
3656       else if(rs<0) {
3657         if(!entry||entry[ra]!=rs1[i])
3658           emit_loadreg(rs1[i],ra);
3659         //if(!entry||entry[ra]!=rs1[i])
3660         //  printf("poor load scheduling!\n");
3661       }
3662       else if(c) {
3663         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3664           if(!entry||entry[ra]!=agr) {
3665             if (opcode[i]==0x22||opcode[i]==0x26) {
3666               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3667             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3668               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3669             }else{
3670               #ifdef HOST_IMM_ADDR32
3671               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3672               #endif
3673               emit_movimm(constmap[i][rs]+offset,ra);
3674               regs[i].loadedconst|=1<<ra;
3675             }
3676           } // else did it in the previous cycle
3677         } // else load_consts already did it
3678       }
3679       if(offset&&!c&&rs1[i]) {
3680         if(rs>=0) {
3681           emit_addimm(rs,offset,ra);
3682         }else{
3683           emit_addimm(ra,offset,ra);
3684         }
3685       }
3686     }
3687   }
3688   // Preload constants for next instruction
3689   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
3690     int agr,ra;
3691     // Actual address
3692     agr=AGEN1+((i+1)&1);
3693     ra=get_reg(i_regs->regmap,agr);
3694     if(ra>=0) {
3695       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3696       int offset=imm[i+1];
3697       int c=(regs[i+1].wasconst>>rs)&1;
3698       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3699         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3700           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3701         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3702           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3703         }else{
3704           #ifdef HOST_IMM_ADDR32
3705           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3706           #endif
3707           emit_movimm(constmap[i+1][rs]+offset,ra);
3708           regs[i+1].loadedconst|=1<<ra;
3709         }
3710       }
3711       else if(rs1[i+1]==0) {
3712         // Using r0 as a base address
3713         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3714           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3715         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3716           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3717         }else{
3718           emit_movimm(offset,ra);
3719         }
3720       }
3721     }
3722   }
3723 }
3724
3725 static int get_final_value(int hr, int i, int *value)
3726 {
3727   int reg=regs[i].regmap[hr];
3728   while(i<slen-1) {
3729     if(regs[i+1].regmap[hr]!=reg) break;
3730     if(!((regs[i+1].isconst>>hr)&1)) break;
3731     if(bt[i+1]) break;
3732     i++;
3733   }
3734   if(i<slen-1) {
3735     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3736       *value=constmap[i][hr];
3737       return 1;
3738     }
3739     if(!bt[i+1]) {
3740       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3741         // Load in delay slot, out-of-order execution
3742         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3743         {
3744           // Precompute load address
3745           *value=constmap[i][hr]+imm[i+2];
3746           return 1;
3747         }
3748       }
3749       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3750       {
3751         // Precompute load address
3752         *value=constmap[i][hr]+imm[i+1];
3753         //printf("c=%x imm=%lx\n",(long)constmap[i][hr],imm[i+1]);
3754         return 1;
3755       }
3756     }
3757   }
3758   *value=constmap[i][hr];
3759   //printf("c=%lx\n",(long)constmap[i][hr]);
3760   if(i==slen-1) return 1;
3761   if(reg<64) {
3762     return !((unneeded_reg[i+1]>>reg)&1);
3763   }else{
3764     return !((unneeded_reg_upper[i+1]>>reg)&1);
3765   }
3766 }
3767
3768 // Load registers with known constants
3769 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
3770 {
3771   int hr,hr2;
3772   // propagate loaded constant flags
3773   if(i==0||bt[i])
3774     regs[i].loadedconst=0;
3775   else {
3776     for(hr=0;hr<HOST_REGS;hr++) {
3777       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
3778          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
3779       {
3780         regs[i].loadedconst|=1<<hr;
3781       }
3782     }
3783   }
3784   // Load 32-bit regs
3785   for(hr=0;hr<HOST_REGS;hr++) {
3786     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3787       //if(entry[hr]!=regmap[hr]) {
3788       if(!((regs[i].loadedconst>>hr)&1)) {
3789         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3790           int value,similar=0;
3791           if(get_final_value(hr,i,&value)) {
3792             // see if some other register has similar value
3793             for(hr2=0;hr2<HOST_REGS;hr2++) {
3794               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
3795                 if(is_similar_value(value,constmap[i][hr2])) {
3796                   similar=1;
3797                   break;
3798                 }
3799               }
3800             }
3801             if(similar) {
3802               int value2;
3803               if(get_final_value(hr2,i,&value2)) // is this needed?
3804                 emit_movimm_from(value2,hr2,value,hr);
3805               else
3806                 emit_movimm(value,hr);
3807             }
3808             else if(value==0) {
3809               emit_zeroreg(hr);
3810             }
3811             else {
3812               emit_movimm(value,hr);
3813             }
3814           }
3815           regs[i].loadedconst|=1<<hr;
3816         }
3817       }
3818     }
3819   }
3820   // Load 64-bit regs
3821   for(hr=0;hr<HOST_REGS;hr++) {
3822     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3823       //if(entry[hr]!=regmap[hr]) {
3824       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3825         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3826           if((is32>>(regmap[hr]&63))&1) {
3827             int lr=get_reg(regmap,regmap[hr]-64);
3828             assert(lr>=0);
3829             emit_sarimm(lr,31,hr);
3830           }
3831           else
3832           {
3833             int value;
3834             if(get_final_value(hr,i,&value)) {
3835               if(value==0) {
3836                 emit_zeroreg(hr);
3837               }
3838               else {
3839                 emit_movimm(value,hr);
3840               }
3841             }
3842           }
3843         }
3844       }
3845     }
3846   }
3847 }
3848 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
3849 {
3850   int hr;
3851   // Load 32-bit regs
3852   for(hr=0;hr<HOST_REGS;hr++) {
3853     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3854       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3855         int value=constmap[i][hr];
3856         if(value==0) {
3857           emit_zeroreg(hr);
3858         }
3859         else {
3860           emit_movimm(value,hr);
3861         }
3862       }
3863     }
3864   }
3865   // Load 64-bit regs
3866   for(hr=0;hr<HOST_REGS;hr++) {
3867     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3868       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3869         if((is32>>(regmap[hr]&63))&1) {
3870           int lr=get_reg(regmap,regmap[hr]-64);
3871           assert(lr>=0);
3872           emit_sarimm(lr,31,hr);
3873         }
3874         else
3875         {
3876           int value=constmap[i][hr];
3877           if(value==0) {
3878             emit_zeroreg(hr);
3879           }
3880           else {
3881             emit_movimm(value,hr);
3882           }
3883         }
3884       }
3885     }
3886   }
3887 }
3888
3889 // Write out all dirty registers (except cycle count)
3890 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
3891 {
3892   int hr;
3893   for(hr=0;hr<HOST_REGS;hr++) {
3894     if(hr!=EXCLUDE_REG) {
3895       if(i_regmap[hr]>0) {
3896         if(i_regmap[hr]!=CCREG) {
3897           if((i_dirty>>hr)&1) {
3898             if(i_regmap[hr]<64) {
3899               emit_storereg(i_regmap[hr],hr);
3900             }else{
3901               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
3902                 emit_storereg(i_regmap[hr],hr);
3903               }
3904             }
3905           }
3906         }
3907       }
3908     }
3909   }
3910 }
3911 // Write out dirty registers that we need to reload (pair with load_needed_regs)
3912 // This writes the registers not written by store_regs_bt
3913 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
3914 {
3915   int hr;
3916   int t=(addr-start)>>2;
3917   for(hr=0;hr<HOST_REGS;hr++) {
3918     if(hr!=EXCLUDE_REG) {
3919       if(i_regmap[hr]>0) {
3920         if(i_regmap[hr]!=CCREG) {
3921           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
3922             if((i_dirty>>hr)&1) {
3923               if(i_regmap[hr]<64) {
3924                 emit_storereg(i_regmap[hr],hr);
3925               }else{
3926                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
3927                   emit_storereg(i_regmap[hr],hr);
3928                 }
3929               }
3930             }
3931           }
3932         }
3933       }
3934     }
3935   }
3936 }
3937
3938 // Load all registers (except cycle count)
3939 void load_all_regs(signed char i_regmap[])
3940 {
3941   int hr;
3942   for(hr=0;hr<HOST_REGS;hr++) {
3943     if(hr!=EXCLUDE_REG) {
3944       if(i_regmap[hr]==0) {
3945         emit_zeroreg(hr);
3946       }
3947       else
3948       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
3949       {
3950         emit_loadreg(i_regmap[hr],hr);
3951       }
3952     }
3953   }
3954 }
3955
3956 // Load all current registers also needed by next instruction
3957 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
3958 {
3959   int hr;
3960   for(hr=0;hr<HOST_REGS;hr++) {
3961     if(hr!=EXCLUDE_REG) {
3962       if(get_reg(next_regmap,i_regmap[hr])>=0) {
3963         if(i_regmap[hr]==0) {
3964           emit_zeroreg(hr);
3965         }
3966         else
3967         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
3968         {
3969           emit_loadreg(i_regmap[hr],hr);
3970         }
3971       }
3972     }
3973   }
3974 }
3975
3976 // Load all regs, storing cycle count if necessary
3977 void load_regs_entry(int t)
3978 {
3979   int hr;
3980   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
3981   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
3982   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
3983     emit_storereg(CCREG,HOST_CCREG);
3984   }
3985   // Load 32-bit regs
3986   for(hr=0;hr<HOST_REGS;hr++) {
3987     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
3988       if(regs[t].regmap_entry[hr]==0) {
3989         emit_zeroreg(hr);
3990       }
3991       else if(regs[t].regmap_entry[hr]!=CCREG)
3992       {
3993         emit_loadreg(regs[t].regmap_entry[hr],hr);
3994       }
3995     }
3996   }
3997   // Load 64-bit regs
3998   for(hr=0;hr<HOST_REGS;hr++) {
3999     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4000       assert(regs[t].regmap_entry[hr]!=64);
4001       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4002         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4003         if(lr<0) {
4004           emit_loadreg(regs[t].regmap_entry[hr],hr);
4005         }
4006         else
4007         {
4008           emit_sarimm(lr,31,hr);
4009         }
4010       }
4011       else
4012       {
4013         emit_loadreg(regs[t].regmap_entry[hr],hr);
4014       }
4015     }
4016   }
4017 }
4018
4019 // Store dirty registers prior to branch
4020 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4021 {
4022   if(internal_branch(i_is32,addr))
4023   {
4024     int t=(addr-start)>>2;
4025     int hr;
4026     for(hr=0;hr<HOST_REGS;hr++) {
4027       if(hr!=EXCLUDE_REG) {
4028         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4029           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4030             if((i_dirty>>hr)&1) {
4031               if(i_regmap[hr]<64) {
4032                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4033                   emit_storereg(i_regmap[hr],hr);
4034                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4035                     #ifdef DESTRUCTIVE_WRITEBACK
4036                     emit_sarimm(hr,31,hr);
4037                     emit_storereg(i_regmap[hr]|64,hr);
4038                     #else
4039                     emit_sarimm(hr,31,HOST_TEMPREG);
4040                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4041                     #endif
4042                   }
4043                 }
4044               }else{
4045                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4046                   emit_storereg(i_regmap[hr],hr);
4047                 }
4048               }
4049             }
4050           }
4051         }
4052       }
4053     }
4054   }
4055   else
4056   {
4057     // Branch out of this block, write out all dirty regs
4058     wb_dirtys(i_regmap,i_is32,i_dirty);
4059   }
4060 }
4061
4062 // Load all needed registers for branch target
4063 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4064 {
4065   //if(addr>=start && addr<(start+slen*4))
4066   if(internal_branch(i_is32,addr))
4067   {
4068     int t=(addr-start)>>2;
4069     int hr;
4070     // Store the cycle count before loading something else
4071     if(i_regmap[HOST_CCREG]!=CCREG) {
4072       assert(i_regmap[HOST_CCREG]==-1);
4073     }
4074     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4075       emit_storereg(CCREG,HOST_CCREG);
4076     }
4077     // Load 32-bit regs
4078     for(hr=0;hr<HOST_REGS;hr++) {
4079       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4080         #ifdef DESTRUCTIVE_WRITEBACK
4081         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4082         #else
4083         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4084         #endif
4085           if(regs[t].regmap_entry[hr]==0) {
4086             emit_zeroreg(hr);
4087           }
4088           else if(regs[t].regmap_entry[hr]!=CCREG)
4089           {
4090             emit_loadreg(regs[t].regmap_entry[hr],hr);
4091           }
4092         }
4093       }
4094     }
4095     //Load 64-bit regs
4096     for(hr=0;hr<HOST_REGS;hr++) {
4097       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4098         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4099           assert(regs[t].regmap_entry[hr]!=64);
4100           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4101             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4102             if(lr<0) {
4103               emit_loadreg(regs[t].regmap_entry[hr],hr);
4104             }
4105             else
4106             {
4107               emit_sarimm(lr,31,hr);
4108             }
4109           }
4110           else
4111           {
4112             emit_loadreg(regs[t].regmap_entry[hr],hr);
4113           }
4114         }
4115         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4116           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4117           assert(lr>=0);
4118           emit_sarimm(lr,31,hr);
4119         }
4120       }
4121     }
4122   }
4123 }
4124
4125 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4126 {
4127   if(addr>=start && addr<start+slen*4-4)
4128   {
4129     int t=(addr-start)>>2;
4130     int hr;
4131     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4132     for(hr=0;hr<HOST_REGS;hr++)
4133     {
4134       if(hr!=EXCLUDE_REG)
4135       {
4136         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4137         {
4138           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4139           {
4140             return 0;
4141           }
4142           else
4143           if((i_dirty>>hr)&1)
4144           {
4145             if(i_regmap[hr]<TEMPREG)
4146             {
4147               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4148                 return 0;
4149             }
4150             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4151             {
4152               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4153                 return 0;
4154             }
4155           }
4156         }
4157         else // Same register but is it 32-bit or dirty?
4158         if(i_regmap[hr]>=0)
4159         {
4160           if(!((regs[t].dirty>>hr)&1))
4161           {
4162             if((i_dirty>>hr)&1)
4163             {
4164               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4165               {
4166                 //printf("%x: dirty no match\n",addr);
4167                 return 0;
4168               }
4169             }
4170           }
4171           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4172           {
4173             //printf("%x: is32 no match\n",addr);
4174             return 0;
4175           }
4176         }
4177       }
4178     }
4179     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4180     // Delay slots are not valid branch targets
4181     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4182     // Delay slots require additional processing, so do not match
4183     if(is_ds[t]) return 0;
4184   }
4185   else
4186   {
4187     int hr;
4188     for(hr=0;hr<HOST_REGS;hr++)
4189     {
4190       if(hr!=EXCLUDE_REG)
4191       {
4192         if(i_regmap[hr]>=0)
4193         {
4194           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4195           {
4196             if((i_dirty>>hr)&1)
4197             {
4198               return 0;
4199             }
4200           }
4201         }
4202       }
4203     }
4204   }
4205   return 1;
4206 }
4207
4208 #ifdef DRC_DBG
4209 static void drc_dbg_emit_do_cmp(int i)
4210 {
4211   extern void do_insn_cmp();
4212   extern int cycle;
4213   u_int hr,reglist=0;
4214
4215   for(hr=0;hr<HOST_REGS;hr++)
4216     if(regs[i].regmap[hr]>=0) reglist|=1<<hr;
4217   save_regs(reglist);
4218   emit_movimm(start+i*4,0);
4219   emit_writeword(0,&pcaddr);
4220   emit_call(do_insn_cmp);
4221   //emit_readword(&cycle,0);
4222   //emit_addimm(0,2,0);
4223   //emit_writeword(0,&cycle);
4224   restore_regs(reglist);
4225 }
4226 #else
4227 #define drc_dbg_emit_do_cmp(x)
4228 #endif
4229
4230 // Used when a branch jumps into the delay slot of another branch
4231 void ds_assemble_entry(int i)
4232 {
4233   int t=(ba[i]-start)>>2;
4234   if (!instr_addr[t])
4235     instr_addr[t] = out;
4236   assem_debug("Assemble delay slot at %x\n",ba[i]);
4237   assem_debug("<->\n");
4238   drc_dbg_emit_do_cmp(t);
4239   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4240     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4241   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4242   address_generation(t,&regs[t],regs[t].regmap_entry);
4243   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4244     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4245   cop1_usable=0;
4246   is_delayslot=0;
4247   switch(itype[t]) {
4248     case ALU:
4249       alu_assemble(t,&regs[t]);break;
4250     case IMM16:
4251       imm16_assemble(t,&regs[t]);break;
4252     case SHIFT:
4253       shift_assemble(t,&regs[t]);break;
4254     case SHIFTIMM:
4255       shiftimm_assemble(t,&regs[t]);break;
4256     case LOAD:
4257       load_assemble(t,&regs[t]);break;
4258     case LOADLR:
4259       loadlr_assemble(t,&regs[t]);break;
4260     case STORE:
4261       store_assemble(t,&regs[t]);break;
4262     case STORELR:
4263       storelr_assemble(t,&regs[t]);break;
4264     case COP0:
4265       cop0_assemble(t,&regs[t]);break;
4266     case COP1:
4267       cop1_assemble(t,&regs[t]);break;
4268     case C1LS:
4269       c1ls_assemble(t,&regs[t]);break;
4270     case COP2:
4271       cop2_assemble(t,&regs[t]);break;
4272     case C2LS:
4273       c2ls_assemble(t,&regs[t]);break;
4274     case C2OP:
4275       c2op_assemble(t,&regs[t]);break;
4276     case FCONV:
4277       fconv_assemble(t,&regs[t]);break;
4278     case FLOAT:
4279       float_assemble(t,&regs[t]);break;
4280     case FCOMP:
4281       fcomp_assemble(t,&regs[t]);break;
4282     case MULTDIV:
4283       multdiv_assemble(t,&regs[t]);break;
4284     case MOV:
4285       mov_assemble(t,&regs[t]);break;
4286     case SYSCALL:
4287     case HLECALL:
4288     case INTCALL:
4289     case SPAN:
4290     case UJUMP:
4291     case RJUMP:
4292     case CJUMP:
4293     case SJUMP:
4294     case FJUMP:
4295       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4296   }
4297   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4298   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4299   if(internal_branch(regs[t].is32,ba[i]+4))
4300     assem_debug("branch: internal\n");
4301   else
4302     assem_debug("branch: external\n");
4303   assert(internal_branch(regs[t].is32,ba[i]+4));
4304   add_to_linker(out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4305   emit_jmp(0);
4306 }
4307
4308 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4309 {
4310   int count;
4311   void *jaddr;
4312   void *idle=NULL;
4313   int t=0;
4314   if(itype[i]==RJUMP)
4315   {
4316     *adj=0;
4317   }
4318   //if(ba[i]>=start && ba[i]<(start+slen*4))
4319   if(internal_branch(branch_regs[i].is32,ba[i]))
4320   {
4321     t=(ba[i]-start)>>2;
4322     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4323     else *adj=ccadj[t];
4324   }
4325   else
4326   {
4327     *adj=0;
4328   }
4329   count=ccadj[i];
4330   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4331     // Idle loop
4332     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4333     idle=out;
4334     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4335     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4336     jaddr=out;
4337     emit_jmp(0);
4338   }
4339   else if(*adj==0||invert) {
4340     int cycles=CLOCK_ADJUST(count+2);
4341     // faster loop HACK
4342     if (t&&*adj) {
4343       int rel=t-i;
4344       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4345         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4346     }
4347     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4348     jaddr=out;
4349     emit_jns(0);
4350   }
4351   else
4352   {
4353     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4354     jaddr=out;
4355     emit_jns(0);
4356   }
4357   add_stub(CC_STUB,jaddr,idle?idle:out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4358 }
4359
4360 static void do_ccstub(int n)
4361 {
4362   literal_pool(256);
4363   assem_debug("do_ccstub %x\n",start+stubs[n].b*4);
4364   set_jump_target(stubs[n].addr, out);
4365   int i=stubs[n].b;
4366   if(stubs[n].d==NULLDS) {
4367     // Delay slot instruction is nullified ("likely" branch)
4368     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4369   }
4370   else if(stubs[n].d!=TAKEN) {
4371     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4372   }
4373   else {
4374     if(internal_branch(branch_regs[i].is32,ba[i]))
4375       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4376   }
4377   if(stubs[n].c!=-1)
4378   {
4379     // Save PC as return address
4380     emit_movimm(stubs[n].c,EAX);
4381     emit_writeword(EAX,&pcaddr);
4382   }
4383   else
4384   {
4385     // Return address depends on which way the branch goes
4386     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4387     {
4388       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4389       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4390       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4391       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4392       if(rs1[i]==0)
4393       {
4394         s1l=s2l;s1h=s2h;
4395         s2l=s2h=-1;
4396       }
4397       else if(rs2[i]==0)
4398       {
4399         s2l=s2h=-1;
4400       }
4401       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4402         s1h=s2h=-1;
4403       }
4404       assert(s1l>=0);
4405       #ifdef DESTRUCTIVE_WRITEBACK
4406       if(rs1[i]) {
4407         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4408           emit_loadreg(rs1[i],s1l);
4409       }
4410       else {
4411         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4412           emit_loadreg(rs2[i],s1l);
4413       }
4414       if(s2l>=0)
4415         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4416           emit_loadreg(rs2[i],s2l);
4417       #endif
4418       int hr=0;
4419       int addr=-1,alt=-1,ntaddr=-1;
4420       while(hr<HOST_REGS)
4421       {
4422         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4423            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4424            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4425         {
4426           addr=hr++;break;
4427         }
4428         hr++;
4429       }
4430       while(hr<HOST_REGS)
4431       {
4432         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4433            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4434            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4435         {
4436           alt=hr++;break;
4437         }
4438         hr++;
4439       }
4440       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4441       {
4442         while(hr<HOST_REGS)
4443         {
4444           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4445              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4446              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4447           {
4448             ntaddr=hr;break;
4449           }
4450           hr++;
4451         }
4452         assert(hr<HOST_REGS);
4453       }
4454       if((opcode[i]&0x2f)==4) // BEQ
4455       {
4456         #ifdef HAVE_CMOV_IMM
4457         if(s1h<0) {
4458           if(s2l>=0) emit_cmp(s1l,s2l);
4459           else emit_test(s1l,s1l);
4460           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4461         }
4462         else
4463         #endif
4464         {
4465           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4466           if(s1h>=0) {
4467             if(s2h>=0) emit_cmp(s1h,s2h);
4468             else emit_test(s1h,s1h);
4469             emit_cmovne_reg(alt,addr);
4470           }
4471           if(s2l>=0) emit_cmp(s1l,s2l);
4472           else emit_test(s1l,s1l);
4473           emit_cmovne_reg(alt,addr);
4474         }
4475       }
4476       if((opcode[i]&0x2f)==5) // BNE
4477       {
4478         #ifdef HAVE_CMOV_IMM
4479         if(s1h<0) {
4480           if(s2l>=0) emit_cmp(s1l,s2l);
4481           else emit_test(s1l,s1l);
4482           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4483         }
4484         else
4485         #endif
4486         {
4487           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4488           if(s1h>=0) {
4489             if(s2h>=0) emit_cmp(s1h,s2h);
4490             else emit_test(s1h,s1h);
4491             emit_cmovne_reg(alt,addr);
4492           }
4493           if(s2l>=0) emit_cmp(s1l,s2l);
4494           else emit_test(s1l,s1l);
4495           emit_cmovne_reg(alt,addr);
4496         }
4497       }
4498       if((opcode[i]&0x2f)==6) // BLEZ
4499       {
4500         //emit_movimm(ba[i],alt);
4501         //emit_movimm(start+i*4+8,addr);
4502         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4503         emit_cmpimm(s1l,1);
4504         if(s1h>=0) emit_mov(addr,ntaddr);
4505         emit_cmovl_reg(alt,addr);
4506         if(s1h>=0) {
4507           emit_test(s1h,s1h);
4508           emit_cmovne_reg(ntaddr,addr);
4509           emit_cmovs_reg(alt,addr);
4510         }
4511       }
4512       if((opcode[i]&0x2f)==7) // BGTZ
4513       {
4514         //emit_movimm(ba[i],addr);
4515         //emit_movimm(start+i*4+8,ntaddr);
4516         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4517         emit_cmpimm(s1l,1);
4518         if(s1h>=0) emit_mov(addr,alt);
4519         emit_cmovl_reg(ntaddr,addr);
4520         if(s1h>=0) {
4521           emit_test(s1h,s1h);
4522           emit_cmovne_reg(alt,addr);
4523           emit_cmovs_reg(ntaddr,addr);
4524         }
4525       }
4526       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4527       {
4528         //emit_movimm(ba[i],alt);
4529         //emit_movimm(start+i*4+8,addr);
4530         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4531         if(s1h>=0) emit_test(s1h,s1h);
4532         else emit_test(s1l,s1l);
4533         emit_cmovs_reg(alt,addr);
4534       }
4535       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4536       {
4537         //emit_movimm(ba[i],addr);
4538         //emit_movimm(start+i*4+8,alt);
4539         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4540         if(s1h>=0) emit_test(s1h,s1h);
4541         else emit_test(s1l,s1l);
4542         emit_cmovs_reg(alt,addr);
4543       }
4544       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4545         if(source[i]&0x10000) // BC1T
4546         {
4547           //emit_movimm(ba[i],alt);
4548           //emit_movimm(start+i*4+8,addr);
4549           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4550           emit_testimm(s1l,0x800000);
4551           emit_cmovne_reg(alt,addr);
4552         }
4553         else // BC1F
4554         {
4555           //emit_movimm(ba[i],addr);
4556           //emit_movimm(start+i*4+8,alt);
4557           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4558           emit_testimm(s1l,0x800000);
4559           emit_cmovne_reg(alt,addr);
4560         }
4561       }
4562       emit_writeword(addr,&pcaddr);
4563     }
4564     else
4565     if(itype[i]==RJUMP)
4566     {
4567       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4568       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4569         r=get_reg(branch_regs[i].regmap,RTEMP);
4570       }
4571       emit_writeword(r,&pcaddr);
4572     }
4573     else {SysPrintf("Unknown branch type in do_ccstub\n");exit(1);}
4574   }
4575   // Update cycle count
4576   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4577   if(stubs[n].a) emit_addimm(HOST_CCREG,CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
4578   emit_call(cc_interrupt);
4579   if(stubs[n].a) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
4580   if(stubs[n].d==TAKEN) {
4581     if(internal_branch(branch_regs[i].is32,ba[i]))
4582       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4583     else if(itype[i]==RJUMP) {
4584       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4585         emit_readword(&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4586       else
4587         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4588     }
4589   }else if(stubs[n].d==NOTTAKEN) {
4590     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4591     else load_all_regs(branch_regs[i].regmap);
4592   }else if(stubs[n].d==NULLDS) {
4593     // Delay slot instruction is nullified ("likely" branch)
4594     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4595     else load_all_regs(regs[i].regmap);
4596   }else{
4597     load_all_regs(branch_regs[i].regmap);
4598   }
4599   emit_jmp(stubs[n].retaddr);
4600 }
4601
4602 static void add_to_linker(void *addr, u_int target, int ext)
4603 {
4604   assert(linkcount < ARRAY_SIZE(link_addr));
4605   link_addr[linkcount].addr = addr;
4606   link_addr[linkcount].target = target;
4607   link_addr[linkcount].ext = ext;
4608   linkcount++;
4609 }
4610
4611 static void ujump_assemble_write_ra(int i)
4612 {
4613   int rt;
4614   unsigned int return_address;
4615   rt=get_reg(branch_regs[i].regmap,31);
4616   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4617   //assert(rt>=0);
4618   return_address=start+i*4+8;
4619   if(rt>=0) {
4620     #ifdef USE_MINI_HT
4621     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
4622       int temp=-1; // note: must be ds-safe
4623       #ifdef HOST_TEMPREG
4624       temp=HOST_TEMPREG;
4625       #endif
4626       if(temp>=0) do_miniht_insert(return_address,rt,temp);
4627       else emit_movimm(return_address,rt);
4628     }
4629     else
4630     #endif
4631     {
4632       #ifdef REG_PREFETCH
4633       if(temp>=0)
4634       {
4635         if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4636       }
4637       #endif
4638       emit_movimm(return_address,rt); // PC into link register
4639       #ifdef IMM_PREFETCH
4640       emit_prefetch(hash_table_get(return_address));
4641       #endif
4642     }
4643   }
4644 }
4645
4646 void ujump_assemble(int i,struct regstat *i_regs)
4647 {
4648   int ra_done=0;
4649   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4650   address_generation(i+1,i_regs,regs[i].regmap_entry);
4651   #ifdef REG_PREFETCH
4652   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4653   if(rt1[i]==31&&temp>=0)
4654   {
4655     signed char *i_regmap=i_regs->regmap;
4656     int return_address=start+i*4+8;
4657     if(get_reg(branch_regs[i].regmap,31)>0)
4658     if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4659   }
4660   #endif
4661   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4662     ujump_assemble_write_ra(i); // writeback ra for DS
4663     ra_done=1;
4664   }
4665   ds_assemble(i+1,i_regs);
4666   uint64_t bc_unneeded=branch_regs[i].u;
4667   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4668   bc_unneeded|=1|(1LL<<rt1[i]);
4669   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4670   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4671                 bc_unneeded,bc_unneeded_upper);
4672   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4673   if(!ra_done&&rt1[i]==31)
4674     ujump_assemble_write_ra(i);
4675   int cc,adj;
4676   cc=get_reg(branch_regs[i].regmap,CCREG);
4677   assert(cc==HOST_CCREG);
4678   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4679   #ifdef REG_PREFETCH
4680   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4681   #endif
4682   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4683   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4684   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4685   if(internal_branch(branch_regs[i].is32,ba[i]))
4686     assem_debug("branch: internal\n");
4687   else
4688     assem_debug("branch: external\n");
4689   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4690     ds_assemble_entry(i);
4691   }
4692   else {
4693     add_to_linker(out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4694     emit_jmp(0);
4695   }
4696 }
4697
4698 static void rjump_assemble_write_ra(int i)
4699 {
4700   int rt,return_address;
4701   assert(rt1[i+1]!=rt1[i]);
4702   assert(rt2[i+1]!=rt1[i]);
4703   rt=get_reg(branch_regs[i].regmap,rt1[i]);
4704   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4705   assert(rt>=0);
4706   return_address=start+i*4+8;
4707   #ifdef REG_PREFETCH
4708   if(temp>=0)
4709   {
4710     if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4711   }
4712   #endif
4713   emit_movimm(return_address,rt); // PC into link register
4714   #ifdef IMM_PREFETCH
4715   emit_prefetch(hash_table_get(return_address));
4716   #endif
4717 }
4718
4719 void rjump_assemble(int i,struct regstat *i_regs)
4720 {
4721   int temp;
4722   int rs,cc;
4723   int ra_done=0;
4724   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4725   assert(rs>=0);
4726   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4727     // Delay slot abuse, make a copy of the branch address register
4728     temp=get_reg(branch_regs[i].regmap,RTEMP);
4729     assert(temp>=0);
4730     assert(regs[i].regmap[temp]==RTEMP);
4731     emit_mov(rs,temp);
4732     rs=temp;
4733   }
4734   address_generation(i+1,i_regs,regs[i].regmap_entry);
4735   #ifdef REG_PREFETCH
4736   if(rt1[i]==31)
4737   {
4738     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4739       signed char *i_regmap=i_regs->regmap;
4740       int return_address=start+i*4+8;
4741       if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4742     }
4743   }
4744   #endif
4745   #ifdef USE_MINI_HT
4746   if(rs1[i]==31) {
4747     int rh=get_reg(regs[i].regmap,RHASH);
4748     if(rh>=0) do_preload_rhash(rh);
4749   }
4750   #endif
4751   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4752     rjump_assemble_write_ra(i);
4753     ra_done=1;
4754   }
4755   ds_assemble(i+1,i_regs);
4756   uint64_t bc_unneeded=branch_regs[i].u;
4757   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4758   bc_unneeded|=1|(1LL<<rt1[i]);
4759   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4760   bc_unneeded&=~(1LL<<rs1[i]);
4761   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4762                 bc_unneeded,bc_unneeded_upper);
4763   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4764   if(!ra_done&&rt1[i]!=0)
4765     rjump_assemble_write_ra(i);
4766   cc=get_reg(branch_regs[i].regmap,CCREG);
4767   assert(cc==HOST_CCREG);
4768   (void)cc;
4769   #ifdef USE_MINI_HT
4770   int rh=get_reg(branch_regs[i].regmap,RHASH);
4771   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4772   if(rs1[i]==31) {
4773     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4774     do_preload_rhtbl(ht);
4775     do_rhash(rs,rh);
4776   }
4777   #endif
4778   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4779   #ifdef DESTRUCTIVE_WRITEBACK
4780   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4781     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4782       emit_loadreg(rs1[i],rs);
4783     }
4784   }
4785   #endif
4786   #ifdef REG_PREFETCH
4787   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4788   #endif
4789   #ifdef USE_MINI_HT
4790   if(rs1[i]==31) {
4791     do_miniht_load(ht,rh);
4792   }
4793   #endif
4794   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4795   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4796   //assert(adj==0);
4797   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4798   add_stub(CC_STUB,out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
4799   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
4800     // special case for RFE
4801     emit_jmp(0);
4802   else
4803     emit_jns(0);
4804   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4805   #ifdef USE_MINI_HT
4806   if(rs1[i]==31) {
4807     do_miniht_jump(rs,rh,ht);
4808   }
4809   else
4810   #endif
4811   {
4812     emit_jmp(jump_vaddr_reg[rs]);
4813   }
4814   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4815   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4816   #endif
4817 }
4818
4819 void cjump_assemble(int i,struct regstat *i_regs)
4820 {
4821   signed char *i_regmap=i_regs->regmap;
4822   int cc;
4823   int match;
4824   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4825   assem_debug("match=%d\n",match);
4826   int s1h,s1l,s2h,s2l;
4827   int prev_cop1_usable=cop1_usable;
4828   int unconditional=0,nop=0;
4829   int only32=0;
4830   int invert=0;
4831   int internal=internal_branch(branch_regs[i].is32,ba[i]);
4832   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4833   if(!match) invert=1;
4834   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4835   if(i>(ba[i]-start)>>2) invert=1;
4836   #endif
4837
4838   if(ooo[i]) {
4839     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4840     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4841     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4842     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4843   }
4844   else {
4845     s1l=get_reg(i_regmap,rs1[i]);
4846     s1h=get_reg(i_regmap,rs1[i]|64);
4847     s2l=get_reg(i_regmap,rs2[i]);
4848     s2h=get_reg(i_regmap,rs2[i]|64);
4849   }
4850   if(rs1[i]==0&&rs2[i]==0)
4851   {
4852     if(opcode[i]&1) nop=1;
4853     else unconditional=1;
4854     //assert(opcode[i]!=5);
4855     //assert(opcode[i]!=7);
4856     //assert(opcode[i]!=0x15);
4857     //assert(opcode[i]!=0x17);
4858   }
4859   else if(rs1[i]==0)
4860   {
4861     s1l=s2l;s1h=s2h;
4862     s2l=s2h=-1;
4863     only32=(regs[i].was32>>rs2[i])&1;
4864   }
4865   else if(rs2[i]==0)
4866   {
4867     s2l=s2h=-1;
4868     only32=(regs[i].was32>>rs1[i])&1;
4869   }
4870   else {
4871     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
4872   }
4873
4874   if(ooo[i]) {
4875     // Out of order execution (delay slot first)
4876     //printf("OOOE\n");
4877     address_generation(i+1,i_regs,regs[i].regmap_entry);
4878     ds_assemble(i+1,i_regs);
4879     int adj;
4880     uint64_t bc_unneeded=branch_regs[i].u;
4881     uint64_t bc_unneeded_upper=branch_regs[i].uu;
4882     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
4883     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
4884     bc_unneeded|=1;
4885     bc_unneeded_upper|=1;
4886     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4887                   bc_unneeded,bc_unneeded_upper);
4888     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
4889     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4890     cc=get_reg(branch_regs[i].regmap,CCREG);
4891     assert(cc==HOST_CCREG);
4892     if(unconditional)
4893       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4894     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
4895     //assem_debug("cycle count (adj)\n");
4896     if(unconditional) {
4897       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4898       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
4899         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4900         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4901         if(internal)
4902           assem_debug("branch: internal\n");
4903         else
4904           assem_debug("branch: external\n");
4905         if(internal&&is_ds[(ba[i]-start)>>2]) {
4906           ds_assemble_entry(i);
4907         }
4908         else {
4909           add_to_linker(out,ba[i],internal);
4910           emit_jmp(0);
4911         }
4912         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4913         if(((u_int)out)&7) emit_addnop(0);
4914         #endif
4915       }
4916     }
4917     else if(nop) {
4918       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
4919       void *jaddr=out;
4920       emit_jns(0);
4921       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
4922     }
4923     else {
4924       void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
4925       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
4926       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4927       if(!only32)
4928       {
4929         assert(s1h>=0);
4930         if(opcode[i]==4) // BEQ
4931         {
4932           if(s2h>=0) emit_cmp(s1h,s2h);
4933           else emit_test(s1h,s1h);
4934           nottaken1=out;
4935           emit_jne((void *)1l);
4936         }
4937         if(opcode[i]==5) // BNE
4938         {
4939           if(s2h>=0) emit_cmp(s1h,s2h);
4940           else emit_test(s1h,s1h);
4941           if(invert) taken=out;
4942           else add_to_linker(out,ba[i],internal);
4943           emit_jne(0);
4944         }
4945         if(opcode[i]==6) // BLEZ
4946         {
4947           emit_test(s1h,s1h);
4948           if(invert) taken=out;
4949           else add_to_linker(out,ba[i],internal);
4950           emit_js(0);
4951           nottaken1=out;
4952           emit_jne((void *)1l);
4953         }
4954         if(opcode[i]==7) // BGTZ
4955         {
4956           emit_test(s1h,s1h);
4957           nottaken1=out;
4958           emit_js(1);
4959           if(invert) taken=out;
4960           else add_to_linker(out,ba[i],internal);
4961           emit_jne(0);
4962         }
4963       } // if(!only32)
4964
4965       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4966       assert(s1l>=0);
4967       if(opcode[i]==4) // BEQ
4968       {
4969         if(s2l>=0) emit_cmp(s1l,s2l);
4970         else emit_test(s1l,s1l);
4971         if(invert){
4972           nottaken=out;
4973           emit_jne((void *)1l);
4974         }else{
4975           add_to_linker(out,ba[i],internal);
4976           emit_jeq(0);
4977         }
4978       }
4979       if(opcode[i]==5) // BNE
4980       {
4981         if(s2l>=0) emit_cmp(s1l,s2l);
4982         else emit_test(s1l,s1l);
4983         if(invert){
4984           nottaken=out;
4985           emit_jeq(1);
4986         }else{
4987           add_to_linker(out,ba[i],internal);
4988           emit_jne(0);
4989         }
4990       }
4991       if(opcode[i]==6) // BLEZ
4992       {
4993         emit_cmpimm(s1l,1);
4994         if(invert){
4995           nottaken=out;
4996           emit_jge(1);
4997         }else{
4998           add_to_linker(out,ba[i],internal);
4999           emit_jl(0);
5000         }
5001       }
5002       if(opcode[i]==7) // BGTZ
5003       {
5004         emit_cmpimm(s1l,1);
5005         if(invert){
5006           nottaken=out;
5007           emit_jl(1);
5008         }else{
5009           add_to_linker(out,ba[i],internal);
5010           emit_jge(0);
5011         }
5012       }
5013       if(invert) {
5014         if(taken) set_jump_target(taken, out);
5015         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5016         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5017           if(adj) {
5018             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5019             add_to_linker(out,ba[i],internal);
5020           }else{
5021             emit_addnop(13);
5022             add_to_linker(out,ba[i],internal*2);
5023           }
5024           emit_jmp(0);
5025         }else
5026         #endif
5027         {
5028           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5029           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5030           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5031           if(internal)
5032             assem_debug("branch: internal\n");
5033           else
5034             assem_debug("branch: external\n");
5035           if(internal&&is_ds[(ba[i]-start)>>2]) {
5036             ds_assemble_entry(i);
5037           }
5038           else {
5039             add_to_linker(out,ba[i],internal);
5040             emit_jmp(0);
5041           }
5042         }
5043         set_jump_target(nottaken, out);
5044       }
5045
5046       if(nottaken1) set_jump_target(nottaken1, out);
5047       if(adj) {
5048         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5049       }
5050     } // (!unconditional)
5051   } // if(ooo)
5052   else
5053   {
5054     // In-order execution (branch first)
5055     //if(likely[i]) printf("IOL\n");
5056     //else
5057     //printf("IOE\n");
5058     void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
5059     if(!unconditional&&!nop) {
5060       if(!only32)
5061       {
5062         assert(s1h>=0);
5063         if((opcode[i]&0x2f)==4) // BEQ
5064         {
5065           if(s2h>=0) emit_cmp(s1h,s2h);
5066           else emit_test(s1h,s1h);
5067           nottaken1=out;
5068           emit_jne((void *)2l);
5069         }
5070         if((opcode[i]&0x2f)==5) // BNE
5071         {
5072           if(s2h>=0) emit_cmp(s1h,s2h);
5073           else emit_test(s1h,s1h);
5074           taken=out;
5075           emit_jne((void *)1l);
5076         }
5077         if((opcode[i]&0x2f)==6) // BLEZ
5078         {
5079           emit_test(s1h,s1h);
5080           taken=out;
5081           emit_js(1);
5082           nottaken1=out;
5083           emit_jne((void *)2l);
5084         }
5085         if((opcode[i]&0x2f)==7) // BGTZ
5086         {
5087           emit_test(s1h,s1h);
5088           nottaken1=out;
5089           emit_js(2);
5090           taken=out;
5091           emit_jne((void *)1l);
5092         }
5093       } // if(!only32)
5094
5095       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5096       assert(s1l>=0);
5097       if((opcode[i]&0x2f)==4) // BEQ
5098       {
5099         if(s2l>=0) emit_cmp(s1l,s2l);
5100         else emit_test(s1l,s1l);
5101         nottaken=out;
5102         emit_jne((void *)2l);
5103       }
5104       if((opcode[i]&0x2f)==5) // BNE
5105       {
5106         if(s2l>=0) emit_cmp(s1l,s2l);
5107         else emit_test(s1l,s1l);
5108         nottaken=out;
5109         emit_jeq(2);
5110       }
5111       if((opcode[i]&0x2f)==6) // BLEZ
5112       {
5113         emit_cmpimm(s1l,1);
5114         nottaken=out;
5115         emit_jge(2);
5116       }
5117       if((opcode[i]&0x2f)==7) // BGTZ
5118       {
5119         emit_cmpimm(s1l,1);
5120         nottaken=out;
5121         emit_jl(2);
5122       }
5123     } // if(!unconditional)
5124     int adj;
5125     uint64_t ds_unneeded=branch_regs[i].u;
5126     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5127     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5128     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5129     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5130     ds_unneeded|=1;
5131     ds_unneeded_upper|=1;
5132     // branch taken
5133     if(!nop) {
5134       if(taken) set_jump_target(taken, out);
5135       assem_debug("1:\n");
5136       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5137                     ds_unneeded,ds_unneeded_upper);
5138       // load regs
5139       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5140       address_generation(i+1,&branch_regs[i],0);
5141       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5142       ds_assemble(i+1,&branch_regs[i]);
5143       cc=get_reg(branch_regs[i].regmap,CCREG);
5144       if(cc==-1) {
5145         emit_loadreg(CCREG,cc=HOST_CCREG);
5146         // CHECK: Is the following instruction (fall thru) allocated ok?
5147       }
5148       assert(cc==HOST_CCREG);
5149       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5150       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5151       assem_debug("cycle count (adj)\n");
5152       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5153       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5154       if(internal)
5155         assem_debug("branch: internal\n");
5156       else
5157         assem_debug("branch: external\n");
5158       if(internal&&is_ds[(ba[i]-start)>>2]) {
5159         ds_assemble_entry(i);
5160       }
5161       else {
5162         add_to_linker(out,ba[i],internal);
5163         emit_jmp(0);
5164       }
5165     }
5166     // branch not taken
5167     cop1_usable=prev_cop1_usable;
5168     if(!unconditional) {
5169       if(nottaken1) set_jump_target(nottaken1, out);
5170       set_jump_target(nottaken, out);
5171       assem_debug("2:\n");
5172       if(!likely[i]) {
5173         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5174                       ds_unneeded,ds_unneeded_upper);
5175         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5176         address_generation(i+1,&branch_regs[i],0);
5177         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5178         ds_assemble(i+1,&branch_regs[i]);
5179       }
5180       cc=get_reg(branch_regs[i].regmap,CCREG);
5181       if(cc==-1&&!likely[i]) {
5182         // Cycle count isn't in a register, temporarily load it then write it out
5183         emit_loadreg(CCREG,HOST_CCREG);
5184         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5185         void *jaddr=out;
5186         emit_jns(0);
5187         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5188         emit_storereg(CCREG,HOST_CCREG);
5189       }
5190       else{
5191         cc=get_reg(i_regmap,CCREG);
5192         assert(cc==HOST_CCREG);
5193         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5194         void *jaddr=out;
5195         emit_jns(0);
5196         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5197       }
5198     }
5199   }
5200 }
5201
5202 void sjump_assemble(int i,struct regstat *i_regs)
5203 {
5204   signed char *i_regmap=i_regs->regmap;
5205   int cc;
5206   int match;
5207   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5208   assem_debug("smatch=%d\n",match);
5209   int s1h,s1l;
5210   int prev_cop1_usable=cop1_usable;
5211   int unconditional=0,nevertaken=0;
5212   int only32=0;
5213   int invert=0;
5214   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5215   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5216   if(!match) invert=1;
5217   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5218   if(i>(ba[i]-start)>>2) invert=1;
5219   #endif
5220
5221   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5222   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5223
5224   if(ooo[i]) {
5225     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5226     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5227   }
5228   else {
5229     s1l=get_reg(i_regmap,rs1[i]);
5230     s1h=get_reg(i_regmap,rs1[i]|64);
5231   }
5232   if(rs1[i]==0)
5233   {
5234     if(opcode2[i]&1) unconditional=1;
5235     else nevertaken=1;
5236     // These are never taken (r0 is never less than zero)
5237     //assert(opcode2[i]!=0);
5238     //assert(opcode2[i]!=2);
5239     //assert(opcode2[i]!=0x10);
5240     //assert(opcode2[i]!=0x12);
5241   }
5242   else {
5243     only32=(regs[i].was32>>rs1[i])&1;
5244   }
5245
5246   if(ooo[i]) {
5247     // Out of order execution (delay slot first)
5248     //printf("OOOE\n");
5249     address_generation(i+1,i_regs,regs[i].regmap_entry);
5250     ds_assemble(i+1,i_regs);
5251     int adj;
5252     uint64_t bc_unneeded=branch_regs[i].u;
5253     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5254     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5255     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5256     bc_unneeded|=1;
5257     bc_unneeded_upper|=1;
5258     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5259                   bc_unneeded,bc_unneeded_upper);
5260     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5261     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5262     if(rt1[i]==31) {
5263       int rt,return_address;
5264       rt=get_reg(branch_regs[i].regmap,31);
5265       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5266       if(rt>=0) {
5267         // Save the PC even if the branch is not taken
5268         return_address=start+i*4+8;
5269         emit_movimm(return_address,rt); // PC into link register
5270         #ifdef IMM_PREFETCH
5271         if(!nevertaken) emit_prefetch(hash_table_get(return_address));
5272         #endif
5273       }
5274     }
5275     cc=get_reg(branch_regs[i].regmap,CCREG);
5276     assert(cc==HOST_CCREG);
5277     if(unconditional)
5278       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5279     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5280     assem_debug("cycle count (adj)\n");
5281     if(unconditional) {
5282       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5283       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5284         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5285         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5286         if(internal)
5287           assem_debug("branch: internal\n");
5288         else
5289           assem_debug("branch: external\n");
5290         if(internal&&is_ds[(ba[i]-start)>>2]) {
5291           ds_assemble_entry(i);
5292         }
5293         else {
5294           add_to_linker(out,ba[i],internal);
5295           emit_jmp(0);
5296         }
5297         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5298         if(((u_int)out)&7) emit_addnop(0);
5299         #endif
5300       }
5301     }
5302     else if(nevertaken) {
5303       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5304       void *jaddr=out;
5305       emit_jns(0);
5306       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5307     }
5308     else {
5309       void *nottaken = NULL;
5310       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5311       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5312       if(!only32)
5313       {
5314         assert(s1h>=0);
5315         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5316         {
5317           emit_test(s1h,s1h);
5318           if(invert){
5319             nottaken=out;
5320             emit_jns(1);
5321           }else{
5322             add_to_linker(out,ba[i],internal);
5323             emit_js(0);
5324           }
5325         }
5326         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5327         {
5328           emit_test(s1h,s1h);
5329           if(invert){
5330             nottaken=out;
5331             emit_js(1);
5332           }else{
5333             add_to_linker(out,ba[i],internal);
5334             emit_jns(0);
5335           }
5336         }
5337       } // if(!only32)
5338       else
5339       {
5340         assert(s1l>=0);
5341         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5342         {
5343           emit_test(s1l,s1l);
5344           if(invert){
5345             nottaken=out;
5346             emit_jns(1);
5347           }else{
5348             add_to_linker(out,ba[i],internal);
5349             emit_js(0);
5350           }
5351         }
5352         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5353         {
5354           emit_test(s1l,s1l);
5355           if(invert){
5356             nottaken=out;
5357             emit_js(1);
5358           }else{
5359             add_to_linker(out,ba[i],internal);
5360             emit_jns(0);
5361           }
5362         }
5363       } // if(!only32)
5364
5365       if(invert) {
5366         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5367         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5368           if(adj) {
5369             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5370             add_to_linker(out,ba[i],internal);
5371           }else{
5372             emit_addnop(13);
5373             add_to_linker(out,ba[i],internal*2);
5374           }
5375           emit_jmp(0);
5376         }else
5377         #endif
5378         {
5379           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5380           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5381           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5382           if(internal)
5383             assem_debug("branch: internal\n");
5384           else
5385             assem_debug("branch: external\n");
5386           if(internal&&is_ds[(ba[i]-start)>>2]) {
5387             ds_assemble_entry(i);
5388           }
5389           else {
5390             add_to_linker(out,ba[i],internal);
5391             emit_jmp(0);
5392           }
5393         }
5394         set_jump_target(nottaken, out);
5395       }
5396
5397       if(adj) {
5398         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5399       }
5400     } // (!unconditional)
5401   } // if(ooo)
5402   else
5403   {
5404     // In-order execution (branch first)
5405     //printf("IOE\n");
5406     void *nottaken = NULL;
5407     if(rt1[i]==31) {
5408       int rt,return_address;
5409       rt=get_reg(branch_regs[i].regmap,31);
5410       if(rt>=0) {
5411         // Save the PC even if the branch is not taken
5412         return_address=start+i*4+8;
5413         emit_movimm(return_address,rt); // PC into link register
5414         #ifdef IMM_PREFETCH
5415         emit_prefetch(hash_table_get(return_address));
5416         #endif
5417       }
5418     }
5419     if(!unconditional) {
5420       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5421       if(!only32)
5422       {
5423         assert(s1h>=0);
5424         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5425         {
5426           emit_test(s1h,s1h);
5427           nottaken=out;
5428           emit_jns(1);
5429         }
5430         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5431         {
5432           emit_test(s1h,s1h);
5433           nottaken=out;
5434           emit_js(1);
5435         }
5436       } // if(!only32)
5437       else
5438       {
5439         assert(s1l>=0);
5440         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5441         {
5442           emit_test(s1l,s1l);
5443           nottaken=out;
5444           emit_jns(1);
5445         }
5446         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5447         {
5448           emit_test(s1l,s1l);
5449           nottaken=out;
5450           emit_js(1);
5451         }
5452       }
5453     } // if(!unconditional)
5454     int adj;
5455     uint64_t ds_unneeded=branch_regs[i].u;
5456     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5457     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5458     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5459     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5460     ds_unneeded|=1;
5461     ds_unneeded_upper|=1;
5462     // branch taken
5463     if(!nevertaken) {
5464       //assem_debug("1:\n");
5465       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5466                     ds_unneeded,ds_unneeded_upper);
5467       // load regs
5468       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5469       address_generation(i+1,&branch_regs[i],0);
5470       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5471       ds_assemble(i+1,&branch_regs[i]);
5472       cc=get_reg(branch_regs[i].regmap,CCREG);
5473       if(cc==-1) {
5474         emit_loadreg(CCREG,cc=HOST_CCREG);
5475         // CHECK: Is the following instruction (fall thru) allocated ok?
5476       }
5477       assert(cc==HOST_CCREG);
5478       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5479       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5480       assem_debug("cycle count (adj)\n");
5481       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5482       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5483       if(internal)
5484         assem_debug("branch: internal\n");
5485       else
5486         assem_debug("branch: external\n");
5487       if(internal&&is_ds[(ba[i]-start)>>2]) {
5488         ds_assemble_entry(i);
5489       }
5490       else {
5491         add_to_linker(out,ba[i],internal);
5492         emit_jmp(0);
5493       }
5494     }
5495     // branch not taken
5496     cop1_usable=prev_cop1_usable;
5497     if(!unconditional) {
5498       set_jump_target(nottaken, out);
5499       assem_debug("1:\n");
5500       if(!likely[i]) {
5501         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5502                       ds_unneeded,ds_unneeded_upper);
5503         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5504         address_generation(i+1,&branch_regs[i],0);
5505         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5506         ds_assemble(i+1,&branch_regs[i]);
5507       }
5508       cc=get_reg(branch_regs[i].regmap,CCREG);
5509       if(cc==-1&&!likely[i]) {
5510         // Cycle count isn't in a register, temporarily load it then write it out
5511         emit_loadreg(CCREG,HOST_CCREG);
5512         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5513         void *jaddr=out;
5514         emit_jns(0);
5515         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5516         emit_storereg(CCREG,HOST_CCREG);
5517       }
5518       else{
5519         cc=get_reg(i_regmap,CCREG);
5520         assert(cc==HOST_CCREG);
5521         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5522         void *jaddr=out;
5523         emit_jns(0);
5524         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5525       }
5526     }
5527   }
5528 }
5529
5530 void fjump_assemble(int i,struct regstat *i_regs)
5531 {
5532   signed char *i_regmap=i_regs->regmap;
5533   int cc;
5534   int match;
5535   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5536   assem_debug("fmatch=%d\n",match);
5537   int fs,cs;
5538   void *eaddr;
5539   int invert=0;
5540   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5541   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5542   if(!match) invert=1;
5543   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5544   if(i>(ba[i]-start)>>2) invert=1;
5545   #endif
5546
5547   if(ooo[i]) {
5548     fs=get_reg(branch_regs[i].regmap,FSREG);
5549     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5550   }
5551   else {
5552     fs=get_reg(i_regmap,FSREG);
5553   }
5554
5555   // Check cop1 unusable
5556   if(!cop1_usable) {
5557     cs=get_reg(i_regmap,CSREG);
5558     assert(cs>=0);
5559     emit_testimm(cs,0x20000000);
5560     eaddr=out;
5561     emit_jeq(0);
5562     add_stub_r(FP_STUB,eaddr,out,i,cs,i_regs,0,0);
5563     cop1_usable=1;
5564   }
5565
5566   if(ooo[i]) {
5567     // Out of order execution (delay slot first)
5568     //printf("OOOE\n");
5569     ds_assemble(i+1,i_regs);
5570     int adj;
5571     uint64_t bc_unneeded=branch_regs[i].u;
5572     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5573     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5574     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5575     bc_unneeded|=1;
5576     bc_unneeded_upper|=1;
5577     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5578                   bc_unneeded,bc_unneeded_upper);
5579     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5580     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5581     cc=get_reg(branch_regs[i].regmap,CCREG);
5582     assert(cc==HOST_CCREG);
5583     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5584     assem_debug("cycle count (adj)\n");
5585     if(1) {
5586       void *nottaken = NULL;
5587       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5588       if(1) {
5589         assert(fs>=0);
5590         emit_testimm(fs,0x800000);
5591         if(source[i]&0x10000) // BC1T
5592         {
5593           if(invert){
5594             nottaken=out;
5595             emit_jeq(1);
5596           }else{
5597             add_to_linker(out,ba[i],internal);
5598             emit_jne(0);
5599           }
5600         }
5601         else // BC1F
5602           if(invert){
5603             nottaken=out;
5604             emit_jne((void *)1l);
5605           }else{
5606             add_to_linker(out,ba[i],internal);
5607             emit_jeq(0);
5608           }
5609         {
5610         }
5611       } // if(!only32)
5612
5613       if(invert) {
5614         if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5615         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5616         else if(match) emit_addnop(13);
5617         #endif
5618         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5619         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5620         if(internal)
5621           assem_debug("branch: internal\n");
5622         else
5623           assem_debug("branch: external\n");
5624         if(internal&&is_ds[(ba[i]-start)>>2]) {
5625           ds_assemble_entry(i);
5626         }
5627         else {
5628           add_to_linker(out,ba[i],internal);
5629           emit_jmp(0);
5630         }
5631         set_jump_target(nottaken, out);
5632       }
5633
5634       if(adj) {
5635         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5636       }
5637     } // (!unconditional)
5638   } // if(ooo)
5639   else
5640   {
5641     // In-order execution (branch first)
5642     //printf("IOE\n");
5643     void *nottaken = NULL;
5644     if(1) {
5645       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5646       if(1) {
5647         assert(fs>=0);
5648         emit_testimm(fs,0x800000);
5649         if(source[i]&0x10000) // BC1T
5650         {
5651           nottaken=out;
5652           emit_jeq(1);
5653         }
5654         else // BC1F
5655         {
5656           nottaken=out;
5657           emit_jne((void *)1l);
5658         }
5659       }
5660     } // if(!unconditional)
5661     int adj;
5662     uint64_t ds_unneeded=branch_regs[i].u;
5663     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5664     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5665     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5666     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5667     ds_unneeded|=1;
5668     ds_unneeded_upper|=1;
5669     // branch taken
5670     //assem_debug("1:\n");
5671     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5672                   ds_unneeded,ds_unneeded_upper);
5673     // load regs
5674     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5675     address_generation(i+1,&branch_regs[i],0);
5676     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5677     ds_assemble(i+1,&branch_regs[i]);
5678     cc=get_reg(branch_regs[i].regmap,CCREG);
5679     if(cc==-1) {
5680       emit_loadreg(CCREG,cc=HOST_CCREG);
5681       // CHECK: Is the following instruction (fall thru) allocated ok?
5682     }
5683     assert(cc==HOST_CCREG);
5684     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5685     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5686     assem_debug("cycle count (adj)\n");
5687     if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5688     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5689     if(internal)
5690       assem_debug("branch: internal\n");
5691     else
5692       assem_debug("branch: external\n");
5693     if(internal&&is_ds[(ba[i]-start)>>2]) {
5694       ds_assemble_entry(i);
5695     }
5696     else {
5697       add_to_linker(out,ba[i],internal);
5698       emit_jmp(0);
5699     }
5700
5701     // branch not taken
5702     if(1) { // <- FIXME (don't need this)
5703       set_jump_target(nottaken, out);
5704       assem_debug("1:\n");
5705       if(!likely[i]) {
5706         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5707                       ds_unneeded,ds_unneeded_upper);
5708         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5709         address_generation(i+1,&branch_regs[i],0);
5710         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5711         ds_assemble(i+1,&branch_regs[i]);
5712       }
5713       cc=get_reg(branch_regs[i].regmap,CCREG);
5714       if(cc==-1&&!likely[i]) {
5715         // Cycle count isn't in a register, temporarily load it then write it out
5716         emit_loadreg(CCREG,HOST_CCREG);
5717         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5718         void *jaddr=out;
5719         emit_jns(0);
5720         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5721         emit_storereg(CCREG,HOST_CCREG);
5722       }
5723       else{
5724         cc=get_reg(i_regmap,CCREG);
5725         assert(cc==HOST_CCREG);
5726         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5727         void *jaddr=out;
5728         emit_jns(0);
5729         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5730       }
5731     }
5732   }
5733 }
5734
5735 static void pagespan_assemble(int i,struct regstat *i_regs)
5736 {
5737   int s1l=get_reg(i_regs->regmap,rs1[i]);
5738   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
5739   int s2l=get_reg(i_regs->regmap,rs2[i]);
5740   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
5741   void *taken = NULL;
5742   void *nottaken = NULL;
5743   int unconditional=0;
5744   if(rs1[i]==0)
5745   {
5746     s1l=s2l;s1h=s2h;
5747     s2l=s2h=-1;
5748   }
5749   else if(rs2[i]==0)
5750   {
5751     s2l=s2h=-1;
5752   }
5753   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
5754     s1h=s2h=-1;
5755   }
5756   int hr=0;
5757   int addr=-1,alt=-1,ntaddr=-1;
5758   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5759   else {
5760     while(hr<HOST_REGS)
5761     {
5762       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5763          (i_regs->regmap[hr]&63)!=rs1[i] &&
5764          (i_regs->regmap[hr]&63)!=rs2[i] )
5765       {
5766         addr=hr++;break;
5767       }
5768       hr++;
5769     }
5770   }
5771   while(hr<HOST_REGS)
5772   {
5773     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5774        (i_regs->regmap[hr]&63)!=rs1[i] &&
5775        (i_regs->regmap[hr]&63)!=rs2[i] )
5776     {
5777       alt=hr++;break;
5778     }
5779     hr++;
5780   }
5781   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5782   {
5783     while(hr<HOST_REGS)
5784     {
5785       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5786          (i_regs->regmap[hr]&63)!=rs1[i] &&
5787          (i_regs->regmap[hr]&63)!=rs2[i] )
5788       {
5789         ntaddr=hr;break;
5790       }
5791       hr++;
5792     }
5793   }
5794   assert(hr<HOST_REGS);
5795   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5796     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
5797   }
5798   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5799   if(opcode[i]==2) // J
5800   {
5801     unconditional=1;
5802   }
5803   if(opcode[i]==3) // JAL
5804   {
5805     // TODO: mini_ht
5806     int rt=get_reg(i_regs->regmap,31);
5807     emit_movimm(start+i*4+8,rt);
5808     unconditional=1;
5809   }
5810   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
5811   {
5812     emit_mov(s1l,addr);
5813     if(opcode2[i]==9) // JALR
5814     {
5815       int rt=get_reg(i_regs->regmap,rt1[i]);
5816       emit_movimm(start+i*4+8,rt);
5817     }
5818   }
5819   if((opcode[i]&0x3f)==4) // BEQ
5820   {
5821     if(rs1[i]==rs2[i])
5822     {
5823       unconditional=1;
5824     }
5825     else
5826     #ifdef HAVE_CMOV_IMM
5827     if(s1h<0) {
5828       if(s2l>=0) emit_cmp(s1l,s2l);
5829       else emit_test(s1l,s1l);
5830       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5831     }
5832     else
5833     #endif
5834     {
5835       assert(s1l>=0);
5836       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5837       if(s1h>=0) {
5838         if(s2h>=0) emit_cmp(s1h,s2h);
5839         else emit_test(s1h,s1h);
5840         emit_cmovne_reg(alt,addr);
5841       }
5842       if(s2l>=0) emit_cmp(s1l,s2l);
5843       else emit_test(s1l,s1l);
5844       emit_cmovne_reg(alt,addr);
5845     }
5846   }
5847   if((opcode[i]&0x3f)==5) // BNE
5848   {
5849     #ifdef HAVE_CMOV_IMM
5850     if(s1h<0) {
5851       if(s2l>=0) emit_cmp(s1l,s2l);
5852       else emit_test(s1l,s1l);
5853       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5854     }
5855     else
5856     #endif
5857     {
5858       assert(s1l>=0);
5859       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5860       if(s1h>=0) {
5861         if(s2h>=0) emit_cmp(s1h,s2h);
5862         else emit_test(s1h,s1h);
5863         emit_cmovne_reg(alt,addr);
5864       }
5865       if(s2l>=0) emit_cmp(s1l,s2l);
5866       else emit_test(s1l,s1l);
5867       emit_cmovne_reg(alt,addr);
5868     }
5869   }
5870   if((opcode[i]&0x3f)==0x14) // BEQL
5871   {
5872     if(s1h>=0) {
5873       if(s2h>=0) emit_cmp(s1h,s2h);
5874       else emit_test(s1h,s1h);
5875       nottaken=out;
5876       emit_jne(0);
5877     }
5878     if(s2l>=0) emit_cmp(s1l,s2l);
5879     else emit_test(s1l,s1l);
5880     if(nottaken) set_jump_target(nottaken, out);
5881     nottaken=out;
5882     emit_jne(0);
5883   }
5884   if((opcode[i]&0x3f)==0x15) // BNEL
5885   {
5886     if(s1h>=0) {
5887       if(s2h>=0) emit_cmp(s1h,s2h);
5888       else emit_test(s1h,s1h);
5889       taken=out;
5890       emit_jne(0);
5891     }
5892     if(s2l>=0) emit_cmp(s1l,s2l);
5893     else emit_test(s1l,s1l);
5894     nottaken=out;
5895     emit_jeq(0);
5896     if(taken) set_jump_target(taken, out);
5897   }
5898   if((opcode[i]&0x3f)==6) // BLEZ
5899   {
5900     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5901     emit_cmpimm(s1l,1);
5902     if(s1h>=0) emit_mov(addr,ntaddr);
5903     emit_cmovl_reg(alt,addr);
5904     if(s1h>=0) {
5905       emit_test(s1h,s1h);
5906       emit_cmovne_reg(ntaddr,addr);
5907       emit_cmovs_reg(alt,addr);
5908     }
5909   }
5910   if((opcode[i]&0x3f)==7) // BGTZ
5911   {
5912     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5913     emit_cmpimm(s1l,1);
5914     if(s1h>=0) emit_mov(addr,alt);
5915     emit_cmovl_reg(ntaddr,addr);
5916     if(s1h>=0) {
5917       emit_test(s1h,s1h);
5918       emit_cmovne_reg(alt,addr);
5919       emit_cmovs_reg(ntaddr,addr);
5920     }
5921   }
5922   if((opcode[i]&0x3f)==0x16) // BLEZL
5923   {
5924     assert((opcode[i]&0x3f)!=0x16);
5925   }
5926   if((opcode[i]&0x3f)==0x17) // BGTZL
5927   {
5928     assert((opcode[i]&0x3f)!=0x17);
5929   }
5930   assert(opcode[i]!=1); // BLTZ/BGEZ
5931
5932   //FIXME: Check CSREG
5933   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5934     if((source[i]&0x30000)==0) // BC1F
5935     {
5936       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5937       emit_testimm(s1l,0x800000);
5938       emit_cmovne_reg(alt,addr);
5939     }
5940     if((source[i]&0x30000)==0x10000) // BC1T
5941     {
5942       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5943       emit_testimm(s1l,0x800000);
5944       emit_cmovne_reg(alt,addr);
5945     }
5946     if((source[i]&0x30000)==0x20000) // BC1FL
5947     {
5948       emit_testimm(s1l,0x800000);
5949       nottaken=out;
5950       emit_jne(0);
5951     }
5952     if((source[i]&0x30000)==0x30000) // BC1TL
5953     {
5954       emit_testimm(s1l,0x800000);
5955       nottaken=out;
5956       emit_jeq(0);
5957     }
5958   }
5959
5960   assert(i_regs->regmap[HOST_CCREG]==CCREG);
5961   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
5962   if(likely[i]||unconditional)
5963   {
5964     emit_movimm(ba[i],HOST_BTREG);
5965   }
5966   else if(addr!=HOST_BTREG)
5967   {
5968     emit_mov(addr,HOST_BTREG);
5969   }
5970   void *branch_addr=out;
5971   emit_jmp(0);
5972   int target_addr=start+i*4+5;
5973   void *stub=out;
5974   void *compiled_target_addr=check_addr(target_addr);
5975   emit_extjump_ds(branch_addr, target_addr);
5976   if(compiled_target_addr) {
5977     set_jump_target(branch_addr, compiled_target_addr);
5978     add_link(target_addr,stub);
5979   }
5980   else set_jump_target(branch_addr, stub);
5981   if(likely[i]) {
5982     // Not-taken path
5983     set_jump_target(nottaken, out);
5984     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
5985     void *branch_addr=out;
5986     emit_jmp(0);
5987     int target_addr=start+i*4+8;
5988     void *stub=out;
5989     void *compiled_target_addr=check_addr(target_addr);
5990     emit_extjump_ds(branch_addr, target_addr);
5991     if(compiled_target_addr) {
5992       set_jump_target(branch_addr, compiled_target_addr);
5993       add_link(target_addr,stub);
5994     }
5995     else set_jump_target(branch_addr, stub);
5996   }
5997 }
5998
5999 // Assemble the delay slot for the above
6000 static void pagespan_ds()
6001 {
6002   assem_debug("initial delay slot:\n");
6003   u_int vaddr=start+1;
6004   u_int page=get_page(vaddr);
6005   u_int vpage=get_vpage(vaddr);
6006   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6007   do_dirty_stub_ds();
6008   ll_add(jump_in+page,vaddr,(void *)out);
6009   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6010   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6011     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6012   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6013     emit_writeword(HOST_BTREG,&branch_target);
6014   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6015   address_generation(0,&regs[0],regs[0].regmap_entry);
6016   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6017     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6018   cop1_usable=0;
6019   is_delayslot=0;
6020   switch(itype[0]) {
6021     case ALU:
6022       alu_assemble(0,&regs[0]);break;
6023     case IMM16:
6024       imm16_assemble(0,&regs[0]);break;
6025     case SHIFT:
6026       shift_assemble(0,&regs[0]);break;
6027     case SHIFTIMM:
6028       shiftimm_assemble(0,&regs[0]);break;
6029     case LOAD:
6030       load_assemble(0,&regs[0]);break;
6031     case LOADLR:
6032       loadlr_assemble(0,&regs[0]);break;
6033     case STORE:
6034       store_assemble(0,&regs[0]);break;
6035     case STORELR:
6036       storelr_assemble(0,&regs[0]);break;
6037     case COP0:
6038       cop0_assemble(0,&regs[0]);break;
6039     case COP1:
6040       cop1_assemble(0,&regs[0]);break;
6041     case C1LS:
6042       c1ls_assemble(0,&regs[0]);break;
6043     case COP2:
6044       cop2_assemble(0,&regs[0]);break;
6045     case C2LS:
6046       c2ls_assemble(0,&regs[0]);break;
6047     case C2OP:
6048       c2op_assemble(0,&regs[0]);break;
6049     case FCONV:
6050       fconv_assemble(0,&regs[0]);break;
6051     case FLOAT:
6052       float_assemble(0,&regs[0]);break;
6053     case FCOMP:
6054       fcomp_assemble(0,&regs[0]);break;
6055     case MULTDIV:
6056       multdiv_assemble(0,&regs[0]);break;
6057     case MOV:
6058       mov_assemble(0,&regs[0]);break;
6059     case SYSCALL:
6060     case HLECALL:
6061     case INTCALL:
6062     case SPAN:
6063     case UJUMP:
6064     case RJUMP:
6065     case CJUMP:
6066     case SJUMP:
6067     case FJUMP:
6068       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
6069   }
6070   int btaddr=get_reg(regs[0].regmap,BTREG);
6071   if(btaddr<0) {
6072     btaddr=get_reg(regs[0].regmap,-1);
6073     emit_readword(&branch_target,btaddr);
6074   }
6075   assert(btaddr!=HOST_CCREG);
6076   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6077 #ifdef HOST_IMM8
6078   emit_movimm(start+4,HOST_TEMPREG);
6079   emit_cmp(btaddr,HOST_TEMPREG);
6080 #else
6081   emit_cmpimm(btaddr,start+4);
6082 #endif
6083   void *branch = out;
6084   emit_jeq(0);
6085   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6086   emit_jmp(jump_vaddr_reg[btaddr]);
6087   set_jump_target(branch, out);
6088   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6089   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6090 }
6091
6092 // Basic liveness analysis for MIPS registers
6093 void unneeded_registers(int istart,int iend,int r)
6094 {
6095   int i;
6096   uint64_t u,uu,gte_u,b,bu,gte_bu;
6097   uint64_t temp_u,temp_uu,temp_gte_u=0;
6098   uint64_t tdep;
6099   uint64_t gte_u_unknown=0;
6100   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
6101     gte_u_unknown=~0ll;
6102   if(iend==slen-1) {
6103     u=1;uu=1;
6104     gte_u=gte_u_unknown;
6105   }else{
6106     u=unneeded_reg[iend+1];
6107     uu=unneeded_reg_upper[iend+1];
6108     u=1;uu=1;
6109     gte_u=gte_unneeded[iend+1];
6110   }
6111
6112   for (i=iend;i>=istart;i--)
6113   {
6114     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6115     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6116     {
6117       // If subroutine call, flag return address as a possible branch target
6118       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6119
6120       if(ba[i]<start || ba[i]>=(start+slen*4))
6121       {
6122         // Branch out of this block, flush all regs
6123         u=1;
6124         uu=1;
6125         gte_u=gte_u_unknown;
6126         /* Hexagon hack
6127         if(itype[i]==UJUMP&&rt1[i]==31)
6128         {
6129           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6130         }
6131         if(itype[i]==RJUMP&&rs1[i]==31)
6132         {
6133           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6134         }
6135         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6136           if(itype[i]==UJUMP&&rt1[i]==31)
6137           {
6138             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6139             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6140           }
6141           if(itype[i]==RJUMP&&rs1[i]==31)
6142           {
6143             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6144             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6145           }
6146         }*/
6147         branch_unneeded_reg[i]=u;
6148         branch_unneeded_reg_upper[i]=uu;
6149         // Merge in delay slot
6150         tdep=(~uu>>rt1[i+1])&1;
6151         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6152         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6153         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6154         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6155         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6156         u|=1;uu|=1;
6157         gte_u|=gte_rt[i+1];
6158         gte_u&=~gte_rs[i+1];
6159         // If branch is "likely" (and conditional)
6160         // then we skip the delay slot on the fall-thru path
6161         if(likely[i]) {
6162           if(i<slen-1) {
6163             u&=unneeded_reg[i+2];
6164             uu&=unneeded_reg_upper[i+2];
6165             gte_u&=gte_unneeded[i+2];
6166           }
6167           else
6168           {
6169             u=1;
6170             uu=1;
6171             gte_u=gte_u_unknown;
6172           }
6173         }
6174       }
6175       else
6176       {
6177         // Internal branch, flag target
6178         bt[(ba[i]-start)>>2]=1;
6179         if(ba[i]<=start+i*4) {
6180           // Backward branch
6181           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6182           {
6183             // Unconditional branch
6184             temp_u=1;temp_uu=1;
6185             temp_gte_u=0;
6186           } else {
6187             // Conditional branch (not taken case)
6188             temp_u=unneeded_reg[i+2];
6189             temp_uu=unneeded_reg_upper[i+2];
6190             temp_gte_u&=gte_unneeded[i+2];
6191           }
6192           // Merge in delay slot
6193           tdep=(~temp_uu>>rt1[i+1])&1;
6194           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6195           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6196           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6197           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6198           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6199           temp_u|=1;temp_uu|=1;
6200           temp_gte_u|=gte_rt[i+1];
6201           temp_gte_u&=~gte_rs[i+1];
6202           // If branch is "likely" (and conditional)
6203           // then we skip the delay slot on the fall-thru path
6204           if(likely[i]) {
6205             if(i<slen-1) {
6206               temp_u&=unneeded_reg[i+2];
6207               temp_uu&=unneeded_reg_upper[i+2];
6208               temp_gte_u&=gte_unneeded[i+2];
6209             }
6210             else
6211             {
6212               temp_u=1;
6213               temp_uu=1;
6214               temp_gte_u=gte_u_unknown;
6215             }
6216           }
6217           tdep=(~temp_uu>>rt1[i])&1;
6218           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6219           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6220           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6221           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6222           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6223           temp_u|=1;temp_uu|=1;
6224           temp_gte_u|=gte_rt[i];
6225           temp_gte_u&=~gte_rs[i];
6226           unneeded_reg[i]=temp_u;
6227           unneeded_reg_upper[i]=temp_uu;
6228           gte_unneeded[i]=temp_gte_u;
6229           // Only go three levels deep.  This recursion can take an
6230           // excessive amount of time if there are a lot of nested loops.
6231           if(r<2) {
6232             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6233           }else{
6234             unneeded_reg[(ba[i]-start)>>2]=1;
6235             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6236             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
6237           }
6238         } /*else*/ if(1) {
6239           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6240           {
6241             // Unconditional branch
6242             u=unneeded_reg[(ba[i]-start)>>2];
6243             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6244             gte_u=gte_unneeded[(ba[i]-start)>>2];
6245             branch_unneeded_reg[i]=u;
6246             branch_unneeded_reg_upper[i]=uu;
6247         //u=1;
6248         //uu=1;
6249         //branch_unneeded_reg[i]=u;
6250         //branch_unneeded_reg_upper[i]=uu;
6251             // Merge in delay slot
6252             tdep=(~uu>>rt1[i+1])&1;
6253             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6254             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6255             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6256             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6257             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6258             u|=1;uu|=1;
6259             gte_u|=gte_rt[i+1];
6260             gte_u&=~gte_rs[i+1];
6261           } else {
6262             // Conditional branch
6263             b=unneeded_reg[(ba[i]-start)>>2];
6264             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6265             gte_bu=gte_unneeded[(ba[i]-start)>>2];
6266             branch_unneeded_reg[i]=b;
6267             branch_unneeded_reg_upper[i]=bu;
6268         //b=1;
6269         //bu=1;
6270         //branch_unneeded_reg[i]=b;
6271         //branch_unneeded_reg_upper[i]=bu;
6272             // Branch delay slot
6273             tdep=(~uu>>rt1[i+1])&1;
6274             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6275             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6276             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6277             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6278             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6279             b|=1;bu|=1;
6280             gte_bu|=gte_rt[i+1];
6281             gte_bu&=~gte_rs[i+1];
6282             // If branch is "likely" then we skip the
6283             // delay slot on the fall-thru path
6284             if(likely[i]) {
6285               u=b;
6286               uu=bu;
6287               gte_u=gte_bu;
6288               if(i<slen-1) {
6289                 u&=unneeded_reg[i+2];
6290                 uu&=unneeded_reg_upper[i+2];
6291                 gte_u&=gte_unneeded[i+2];
6292         //u=1;
6293         //uu=1;
6294               }
6295             } else {
6296               u&=b;
6297               uu&=bu;
6298               gte_u&=gte_bu;
6299         //u=1;
6300         //uu=1;
6301             }
6302             if(i<slen-1) {
6303               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6304               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6305         //branch_unneeded_reg[i]=1;
6306         //branch_unneeded_reg_upper[i]=1;
6307             } else {
6308               branch_unneeded_reg[i]=1;
6309               branch_unneeded_reg_upper[i]=1;
6310             }
6311           }
6312         }
6313       }
6314     }
6315     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6316     {
6317       // SYSCALL instruction (software interrupt)
6318       u=1;
6319       uu=1;
6320     }
6321     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6322     {
6323       // ERET instruction (return from interrupt)
6324       u=1;
6325       uu=1;
6326     }
6327     //u=uu=1; // DEBUG
6328     tdep=(~uu>>rt1[i])&1;
6329     // Written registers are unneeded
6330     u|=1LL<<rt1[i];
6331     u|=1LL<<rt2[i];
6332     uu|=1LL<<rt1[i];
6333     uu|=1LL<<rt2[i];
6334     gte_u|=gte_rt[i];
6335     // Accessed registers are needed
6336     u&=~(1LL<<rs1[i]);
6337     u&=~(1LL<<rs2[i]);
6338     uu&=~(1LL<<us1[i]);
6339     uu&=~(1LL<<us2[i]);
6340     gte_u&=~gte_rs[i];
6341     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
6342       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
6343     // Source-target dependencies
6344     uu&=~(tdep<<dep1[i]);
6345     uu&=~(tdep<<dep2[i]);
6346     // R0 is always unneeded
6347     u|=1;uu|=1;
6348     // Save it
6349     unneeded_reg[i]=u;
6350     unneeded_reg_upper[i]=uu;
6351     gte_unneeded[i]=gte_u;
6352     /*
6353     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6354     printf("U:");
6355     int r;
6356     for(r=1;r<=CCREG;r++) {
6357       if((unneeded_reg[i]>>r)&1) {
6358         if(r==HIREG) printf(" HI");
6359         else if(r==LOREG) printf(" LO");
6360         else printf(" r%d",r);
6361       }
6362     }
6363     printf(" UU:");
6364     for(r=1;r<=CCREG;r++) {
6365       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6366         if(r==HIREG) printf(" HI");
6367         else if(r==LOREG) printf(" LO");
6368         else printf(" r%d",r);
6369       }
6370     }
6371     printf("\n");*/
6372   }
6373   for (i=iend;i>=istart;i--)
6374   {
6375     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
6376   }
6377 }
6378
6379 // Write back dirty registers as soon as we will no longer modify them,
6380 // so that we don't end up with lots of writes at the branches.
6381 void clean_registers(int istart,int iend,int wr)
6382 {
6383   int i;
6384   int r;
6385   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6386   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6387   if(iend==slen-1) {
6388     will_dirty_i=will_dirty_next=0;
6389     wont_dirty_i=wont_dirty_next=0;
6390   }else{
6391     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6392     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6393   }
6394   for (i=iend;i>=istart;i--)
6395   {
6396     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6397     {
6398       if(ba[i]<start || ba[i]>=(start+slen*4))
6399       {
6400         // Branch out of this block, flush all regs
6401         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6402         {
6403           // Unconditional branch
6404           will_dirty_i=0;
6405           wont_dirty_i=0;
6406           // Merge in delay slot (will dirty)
6407           for(r=0;r<HOST_REGS;r++) {
6408             if(r!=EXCLUDE_REG) {
6409               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6410               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6411               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6412               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6413               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6414               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6415               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6416               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6417               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6418               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6419               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6420               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6421               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6422               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6423             }
6424           }
6425         }
6426         else
6427         {
6428           // Conditional branch
6429           will_dirty_i=0;
6430           wont_dirty_i=wont_dirty_next;
6431           // Merge in delay slot (will dirty)
6432           for(r=0;r<HOST_REGS;r++) {
6433             if(r!=EXCLUDE_REG) {
6434               if(!likely[i]) {
6435                 // Might not dirty if likely branch is not taken
6436                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6437                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6438                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6439                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6440                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6441                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6442                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6443                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6444                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6445                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6446                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6447                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6448                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6449                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6450               }
6451             }
6452           }
6453         }
6454         // Merge in delay slot (wont dirty)
6455         for(r=0;r<HOST_REGS;r++) {
6456           if(r!=EXCLUDE_REG) {
6457             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6458             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6459             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6460             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6461             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6462             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6463             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6464             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6465             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6466             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6467           }
6468         }
6469         if(wr) {
6470           #ifndef DESTRUCTIVE_WRITEBACK
6471           branch_regs[i].dirty&=wont_dirty_i;
6472           #endif
6473           branch_regs[i].dirty|=will_dirty_i;
6474         }
6475       }
6476       else
6477       {
6478         // Internal branch
6479         if(ba[i]<=start+i*4) {
6480           // Backward branch
6481           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6482           {
6483             // Unconditional branch
6484             temp_will_dirty=0;
6485             temp_wont_dirty=0;
6486             // Merge in delay slot (will dirty)
6487             for(r=0;r<HOST_REGS;r++) {
6488               if(r!=EXCLUDE_REG) {
6489                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6490                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6491                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6492                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6493                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6494                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6495                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6496                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6497                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6498                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6499                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6500                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6501                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6502                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6503               }
6504             }
6505           } else {
6506             // Conditional branch (not taken case)
6507             temp_will_dirty=will_dirty_next;
6508             temp_wont_dirty=wont_dirty_next;
6509             // Merge in delay slot (will dirty)
6510             for(r=0;r<HOST_REGS;r++) {
6511               if(r!=EXCLUDE_REG) {
6512                 if(!likely[i]) {
6513                   // Will not dirty if likely branch is not taken
6514                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6515                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6516                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6517                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6518                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6519                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
6520                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6521                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6522                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6523                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6524                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6525                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6526                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6527                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6528                 }
6529               }
6530             }
6531           }
6532           // Merge in delay slot (wont dirty)
6533           for(r=0;r<HOST_REGS;r++) {
6534             if(r!=EXCLUDE_REG) {
6535               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6536               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6537               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6538               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6539               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6540               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6541               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6542               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6543               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6544               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6545             }
6546           }
6547           // Deal with changed mappings
6548           if(i<iend) {
6549             for(r=0;r<HOST_REGS;r++) {
6550               if(r!=EXCLUDE_REG) {
6551                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
6552                   temp_will_dirty&=~(1<<r);
6553                   temp_wont_dirty&=~(1<<r);
6554                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6555                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6556                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6557                   } else {
6558                     temp_will_dirty|=1<<r;
6559                     temp_wont_dirty|=1<<r;
6560                   }
6561                 }
6562               }
6563             }
6564           }
6565           if(wr) {
6566             will_dirty[i]=temp_will_dirty;
6567             wont_dirty[i]=temp_wont_dirty;
6568             clean_registers((ba[i]-start)>>2,i-1,0);
6569           }else{
6570             // Limit recursion.  It can take an excessive amount
6571             // of time if there are a lot of nested loops.
6572             will_dirty[(ba[i]-start)>>2]=0;
6573             wont_dirty[(ba[i]-start)>>2]=-1;
6574           }
6575         }
6576         /*else*/ if(1)
6577         {
6578           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6579           {
6580             // Unconditional branch
6581             will_dirty_i=0;
6582             wont_dirty_i=0;
6583           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6584             for(r=0;r<HOST_REGS;r++) {
6585               if(r!=EXCLUDE_REG) {
6586                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6587                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
6588                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6589                 }
6590                 if(branch_regs[i].regmap[r]>=0) {
6591                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6592                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6593                 }
6594               }
6595             }
6596           //}
6597             // Merge in delay slot
6598             for(r=0;r<HOST_REGS;r++) {
6599               if(r!=EXCLUDE_REG) {
6600                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6601                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6602                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6603                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6604                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6605                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6606                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6607                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6608                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6609                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6610                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6611                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6612                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6613                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6614               }
6615             }
6616           } else {
6617             // Conditional branch
6618             will_dirty_i=will_dirty_next;
6619             wont_dirty_i=wont_dirty_next;
6620           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6621             for(r=0;r<HOST_REGS;r++) {
6622               if(r!=EXCLUDE_REG) {
6623                 signed char target_reg=branch_regs[i].regmap[r];
6624                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6625                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6626                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6627                 }
6628                 else if(target_reg>=0) {
6629                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6630                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6631                 }
6632                 // Treat delay slot as part of branch too
6633                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6634                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6635                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6636                 }
6637                 else
6638                 {
6639                   will_dirty[i+1]&=~(1<<r);
6640                 }*/
6641               }
6642             }
6643           //}
6644             // Merge in delay slot
6645             for(r=0;r<HOST_REGS;r++) {
6646               if(r!=EXCLUDE_REG) {
6647                 if(!likely[i]) {
6648                   // Might not dirty if likely branch is not taken
6649                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6650                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6651                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6652                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6653                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6654                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6655                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6656                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6657                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6658                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6659                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6660                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6661                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6662                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6663                 }
6664               }
6665             }
6666           }
6667           // Merge in delay slot (won't dirty)
6668           for(r=0;r<HOST_REGS;r++) {
6669             if(r!=EXCLUDE_REG) {
6670               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6671               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6672               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6673               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6674               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6675               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6676               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6677               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6678               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6679               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6680             }
6681           }
6682           if(wr) {
6683             #ifndef DESTRUCTIVE_WRITEBACK
6684             branch_regs[i].dirty&=wont_dirty_i;
6685             #endif
6686             branch_regs[i].dirty|=will_dirty_i;
6687           }
6688         }
6689       }
6690     }
6691     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6692     {
6693       // SYSCALL instruction (software interrupt)
6694       will_dirty_i=0;
6695       wont_dirty_i=0;
6696     }
6697     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6698     {
6699       // ERET instruction (return from interrupt)
6700       will_dirty_i=0;
6701       wont_dirty_i=0;
6702     }
6703     will_dirty_next=will_dirty_i;
6704     wont_dirty_next=wont_dirty_i;
6705     for(r=0;r<HOST_REGS;r++) {
6706       if(r!=EXCLUDE_REG) {
6707         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6708         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6709         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6710         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6711         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6712         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6713         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6714         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6715         if(i>istart) {
6716           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP)
6717           {
6718             // Don't store a register immediately after writing it,
6719             // may prevent dual-issue.
6720             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
6721             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
6722           }
6723         }
6724       }
6725     }
6726     // Save it
6727     will_dirty[i]=will_dirty_i;
6728     wont_dirty[i]=wont_dirty_i;
6729     // Mark registers that won't be dirtied as not dirty
6730     if(wr) {
6731       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
6732       for(r=0;r<HOST_REGS;r++) {
6733         if((will_dirty_i>>r)&1) {
6734           printf(" r%d",r);
6735         }
6736       }
6737       printf("\n");*/
6738
6739       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
6740         regs[i].dirty|=will_dirty_i;
6741         #ifndef DESTRUCTIVE_WRITEBACK
6742         regs[i].dirty&=wont_dirty_i;
6743         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6744         {
6745           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
6746             for(r=0;r<HOST_REGS;r++) {
6747               if(r!=EXCLUDE_REG) {
6748                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
6749                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
6750                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6751               }
6752             }
6753           }
6754         }
6755         else
6756         {
6757           if(i<iend) {
6758             for(r=0;r<HOST_REGS;r++) {
6759               if(r!=EXCLUDE_REG) {
6760                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
6761                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
6762                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6763               }
6764             }
6765           }
6766         }
6767         #endif
6768       //}
6769     }
6770     // Deal with changed mappings
6771     temp_will_dirty=will_dirty_i;
6772     temp_wont_dirty=wont_dirty_i;
6773     for(r=0;r<HOST_REGS;r++) {
6774       if(r!=EXCLUDE_REG) {
6775         int nr;
6776         if(regs[i].regmap[r]==regmap_pre[i][r]) {
6777           if(wr) {
6778             #ifndef DESTRUCTIVE_WRITEBACK
6779             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6780             #endif
6781             regs[i].wasdirty|=will_dirty_i&(1<<r);
6782           }
6783         }
6784         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
6785           // Register moved to a different register
6786           will_dirty_i&=~(1<<r);
6787           wont_dirty_i&=~(1<<r);
6788           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
6789           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
6790           if(wr) {
6791             #ifndef DESTRUCTIVE_WRITEBACK
6792             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6793             #endif
6794             regs[i].wasdirty|=will_dirty_i&(1<<r);
6795           }
6796         }
6797         else {
6798           will_dirty_i&=~(1<<r);
6799           wont_dirty_i&=~(1<<r);
6800           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6801             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6802             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6803           } else {
6804             wont_dirty_i|=1<<r;
6805             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
6806           }
6807         }
6808       }
6809     }
6810   }
6811 }
6812
6813 #ifdef DISASM
6814   /* disassembly */
6815 void disassemble_inst(int i)
6816 {
6817     if (bt[i]) printf("*"); else printf(" ");
6818     switch(itype[i]) {
6819       case UJUMP:
6820         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6821       case CJUMP:
6822         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
6823       case SJUMP:
6824         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
6825       case FJUMP:
6826         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6827       case RJUMP:
6828         if (opcode[i]==0x9&&rt1[i]!=31)
6829           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
6830         else
6831           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6832         break;
6833       case SPAN:
6834         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
6835       case IMM16:
6836         if(opcode[i]==0xf) //LUI
6837           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
6838         else
6839           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6840         break;
6841       case LOAD:
6842       case LOADLR:
6843         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6844         break;
6845       case STORE:
6846       case STORELR:
6847         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
6848         break;
6849       case ALU:
6850       case SHIFT:
6851         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
6852         break;
6853       case MULTDIV:
6854         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
6855         break;
6856       case SHIFTIMM:
6857         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6858         break;
6859       case MOV:
6860         if((opcode2[i]&0x1d)==0x10)
6861           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
6862         else if((opcode2[i]&0x1d)==0x11)
6863           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6864         else
6865           printf (" %x: %s\n",start+i*4,insn[i]);
6866         break;
6867       case COP0:
6868         if(opcode2[i]==0)
6869           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
6870         else if(opcode2[i]==4)
6871           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
6872         else printf (" %x: %s\n",start+i*4,insn[i]);
6873         break;
6874       case COP1:
6875         if(opcode2[i]<3)
6876           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
6877         else if(opcode2[i]>3)
6878           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
6879         else printf (" %x: %s\n",start+i*4,insn[i]);
6880         break;
6881       case COP2:
6882         if(opcode2[i]<3)
6883           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
6884         else if(opcode2[i]>3)
6885           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
6886         else printf (" %x: %s\n",start+i*4,insn[i]);
6887         break;
6888       case C1LS:
6889         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6890         break;
6891       case C2LS:
6892         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6893         break;
6894       case INTCALL:
6895         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
6896         break;
6897       default:
6898         //printf (" %s %8x\n",insn[i],source[i]);
6899         printf (" %x: %s\n",start+i*4,insn[i]);
6900     }
6901 }
6902 #else
6903 static void disassemble_inst(int i) {}
6904 #endif // DISASM
6905
6906 #define DRC_TEST_VAL 0x74657374
6907
6908 static int new_dynarec_test(void)
6909 {
6910   int (*testfunc)(void) = (void *)out;
6911   void *beginning;
6912   int ret;
6913
6914   beginning = start_block();
6915   emit_movimm(DRC_TEST_VAL,0); // test
6916   emit_jmpreg(14);
6917   literal_pool(0);
6918   end_block(beginning);
6919   SysPrintf("testing if we can run recompiled code..\n");
6920   ret = testfunc();
6921   if (ret == DRC_TEST_VAL)
6922     SysPrintf("test passed.\n");
6923   else
6924     SysPrintf("test failed: %08x\n", ret);
6925   out = translation_cache;
6926   return ret == DRC_TEST_VAL;
6927 }
6928
6929 // clear the state completely, instead of just marking
6930 // things invalid like invalidate_all_pages() does
6931 void new_dynarec_clear_full()
6932 {
6933   int n;
6934   out = translation_cache;
6935   memset(invalid_code,1,sizeof(invalid_code));
6936   memset(hash_table,0xff,sizeof(hash_table));
6937   memset(mini_ht,-1,sizeof(mini_ht));
6938   memset(restore_candidate,0,sizeof(restore_candidate));
6939   memset(shadow,0,sizeof(shadow));
6940   copy=shadow;
6941   expirep=16384; // Expiry pointer, +2 blocks
6942   pending_exception=0;
6943   literalcount=0;
6944   stop_after_jal=0;
6945   inv_code_start=inv_code_end=~0;
6946   // TLB
6947   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6948   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6949   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6950 }
6951
6952 void new_dynarec_init()
6953 {
6954   SysPrintf("Init new dynarec\n");
6955
6956   // allocate/prepare a buffer for translation cache
6957   // see assem_arm.h for some explanation
6958 #if   defined(BASE_ADDR_FIXED)
6959   if (mmap(translation_cache, 1 << TARGET_SIZE_2,
6960             PROT_READ | PROT_WRITE | PROT_EXEC,
6961             MAP_PRIVATE | MAP_ANONYMOUS,
6962             -1, 0) != translation_cache) {
6963     SysPrintf("mmap() failed: %s\n", strerror(errno));
6964     SysPrintf("disable BASE_ADDR_FIXED and recompile\n");
6965     abort();
6966   }
6967 #elif defined(BASE_ADDR_DYNAMIC)
6968   #ifdef VITA
6969   sceBlock = sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
6970   if (sceBlock < 0)
6971     SysPrintf("sceKernelAllocMemBlockForVM failed\n");
6972   int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&translation_cache);
6973   if (ret < 0)
6974     SysPrintf("sceKernelGetMemBlockBase failed\n");
6975   #else
6976   translation_cache = mmap (NULL, 1 << TARGET_SIZE_2,
6977             PROT_READ | PROT_WRITE | PROT_EXEC,
6978             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
6979   if (translation_cache == MAP_FAILED) {
6980     SysPrintf("mmap() failed: %s\n", strerror(errno));
6981     abort();
6982   }
6983   #endif
6984 #else
6985   #ifndef NO_WRITE_EXEC
6986   // not all systems allow execute in data segment by default
6987   if (mprotect(translation_cache, 1<<TARGET_SIZE_2, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
6988     SysPrintf("mprotect() failed: %s\n", strerror(errno));
6989   #endif
6990 #endif
6991   out = translation_cache;
6992   cycle_multiplier=200;
6993   new_dynarec_clear_full();
6994 #ifdef HOST_IMM8
6995   // Copy this into local area so we don't have to put it in every literal pool
6996   invc_ptr=invalid_code;
6997 #endif
6998   arch_init();
6999   new_dynarec_test();
7000 #ifndef RAM_FIXED
7001   ram_offset=(u_int)rdram-0x80000000;
7002 #endif
7003   if (ram_offset!=0)
7004     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
7005 }
7006
7007 void new_dynarec_cleanup()
7008 {
7009   int n;
7010 #if defined(BASE_ADDR_FIXED) || defined(BASE_ADDR_DYNAMIC)
7011   #ifdef VITA
7012   sceKernelFreeMemBlock(sceBlock);
7013   sceBlock = -1;
7014   #else
7015   if (munmap(translation_cache, 1<<TARGET_SIZE_2) < 0)
7016     SysPrintf("munmap() failed\n");
7017   #endif
7018 #endif
7019   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7020   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7021   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7022   #ifdef ROM_COPY
7023   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
7024   #endif
7025 }
7026
7027 static u_int *get_source_start(u_int addr, u_int *limit)
7028 {
7029   if (addr < 0x00200000 ||
7030     (0xa0000000 <= addr && addr < 0xa0200000)) {
7031     // used for BIOS calls mostly?
7032     *limit = (addr&0xa0000000)|0x00200000;
7033     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7034   }
7035   else if (!Config.HLE && (
7036     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
7037     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
7038     // BIOS
7039     *limit = (addr & 0xfff00000) | 0x80000;
7040     return (u_int *)((u_int)psxR + (addr&0x7ffff));
7041   }
7042   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
7043     *limit = (addr & 0x80600000) + 0x00200000;
7044     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7045   }
7046   return NULL;
7047 }
7048
7049 static u_int scan_for_ret(u_int addr)
7050 {
7051   u_int limit = 0;
7052   u_int *mem;
7053
7054   mem = get_source_start(addr, &limit);
7055   if (mem == NULL)
7056     return addr;
7057
7058   if (limit > addr + 0x1000)
7059     limit = addr + 0x1000;
7060   for (; addr < limit; addr += 4, mem++) {
7061     if (*mem == 0x03e00008) // jr $ra
7062       return addr + 8;
7063   }
7064   return addr;
7065 }
7066
7067 struct savestate_block {
7068   uint32_t addr;
7069   uint32_t regflags;
7070 };
7071
7072 static int addr_cmp(const void *p1_, const void *p2_)
7073 {
7074   const struct savestate_block *p1 = p1_, *p2 = p2_;
7075   return p1->addr - p2->addr;
7076 }
7077
7078 int new_dynarec_save_blocks(void *save, int size)
7079 {
7080   struct savestate_block *blocks = save;
7081   int maxcount = size / sizeof(blocks[0]);
7082   struct savestate_block tmp_blocks[1024];
7083   struct ll_entry *head;
7084   int p, s, d, o, bcnt;
7085   u_int addr;
7086
7087   o = 0;
7088   for (p = 0; p < ARRAY_SIZE(jump_in); p++) {
7089     bcnt = 0;
7090     for (head = jump_in[p]; head != NULL; head = head->next) {
7091       tmp_blocks[bcnt].addr = head->vaddr;
7092       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
7093       bcnt++;
7094     }
7095     if (bcnt < 1)
7096       continue;
7097     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
7098
7099     addr = tmp_blocks[0].addr;
7100     for (s = d = 0; s < bcnt; s++) {
7101       if (tmp_blocks[s].addr < addr)
7102         continue;
7103       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
7104         tmp_blocks[d++] = tmp_blocks[s];
7105       addr = scan_for_ret(tmp_blocks[s].addr);
7106     }
7107
7108     if (o + d > maxcount)
7109       d = maxcount - o;
7110     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
7111     o += d;
7112   }
7113
7114   return o * sizeof(blocks[0]);
7115 }
7116
7117 void new_dynarec_load_blocks(const void *save, int size)
7118 {
7119   const struct savestate_block *blocks = save;
7120   int count = size / sizeof(blocks[0]);
7121   u_int regs_save[32];
7122   uint32_t f;
7123   int i, b;
7124
7125   get_addr(psxRegs.pc);
7126
7127   // change GPRs for speculation to at least partially work..
7128   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
7129   for (i = 1; i < 32; i++)
7130     psxRegs.GPR.r[i] = 0x80000000;
7131
7132   for (b = 0; b < count; b++) {
7133     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7134       if (f & 1)
7135         psxRegs.GPR.r[i] = 0x1f800000;
7136     }
7137
7138     get_addr(blocks[b].addr);
7139
7140     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7141       if (f & 1)
7142         psxRegs.GPR.r[i] = 0x80000000;
7143     }
7144   }
7145
7146   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
7147 }
7148
7149 int new_recompile_block(int addr)
7150 {
7151   u_int pagelimit = 0;
7152   u_int state_rflags = 0;
7153   int i;
7154
7155   assem_debug("NOTCOMPILED: addr = %x -> %p\n", addr, out);
7156   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7157   //if(debug)
7158   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7159   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7160
7161   // this is just for speculation
7162   for (i = 1; i < 32; i++) {
7163     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
7164       state_rflags |= 1 << i;
7165   }
7166
7167   start = (u_int)addr&~3;
7168   //assert(((u_int)addr&1)==0);
7169   new_dynarec_did_compile=1;
7170   if (Config.HLE && start == 0x80001000) // hlecall
7171   {
7172     // XXX: is this enough? Maybe check hleSoftCall?
7173     void *beginning=start_block();
7174     u_int page=get_page(start);
7175
7176     invalid_code[start>>12]=0;
7177     emit_movimm(start,0);
7178     emit_writeword(0,&pcaddr);
7179     emit_jmp(new_dyna_leave);
7180     literal_pool(0);
7181     end_block(beginning);
7182     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
7183     return 0;
7184   }
7185
7186   source = get_source_start(start, &pagelimit);
7187   if (source == NULL) {
7188     SysPrintf("Compile at bogus memory address: %08x\n", addr);
7189     exit(1);
7190   }
7191
7192   /* Pass 1: disassemble */
7193   /* Pass 2: register dependencies, branch targets */
7194   /* Pass 3: register allocation */
7195   /* Pass 4: branch dependencies */
7196   /* Pass 5: pre-alloc */
7197   /* Pass 6: optimize clean/dirty state */
7198   /* Pass 7: flag 32-bit registers */
7199   /* Pass 8: assembly */
7200   /* Pass 9: linker */
7201   /* Pass 10: garbage collection / free memory */
7202
7203   int j;
7204   int done=0;
7205   unsigned int type,op,op2;
7206
7207   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7208
7209   /* Pass 1 disassembly */
7210
7211   for(i=0;!done;i++) {
7212     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
7213     minimum_free_regs[i]=0;
7214     opcode[i]=op=source[i]>>26;
7215     switch(op)
7216     {
7217       case 0x00: strcpy(insn[i],"special"); type=NI;
7218         op2=source[i]&0x3f;
7219         switch(op2)
7220         {
7221           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7222           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7223           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7224           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7225           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7226           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7227           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7228           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7229           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7230           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7231           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7232           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7233           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7234           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7235           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7236           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7237           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7238           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7239           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7240           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7241           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7242           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7243           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7244           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7245           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7246           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7247           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7248           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7249           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7250           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7251           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7252           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7253           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7254           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7255           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7256 #if 0
7257           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7258           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7259           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7260           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7261           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7262           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7263           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7264           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7265           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7266           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7267           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7268           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7269           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7270           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7271           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7272           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7273           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7274 #endif
7275         }
7276         break;
7277       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7278         op2=(source[i]>>16)&0x1f;
7279         switch(op2)
7280         {
7281           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7282           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7283           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7284           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7285           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7286           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7287           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7288           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7289           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7290           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7291           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7292           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7293           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7294           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7295         }
7296         break;
7297       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7298       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7299       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7300       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7301       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7302       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7303       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7304       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7305       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7306       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7307       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7308       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7309       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7310       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7311       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7312         op2=(source[i]>>21)&0x1f;
7313         switch(op2)
7314         {
7315           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7316           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7317           case 0x10: strcpy(insn[i],"tlb"); type=NI;
7318           switch(source[i]&0x3f)
7319           {
7320             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
7321             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
7322             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
7323             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
7324             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
7325             //case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
7326           }
7327         }
7328         break;
7329       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7330         op2=(source[i]>>21)&0x1f;
7331         switch(op2)
7332         {
7333           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7334           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7335           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7336           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7337           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7338           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7339           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7340           switch((source[i]>>16)&0x3)
7341           {
7342             case 0x00: strcpy(insn[i],"BC1F"); break;
7343             case 0x01: strcpy(insn[i],"BC1T"); break;
7344             case 0x02: strcpy(insn[i],"BC1FL"); break;
7345             case 0x03: strcpy(insn[i],"BC1TL"); break;
7346           }
7347           break;
7348           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7349           switch(source[i]&0x3f)
7350           {
7351             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7352             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7353             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7354             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7355             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7356             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7357             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7358             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7359             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7360             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7361             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7362             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7363             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7364             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7365             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7366             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
7367             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
7368             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
7369             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
7370             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
7371             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
7372             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
7373             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
7374             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
7375             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
7376             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
7377             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
7378             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
7379             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
7380             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
7381             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
7382             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
7383             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
7384             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
7385             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
7386           }
7387           break;
7388           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
7389           switch(source[i]&0x3f)
7390           {
7391             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
7392             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
7393             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
7394             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
7395             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
7396             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
7397             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
7398             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
7399             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
7400             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
7401             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
7402             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
7403             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
7404             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
7405             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
7406             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
7407             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
7408             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
7409             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
7410             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
7411             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
7412             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
7413             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
7414             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
7415             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
7416             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
7417             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
7418             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
7419             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
7420             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
7421             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
7422             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
7423             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
7424             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
7425             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
7426           }
7427           break;
7428           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
7429           switch(source[i]&0x3f)
7430           {
7431             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
7432             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
7433           }
7434           break;
7435           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
7436           switch(source[i]&0x3f)
7437           {
7438             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
7439             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
7440           }
7441           break;
7442         }
7443         break;
7444 #if 0
7445       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7446       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7447       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7448       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7449       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7450       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7451       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7452       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7453 #endif
7454       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7455       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7456       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7457       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7458       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7459       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7460       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7461 #if 0
7462       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7463 #endif
7464       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7465       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7466       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7467       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7468 #if 0
7469       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7470       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7471 #endif
7472       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7473       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7474       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7475       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7476 #if 0
7477       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7478       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7479       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7480 #endif
7481       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7482       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7483 #if 0
7484       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7485       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7486       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7487 #endif
7488       case 0x12: strcpy(insn[i],"COP2"); type=NI;
7489         op2=(source[i]>>21)&0x1f;
7490         //if (op2 & 0x10) {
7491         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
7492           if (gte_handlers[source[i]&0x3f]!=NULL) {
7493             if (gte_regnames[source[i]&0x3f]!=NULL)
7494               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
7495             else
7496               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
7497             type=C2OP;
7498           }
7499         }
7500         else switch(op2)
7501         {
7502           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
7503           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
7504           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
7505           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
7506         }
7507         break;
7508       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
7509       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
7510       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
7511       default: strcpy(insn[i],"???"); type=NI;
7512         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
7513         break;
7514     }
7515     itype[i]=type;
7516     opcode2[i]=op2;
7517     /* Get registers/immediates */
7518     lt1[i]=0;
7519     us1[i]=0;
7520     us2[i]=0;
7521     dep1[i]=0;
7522     dep2[i]=0;
7523     gte_rs[i]=gte_rt[i]=0;
7524     switch(type) {
7525       case LOAD:
7526         rs1[i]=(source[i]>>21)&0x1f;
7527         rs2[i]=0;
7528         rt1[i]=(source[i]>>16)&0x1f;
7529         rt2[i]=0;
7530         imm[i]=(short)source[i];
7531         break;
7532       case STORE:
7533       case STORELR:
7534         rs1[i]=(source[i]>>21)&0x1f;
7535         rs2[i]=(source[i]>>16)&0x1f;
7536         rt1[i]=0;
7537         rt2[i]=0;
7538         imm[i]=(short)source[i];
7539         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
7540         break;
7541       case LOADLR:
7542         // LWL/LWR only load part of the register,
7543         // therefore the target register must be treated as a source too
7544         rs1[i]=(source[i]>>21)&0x1f;
7545         rs2[i]=(source[i]>>16)&0x1f;
7546         rt1[i]=(source[i]>>16)&0x1f;
7547         rt2[i]=0;
7548         imm[i]=(short)source[i];
7549         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
7550         if(op==0x26) dep1[i]=rt1[i]; // LWR
7551         break;
7552       case IMM16:
7553         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7554         else rs1[i]=(source[i]>>21)&0x1f;
7555         rs2[i]=0;
7556         rt1[i]=(source[i]>>16)&0x1f;
7557         rt2[i]=0;
7558         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7559           imm[i]=(unsigned short)source[i];
7560         }else{
7561           imm[i]=(short)source[i];
7562         }
7563         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
7564         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
7565         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7566         break;
7567       case UJUMP:
7568         rs1[i]=0;
7569         rs2[i]=0;
7570         rt1[i]=0;
7571         rt2[i]=0;
7572         // The JAL instruction writes to r31.
7573         if (op&1) {
7574           rt1[i]=31;
7575         }
7576         rs2[i]=CCREG;
7577         break;
7578       case RJUMP:
7579         rs1[i]=(source[i]>>21)&0x1f;
7580         rs2[i]=0;
7581         rt1[i]=0;
7582         rt2[i]=0;
7583         // The JALR instruction writes to rd.
7584         if (op2&1) {
7585           rt1[i]=(source[i]>>11)&0x1f;
7586         }
7587         rs2[i]=CCREG;
7588         break;
7589       case CJUMP:
7590         rs1[i]=(source[i]>>21)&0x1f;
7591         rs2[i]=(source[i]>>16)&0x1f;
7592         rt1[i]=0;
7593         rt2[i]=0;
7594         if(op&2) { // BGTZ/BLEZ
7595           rs2[i]=0;
7596         }
7597         us1[i]=rs1[i];
7598         us2[i]=rs2[i];
7599         likely[i]=op>>4;
7600         break;
7601       case SJUMP:
7602         rs1[i]=(source[i]>>21)&0x1f;
7603         rs2[i]=CCREG;
7604         rt1[i]=0;
7605         rt2[i]=0;
7606         us1[i]=rs1[i];
7607         if(op2&0x10) { // BxxAL
7608           rt1[i]=31;
7609           // NOTE: If the branch is not taken, r31 is still overwritten
7610         }
7611         likely[i]=(op2&2)>>1;
7612         break;
7613       case FJUMP:
7614         rs1[i]=FSREG;
7615         rs2[i]=CSREG;
7616         rt1[i]=0;
7617         rt2[i]=0;
7618         likely[i]=((source[i])>>17)&1;
7619         break;
7620       case ALU:
7621         rs1[i]=(source[i]>>21)&0x1f; // source
7622         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7623         rt1[i]=(source[i]>>11)&0x1f; // destination
7624         rt2[i]=0;
7625         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7626           us1[i]=rs1[i];us2[i]=rs2[i];
7627         }
7628         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7629           dep1[i]=rs1[i];dep2[i]=rs2[i];
7630         }
7631         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7632           dep1[i]=rs1[i];dep2[i]=rs2[i];
7633         }
7634         break;
7635       case MULTDIV:
7636         rs1[i]=(source[i]>>21)&0x1f; // source
7637         rs2[i]=(source[i]>>16)&0x1f; // divisor
7638         rt1[i]=HIREG;
7639         rt2[i]=LOREG;
7640         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7641           us1[i]=rs1[i];us2[i]=rs2[i];
7642         }
7643         break;
7644       case MOV:
7645         rs1[i]=0;
7646         rs2[i]=0;
7647         rt1[i]=0;
7648         rt2[i]=0;
7649         if(op2==0x10) rs1[i]=HIREG; // MFHI
7650         if(op2==0x11) rt1[i]=HIREG; // MTHI
7651         if(op2==0x12) rs1[i]=LOREG; // MFLO
7652         if(op2==0x13) rt1[i]=LOREG; // MTLO
7653         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7654         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7655         dep1[i]=rs1[i];
7656         break;
7657       case SHIFT:
7658         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7659         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7660         rt1[i]=(source[i]>>11)&0x1f; // destination
7661         rt2[i]=0;
7662         // DSLLV/DSRLV/DSRAV are 64-bit
7663         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
7664         break;
7665       case SHIFTIMM:
7666         rs1[i]=(source[i]>>16)&0x1f;
7667         rs2[i]=0;
7668         rt1[i]=(source[i]>>11)&0x1f;
7669         rt2[i]=0;
7670         imm[i]=(source[i]>>6)&0x1f;
7671         // DSxx32 instructions
7672         if(op2>=0x3c) imm[i]|=0x20;
7673         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
7674         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
7675         break;
7676       case COP0:
7677         rs1[i]=0;
7678         rs2[i]=0;
7679         rt1[i]=0;
7680         rt2[i]=0;
7681         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
7682         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
7683         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7684         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7685         break;
7686       case COP1:
7687         rs1[i]=0;
7688         rs2[i]=0;
7689         rt1[i]=0;
7690         rt2[i]=0;
7691         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7692         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7693         if(op2==5) us1[i]=rs1[i]; // DMTC1
7694         rs2[i]=CSREG;
7695         break;
7696       case COP2:
7697         rs1[i]=0;
7698         rs2[i]=0;
7699         rt1[i]=0;
7700         rt2[i]=0;
7701         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
7702         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
7703         rs2[i]=CSREG;
7704         int gr=(source[i]>>11)&0x1F;
7705         switch(op2)
7706         {
7707           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
7708           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
7709           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
7710           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
7711         }
7712         break;
7713       case C1LS:
7714         rs1[i]=(source[i]>>21)&0x1F;
7715         rs2[i]=CSREG;
7716         rt1[i]=0;
7717         rt2[i]=0;
7718         imm[i]=(short)source[i];
7719         break;
7720       case C2LS:
7721         rs1[i]=(source[i]>>21)&0x1F;
7722         rs2[i]=0;
7723         rt1[i]=0;
7724         rt2[i]=0;
7725         imm[i]=(short)source[i];
7726         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
7727         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
7728         break;
7729       case C2OP:
7730         rs1[i]=0;
7731         rs2[i]=0;
7732         rt1[i]=0;
7733         rt2[i]=0;
7734         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
7735         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
7736         gte_rt[i]|=1ll<<63; // every op changes flags
7737         if((source[i]&0x3f)==GTE_MVMVA) {
7738           int v = (source[i] >> 15) & 3;
7739           gte_rs[i]&=~0xe3fll;
7740           if(v==3) gte_rs[i]|=0xe00ll;
7741           else gte_rs[i]|=3ll<<(v*2);
7742         }
7743         break;
7744       case FLOAT:
7745       case FCONV:
7746         rs1[i]=0;
7747         rs2[i]=CSREG;
7748         rt1[i]=0;
7749         rt2[i]=0;
7750         break;
7751       case FCOMP:
7752         rs1[i]=FSREG;
7753         rs2[i]=CSREG;
7754         rt1[i]=FSREG;
7755         rt2[i]=0;
7756         break;
7757       case SYSCALL:
7758       case HLECALL:
7759       case INTCALL:
7760         rs1[i]=CCREG;
7761         rs2[i]=0;
7762         rt1[i]=0;
7763         rt2[i]=0;
7764         break;
7765       default:
7766         rs1[i]=0;
7767         rs2[i]=0;
7768         rt1[i]=0;
7769         rt2[i]=0;
7770     }
7771     /* Calculate branch target addresses */
7772     if(type==UJUMP)
7773       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
7774     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
7775       ba[i]=start+i*4+8; // Ignore never taken branch
7776     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
7777       ba[i]=start+i*4+8; // Ignore never taken branch
7778     else if(type==CJUMP||type==SJUMP||type==FJUMP)
7779       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
7780     else ba[i]=-1;
7781     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
7782       int do_in_intrp=0;
7783       // branch in delay slot?
7784       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
7785         // don't handle first branch and call interpreter if it's hit
7786         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
7787         do_in_intrp=1;
7788       }
7789       // basic load delay detection
7790       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
7791         int t=(ba[i-1]-start)/4;
7792         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
7793           // jump target wants DS result - potential load delay effect
7794           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
7795           do_in_intrp=1;
7796           bt[t+1]=1; // expected return from interpreter
7797         }
7798         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
7799               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
7800           // v0 overwrite like this is a sign of trouble, bail out
7801           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
7802           do_in_intrp=1;
7803         }
7804       }
7805       if(do_in_intrp) {
7806         rs1[i-1]=CCREG;
7807         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
7808         ba[i-1]=-1;
7809         itype[i-1]=INTCALL;
7810         done=2;
7811         i--; // don't compile the DS
7812       }
7813     }
7814     /* Is this the end of the block? */
7815     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
7816       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
7817         done=2;
7818       }
7819       else {
7820         if(stop_after_jal) done=1;
7821         // Stop on BREAK
7822         if((source[i+1]&0xfc00003f)==0x0d) done=1;
7823       }
7824       // Don't recompile stuff that's already compiled
7825       if(check_addr(start+i*4+4)) done=1;
7826       // Don't get too close to the limit
7827       if(i>MAXBLOCK/2) done=1;
7828     }
7829     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
7830     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
7831     if(done==2) {
7832       // Does the block continue due to a branch?
7833       for(j=i-1;j>=0;j--)
7834       {
7835         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
7836         if(ba[j]==start+i*4+4) done=j=0;
7837         if(ba[j]==start+i*4+8) done=j=0;
7838       }
7839     }
7840     //assert(i<MAXBLOCK-1);
7841     if(start+i*4==pagelimit-4) done=1;
7842     assert(start+i*4<pagelimit);
7843     if (i==MAXBLOCK-1) done=1;
7844     // Stop if we're compiling junk
7845     if(itype[i]==NI&&opcode[i]==0x11) {
7846       done=stop_after_jal=1;
7847       SysPrintf("Disabled speculative precompilation\n");
7848     }
7849   }
7850   slen=i;
7851   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
7852     if(start+i*4==pagelimit) {
7853       itype[i-1]=SPAN;
7854     }
7855   }
7856   assert(slen>0);
7857
7858   /* Pass 2 - Register dependencies and branch targets */
7859
7860   unneeded_registers(0,slen-1,0);
7861
7862   /* Pass 3 - Register allocation */
7863
7864   struct regstat current; // Current register allocations/status
7865   current.is32=1;
7866   current.dirty=0;
7867   current.u=unneeded_reg[0];
7868   current.uu=unneeded_reg_upper[0];
7869   clear_all_regs(current.regmap);
7870   alloc_reg(&current,0,CCREG);
7871   dirty_reg(&current,CCREG);
7872   current.isconst=0;
7873   current.wasconst=0;
7874   current.waswritten=0;
7875   int ds=0;
7876   int cc=0;
7877   int hr=-1;
7878
7879   if((u_int)addr&1) {
7880     // First instruction is delay slot
7881     cc=-1;
7882     bt[1]=1;
7883     ds=1;
7884     unneeded_reg[0]=1;
7885     unneeded_reg_upper[0]=1;
7886     current.regmap[HOST_BTREG]=BTREG;
7887   }
7888
7889   for(i=0;i<slen;i++)
7890   {
7891     if(bt[i])
7892     {
7893       int hr;
7894       for(hr=0;hr<HOST_REGS;hr++)
7895       {
7896         // Is this really necessary?
7897         if(current.regmap[hr]==0) current.regmap[hr]=-1;
7898       }
7899       current.isconst=0;
7900       current.waswritten=0;
7901     }
7902     if(i>1)
7903     {
7904       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
7905       {
7906         if(rs1[i-2]==0||rs2[i-2]==0)
7907         {
7908           if(rs1[i-2]) {
7909             current.is32|=1LL<<rs1[i-2];
7910             int hr=get_reg(current.regmap,rs1[i-2]|64);
7911             if(hr>=0) current.regmap[hr]=-1;
7912           }
7913           if(rs2[i-2]) {
7914             current.is32|=1LL<<rs2[i-2];
7915             int hr=get_reg(current.regmap,rs2[i-2]|64);
7916             if(hr>=0) current.regmap[hr]=-1;
7917           }
7918         }
7919       }
7920     }
7921     current.is32=-1LL;
7922
7923     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
7924     regs[i].wasconst=current.isconst;
7925     regs[i].was32=current.is32;
7926     regs[i].wasdirty=current.dirty;
7927     regs[i].loadedconst=0;
7928     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
7929       if(i+1<slen) {
7930         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7931         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
7932         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
7933         current.u|=1;
7934         current.uu|=1;
7935       } else {
7936         current.u=1;
7937         current.uu=1;
7938       }
7939     } else {
7940       if(i+1<slen) {
7941         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
7942         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
7943         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
7944         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7945         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
7946         current.u|=1;
7947         current.uu|=1;
7948       } else { SysPrintf("oops, branch at end of block with no delay slot\n");exit(1); }
7949     }
7950     is_ds[i]=ds;
7951     if(ds) {
7952       ds=0; // Skip delay slot, already allocated as part of branch
7953       // ...but we need to alloc it in case something jumps here
7954       if(i+1<slen) {
7955         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
7956         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
7957       }else{
7958         current.u=branch_unneeded_reg[i-1];
7959         current.uu=branch_unneeded_reg_upper[i-1];
7960       }
7961       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7962       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
7963       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
7964       current.u|=1;
7965       current.uu|=1;
7966       struct regstat temp;
7967       memcpy(&temp,&current,sizeof(current));
7968       temp.wasdirty=temp.dirty;
7969       temp.was32=temp.is32;
7970       // TODO: Take into account unconditional branches, as below
7971       delayslot_alloc(&temp,i);
7972       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
7973       regs[i].wasdirty=temp.wasdirty;
7974       regs[i].was32=temp.was32;
7975       regs[i].dirty=temp.dirty;
7976       regs[i].is32=temp.is32;
7977       regs[i].isconst=0;
7978       regs[i].wasconst=0;
7979       current.isconst=0;
7980       // Create entry (branch target) regmap
7981       for(hr=0;hr<HOST_REGS;hr++)
7982       {
7983         int r=temp.regmap[hr];
7984         if(r>=0) {
7985           if(r!=regmap_pre[i][hr]) {
7986             regs[i].regmap_entry[hr]=-1;
7987           }
7988           else
7989           {
7990             if(r<64){
7991               if((current.u>>r)&1) {
7992                 regs[i].regmap_entry[hr]=-1;
7993                 regs[i].regmap[hr]=-1;
7994                 //Don't clear regs in the delay slot as the branch might need them
7995                 //current.regmap[hr]=-1;
7996               }else
7997                 regs[i].regmap_entry[hr]=r;
7998             }
7999             else {
8000               if((current.uu>>(r&63))&1) {
8001                 regs[i].regmap_entry[hr]=-1;
8002                 regs[i].regmap[hr]=-1;
8003                 //Don't clear regs in the delay slot as the branch might need them
8004                 //current.regmap[hr]=-1;
8005               }else
8006                 regs[i].regmap_entry[hr]=r;
8007             }
8008           }
8009         } else {
8010           // First instruction expects CCREG to be allocated
8011           if(i==0&&hr==HOST_CCREG)
8012             regs[i].regmap_entry[hr]=CCREG;
8013           else
8014             regs[i].regmap_entry[hr]=-1;
8015         }
8016       }
8017     }
8018     else { // Not delay slot
8019       switch(itype[i]) {
8020         case UJUMP:
8021           //current.isconst=0; // DEBUG
8022           //current.wasconst=0; // DEBUG
8023           //regs[i].wasconst=0; // DEBUG
8024           clear_const(&current,rt1[i]);
8025           alloc_cc(&current,i);
8026           dirty_reg(&current,CCREG);
8027           if (rt1[i]==31) {
8028             alloc_reg(&current,i,31);
8029             dirty_reg(&current,31);
8030             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8031             //assert(rt1[i+1]!=rt1[i]);
8032             #ifdef REG_PREFETCH
8033             alloc_reg(&current,i,PTEMP);
8034             #endif
8035             //current.is32|=1LL<<rt1[i];
8036           }
8037           ooo[i]=1;
8038           delayslot_alloc(&current,i+1);
8039           //current.isconst=0; // DEBUG
8040           ds=1;
8041           //printf("i=%d, isconst=%x\n",i,current.isconst);
8042           break;
8043         case RJUMP:
8044           //current.isconst=0;
8045           //current.wasconst=0;
8046           //regs[i].wasconst=0;
8047           clear_const(&current,rs1[i]);
8048           clear_const(&current,rt1[i]);
8049           alloc_cc(&current,i);
8050           dirty_reg(&current,CCREG);
8051           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8052             alloc_reg(&current,i,rs1[i]);
8053             if (rt1[i]!=0) {
8054               alloc_reg(&current,i,rt1[i]);
8055               dirty_reg(&current,rt1[i]);
8056               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
8057               assert(rt1[i+1]!=rt1[i]);
8058               #ifdef REG_PREFETCH
8059               alloc_reg(&current,i,PTEMP);
8060               #endif
8061             }
8062             #ifdef USE_MINI_HT
8063             if(rs1[i]==31) { // JALR
8064               alloc_reg(&current,i,RHASH);
8065               #ifndef HOST_IMM_ADDR32
8066               alloc_reg(&current,i,RHTBL);
8067               #endif
8068             }
8069             #endif
8070             delayslot_alloc(&current,i+1);
8071           } else {
8072             // The delay slot overwrites our source register,
8073             // allocate a temporary register to hold the old value.
8074             current.isconst=0;
8075             current.wasconst=0;
8076             regs[i].wasconst=0;
8077             delayslot_alloc(&current,i+1);
8078             current.isconst=0;
8079             alloc_reg(&current,i,RTEMP);
8080           }
8081           //current.isconst=0; // DEBUG
8082           ooo[i]=1;
8083           ds=1;
8084           break;
8085         case CJUMP:
8086           //current.isconst=0;
8087           //current.wasconst=0;
8088           //regs[i].wasconst=0;
8089           clear_const(&current,rs1[i]);
8090           clear_const(&current,rs2[i]);
8091           if((opcode[i]&0x3E)==4) // BEQ/BNE
8092           {
8093             alloc_cc(&current,i);
8094             dirty_reg(&current,CCREG);
8095             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8096             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8097             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8098             {
8099               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8100               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8101             }
8102             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8103                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8104               // The delay slot overwrites one of our conditions.
8105               // Allocate the branch condition registers instead.
8106               current.isconst=0;
8107               current.wasconst=0;
8108               regs[i].wasconst=0;
8109               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8110               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8111               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8112               {
8113                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8114                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8115               }
8116             }
8117             else
8118             {
8119               ooo[i]=1;
8120               delayslot_alloc(&current,i+1);
8121             }
8122           }
8123           else
8124           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8125           {
8126             alloc_cc(&current,i);
8127             dirty_reg(&current,CCREG);
8128             alloc_reg(&current,i,rs1[i]);
8129             if(!(current.is32>>rs1[i]&1))
8130             {
8131               alloc_reg64(&current,i,rs1[i]);
8132             }
8133             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8134               // The delay slot overwrites one of our conditions.
8135               // Allocate the branch condition registers instead.
8136               current.isconst=0;
8137               current.wasconst=0;
8138               regs[i].wasconst=0;
8139               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8140               if(!((current.is32>>rs1[i])&1))
8141               {
8142                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8143               }
8144             }
8145             else
8146             {
8147               ooo[i]=1;
8148               delayslot_alloc(&current,i+1);
8149             }
8150           }
8151           else
8152           // Don't alloc the delay slot yet because we might not execute it
8153           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
8154           {
8155             current.isconst=0;
8156             current.wasconst=0;
8157             regs[i].wasconst=0;
8158             alloc_cc(&current,i);
8159             dirty_reg(&current,CCREG);
8160             alloc_reg(&current,i,rs1[i]);
8161             alloc_reg(&current,i,rs2[i]);
8162             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8163             {
8164               alloc_reg64(&current,i,rs1[i]);
8165               alloc_reg64(&current,i,rs2[i]);
8166             }
8167           }
8168           else
8169           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
8170           {
8171             current.isconst=0;
8172             current.wasconst=0;
8173             regs[i].wasconst=0;
8174             alloc_cc(&current,i);
8175             dirty_reg(&current,CCREG);
8176             alloc_reg(&current,i,rs1[i]);
8177             if(!(current.is32>>rs1[i]&1))
8178             {
8179               alloc_reg64(&current,i,rs1[i]);
8180             }
8181           }
8182           ds=1;
8183           //current.isconst=0;
8184           break;
8185         case SJUMP:
8186           //current.isconst=0;
8187           //current.wasconst=0;
8188           //regs[i].wasconst=0;
8189           clear_const(&current,rs1[i]);
8190           clear_const(&current,rt1[i]);
8191           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
8192           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
8193           {
8194             alloc_cc(&current,i);
8195             dirty_reg(&current,CCREG);
8196             alloc_reg(&current,i,rs1[i]);
8197             if(!(current.is32>>rs1[i]&1))
8198             {
8199               alloc_reg64(&current,i,rs1[i]);
8200             }
8201             if (rt1[i]==31) { // BLTZAL/BGEZAL
8202               alloc_reg(&current,i,31);
8203               dirty_reg(&current,31);
8204               //#ifdef REG_PREFETCH
8205               //alloc_reg(&current,i,PTEMP);
8206               //#endif
8207               //current.is32|=1LL<<rt1[i];
8208             }
8209             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
8210                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
8211               // Allocate the branch condition registers instead.
8212               current.isconst=0;
8213               current.wasconst=0;
8214               regs[i].wasconst=0;
8215               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8216               if(!((current.is32>>rs1[i])&1))
8217               {
8218                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8219               }
8220             }
8221             else
8222             {
8223               ooo[i]=1;
8224               delayslot_alloc(&current,i+1);
8225             }
8226           }
8227           else
8228           // Don't alloc the delay slot yet because we might not execute it
8229           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
8230           {
8231             current.isconst=0;
8232             current.wasconst=0;
8233             regs[i].wasconst=0;
8234             alloc_cc(&current,i);
8235             dirty_reg(&current,CCREG);
8236             alloc_reg(&current,i,rs1[i]);
8237             if(!(current.is32>>rs1[i]&1))
8238             {
8239               alloc_reg64(&current,i,rs1[i]);
8240             }
8241           }
8242           ds=1;
8243           //current.isconst=0;
8244           break;
8245         case FJUMP:
8246           current.isconst=0;
8247           current.wasconst=0;
8248           regs[i].wasconst=0;
8249           if(likely[i]==0) // BC1F/BC1T
8250           {
8251             // TODO: Theoretically we can run out of registers here on x86.
8252             // The delay slot can allocate up to six, and we need to check
8253             // CSREG before executing the delay slot.  Possibly we can drop
8254             // the cycle count and then reload it after checking that the
8255             // FPU is in a usable state, or don't do out-of-order execution.
8256             alloc_cc(&current,i);
8257             dirty_reg(&current,CCREG);
8258             alloc_reg(&current,i,FSREG);
8259             alloc_reg(&current,i,CSREG);
8260             if(itype[i+1]==FCOMP) {
8261               // The delay slot overwrites the branch condition.
8262               // Allocate the branch condition registers instead.
8263               alloc_cc(&current,i);
8264               dirty_reg(&current,CCREG);
8265               alloc_reg(&current,i,CSREG);
8266               alloc_reg(&current,i,FSREG);
8267             }
8268             else {
8269               ooo[i]=1;
8270               delayslot_alloc(&current,i+1);
8271               alloc_reg(&current,i+1,CSREG);
8272             }
8273           }
8274           else
8275           // Don't alloc the delay slot yet because we might not execute it
8276           if(likely[i]) // BC1FL/BC1TL
8277           {
8278             alloc_cc(&current,i);
8279             dirty_reg(&current,CCREG);
8280             alloc_reg(&current,i,CSREG);
8281             alloc_reg(&current,i,FSREG);
8282           }
8283           ds=1;
8284           current.isconst=0;
8285           break;
8286         case IMM16:
8287           imm16_alloc(&current,i);
8288           break;
8289         case LOAD:
8290         case LOADLR:
8291           load_alloc(&current,i);
8292           break;
8293         case STORE:
8294         case STORELR:
8295           store_alloc(&current,i);
8296           break;
8297         case ALU:
8298           alu_alloc(&current,i);
8299           break;
8300         case SHIFT:
8301           shift_alloc(&current,i);
8302           break;
8303         case MULTDIV:
8304           multdiv_alloc(&current,i);
8305           break;
8306         case SHIFTIMM:
8307           shiftimm_alloc(&current,i);
8308           break;
8309         case MOV:
8310           mov_alloc(&current,i);
8311           break;
8312         case COP0:
8313           cop0_alloc(&current,i);
8314           break;
8315         case COP1:
8316         case COP2:
8317           cop1_alloc(&current,i);
8318           break;
8319         case C1LS:
8320           c1ls_alloc(&current,i);
8321           break;
8322         case C2LS:
8323           c2ls_alloc(&current,i);
8324           break;
8325         case C2OP:
8326           c2op_alloc(&current,i);
8327           break;
8328         case FCONV:
8329           fconv_alloc(&current,i);
8330           break;
8331         case FLOAT:
8332           float_alloc(&current,i);
8333           break;
8334         case FCOMP:
8335           fcomp_alloc(&current,i);
8336           break;
8337         case SYSCALL:
8338         case HLECALL:
8339         case INTCALL:
8340           syscall_alloc(&current,i);
8341           break;
8342         case SPAN:
8343           pagespan_alloc(&current,i);
8344           break;
8345       }
8346
8347       // Drop the upper half of registers that have become 32-bit
8348       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
8349       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8350         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8351         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8352         current.uu|=1;
8353       } else {
8354         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
8355         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8356         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8357         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8358         current.uu|=1;
8359       }
8360
8361       // Create entry (branch target) regmap
8362       for(hr=0;hr<HOST_REGS;hr++)
8363       {
8364         int r,or;
8365         r=current.regmap[hr];
8366         if(r>=0) {
8367           if(r!=regmap_pre[i][hr]) {
8368             // TODO: delay slot (?)
8369             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
8370             if(or<0||(r&63)>=TEMPREG){
8371               regs[i].regmap_entry[hr]=-1;
8372             }
8373             else
8374             {
8375               // Just move it to a different register
8376               regs[i].regmap_entry[hr]=r;
8377               // If it was dirty before, it's still dirty
8378               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
8379             }
8380           }
8381           else
8382           {
8383             // Unneeded
8384             if(r==0){
8385               regs[i].regmap_entry[hr]=0;
8386             }
8387             else
8388             if(r<64){
8389               if((current.u>>r)&1) {
8390                 regs[i].regmap_entry[hr]=-1;
8391                 //regs[i].regmap[hr]=-1;
8392                 current.regmap[hr]=-1;
8393               }else
8394                 regs[i].regmap_entry[hr]=r;
8395             }
8396             else {
8397               if((current.uu>>(r&63))&1) {
8398                 regs[i].regmap_entry[hr]=-1;
8399                 //regs[i].regmap[hr]=-1;
8400                 current.regmap[hr]=-1;
8401               }else
8402                 regs[i].regmap_entry[hr]=r;
8403             }
8404           }
8405         } else {
8406           // Branches expect CCREG to be allocated at the target
8407           if(regmap_pre[i][hr]==CCREG)
8408             regs[i].regmap_entry[hr]=CCREG;
8409           else
8410             regs[i].regmap_entry[hr]=-1;
8411         }
8412       }
8413       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8414     }
8415
8416     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
8417       current.waswritten|=1<<rs1[i-1];
8418     current.waswritten&=~(1<<rt1[i]);
8419     current.waswritten&=~(1<<rt2[i]);
8420     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
8421       current.waswritten&=~(1<<rs1[i]);
8422
8423     /* Branch post-alloc */
8424     if(i>0)
8425     {
8426       current.was32=current.is32;
8427       current.wasdirty=current.dirty;
8428       switch(itype[i-1]) {
8429         case UJUMP:
8430           memcpy(&branch_regs[i-1],&current,sizeof(current));
8431           branch_regs[i-1].isconst=0;
8432           branch_regs[i-1].wasconst=0;
8433           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8434           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8435           alloc_cc(&branch_regs[i-1],i-1);
8436           dirty_reg(&branch_regs[i-1],CCREG);
8437           if(rt1[i-1]==31) { // JAL
8438             alloc_reg(&branch_regs[i-1],i-1,31);
8439             dirty_reg(&branch_regs[i-1],31);
8440             branch_regs[i-1].is32|=1LL<<31;
8441           }
8442           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8443           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8444           break;
8445         case RJUMP:
8446           memcpy(&branch_regs[i-1],&current,sizeof(current));
8447           branch_regs[i-1].isconst=0;
8448           branch_regs[i-1].wasconst=0;
8449           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8450           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8451           alloc_cc(&branch_regs[i-1],i-1);
8452           dirty_reg(&branch_regs[i-1],CCREG);
8453           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8454           if(rt1[i-1]!=0) { // JALR
8455             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
8456             dirty_reg(&branch_regs[i-1],rt1[i-1]);
8457             branch_regs[i-1].is32|=1LL<<rt1[i-1];
8458           }
8459           #ifdef USE_MINI_HT
8460           if(rs1[i-1]==31) { // JALR
8461             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8462             #ifndef HOST_IMM_ADDR32
8463             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8464             #endif
8465           }
8466           #endif
8467           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8468           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8469           break;
8470         case CJUMP:
8471           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8472           {
8473             alloc_cc(&current,i-1);
8474             dirty_reg(&current,CCREG);
8475             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8476                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8477               // The delay slot overwrote one of our conditions
8478               // Delay slot goes after the test (in order)
8479               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8480               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8481               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8482               current.u|=1;
8483               current.uu|=1;
8484               delayslot_alloc(&current,i);
8485               current.isconst=0;
8486             }
8487             else
8488             {
8489               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8490               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8491               // Alloc the branch condition registers
8492               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8493               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8494               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
8495               {
8496                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
8497                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
8498               }
8499             }
8500             memcpy(&branch_regs[i-1],&current,sizeof(current));
8501             branch_regs[i-1].isconst=0;
8502             branch_regs[i-1].wasconst=0;
8503             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8504             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8505           }
8506           else
8507           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8508           {
8509             alloc_cc(&current,i-1);
8510             dirty_reg(&current,CCREG);
8511             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8512               // The delay slot overwrote the branch condition
8513               // Delay slot goes after the test (in order)
8514               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8515               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8516               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8517               current.u|=1;
8518               current.uu|=1;
8519               delayslot_alloc(&current,i);
8520               current.isconst=0;
8521             }
8522             else
8523             {
8524               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8525               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8526               // Alloc the branch condition register
8527               alloc_reg(&current,i-1,rs1[i-1]);
8528               if(!(current.is32>>rs1[i-1]&1))
8529               {
8530                 alloc_reg64(&current,i-1,rs1[i-1]);
8531               }
8532             }
8533             memcpy(&branch_regs[i-1],&current,sizeof(current));
8534             branch_regs[i-1].isconst=0;
8535             branch_regs[i-1].wasconst=0;
8536             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8537             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8538           }
8539           else
8540           // Alloc the delay slot in case the branch is taken
8541           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8542           {
8543             memcpy(&branch_regs[i-1],&current,sizeof(current));
8544             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8545             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8546             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8547             alloc_cc(&branch_regs[i-1],i);
8548             dirty_reg(&branch_regs[i-1],CCREG);
8549             delayslot_alloc(&branch_regs[i-1],i);
8550             branch_regs[i-1].isconst=0;
8551             alloc_reg(&current,i,CCREG); // Not taken path
8552             dirty_reg(&current,CCREG);
8553             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8554           }
8555           else
8556           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8557           {
8558             memcpy(&branch_regs[i-1],&current,sizeof(current));
8559             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8560             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8561             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8562             alloc_cc(&branch_regs[i-1],i);
8563             dirty_reg(&branch_regs[i-1],CCREG);
8564             delayslot_alloc(&branch_regs[i-1],i);
8565             branch_regs[i-1].isconst=0;
8566             alloc_reg(&current,i,CCREG); // Not taken path
8567             dirty_reg(&current,CCREG);
8568             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8569           }
8570           break;
8571         case SJUMP:
8572           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8573           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8574           {
8575             alloc_cc(&current,i-1);
8576             dirty_reg(&current,CCREG);
8577             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8578               // The delay slot overwrote the branch condition
8579               // Delay slot goes after the test (in order)
8580               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8581               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8582               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8583               current.u|=1;
8584               current.uu|=1;
8585               delayslot_alloc(&current,i);
8586               current.isconst=0;
8587             }
8588             else
8589             {
8590               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8591               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8592               // Alloc the branch condition register
8593               alloc_reg(&current,i-1,rs1[i-1]);
8594               if(!(current.is32>>rs1[i-1]&1))
8595               {
8596                 alloc_reg64(&current,i-1,rs1[i-1]);
8597               }
8598             }
8599             memcpy(&branch_regs[i-1],&current,sizeof(current));
8600             branch_regs[i-1].isconst=0;
8601             branch_regs[i-1].wasconst=0;
8602             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8603             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8604           }
8605           else
8606           // Alloc the delay slot in case the branch is taken
8607           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
8608           {
8609             memcpy(&branch_regs[i-1],&current,sizeof(current));
8610             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8611             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8612             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8613             alloc_cc(&branch_regs[i-1],i);
8614             dirty_reg(&branch_regs[i-1],CCREG);
8615             delayslot_alloc(&branch_regs[i-1],i);
8616             branch_regs[i-1].isconst=0;
8617             alloc_reg(&current,i,CCREG); // Not taken path
8618             dirty_reg(&current,CCREG);
8619             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8620           }
8621           // FIXME: BLTZAL/BGEZAL
8622           if(opcode2[i-1]&0x10) { // BxxZAL
8623             alloc_reg(&branch_regs[i-1],i-1,31);
8624             dirty_reg(&branch_regs[i-1],31);
8625             branch_regs[i-1].is32|=1LL<<31;
8626           }
8627           break;
8628         case FJUMP:
8629           if(likely[i-1]==0) // BC1F/BC1T
8630           {
8631             alloc_cc(&current,i-1);
8632             dirty_reg(&current,CCREG);
8633             if(itype[i]==FCOMP) {
8634               // The delay slot overwrote the branch condition
8635               // Delay slot goes after the test (in order)
8636               delayslot_alloc(&current,i);
8637               current.isconst=0;
8638             }
8639             else
8640             {
8641               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8642               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8643               // Alloc the branch condition register
8644               alloc_reg(&current,i-1,FSREG);
8645             }
8646             memcpy(&branch_regs[i-1],&current,sizeof(current));
8647             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8648           }
8649           else // BC1FL/BC1TL
8650           {
8651             // Alloc the delay slot in case the branch is taken
8652             memcpy(&branch_regs[i-1],&current,sizeof(current));
8653             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8654             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8655             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8656             alloc_cc(&branch_regs[i-1],i);
8657             dirty_reg(&branch_regs[i-1],CCREG);
8658             delayslot_alloc(&branch_regs[i-1],i);
8659             branch_regs[i-1].isconst=0;
8660             alloc_reg(&current,i,CCREG); // Not taken path
8661             dirty_reg(&current,CCREG);
8662             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8663           }
8664           break;
8665       }
8666
8667       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
8668       {
8669         if(rt1[i-1]==31) // JAL/JALR
8670         {
8671           // Subroutine call will return here, don't alloc any registers
8672           current.is32=1;
8673           current.dirty=0;
8674           clear_all_regs(current.regmap);
8675           alloc_reg(&current,i,CCREG);
8676           dirty_reg(&current,CCREG);
8677         }
8678         else if(i+1<slen)
8679         {
8680           // Internal branch will jump here, match registers to caller
8681           current.is32=0x3FFFFFFFFLL;
8682           current.dirty=0;
8683           clear_all_regs(current.regmap);
8684           alloc_reg(&current,i,CCREG);
8685           dirty_reg(&current,CCREG);
8686           for(j=i-1;j>=0;j--)
8687           {
8688             if(ba[j]==start+i*4+4) {
8689               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
8690               current.is32=branch_regs[j].is32;
8691               current.dirty=branch_regs[j].dirty;
8692               break;
8693             }
8694           }
8695           while(j>=0) {
8696             if(ba[j]==start+i*4+4) {
8697               for(hr=0;hr<HOST_REGS;hr++) {
8698                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
8699                   current.regmap[hr]=-1;
8700                 }
8701                 current.is32&=branch_regs[j].is32;
8702                 current.dirty&=branch_regs[j].dirty;
8703               }
8704             }
8705             j--;
8706           }
8707         }
8708       }
8709     }
8710
8711     // Count cycles in between branches
8712     ccadj[i]=cc;
8713     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
8714     {
8715       cc=0;
8716     }
8717 #if !defined(DRC_DBG)
8718     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
8719     {
8720       // GTE runs in parallel until accessed, divide by 2 for a rough guess
8721       cc+=gte_cycletab[source[i]&0x3f]/2;
8722     }
8723     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
8724     {
8725       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
8726     }
8727     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
8728     {
8729       cc+=4;
8730     }
8731     else if(itype[i]==C2LS)
8732     {
8733       cc+=4;
8734     }
8735 #endif
8736     else
8737     {
8738       cc++;
8739     }
8740
8741     flush_dirty_uppers(&current);
8742     if(!is_ds[i]) {
8743       regs[i].is32=current.is32;
8744       regs[i].dirty=current.dirty;
8745       regs[i].isconst=current.isconst;
8746       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
8747     }
8748     for(hr=0;hr<HOST_REGS;hr++) {
8749       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
8750         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
8751           regs[i].wasconst&=~(1<<hr);
8752         }
8753       }
8754     }
8755     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
8756     regs[i].waswritten=current.waswritten;
8757   }
8758
8759   /* Pass 4 - Cull unused host registers */
8760
8761   uint64_t nr=0;
8762
8763   for (i=slen-1;i>=0;i--)
8764   {
8765     int hr;
8766     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8767     {
8768       if(ba[i]<start || ba[i]>=(start+slen*4))
8769       {
8770         // Branch out of this block, don't need anything
8771         nr=0;
8772       }
8773       else
8774       {
8775         // Internal branch
8776         // Need whatever matches the target
8777         nr=0;
8778         int t=(ba[i]-start)>>2;
8779         for(hr=0;hr<HOST_REGS;hr++)
8780         {
8781           if(regs[i].regmap_entry[hr]>=0) {
8782             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
8783           }
8784         }
8785       }
8786       // Conditional branch may need registers for following instructions
8787       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8788       {
8789         if(i<slen-2) {
8790           nr|=needed_reg[i+2];
8791           for(hr=0;hr<HOST_REGS;hr++)
8792           {
8793             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
8794             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
8795           }
8796         }
8797       }
8798       // Don't need stuff which is overwritten
8799       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8800       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8801       // Merge in delay slot
8802       for(hr=0;hr<HOST_REGS;hr++)
8803       {
8804         if(!likely[i]) {
8805           // These are overwritten unless the branch is "likely"
8806           // and the delay slot is nullified if not taken
8807           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8808           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8809         }
8810         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8811         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8812         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8813         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8814         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8815         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8816         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8817         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8818         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
8819           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8820           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8821         }
8822         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
8823           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8824           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8825         }
8826         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
8827           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8828           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8829         }
8830       }
8831     }
8832     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
8833     {
8834       // SYSCALL instruction (software interrupt)
8835       nr=0;
8836     }
8837     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
8838     {
8839       // ERET instruction (return from interrupt)
8840       nr=0;
8841     }
8842     else // Non-branch
8843     {
8844       if(i<slen-1) {
8845         for(hr=0;hr<HOST_REGS;hr++) {
8846           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
8847           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
8848           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8849           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8850         }
8851       }
8852     }
8853     for(hr=0;hr<HOST_REGS;hr++)
8854     {
8855       // Overwritten registers are not needed
8856       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8857       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8858       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8859       // Source registers are needed
8860       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8861       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8862       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
8863       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
8864       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8865       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8866       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8867       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8868       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
8869         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8870         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8871       }
8872       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
8873         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8874         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8875       }
8876       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
8877         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8878         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8879       }
8880       // Don't store a register immediately after writing it,
8881       // may prevent dual-issue.
8882       // But do so if this is a branch target, otherwise we
8883       // might have to load the register before the branch.
8884       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
8885         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
8886            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
8887           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8888           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8889         }
8890         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
8891            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
8892           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8893           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8894         }
8895       }
8896     }
8897     // Cycle count is needed at branches.  Assume it is needed at the target too.
8898     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
8899       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8900       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8901     }
8902     // Save it
8903     needed_reg[i]=nr;
8904
8905     // Deallocate unneeded registers
8906     for(hr=0;hr<HOST_REGS;hr++)
8907     {
8908       if(!((nr>>hr)&1)) {
8909         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
8910         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8911            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8912            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
8913         {
8914           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8915           {
8916             if(likely[i]) {
8917               regs[i].regmap[hr]=-1;
8918               regs[i].isconst&=~(1<<hr);
8919               if(i<slen-2) {
8920                 regmap_pre[i+2][hr]=-1;
8921                 regs[i+2].wasconst&=~(1<<hr);
8922               }
8923             }
8924           }
8925         }
8926         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8927         {
8928           int d1=0,d2=0,map=0,temp=0;
8929           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
8930           {
8931             d1=dep1[i+1];
8932             d2=dep2[i+1];
8933           }
8934           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
8935              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8936             map=INVCP;
8937           }
8938           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
8939              itype[i+1]==C1LS || itype[i+1]==C2LS)
8940             temp=FTEMP;
8941           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8942              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8943              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
8944              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
8945              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
8946              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
8947              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
8948              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
8949              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
8950              regs[i].regmap[hr]!=map )
8951           {
8952             regs[i].regmap[hr]=-1;
8953             regs[i].isconst&=~(1<<hr);
8954             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
8955                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
8956                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
8957                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
8958                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
8959                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
8960                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
8961                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
8962                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
8963                branch_regs[i].regmap[hr]!=map)
8964             {
8965               branch_regs[i].regmap[hr]=-1;
8966               branch_regs[i].regmap_entry[hr]=-1;
8967               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8968               {
8969                 if(!likely[i]&&i<slen-2) {
8970                   regmap_pre[i+2][hr]=-1;
8971                   regs[i+2].wasconst&=~(1<<hr);
8972                 }
8973               }
8974             }
8975           }
8976         }
8977         else
8978         {
8979           // Non-branch
8980           if(i>0)
8981           {
8982             int d1=0,d2=0,map=-1,temp=-1;
8983             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
8984             {
8985               d1=dep1[i];
8986               d2=dep2[i];
8987             }
8988             if(itype[i]==STORE || itype[i]==STORELR ||
8989                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8990               map=INVCP;
8991             }
8992             if(itype[i]==LOADLR || itype[i]==STORELR ||
8993                itype[i]==C1LS || itype[i]==C2LS)
8994               temp=FTEMP;
8995             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8996                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
8997                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
8998                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
8999                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9000                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9001             {
9002               if(i<slen-1&&!is_ds[i]) {
9003                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9004                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9005                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9006                 {
9007                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9008                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9009                 }
9010                 regmap_pre[i+1][hr]=-1;
9011                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9012                 regs[i+1].wasconst&=~(1<<hr);
9013               }
9014               regs[i].regmap[hr]=-1;
9015               regs[i].isconst&=~(1<<hr);
9016             }
9017           }
9018         }
9019       }
9020     }
9021   }
9022
9023   /* Pass 5 - Pre-allocate registers */
9024
9025   // If a register is allocated during a loop, try to allocate it for the
9026   // entire loop, if possible.  This avoids loading/storing registers
9027   // inside of the loop.
9028
9029   signed char f_regmap[HOST_REGS];
9030   clear_all_regs(f_regmap);
9031   for(i=0;i<slen-1;i++)
9032   {
9033     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9034     {
9035       if(ba[i]>=start && ba[i]<(start+i*4))
9036       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9037       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9038       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9039       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9040       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
9041       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
9042       {
9043         int t=(ba[i]-start)>>2;
9044         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9045         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
9046         for(hr=0;hr<HOST_REGS;hr++)
9047         {
9048           if(regs[i].regmap[hr]>64) {
9049             if(!((regs[i].dirty>>hr)&1))
9050               f_regmap[hr]=regs[i].regmap[hr];
9051             else f_regmap[hr]=-1;
9052           }
9053           else if(regs[i].regmap[hr]>=0) {
9054             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9055               // dealloc old register
9056               int n;
9057               for(n=0;n<HOST_REGS;n++)
9058               {
9059                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9060               }
9061               // and alloc new one
9062               f_regmap[hr]=regs[i].regmap[hr];
9063             }
9064           }
9065           if(branch_regs[i].regmap[hr]>64) {
9066             if(!((branch_regs[i].dirty>>hr)&1))
9067               f_regmap[hr]=branch_regs[i].regmap[hr];
9068             else f_regmap[hr]=-1;
9069           }
9070           else if(branch_regs[i].regmap[hr]>=0) {
9071             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
9072               // dealloc old register
9073               int n;
9074               for(n=0;n<HOST_REGS;n++)
9075               {
9076                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
9077               }
9078               // and alloc new one
9079               f_regmap[hr]=branch_regs[i].regmap[hr];
9080             }
9081           }
9082           if(ooo[i]) {
9083             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
9084               f_regmap[hr]=branch_regs[i].regmap[hr];
9085           }else{
9086             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
9087               f_regmap[hr]=branch_regs[i].regmap[hr];
9088           }
9089           // Avoid dirty->clean transition
9090           #ifdef DESTRUCTIVE_WRITEBACK
9091           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9092           #endif
9093           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
9094           // case above, however it's always a good idea.  We can't hoist the
9095           // load if the register was already allocated, so there's no point
9096           // wasting time analyzing most of these cases.  It only "succeeds"
9097           // when the mapping was different and the load can be replaced with
9098           // a mov, which is of negligible benefit.  So such cases are
9099           // skipped below.
9100           if(f_regmap[hr]>0) {
9101             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
9102               int r=f_regmap[hr];
9103               for(j=t;j<=i;j++)
9104               {
9105                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9106                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9107                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9108                 if(r>63) {
9109                   // NB This can exclude the case where the upper-half
9110                   // register is lower numbered than the lower-half
9111                   // register.  Not sure if it's worth fixing...
9112                   if(get_reg(regs[j].regmap,r&63)<0) break;
9113                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
9114                   if(regs[j].is32&(1LL<<(r&63))) break;
9115                 }
9116                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9117                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9118                   int k;
9119                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9120                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9121                     if(r>63) {
9122                       if(get_reg(regs[i].regmap,r&63)<0) break;
9123                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9124                     }
9125                     k=i;
9126                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9127                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9128                         //printf("no free regs for store %x\n",start+(k-1)*4);
9129                         break;
9130                       }
9131                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9132                         //printf("no-match due to different register\n");
9133                         break;
9134                       }
9135                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9136                         //printf("no-match due to branch\n");
9137                         break;
9138                       }
9139                       // call/ret fast path assumes no registers allocated
9140                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
9141                         break;
9142                       }
9143                       if(r>63) {
9144                         // NB This can exclude the case where the upper-half
9145                         // register is lower numbered than the lower-half
9146                         // register.  Not sure if it's worth fixing...
9147                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9148                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9149                       }
9150                       k--;
9151                     }
9152                     if(i<slen-1) {
9153                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9154                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9155                         //printf("bad match after branch\n");
9156                         break;
9157                       }
9158                     }
9159                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
9160                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
9161                       while(k<i) {
9162                         regs[k].regmap_entry[hr]=f_regmap[hr];
9163                         regs[k].regmap[hr]=f_regmap[hr];
9164                         regmap_pre[k+1][hr]=f_regmap[hr];
9165                         regs[k].wasdirty&=~(1<<hr);
9166                         regs[k].dirty&=~(1<<hr);
9167                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
9168                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
9169                         regs[k].wasconst&=~(1<<hr);
9170                         regs[k].isconst&=~(1<<hr);
9171                         k++;
9172                       }
9173                     }
9174                     else {
9175                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
9176                       break;
9177                     }
9178                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
9179                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
9180                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
9181                       regs[i].regmap_entry[hr]=f_regmap[hr];
9182                       regs[i].regmap[hr]=f_regmap[hr];
9183                       regs[i].wasdirty&=~(1<<hr);
9184                       regs[i].dirty&=~(1<<hr);
9185                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
9186                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
9187                       regs[i].wasconst&=~(1<<hr);
9188                       regs[i].isconst&=~(1<<hr);
9189                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
9190                       branch_regs[i].wasdirty&=~(1<<hr);
9191                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
9192                       branch_regs[i].regmap[hr]=f_regmap[hr];
9193                       branch_regs[i].dirty&=~(1<<hr);
9194                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
9195                       branch_regs[i].wasconst&=~(1<<hr);
9196                       branch_regs[i].isconst&=~(1<<hr);
9197                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
9198                         regmap_pre[i+2][hr]=f_regmap[hr];
9199                         regs[i+2].wasdirty&=~(1<<hr);
9200                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
9201                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
9202                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
9203                       }
9204                     }
9205                   }
9206                   for(k=t;k<j;k++) {
9207                     // Alloc register clean at beginning of loop,
9208                     // but may dirty it in pass 6
9209                     regs[k].regmap_entry[hr]=f_regmap[hr];
9210                     regs[k].regmap[hr]=f_regmap[hr];
9211                     regs[k].dirty&=~(1<<hr);
9212                     regs[k].wasconst&=~(1<<hr);
9213                     regs[k].isconst&=~(1<<hr);
9214                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
9215                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
9216                       branch_regs[k].regmap[hr]=f_regmap[hr];
9217                       branch_regs[k].dirty&=~(1<<hr);
9218                       branch_regs[k].wasconst&=~(1<<hr);
9219                       branch_regs[k].isconst&=~(1<<hr);
9220                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
9221                         regmap_pre[k+2][hr]=f_regmap[hr];
9222                         regs[k+2].wasdirty&=~(1<<hr);
9223                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
9224                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
9225                       }
9226                     }
9227                     else
9228                     {
9229                       regmap_pre[k+1][hr]=f_regmap[hr];
9230                       regs[k+1].wasdirty&=~(1<<hr);
9231                     }
9232                   }
9233                   if(regs[j].regmap[hr]==f_regmap[hr])
9234                     regs[j].regmap_entry[hr]=f_regmap[hr];
9235                   break;
9236                 }
9237                 if(j==i) break;
9238                 if(regs[j].regmap[hr]>=0)
9239                   break;
9240                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
9241                   //printf("no-match due to different register\n");
9242                   break;
9243                 }
9244                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
9245                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
9246                   break;
9247                 }
9248                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9249                 {
9250                   // Stop on unconditional branch
9251                   break;
9252                 }
9253                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
9254                 {
9255                   if(ooo[j]) {
9256                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
9257                       break;
9258                   }else{
9259                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
9260                       break;
9261                   }
9262                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
9263                     //printf("no-match due to different register (branch)\n");
9264                     break;
9265                   }
9266                 }
9267                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9268                   //printf("No free regs for store %x\n",start+j*4);
9269                   break;
9270                 }
9271                 if(f_regmap[hr]>=64) {
9272                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
9273                     break;
9274                   }
9275                   else
9276                   {
9277                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
9278                       break;
9279                     }
9280                   }
9281                 }
9282               }
9283             }
9284           }
9285         }
9286       }
9287     }else{
9288       // Non branch or undetermined branch target
9289       for(hr=0;hr<HOST_REGS;hr++)
9290       {
9291         if(hr!=EXCLUDE_REG) {
9292           if(regs[i].regmap[hr]>64) {
9293             if(!((regs[i].dirty>>hr)&1))
9294               f_regmap[hr]=regs[i].regmap[hr];
9295           }
9296           else if(regs[i].regmap[hr]>=0) {
9297             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9298               // dealloc old register
9299               int n;
9300               for(n=0;n<HOST_REGS;n++)
9301               {
9302                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9303               }
9304               // and alloc new one
9305               f_regmap[hr]=regs[i].regmap[hr];
9306             }
9307           }
9308         }
9309       }
9310       // Try to restore cycle count at branch targets
9311       if(bt[i]) {
9312         for(j=i;j<slen-1;j++) {
9313           if(regs[j].regmap[HOST_CCREG]!=-1) break;
9314           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9315             //printf("no free regs for store %x\n",start+j*4);
9316             break;
9317           }
9318         }
9319         if(regs[j].regmap[HOST_CCREG]==CCREG) {
9320           int k=i;
9321           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
9322           while(k<j) {
9323             regs[k].regmap_entry[HOST_CCREG]=CCREG;
9324             regs[k].regmap[HOST_CCREG]=CCREG;
9325             regmap_pre[k+1][HOST_CCREG]=CCREG;
9326             regs[k+1].wasdirty|=1<<HOST_CCREG;
9327             regs[k].dirty|=1<<HOST_CCREG;
9328             regs[k].wasconst&=~(1<<HOST_CCREG);
9329             regs[k].isconst&=~(1<<HOST_CCREG);
9330             k++;
9331           }
9332           regs[j].regmap_entry[HOST_CCREG]=CCREG;
9333         }
9334         // Work backwards from the branch target
9335         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9336         {
9337           //printf("Extend backwards\n");
9338           int k;
9339           k=i;
9340           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9341             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9342               //printf("no free regs for store %x\n",start+(k-1)*4);
9343               break;
9344             }
9345             k--;
9346           }
9347           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9348             //printf("Extend CC, %x ->\n",start+k*4);
9349             while(k<=i) {
9350               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9351               regs[k].regmap[HOST_CCREG]=CCREG;
9352               regmap_pre[k+1][HOST_CCREG]=CCREG;
9353               regs[k+1].wasdirty|=1<<HOST_CCREG;
9354               regs[k].dirty|=1<<HOST_CCREG;
9355               regs[k].wasconst&=~(1<<HOST_CCREG);
9356               regs[k].isconst&=~(1<<HOST_CCREG);
9357               k++;
9358             }
9359           }
9360           else {
9361             //printf("Fail Extend CC, %x ->\n",start+k*4);
9362           }
9363         }
9364       }
9365       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9366          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9367          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9368          itype[i]!=FCONV&&itype[i]!=FCOMP)
9369       {
9370         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9371       }
9372     }
9373   }
9374
9375   // Cache memory offset or tlb map pointer if a register is available
9376   #ifndef HOST_IMM_ADDR32
9377   #ifndef RAM_OFFSET
9378   if(0)
9379   #endif
9380   {
9381     int earliest_available[HOST_REGS];
9382     int loop_start[HOST_REGS];
9383     int score[HOST_REGS];
9384     int end[HOST_REGS];
9385     int reg=ROREG;
9386
9387     // Init
9388     for(hr=0;hr<HOST_REGS;hr++) {
9389       score[hr]=0;earliest_available[hr]=0;
9390       loop_start[hr]=MAXBLOCK;
9391     }
9392     for(i=0;i<slen-1;i++)
9393     {
9394       // Can't do anything if no registers are available
9395       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
9396         for(hr=0;hr<HOST_REGS;hr++) {
9397           score[hr]=0;earliest_available[hr]=i+1;
9398           loop_start[hr]=MAXBLOCK;
9399         }
9400       }
9401       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9402         if(!ooo[i]) {
9403           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
9404             for(hr=0;hr<HOST_REGS;hr++) {
9405               score[hr]=0;earliest_available[hr]=i+1;
9406               loop_start[hr]=MAXBLOCK;
9407             }
9408           }
9409         }else{
9410           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
9411             for(hr=0;hr<HOST_REGS;hr++) {
9412               score[hr]=0;earliest_available[hr]=i+1;
9413               loop_start[hr]=MAXBLOCK;
9414             }
9415           }
9416         }
9417       }
9418       // Mark unavailable registers
9419       for(hr=0;hr<HOST_REGS;hr++) {
9420         if(regs[i].regmap[hr]>=0) {
9421           score[hr]=0;earliest_available[hr]=i+1;
9422           loop_start[hr]=MAXBLOCK;
9423         }
9424         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9425           if(branch_regs[i].regmap[hr]>=0) {
9426             score[hr]=0;earliest_available[hr]=i+2;
9427             loop_start[hr]=MAXBLOCK;
9428           }
9429         }
9430       }
9431       // No register allocations after unconditional jumps
9432       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
9433       {
9434         for(hr=0;hr<HOST_REGS;hr++) {
9435           score[hr]=0;earliest_available[hr]=i+2;
9436           loop_start[hr]=MAXBLOCK;
9437         }
9438         i++; // Skip delay slot too
9439         //printf("skip delay slot: %x\n",start+i*4);
9440       }
9441       else
9442       // Possible match
9443       if(itype[i]==LOAD||itype[i]==LOADLR||
9444          itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
9445         for(hr=0;hr<HOST_REGS;hr++) {
9446           if(hr!=EXCLUDE_REG) {
9447             end[hr]=i-1;
9448             for(j=i;j<slen-1;j++) {
9449               if(regs[j].regmap[hr]>=0) break;
9450               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9451                 if(branch_regs[j].regmap[hr]>=0) break;
9452                 if(ooo[j]) {
9453                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
9454                 }else{
9455                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
9456                 }
9457               }
9458               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
9459               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9460                 int t=(ba[j]-start)>>2;
9461                 if(t<j&&t>=earliest_available[hr]) {
9462                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
9463                     // Score a point for hoisting loop invariant
9464                     if(t<loop_start[hr]) loop_start[hr]=t;
9465                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
9466                     score[hr]++;
9467                     end[hr]=j;
9468                   }
9469                 }
9470                 else if(t<j) {
9471                   if(regs[t].regmap[hr]==reg) {
9472                     // Score a point if the branch target matches this register
9473                     score[hr]++;
9474                     end[hr]=j;
9475                   }
9476                 }
9477                 if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
9478                    itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
9479                   score[hr]++;
9480                   end[hr]=j;
9481                 }
9482               }
9483               if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9484               {
9485                 // Stop on unconditional branch
9486                 break;
9487               }
9488               else
9489               if(itype[j]==LOAD||itype[j]==LOADLR||
9490                  itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
9491                 score[hr]++;
9492                 end[hr]=j;
9493               }
9494             }
9495           }
9496         }
9497         // Find highest score and allocate that register
9498         int maxscore=0;
9499         for(hr=0;hr<HOST_REGS;hr++) {
9500           if(hr!=EXCLUDE_REG) {
9501             if(score[hr]>score[maxscore]) {
9502               maxscore=hr;
9503               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
9504             }
9505           }
9506         }
9507         if(score[maxscore]>1)
9508         {
9509           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
9510           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
9511             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
9512             assert(regs[j].regmap[maxscore]<0);
9513             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
9514             regs[j].regmap[maxscore]=reg;
9515             regs[j].dirty&=~(1<<maxscore);
9516             regs[j].wasconst&=~(1<<maxscore);
9517             regs[j].isconst&=~(1<<maxscore);
9518             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9519               branch_regs[j].regmap[maxscore]=reg;
9520               branch_regs[j].wasdirty&=~(1<<maxscore);
9521               branch_regs[j].dirty&=~(1<<maxscore);
9522               branch_regs[j].wasconst&=~(1<<maxscore);
9523               branch_regs[j].isconst&=~(1<<maxscore);
9524               if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
9525                 regmap_pre[j+2][maxscore]=reg;
9526                 regs[j+2].wasdirty&=~(1<<maxscore);
9527               }
9528               // loop optimization (loop_preload)
9529               int t=(ba[j]-start)>>2;
9530               if(t==loop_start[maxscore]) {
9531                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
9532                   regs[t].regmap_entry[maxscore]=reg;
9533               }
9534             }
9535             else
9536             {
9537               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
9538                 regmap_pre[j+1][maxscore]=reg;
9539                 regs[j+1].wasdirty&=~(1<<maxscore);
9540               }
9541             }
9542           }
9543           i=j-1;
9544           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
9545           for(hr=0;hr<HOST_REGS;hr++) {
9546             score[hr]=0;earliest_available[hr]=i+i;
9547             loop_start[hr]=MAXBLOCK;
9548           }
9549         }
9550       }
9551     }
9552   }
9553   #endif
9554
9555   // This allocates registers (if possible) one instruction prior
9556   // to use, which can avoid a load-use penalty on certain CPUs.
9557   for(i=0;i<slen-1;i++)
9558   {
9559     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9560     {
9561       if(!bt[i+1])
9562       {
9563         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
9564            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
9565         {
9566           if(rs1[i+1]) {
9567             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9568             {
9569               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9570               {
9571                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9572                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9573                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9574                 regs[i].isconst&=~(1<<hr);
9575                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9576                 constmap[i][hr]=constmap[i+1][hr];
9577                 regs[i+1].wasdirty&=~(1<<hr);
9578                 regs[i].dirty&=~(1<<hr);
9579               }
9580             }
9581           }
9582           if(rs2[i+1]) {
9583             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
9584             {
9585               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9586               {
9587                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9588                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9589                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9590                 regs[i].isconst&=~(1<<hr);
9591                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9592                 constmap[i][hr]=constmap[i+1][hr];
9593                 regs[i+1].wasdirty&=~(1<<hr);
9594                 regs[i].dirty&=~(1<<hr);
9595               }
9596             }
9597           }
9598           // Preload target address for load instruction (non-constant)
9599           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9600             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9601             {
9602               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9603               {
9604                 regs[i].regmap[hr]=rs1[i+1];
9605                 regmap_pre[i+1][hr]=rs1[i+1];
9606                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9607                 regs[i].isconst&=~(1<<hr);
9608                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9609                 constmap[i][hr]=constmap[i+1][hr];
9610                 regs[i+1].wasdirty&=~(1<<hr);
9611                 regs[i].dirty&=~(1<<hr);
9612               }
9613             }
9614           }
9615           // Load source into target register
9616           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9617             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9618             {
9619               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9620               {
9621                 regs[i].regmap[hr]=rs1[i+1];
9622                 regmap_pre[i+1][hr]=rs1[i+1];
9623                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9624                 regs[i].isconst&=~(1<<hr);
9625                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9626                 constmap[i][hr]=constmap[i+1][hr];
9627                 regs[i+1].wasdirty&=~(1<<hr);
9628                 regs[i].dirty&=~(1<<hr);
9629               }
9630             }
9631           }
9632           // Address for store instruction (non-constant)
9633           if(itype[i+1]==STORE||itype[i+1]==STORELR
9634              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
9635             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9636               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
9637               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9638               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
9639               assert(hr>=0);
9640               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9641               {
9642                 regs[i].regmap[hr]=rs1[i+1];
9643                 regmap_pre[i+1][hr]=rs1[i+1];
9644                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9645                 regs[i].isconst&=~(1<<hr);
9646                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9647                 constmap[i][hr]=constmap[i+1][hr];
9648                 regs[i+1].wasdirty&=~(1<<hr);
9649                 regs[i].dirty&=~(1<<hr);
9650               }
9651             }
9652           }
9653           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
9654             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9655               int nr;
9656               hr=get_reg(regs[i+1].regmap,FTEMP);
9657               assert(hr>=0);
9658               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9659               {
9660                 regs[i].regmap[hr]=rs1[i+1];
9661                 regmap_pre[i+1][hr]=rs1[i+1];
9662                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9663                 regs[i].isconst&=~(1<<hr);
9664                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9665                 constmap[i][hr]=constmap[i+1][hr];
9666                 regs[i+1].wasdirty&=~(1<<hr);
9667                 regs[i].dirty&=~(1<<hr);
9668               }
9669               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9670               {
9671                 // move it to another register
9672                 regs[i+1].regmap[hr]=-1;
9673                 regmap_pre[i+2][hr]=-1;
9674                 regs[i+1].regmap[nr]=FTEMP;
9675                 regmap_pre[i+2][nr]=FTEMP;
9676                 regs[i].regmap[nr]=rs1[i+1];
9677                 regmap_pre[i+1][nr]=rs1[i+1];
9678                 regs[i+1].regmap_entry[nr]=rs1[i+1];
9679                 regs[i].isconst&=~(1<<nr);
9680                 regs[i+1].isconst&=~(1<<nr);
9681                 regs[i].dirty&=~(1<<nr);
9682                 regs[i+1].wasdirty&=~(1<<nr);
9683                 regs[i+1].dirty&=~(1<<nr);
9684                 regs[i+2].wasdirty&=~(1<<nr);
9685               }
9686             }
9687           }
9688           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
9689             if(itype[i+1]==LOAD)
9690               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
9691             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
9692               hr=get_reg(regs[i+1].regmap,FTEMP);
9693             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
9694               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
9695               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9696             }
9697             if(hr>=0&&regs[i].regmap[hr]<0) {
9698               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
9699               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
9700                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
9701                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
9702                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
9703                 regs[i].isconst&=~(1<<hr);
9704                 regs[i+1].wasdirty&=~(1<<hr);
9705                 regs[i].dirty&=~(1<<hr);
9706               }
9707             }
9708           }
9709         }
9710       }
9711     }
9712   }
9713
9714   /* Pass 6 - Optimize clean/dirty state */
9715   clean_registers(0,slen-1,1);
9716
9717   /* Pass 7 - Identify 32-bit registers */
9718   for (i=slen-1;i>=0;i--)
9719   {
9720     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9721     {
9722       // Conditional branch
9723       if((source[i]>>16)!=0x1000&&i<slen-2) {
9724         // Mark this address as a branch target since it may be called
9725         // upon return from interrupt
9726         bt[i+2]=1;
9727       }
9728     }
9729   }
9730
9731   if(itype[slen-1]==SPAN) {
9732     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
9733   }
9734
9735 #ifdef DISASM
9736   /* Debug/disassembly */
9737   for(i=0;i<slen;i++)
9738   {
9739     printf("U:");
9740     int r;
9741     for(r=1;r<=CCREG;r++) {
9742       if((unneeded_reg[i]>>r)&1) {
9743         if(r==HIREG) printf(" HI");
9744         else if(r==LOREG) printf(" LO");
9745         else printf(" r%d",r);
9746       }
9747     }
9748     printf("\n");
9749     #if defined(__i386__) || defined(__x86_64__)
9750     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
9751     #endif
9752     #ifdef __arm__
9753     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
9754     #endif
9755     printf("needs: ");
9756     if(needed_reg[i]&1) printf("eax ");
9757     if((needed_reg[i]>>1)&1) printf("ecx ");
9758     if((needed_reg[i]>>2)&1) printf("edx ");
9759     if((needed_reg[i]>>3)&1) printf("ebx ");
9760     if((needed_reg[i]>>5)&1) printf("ebp ");
9761     if((needed_reg[i]>>6)&1) printf("esi ");
9762     if((needed_reg[i]>>7)&1) printf("edi ");
9763     printf("\n");
9764     #if defined(__i386__) || defined(__x86_64__)
9765     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
9766     printf("dirty: ");
9767     if(regs[i].wasdirty&1) printf("eax ");
9768     if((regs[i].wasdirty>>1)&1) printf("ecx ");
9769     if((regs[i].wasdirty>>2)&1) printf("edx ");
9770     if((regs[i].wasdirty>>3)&1) printf("ebx ");
9771     if((regs[i].wasdirty>>5)&1) printf("ebp ");
9772     if((regs[i].wasdirty>>6)&1) printf("esi ");
9773     if((regs[i].wasdirty>>7)&1) printf("edi ");
9774     #endif
9775     #ifdef __arm__
9776     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
9777     printf("dirty: ");
9778     if(regs[i].wasdirty&1) printf("r0 ");
9779     if((regs[i].wasdirty>>1)&1) printf("r1 ");
9780     if((regs[i].wasdirty>>2)&1) printf("r2 ");
9781     if((regs[i].wasdirty>>3)&1) printf("r3 ");
9782     if((regs[i].wasdirty>>4)&1) printf("r4 ");
9783     if((regs[i].wasdirty>>5)&1) printf("r5 ");
9784     if((regs[i].wasdirty>>6)&1) printf("r6 ");
9785     if((regs[i].wasdirty>>7)&1) printf("r7 ");
9786     if((regs[i].wasdirty>>8)&1) printf("r8 ");
9787     if((regs[i].wasdirty>>9)&1) printf("r9 ");
9788     if((regs[i].wasdirty>>10)&1) printf("r10 ");
9789     if((regs[i].wasdirty>>12)&1) printf("r12 ");
9790     #endif
9791     printf("\n");
9792     disassemble_inst(i);
9793     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
9794     #if defined(__i386__) || defined(__x86_64__)
9795     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
9796     if(regs[i].dirty&1) printf("eax ");
9797     if((regs[i].dirty>>1)&1) printf("ecx ");
9798     if((regs[i].dirty>>2)&1) printf("edx ");
9799     if((regs[i].dirty>>3)&1) printf("ebx ");
9800     if((regs[i].dirty>>5)&1) printf("ebp ");
9801     if((regs[i].dirty>>6)&1) printf("esi ");
9802     if((regs[i].dirty>>7)&1) printf("edi ");
9803     #endif
9804     #ifdef __arm__
9805     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
9806     if(regs[i].dirty&1) printf("r0 ");
9807     if((regs[i].dirty>>1)&1) printf("r1 ");
9808     if((regs[i].dirty>>2)&1) printf("r2 ");
9809     if((regs[i].dirty>>3)&1) printf("r3 ");
9810     if((regs[i].dirty>>4)&1) printf("r4 ");
9811     if((regs[i].dirty>>5)&1) printf("r5 ");
9812     if((regs[i].dirty>>6)&1) printf("r6 ");
9813     if((regs[i].dirty>>7)&1) printf("r7 ");
9814     if((regs[i].dirty>>8)&1) printf("r8 ");
9815     if((regs[i].dirty>>9)&1) printf("r9 ");
9816     if((regs[i].dirty>>10)&1) printf("r10 ");
9817     if((regs[i].dirty>>12)&1) printf("r12 ");
9818     #endif
9819     printf("\n");
9820     if(regs[i].isconst) {
9821       printf("constants: ");
9822       #if defined(__i386__) || defined(__x86_64__)
9823       if(regs[i].isconst&1) printf("eax=%x ",(u_int)constmap[i][0]);
9824       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(u_int)constmap[i][1]);
9825       if((regs[i].isconst>>2)&1) printf("edx=%x ",(u_int)constmap[i][2]);
9826       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(u_int)constmap[i][3]);
9827       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(u_int)constmap[i][5]);
9828       if((regs[i].isconst>>6)&1) printf("esi=%x ",(u_int)constmap[i][6]);
9829       if((regs[i].isconst>>7)&1) printf("edi=%x ",(u_int)constmap[i][7]);
9830       #endif
9831       #ifdef __arm__
9832       int r;
9833       for (r = 0; r < ARRAY_SIZE(constmap[i]); r++)
9834         if ((regs[i].isconst >> r) & 1)
9835           printf(" r%d=%x", r, (u_int)constmap[i][r]);
9836       #endif
9837       printf("\n");
9838     }
9839     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9840       #if defined(__i386__) || defined(__x86_64__)
9841       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
9842       if(branch_regs[i].dirty&1) printf("eax ");
9843       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
9844       if((branch_regs[i].dirty>>2)&1) printf("edx ");
9845       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
9846       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
9847       if((branch_regs[i].dirty>>6)&1) printf("esi ");
9848       if((branch_regs[i].dirty>>7)&1) printf("edi ");
9849       #endif
9850       #ifdef __arm__
9851       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
9852       if(branch_regs[i].dirty&1) printf("r0 ");
9853       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
9854       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
9855       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
9856       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
9857       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
9858       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
9859       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
9860       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
9861       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
9862       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
9863       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
9864       #endif
9865     }
9866   }
9867 #endif // DISASM
9868
9869   /* Pass 8 - Assembly */
9870   linkcount=0;stubcount=0;
9871   ds=0;is_delayslot=0;
9872   cop1_usable=0;
9873   uint64_t is32_pre=0;
9874   u_int dirty_pre=0;
9875   void *beginning=start_block();
9876   if((u_int)addr&1) {
9877     ds=1;
9878     pagespan_ds();
9879   }
9880   void *instr_addr0_override = NULL;
9881
9882   if (start == 0x80030000) {
9883     // nasty hack for fastbios thing
9884     // override block entry to this code
9885     instr_addr0_override = out;
9886     emit_movimm(start,0);
9887     // abuse io address var as a flag that we
9888     // have already returned here once
9889     emit_readword(&address,1);
9890     emit_writeword(0,&pcaddr);
9891     emit_writeword(0,&address);
9892     emit_cmp(0,1);
9893     emit_jne(new_dyna_leave);
9894   }
9895   for(i=0;i<slen;i++)
9896   {
9897     //if(ds) printf("ds: ");
9898     disassemble_inst(i);
9899     if(ds) {
9900       ds=0; // Skip delay slot
9901       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
9902       instr_addr[i] = NULL;
9903     } else {
9904       speculate_register_values(i);
9905       #ifndef DESTRUCTIVE_WRITEBACK
9906       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
9907       {
9908         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
9909               unneeded_reg[i],unneeded_reg_upper[i]);
9910       }
9911       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
9912         is32_pre=branch_regs[i].is32;
9913         dirty_pre=branch_regs[i].dirty;
9914       }else{
9915         is32_pre=regs[i].is32;
9916         dirty_pre=regs[i].dirty;
9917       }
9918       #endif
9919       // write back
9920       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
9921       {
9922         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
9923                       unneeded_reg[i],unneeded_reg_upper[i]);
9924         loop_preload(regmap_pre[i],regs[i].regmap_entry);
9925       }
9926       // branch target entry point
9927       instr_addr[i] = out;
9928       assem_debug("<->\n");
9929       drc_dbg_emit_do_cmp(i);
9930
9931       // load regs
9932       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
9933         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
9934       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
9935       address_generation(i,&regs[i],regs[i].regmap_entry);
9936       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
9937       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9938       {
9939         // Load the delay slot registers if necessary
9940         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
9941           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
9942         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
9943           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
9944         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
9945           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
9946       }
9947       else if(i+1<slen)
9948       {
9949         // Preload registers for following instruction
9950         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
9951           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
9952             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
9953         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
9954           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
9955             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
9956       }
9957       // TODO: if(is_ooo(i)) address_generation(i+1);
9958       if(itype[i]==CJUMP||itype[i]==FJUMP)
9959         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
9960       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
9961         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
9962       if(bt[i]) cop1_usable=0;
9963       // assemble
9964       switch(itype[i]) {
9965         case ALU:
9966           alu_assemble(i,&regs[i]);break;
9967         case IMM16:
9968           imm16_assemble(i,&regs[i]);break;
9969         case SHIFT:
9970           shift_assemble(i,&regs[i]);break;
9971         case SHIFTIMM:
9972           shiftimm_assemble(i,&regs[i]);break;
9973         case LOAD:
9974           load_assemble(i,&regs[i]);break;
9975         case LOADLR:
9976           loadlr_assemble(i,&regs[i]);break;
9977         case STORE:
9978           store_assemble(i,&regs[i]);break;
9979         case STORELR:
9980           storelr_assemble(i,&regs[i]);break;
9981         case COP0:
9982           cop0_assemble(i,&regs[i]);break;
9983         case COP1:
9984           cop1_assemble(i,&regs[i]);break;
9985         case C1LS:
9986           c1ls_assemble(i,&regs[i]);break;
9987         case COP2:
9988           cop2_assemble(i,&regs[i]);break;
9989         case C2LS:
9990           c2ls_assemble(i,&regs[i]);break;
9991         case C2OP:
9992           c2op_assemble(i,&regs[i]);break;
9993         case FCONV:
9994           fconv_assemble(i,&regs[i]);break;
9995         case FLOAT:
9996           float_assemble(i,&regs[i]);break;
9997         case FCOMP:
9998           fcomp_assemble(i,&regs[i]);break;
9999         case MULTDIV:
10000           multdiv_assemble(i,&regs[i]);break;
10001         case MOV:
10002           mov_assemble(i,&regs[i]);break;
10003         case SYSCALL:
10004           syscall_assemble(i,&regs[i]);break;
10005         case HLECALL:
10006           hlecall_assemble(i,&regs[i]);break;
10007         case INTCALL:
10008           intcall_assemble(i,&regs[i]);break;
10009         case UJUMP:
10010           ujump_assemble(i,&regs[i]);ds=1;break;
10011         case RJUMP:
10012           rjump_assemble(i,&regs[i]);ds=1;break;
10013         case CJUMP:
10014           cjump_assemble(i,&regs[i]);ds=1;break;
10015         case SJUMP:
10016           sjump_assemble(i,&regs[i]);ds=1;break;
10017         case FJUMP:
10018           fjump_assemble(i,&regs[i]);ds=1;break;
10019         case SPAN:
10020           pagespan_assemble(i,&regs[i]);break;
10021       }
10022       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10023         literal_pool(1024);
10024       else
10025         literal_pool_jumpover(256);
10026     }
10027   }
10028   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10029   // If the block did not end with an unconditional branch,
10030   // add a jump to the next instruction.
10031   if(i>1) {
10032     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10033       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10034       assert(i==slen);
10035       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10036         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10037         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10038           emit_loadreg(CCREG,HOST_CCREG);
10039         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10040       }
10041       else if(!likely[i-2])
10042       {
10043         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10044         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10045       }
10046       else
10047       {
10048         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10049         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10050       }
10051       add_to_linker(out,start+i*4,0);
10052       emit_jmp(0);
10053     }
10054   }
10055   else
10056   {
10057     assert(i>0);
10058     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10059     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10060     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10061       emit_loadreg(CCREG,HOST_CCREG);
10062     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10063     add_to_linker(out,start+i*4,0);
10064     emit_jmp(0);
10065   }
10066
10067   // TODO: delay slot stubs?
10068   // Stubs
10069   for(i=0;i<stubcount;i++)
10070   {
10071     switch(stubs[i].type)
10072     {
10073       case LOADB_STUB:
10074       case LOADH_STUB:
10075       case LOADW_STUB:
10076       case LOADD_STUB:
10077       case LOADBU_STUB:
10078       case LOADHU_STUB:
10079         do_readstub(i);break;
10080       case STOREB_STUB:
10081       case STOREH_STUB:
10082       case STOREW_STUB:
10083       case STORED_STUB:
10084         do_writestub(i);break;
10085       case CC_STUB:
10086         do_ccstub(i);break;
10087       case INVCODE_STUB:
10088         do_invstub(i);break;
10089       case FP_STUB:
10090         do_cop1stub(i);break;
10091       case STORELR_STUB:
10092         do_unalignedwritestub(i);break;
10093     }
10094   }
10095
10096   if (instr_addr0_override)
10097     instr_addr[0] = instr_addr0_override;
10098
10099   /* Pass 9 - Linker */
10100   for(i=0;i<linkcount;i++)
10101   {
10102     assem_debug("%p -> %8x\n",link_addr[i].addr,link_addr[i].target);
10103     literal_pool(64);
10104     if (!link_addr[i].ext)
10105     {
10106       void *stub = out;
10107       void *addr = check_addr(link_addr[i].target);
10108       emit_extjump(link_addr[i].addr, link_addr[i].target);
10109       if (addr) {
10110         set_jump_target(link_addr[i].addr, addr);
10111         add_link(link_addr[i].target,stub);
10112       }
10113       else
10114         set_jump_target(link_addr[i].addr, stub);
10115     }
10116     else
10117     {
10118       // Internal branch
10119       int target=(link_addr[i].target-start)>>2;
10120       assert(target>=0&&target<slen);
10121       assert(instr_addr[target]);
10122       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10123       //set_jump_target_fillslot(link_addr[i].addr,instr_addr[target],link_addr[i].ext>>1);
10124       //#else
10125       set_jump_target(link_addr[i].addr, instr_addr[target]);
10126       //#endif
10127     }
10128   }
10129   // External Branch Targets (jump_in)
10130   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
10131   for(i=0;i<slen;i++)
10132   {
10133     if(bt[i]||i==0)
10134     {
10135       if(instr_addr[i]) // TODO - delay slots (=null)
10136       {
10137         u_int vaddr=start+i*4;
10138         u_int page=get_page(vaddr);
10139         u_int vpage=get_vpage(vaddr);
10140         literal_pool(256);
10141         {
10142           assem_debug("%p (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10143           assem_debug("jump_in: %x\n",start+i*4);
10144           ll_add(jump_dirty+vpage,vaddr,out);
10145           void *entry_point = do_dirty_stub(i);
10146           ll_add_flags(jump_in+page,vaddr,state_rflags,entry_point);
10147           // If there was an existing entry in the hash table,
10148           // replace it with the new address.
10149           // Don't add new entries.  We'll insert the
10150           // ones that actually get used in check_addr().
10151           struct ht_entry *ht_bin = hash_table_get(vaddr);
10152           if (ht_bin->vaddr[0] == vaddr)
10153             ht_bin->tcaddr[0] = entry_point;
10154           if (ht_bin->vaddr[1] == vaddr)
10155             ht_bin->tcaddr[1] = entry_point;
10156         }
10157       }
10158     }
10159   }
10160   // Write out the literal pool if necessary
10161   literal_pool(0);
10162   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10163   // Align code
10164   if(((u_int)out)&7) emit_addnop(13);
10165   #endif
10166   assert((u_int)out-(u_int)beginning<MAX_OUTPUT_BLOCK_SIZE);
10167   //printf("shadow buffer: %p-%p\n",copy,(u_char *)copy+slen*4);
10168   memcpy(copy,source,slen*4);
10169   copy+=slen*4;
10170
10171   end_block(beginning);
10172
10173   // If we're within 256K of the end of the buffer,
10174   // start over from the beginning. (Is 256K enough?)
10175   if (out > translation_cache+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE)
10176     out = translation_cache;
10177
10178   // Trap writes to any of the pages we compiled
10179   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
10180     invalid_code[i]=0;
10181   }
10182   inv_code_start=inv_code_end=~0;
10183
10184   // for PCSX we need to mark all mirrors too
10185   if(get_page(start)<(RAM_SIZE>>12))
10186     for(i=start>>12;i<=(start+slen*4)>>12;i++)
10187       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
10188       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
10189       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
10190
10191   /* Pass 10 - Free memory by expiring oldest blocks */
10192
10193   int end=(((out-translation_cache)>>(TARGET_SIZE_2-16))+16384)&65535;
10194   while(expirep!=end)
10195   {
10196     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
10197     uintptr_t base=(uintptr_t)translation_cache+((expirep>>13)<<shift); // Base address of this block
10198     inv_debug("EXP: Phase %d\n",expirep);
10199     switch((expirep>>11)&3)
10200     {
10201       case 0:
10202         // Clear jump_in and jump_dirty
10203         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
10204         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
10205         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
10206         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
10207         break;
10208       case 1:
10209         // Clear pointers
10210         ll_kill_pointers(jump_out[expirep&2047],base,shift);
10211         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
10212         break;
10213       case 2:
10214         // Clear hash table
10215         for(i=0;i<32;i++) {
10216           struct ht_entry *ht_bin = &hash_table[((expirep&2047)<<5)+i];
10217           if (((uintptr_t)ht_bin->tcaddr[1]>>shift) == (base>>shift) ||
10218              (((uintptr_t)ht_bin->tcaddr[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10219             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[1],ht_bin->tcaddr[1]);
10220             ht_bin->vaddr[1] = -1;
10221             ht_bin->tcaddr[1] = NULL;
10222           }
10223           if (((uintptr_t)ht_bin->tcaddr[0]>>shift) == (base>>shift) ||
10224              (((uintptr_t)ht_bin->tcaddr[0]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10225             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[0],ht_bin->tcaddr[0]);
10226             ht_bin->vaddr[0] = ht_bin->vaddr[1];
10227             ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
10228             ht_bin->vaddr[1] = -1;
10229             ht_bin->tcaddr[1] = NULL;
10230           }
10231         }
10232         break;
10233       case 3:
10234         // Clear jump_out
10235         #ifdef __arm__
10236         if((expirep&2047)==0)
10237           do_clear_cache();
10238         #endif
10239         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
10240         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
10241         break;
10242     }
10243     expirep=(expirep+1)&65535;
10244   }
10245   return 0;
10246 }
10247
10248 // vim:shiftwidth=2:expandtab