drc: remove old debug code
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29 #ifdef _3DS
30 #include <3ds_utils.h>
31 #endif
32 #ifdef VITA
33 #include <psp2/kernel/sysmem.h>
34 static int sceBlock;
35 #endif
36
37 #include "new_dynarec_config.h"
38 #include "../psxhle.h" //emulator interface
39 #include "emu_if.h" //emulator interface
40
41 #ifndef ARRAY_SIZE
42 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
43 #endif
44
45 //#define DISASM
46 //#define assem_debug printf
47 //#define inv_debug printf
48 #define assem_debug(...)
49 #define inv_debug(...)
50
51 #ifdef __i386__
52 #include "assem_x86.h"
53 #endif
54 #ifdef __x86_64__
55 #include "assem_x64.h"
56 #endif
57 #ifdef __arm__
58 #include "assem_arm.h"
59 #endif
60
61 #define MAXBLOCK 4096
62 #define MAX_OUTPUT_BLOCK_SIZE 262144
63
64 // stubs
65 enum stub_type {
66   CC_STUB = 1,
67   FP_STUB = 2,
68   LOADB_STUB = 3,
69   LOADH_STUB = 4,
70   LOADW_STUB = 5,
71   LOADD_STUB = 6,
72   LOADBU_STUB = 7,
73   LOADHU_STUB = 8,
74   STOREB_STUB = 9,
75   STOREH_STUB = 10,
76   STOREW_STUB = 11,
77   STORED_STUB = 12,
78   STORELR_STUB = 13,
79   INVCODE_STUB = 14,
80 };
81
82 struct regstat
83 {
84   signed char regmap_entry[HOST_REGS];
85   signed char regmap[HOST_REGS];
86   uint64_t was32;
87   uint64_t is32;
88   uint64_t wasdirty;
89   uint64_t dirty;
90   uint64_t u;
91   uint64_t uu;
92   u_int wasconst;
93   u_int isconst;
94   u_int loadedconst;             // host regs that have constants loaded
95   u_int waswritten;              // MIPS regs that were used as store base before
96 };
97
98 // note: asm depends on this layout
99 struct ll_entry
100 {
101   u_int vaddr;
102   u_int reg_sv_flags;
103   void *addr;
104   struct ll_entry *next;
105 };
106
107 struct ht_entry
108 {
109   u_int vaddr[2];
110   void *tcaddr[2];
111 };
112
113 struct code_stub
114 {
115   enum stub_type type;
116   void *addr;
117   void *retaddr;
118   u_int a;
119   uintptr_t b;
120   uintptr_t c;
121   u_int d;
122   u_int e;
123 };
124
125   // used by asm:
126   u_char *out;
127   struct ht_entry hash_table[65536]  __attribute__((aligned(16)));
128   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
129   struct ll_entry *jump_dirty[4096];
130
131   static struct ll_entry *jump_out[4096];
132   static u_int start;
133   static u_int *source;
134   static char insn[MAXBLOCK][10];
135   static u_char itype[MAXBLOCK];
136   static u_char opcode[MAXBLOCK];
137   static u_char opcode2[MAXBLOCK];
138   static u_char bt[MAXBLOCK];
139   static u_char rs1[MAXBLOCK];
140   static u_char rs2[MAXBLOCK];
141   static u_char rt1[MAXBLOCK];
142   static u_char rt2[MAXBLOCK];
143   static u_char us1[MAXBLOCK];
144   static u_char us2[MAXBLOCK];
145   static u_char dep1[MAXBLOCK];
146   static u_char dep2[MAXBLOCK];
147   static u_char lt1[MAXBLOCK];
148   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
149   static uint64_t gte_rt[MAXBLOCK];
150   static uint64_t gte_unneeded[MAXBLOCK];
151   static u_int smrv[32]; // speculated MIPS register values
152   static u_int smrv_strong; // mask or regs that are likely to have correct values
153   static u_int smrv_weak; // same, but somewhat less likely
154   static u_int smrv_strong_next; // same, but after current insn executes
155   static u_int smrv_weak_next;
156   static int imm[MAXBLOCK];
157   static u_int ba[MAXBLOCK];
158   static char likely[MAXBLOCK];
159   static char is_ds[MAXBLOCK];
160   static char ooo[MAXBLOCK];
161   static uint64_t unneeded_reg[MAXBLOCK];
162   static uint64_t unneeded_reg_upper[MAXBLOCK];
163   static uint64_t branch_unneeded_reg[MAXBLOCK];
164   static uint64_t branch_unneeded_reg_upper[MAXBLOCK];
165   static signed char regmap_pre[MAXBLOCK][HOST_REGS];
166   static uint64_t current_constmap[HOST_REGS];
167   static uint64_t constmap[MAXBLOCK][HOST_REGS];
168   static struct regstat regs[MAXBLOCK];
169   static struct regstat branch_regs[MAXBLOCK];
170   static signed char minimum_free_regs[MAXBLOCK];
171   static u_int needed_reg[MAXBLOCK];
172   static u_int wont_dirty[MAXBLOCK];
173   static u_int will_dirty[MAXBLOCK];
174   static int ccadj[MAXBLOCK];
175   static int slen;
176   static void *instr_addr[MAXBLOCK];
177   static u_int link_addr[MAXBLOCK][3];
178   static int linkcount;
179   static struct code_stub stubs[MAXBLOCK*3];
180   static int stubcount;
181   static u_int literals[1024][2];
182   static int literalcount;
183   static int is_delayslot;
184   static int cop1_usable;
185   static char shadow[1048576]  __attribute__((aligned(16)));
186   static void *copy;
187   static int expirep;
188   static u_int stop_after_jal;
189 #ifndef RAM_FIXED
190   static u_int ram_offset;
191 #else
192   static const u_int ram_offset=0;
193 #endif
194
195   int new_dynarec_hacks;
196   int new_dynarec_did_compile;
197   extern u_char restore_candidate[512];
198   extern int cycle_count;
199
200   /* registers that may be allocated */
201   /* 1-31 gpr */
202 #define HIREG 32 // hi
203 #define LOREG 33 // lo
204 #define FSREG 34 // FPU status (FCSR)
205 #define CSREG 35 // Coprocessor status
206 #define CCREG 36 // Cycle count
207 #define INVCP 37 // Pointer to invalid_code
208 //#define MMREG 38 // Pointer to memory_map
209 #define ROREG 39 // ram offset (if rdram!=0x80000000)
210 #define TEMPREG 40
211 #define FTEMP 40 // FPU temporary register
212 #define PTEMP 41 // Prefetch temporary register
213 //#define TLREG 42 // TLB mapping offset
214 #define RHASH 43 // Return address hash
215 #define RHTBL 44 // Return address hash table address
216 #define RTEMP 45 // JR/JALR address register
217 #define MAXREG 45
218 #define AGEN1 46 // Address generation temporary register
219 //#define AGEN2 47 // Address generation temporary register
220 //#define MGEN1 48 // Maptable address generation temporary register
221 //#define MGEN2 49 // Maptable address generation temporary register
222 #define BTREG 50 // Branch target temporary register
223
224   /* instruction types */
225 #define NOP 0     // No operation
226 #define LOAD 1    // Load
227 #define STORE 2   // Store
228 #define LOADLR 3  // Unaligned load
229 #define STORELR 4 // Unaligned store
230 #define MOV 5     // Move
231 #define ALU 6     // Arithmetic/logic
232 #define MULTDIV 7 // Multiply/divide
233 #define SHIFT 8   // Shift by register
234 #define SHIFTIMM 9// Shift by immediate
235 #define IMM16 10  // 16-bit immediate
236 #define RJUMP 11  // Unconditional jump to register
237 #define UJUMP 12  // Unconditional jump
238 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
239 #define SJUMP 14  // Conditional branch (regimm format)
240 #define COP0 15   // Coprocessor 0
241 #define COP1 16   // Coprocessor 1
242 #define C1LS 17   // Coprocessor 1 load/store
243 #define FJUMP 18  // Conditional branch (floating point)
244 #define FLOAT 19  // Floating point unit
245 #define FCONV 20  // Convert integer to float
246 #define FCOMP 21  // Floating point compare (sets FSREG)
247 #define SYSCALL 22// SYSCALL
248 #define OTHER 23  // Other
249 #define SPAN 24   // Branch/delay slot spans 2 pages
250 #define NI 25     // Not implemented
251 #define HLECALL 26// PCSX fake opcodes for HLE
252 #define COP2 27   // Coprocessor 2 move
253 #define C2LS 28   // Coprocessor 2 load/store
254 #define C2OP 29   // Coprocessor 2 operation
255 #define INTCALL 30// Call interpreter to handle rare corner cases
256
257   /* branch codes */
258 #define TAKEN 1
259 #define NOTTAKEN 2
260 #define NULLDS 3
261
262 // asm linkage
263 int new_recompile_block(int addr);
264 void *get_addr_ht(u_int vaddr);
265 void invalidate_block(u_int block);
266 void invalidate_addr(u_int addr);
267 void remove_hash(int vaddr);
268 void dyna_linker();
269 void dyna_linker_ds();
270 void verify_code();
271 void verify_code_vm();
272 void verify_code_ds();
273 void cc_interrupt();
274 void fp_exception();
275 void fp_exception_ds();
276 void jump_syscall_hle();
277 void jump_hlecall();
278 void jump_intcall();
279 void new_dyna_leave();
280
281 // Needed by assembler
282 static void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
283 static void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
284 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
285 static void load_all_regs(signed char i_regmap[]);
286 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
287 static void load_regs_entry(int t);
288 static void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
289
290 static int verify_dirty(u_int *ptr);
291 static int get_final_value(int hr, int i, int *value);
292 static void add_stub(enum stub_type type, void *addr, void *retaddr,
293   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e);
294 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
295   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist);
296 static void add_to_linker(int addr,int target,int ext);
297
298 static int tracedebug=0;
299
300 static void mprotect_w_x(void *start, void *end, int is_x)
301 {
302 #ifdef NO_WRITE_EXEC
303   #if defined(VITA)
304   // *Open* enables write on all memory that was
305   // allocated by sceKernelAllocMemBlockForVM()?
306   if (is_x)
307     sceKernelCloseVMDomain();
308   else
309     sceKernelOpenVMDomain();
310   #else
311   u_long mstart = (u_long)start & ~4095ul;
312   u_long mend = (u_long)end;
313   if (mprotect((void *)mstart, mend - mstart,
314                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
315     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
316   #endif
317 #endif
318 }
319
320 static void start_tcache_write(void *start, void *end)
321 {
322   mprotect_w_x(start, end, 0);
323 }
324
325 static void end_tcache_write(void *start, void *end)
326 {
327 #ifdef __arm__
328   size_t len = (char *)end - (char *)start;
329   #if   defined(__BLACKBERRY_QNX__)
330   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
331   #elif defined(__MACH__)
332   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
333   #elif defined(VITA)
334   sceKernelSyncVMDomain(sceBlock, start, len);
335   #elif defined(_3DS)
336   ctr_flush_invalidate_cache();
337   #else
338   __clear_cache(start, end);
339   #endif
340   (void)len;
341 #endif
342
343   mprotect_w_x(start, end, 1);
344 }
345
346 static void *start_block(void)
347 {
348   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
349   if (end > (u_char *)BASE_ADDR + (1<<TARGET_SIZE_2))
350     end = (u_char *)BASE_ADDR + (1<<TARGET_SIZE_2);
351   start_tcache_write(out, end);
352   return out;
353 }
354
355 static void end_block(void *start)
356 {
357   end_tcache_write(start, out);
358 }
359
360 //#define DEBUG_CYCLE_COUNT 1
361
362 #define NO_CYCLE_PENALTY_THR 12
363
364 int cycle_multiplier; // 100 for 1.0
365
366 static int CLOCK_ADJUST(int x)
367 {
368   int s=(x>>31)|1;
369   return (x * cycle_multiplier + s * 50) / 100;
370 }
371
372 static u_int get_page(u_int vaddr)
373 {
374   u_int page=vaddr&~0xe0000000;
375   if (page < 0x1000000)
376     page &= ~0x0e00000; // RAM mirrors
377   page>>=12;
378   if(page>2048) page=2048+(page&2047);
379   return page;
380 }
381
382 // no virtual mem in PCSX
383 static u_int get_vpage(u_int vaddr)
384 {
385   return get_page(vaddr);
386 }
387
388 static struct ht_entry *hash_table_get(u_int vaddr)
389 {
390   return &hash_table[((vaddr>>16)^vaddr)&0xFFFF];
391 }
392
393 static void hash_table_add(struct ht_entry *ht_bin, u_int vaddr, void *tcaddr)
394 {
395   ht_bin->vaddr[1] = ht_bin->vaddr[0];
396   ht_bin->tcaddr[1] = ht_bin->tcaddr[0];
397   ht_bin->vaddr[0] = vaddr;
398   ht_bin->tcaddr[0] = tcaddr;
399 }
400
401 // some messy ari64's code, seems to rely on unsigned 32bit overflow
402 static int doesnt_expire_soon(void *tcaddr)
403 {
404   u_int diff = (u_int)((u_char *)tcaddr - out) << (32-TARGET_SIZE_2);
405   return diff > (u_int)(0x60000000 + (MAX_OUTPUT_BLOCK_SIZE << (32-TARGET_SIZE_2)));
406 }
407
408 // Get address from virtual address
409 // This is called from the recompiled JR/JALR instructions
410 void *get_addr(u_int vaddr)
411 {
412   u_int page=get_page(vaddr);
413   u_int vpage=get_vpage(vaddr);
414   struct ll_entry *head;
415   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
416   head=jump_in[page];
417   while(head!=NULL) {
418     if(head->vaddr==vaddr) {
419   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
420       hash_table_add(hash_table_get(vaddr), vaddr, head->addr);
421       return head->addr;
422     }
423     head=head->next;
424   }
425   head=jump_dirty[vpage];
426   while(head!=NULL) {
427     if(head->vaddr==vaddr) {
428       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
429       // Don't restore blocks which are about to expire from the cache
430       if (doesnt_expire_soon(head->addr))
431       if (verify_dirty(head->addr)) {
432         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
433         invalid_code[vaddr>>12]=0;
434         inv_code_start=inv_code_end=~0;
435         if(vpage<2048) {
436           restore_candidate[vpage>>3]|=1<<(vpage&7);
437         }
438         else restore_candidate[page>>3]|=1<<(page&7);
439         struct ht_entry *ht_bin = hash_table_get(vaddr);
440         if (ht_bin->vaddr[0] == vaddr)
441           ht_bin->tcaddr[0] = head->addr; // Replace existing entry
442         else
443           hash_table_add(ht_bin, vaddr, head->addr);
444
445         return head->addr;
446       }
447     }
448     head=head->next;
449   }
450   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
451   int r=new_recompile_block(vaddr);
452   if(r==0) return get_addr(vaddr);
453   // Execute in unmapped page, generate pagefault execption
454   Status|=2;
455   Cause=(vaddr<<31)|0x8;
456   EPC=(vaddr&1)?vaddr-5:vaddr;
457   BadVAddr=(vaddr&~1);
458   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
459   EntryHi=BadVAddr&0xFFFFE000;
460   return get_addr_ht(0x80000000);
461 }
462 // Look up address in hash table first
463 void *get_addr_ht(u_int vaddr)
464 {
465   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
466   const struct ht_entry *ht_bin = hash_table_get(vaddr);
467   if (ht_bin->vaddr[0] == vaddr) return ht_bin->tcaddr[0];
468   if (ht_bin->vaddr[1] == vaddr) return ht_bin->tcaddr[1];
469   return get_addr(vaddr);
470 }
471
472 void clear_all_regs(signed char regmap[])
473 {
474   int hr;
475   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
476 }
477
478 signed char get_reg(signed char regmap[],int r)
479 {
480   int hr;
481   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
482   return -1;
483 }
484
485 // Find a register that is available for two consecutive cycles
486 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
487 {
488   int hr;
489   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
490   return -1;
491 }
492
493 int count_free_regs(signed char regmap[])
494 {
495   int count=0;
496   int hr;
497   for(hr=0;hr<HOST_REGS;hr++)
498   {
499     if(hr!=EXCLUDE_REG) {
500       if(regmap[hr]<0) count++;
501     }
502   }
503   return count;
504 }
505
506 void dirty_reg(struct regstat *cur,signed char reg)
507 {
508   int hr;
509   if(!reg) return;
510   for (hr=0;hr<HOST_REGS;hr++) {
511     if((cur->regmap[hr]&63)==reg) {
512       cur->dirty|=1<<hr;
513     }
514   }
515 }
516
517 // If we dirty the lower half of a 64 bit register which is now being
518 // sign-extended, we need to dump the upper half.
519 // Note: Do this only after completion of the instruction, because
520 // some instructions may need to read the full 64-bit value even if
521 // overwriting it (eg SLTI, DSRA32).
522 static void flush_dirty_uppers(struct regstat *cur)
523 {
524   int hr,reg;
525   for (hr=0;hr<HOST_REGS;hr++) {
526     if((cur->dirty>>hr)&1) {
527       reg=cur->regmap[hr];
528       if(reg>=64)
529         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
530     }
531   }
532 }
533
534 void set_const(struct regstat *cur,signed char reg,uint64_t value)
535 {
536   int hr;
537   if(!reg) return;
538   for (hr=0;hr<HOST_REGS;hr++) {
539     if(cur->regmap[hr]==reg) {
540       cur->isconst|=1<<hr;
541       current_constmap[hr]=value;
542     }
543     else if((cur->regmap[hr]^64)==reg) {
544       cur->isconst|=1<<hr;
545       current_constmap[hr]=value>>32;
546     }
547   }
548 }
549
550 void clear_const(struct regstat *cur,signed char reg)
551 {
552   int hr;
553   if(!reg) return;
554   for (hr=0;hr<HOST_REGS;hr++) {
555     if((cur->regmap[hr]&63)==reg) {
556       cur->isconst&=~(1<<hr);
557     }
558   }
559 }
560
561 int is_const(struct regstat *cur,signed char reg)
562 {
563   int hr;
564   if(reg<0) return 0;
565   if(!reg) return 1;
566   for (hr=0;hr<HOST_REGS;hr++) {
567     if((cur->regmap[hr]&63)==reg) {
568       return (cur->isconst>>hr)&1;
569     }
570   }
571   return 0;
572 }
573 uint64_t get_const(struct regstat *cur,signed char reg)
574 {
575   int hr;
576   if(!reg) return 0;
577   for (hr=0;hr<HOST_REGS;hr++) {
578     if(cur->regmap[hr]==reg) {
579       return current_constmap[hr];
580     }
581   }
582   SysPrintf("Unknown constant in r%d\n",reg);
583   exit(1);
584 }
585
586 // Least soon needed registers
587 // Look at the next ten instructions and see which registers
588 // will be used.  Try not to reallocate these.
589 void lsn(u_char hsn[], int i, int *preferred_reg)
590 {
591   int j;
592   int b=-1;
593   for(j=0;j<9;j++)
594   {
595     if(i+j>=slen) {
596       j=slen-i-1;
597       break;
598     }
599     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
600     {
601       // Don't go past an unconditonal jump
602       j++;
603       break;
604     }
605   }
606   for(;j>=0;j--)
607   {
608     if(rs1[i+j]) hsn[rs1[i+j]]=j;
609     if(rs2[i+j]) hsn[rs2[i+j]]=j;
610     if(rt1[i+j]) hsn[rt1[i+j]]=j;
611     if(rt2[i+j]) hsn[rt2[i+j]]=j;
612     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
613       // Stores can allocate zero
614       hsn[rs1[i+j]]=j;
615       hsn[rs2[i+j]]=j;
616     }
617     // On some architectures stores need invc_ptr
618     #if defined(HOST_IMM8)
619     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
620       hsn[INVCP]=j;
621     }
622     #endif
623     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
624     {
625       hsn[CCREG]=j;
626       b=j;
627     }
628   }
629   if(b>=0)
630   {
631     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
632     {
633       // Follow first branch
634       int t=(ba[i+b]-start)>>2;
635       j=7-b;if(t+j>=slen) j=slen-t-1;
636       for(;j>=0;j--)
637       {
638         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
639         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
640         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
641         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
642       }
643     }
644     // TODO: preferred register based on backward branch
645   }
646   // Delay slot should preferably not overwrite branch conditions or cycle count
647   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
648     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
649     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
650     hsn[CCREG]=1;
651     // ...or hash tables
652     hsn[RHASH]=1;
653     hsn[RHTBL]=1;
654   }
655   // Coprocessor load/store needs FTEMP, even if not declared
656   if(itype[i]==C1LS||itype[i]==C2LS) {
657     hsn[FTEMP]=0;
658   }
659   // Load L/R also uses FTEMP as a temporary register
660   if(itype[i]==LOADLR) {
661     hsn[FTEMP]=0;
662   }
663   // Also SWL/SWR/SDL/SDR
664   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
665     hsn[FTEMP]=0;
666   }
667   // Don't remove the miniht registers
668   if(itype[i]==UJUMP||itype[i]==RJUMP)
669   {
670     hsn[RHASH]=0;
671     hsn[RHTBL]=0;
672   }
673 }
674
675 // We only want to allocate registers if we're going to use them again soon
676 int needed_again(int r, int i)
677 {
678   int j;
679   int b=-1;
680   int rn=10;
681
682   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
683   {
684     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
685       return 0; // Don't need any registers if exiting the block
686   }
687   for(j=0;j<9;j++)
688   {
689     if(i+j>=slen) {
690       j=slen-i-1;
691       break;
692     }
693     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
694     {
695       // Don't go past an unconditonal jump
696       j++;
697       break;
698     }
699     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
700     {
701       break;
702     }
703   }
704   for(;j>=1;j--)
705   {
706     if(rs1[i+j]==r) rn=j;
707     if(rs2[i+j]==r) rn=j;
708     if((unneeded_reg[i+j]>>r)&1) rn=10;
709     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
710     {
711       b=j;
712     }
713   }
714   /*
715   if(b>=0)
716   {
717     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
718     {
719       // Follow first branch
720       int o=rn;
721       int t=(ba[i+b]-start)>>2;
722       j=7-b;if(t+j>=slen) j=slen-t-1;
723       for(;j>=0;j--)
724       {
725         if(!((unneeded_reg[t+j]>>r)&1)) {
726           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
727           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
728         }
729         else rn=o;
730       }
731     }
732   }*/
733   if(rn<10) return 1;
734   (void)b;
735   return 0;
736 }
737
738 // Try to match register allocations at the end of a loop with those
739 // at the beginning
740 int loop_reg(int i, int r, int hr)
741 {
742   int j,k;
743   for(j=0;j<9;j++)
744   {
745     if(i+j>=slen) {
746       j=slen-i-1;
747       break;
748     }
749     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
750     {
751       // Don't go past an unconditonal jump
752       j++;
753       break;
754     }
755   }
756   k=0;
757   if(i>0){
758     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
759       k--;
760   }
761   for(;k<j;k++)
762   {
763     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
764     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
765     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
766     {
767       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
768       {
769         int t=(ba[i+k]-start)>>2;
770         int reg=get_reg(regs[t].regmap_entry,r);
771         if(reg>=0) return reg;
772         //reg=get_reg(regs[t+1].regmap_entry,r);
773         //if(reg>=0) return reg;
774       }
775     }
776   }
777   return hr;
778 }
779
780
781 // Allocate every register, preserving source/target regs
782 void alloc_all(struct regstat *cur,int i)
783 {
784   int hr;
785
786   for(hr=0;hr<HOST_REGS;hr++) {
787     if(hr!=EXCLUDE_REG) {
788       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
789          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
790       {
791         cur->regmap[hr]=-1;
792         cur->dirty&=~(1<<hr);
793       }
794       // Don't need zeros
795       if((cur->regmap[hr]&63)==0)
796       {
797         cur->regmap[hr]=-1;
798         cur->dirty&=~(1<<hr);
799       }
800     }
801   }
802 }
803
804 #ifdef __i386__
805 #include "assem_x86.c"
806 #endif
807 #ifdef __x86_64__
808 #include "assem_x64.c"
809 #endif
810 #ifdef __arm__
811 #include "assem_arm.c"
812 #endif
813
814 // Add virtual address mapping to linked list
815 void ll_add(struct ll_entry **head,int vaddr,void *addr)
816 {
817   struct ll_entry *new_entry;
818   new_entry=malloc(sizeof(struct ll_entry));
819   assert(new_entry!=NULL);
820   new_entry->vaddr=vaddr;
821   new_entry->reg_sv_flags=0;
822   new_entry->addr=addr;
823   new_entry->next=*head;
824   *head=new_entry;
825 }
826
827 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
828 {
829   ll_add(head,vaddr,addr);
830   (*head)->reg_sv_flags=reg_sv_flags;
831 }
832
833 // Check if an address is already compiled
834 // but don't return addresses which are about to expire from the cache
835 void *check_addr(u_int vaddr)
836 {
837   struct ht_entry *ht_bin = hash_table_get(vaddr);
838   size_t i;
839   for (i = 0; i < ARRAY_SIZE(ht_bin->vaddr); i++) {
840     if (ht_bin->vaddr[i] == vaddr)
841       if (doesnt_expire_soon((u_char *)ht_bin->tcaddr[i] - MAX_OUTPUT_BLOCK_SIZE))
842         if (isclean(ht_bin->tcaddr[i]))
843           return ht_bin->tcaddr[i];
844   }
845   u_int page=get_page(vaddr);
846   struct ll_entry *head;
847   head=jump_in[page];
848   while (head != NULL) {
849     if (head->vaddr == vaddr) {
850       if (doesnt_expire_soon(head->addr)) {
851         // Update existing entry with current address
852         if (ht_bin->vaddr[0] == vaddr) {
853           ht_bin->tcaddr[0] = head->addr;
854           return head->addr;
855         }
856         if (ht_bin->vaddr[1] == vaddr) {
857           ht_bin->tcaddr[1] = head->addr;
858           return head->addr;
859         }
860         // Insert into hash table with low priority.
861         // Don't evict existing entries, as they are probably
862         // addresses that are being accessed frequently.
863         if (ht_bin->vaddr[0] == -1) {
864           ht_bin->vaddr[0] = vaddr;
865           ht_bin->tcaddr[0] = head->addr;
866         }
867         else if (ht_bin->vaddr[1] == -1) {
868           ht_bin->vaddr[1] = vaddr;
869           ht_bin->tcaddr[1] = head->addr;
870         }
871         return head->addr;
872       }
873     }
874     head=head->next;
875   }
876   return 0;
877 }
878
879 void remove_hash(int vaddr)
880 {
881   //printf("remove hash: %x\n",vaddr);
882   struct ht_entry *ht_bin = hash_table_get(vaddr);
883   if (ht_bin->vaddr[1] == vaddr) {
884     ht_bin->vaddr[1] = -1;
885     ht_bin->tcaddr[1] = NULL;
886   }
887   if (ht_bin->vaddr[0] == vaddr) {
888     ht_bin->vaddr[0] = ht_bin->vaddr[1];
889     ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
890     ht_bin->vaddr[1] = -1;
891     ht_bin->tcaddr[1] = NULL;
892   }
893 }
894
895 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
896 {
897   struct ll_entry *next;
898   while(*head) {
899     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) ||
900        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
901     {
902       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
903       remove_hash((*head)->vaddr);
904       next=(*head)->next;
905       free(*head);
906       *head=next;
907     }
908     else
909     {
910       head=&((*head)->next);
911     }
912   }
913 }
914
915 // Remove all entries from linked list
916 void ll_clear(struct ll_entry **head)
917 {
918   struct ll_entry *cur;
919   struct ll_entry *next;
920   if((cur=*head)) {
921     *head=0;
922     while(cur) {
923       next=cur->next;
924       free(cur);
925       cur=next;
926     }
927   }
928 }
929
930 // Dereference the pointers and remove if it matches
931 static void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
932 {
933   while(head) {
934     int ptr=get_pointer(head->addr);
935     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
936     if(((ptr>>shift)==(addr>>shift)) ||
937        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
938     {
939       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
940       void *host_addr=find_extjump_insn(head->addr);
941       #ifdef __arm__
942         mark_clear_cache(host_addr);
943       #endif
944       set_jump_target(host_addr, head->addr);
945     }
946     head=head->next;
947   }
948 }
949
950 // This is called when we write to a compiled block (see do_invstub)
951 void invalidate_page(u_int page)
952 {
953   struct ll_entry *head;
954   struct ll_entry *next;
955   head=jump_in[page];
956   jump_in[page]=0;
957   while(head!=NULL) {
958     inv_debug("INVALIDATE: %x\n",head->vaddr);
959     remove_hash(head->vaddr);
960     next=head->next;
961     free(head);
962     head=next;
963   }
964   head=jump_out[page];
965   jump_out[page]=0;
966   while(head!=NULL) {
967     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
968     void *host_addr=find_extjump_insn(head->addr);
969     #ifdef __arm__
970       mark_clear_cache(host_addr);
971     #endif
972     set_jump_target(host_addr, head->addr);
973     next=head->next;
974     free(head);
975     head=next;
976   }
977 }
978
979 static void invalidate_block_range(u_int block, u_int first, u_int last)
980 {
981   u_int page=get_page(block<<12);
982   //printf("first=%d last=%d\n",first,last);
983   invalidate_page(page);
984   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
985   assert(last<page+5);
986   // Invalidate the adjacent pages if a block crosses a 4K boundary
987   while(first<page) {
988     invalidate_page(first);
989     first++;
990   }
991   for(first=page+1;first<last;first++) {
992     invalidate_page(first);
993   }
994   #ifdef __arm__
995     do_clear_cache();
996   #endif
997
998   // Don't trap writes
999   invalid_code[block]=1;
1000
1001   #ifdef USE_MINI_HT
1002   memset(mini_ht,-1,sizeof(mini_ht));
1003   #endif
1004 }
1005
1006 void invalidate_block(u_int block)
1007 {
1008   u_int page=get_page(block<<12);
1009   u_int vpage=get_vpage(block<<12);
1010   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1011   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1012   u_int first,last;
1013   first=last=page;
1014   struct ll_entry *head;
1015   head=jump_dirty[vpage];
1016   //printf("page=%d vpage=%d\n",page,vpage);
1017   while(head!=NULL) {
1018     u_int start,end;
1019     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1020       get_bounds((int)head->addr,&start,&end);
1021       //printf("start: %x end: %x\n",start,end);
1022       if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE) {
1023         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1024           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1025           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1026         }
1027       }
1028     }
1029     head=head->next;
1030   }
1031   invalidate_block_range(block,first,last);
1032 }
1033
1034 void invalidate_addr(u_int addr)
1035 {
1036   //static int rhits;
1037   // this check is done by the caller
1038   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1039   u_int page=get_vpage(addr);
1040   if(page<2048) { // RAM
1041     struct ll_entry *head;
1042     u_int addr_min=~0, addr_max=0;
1043     u_int mask=RAM_SIZE-1;
1044     u_int addr_main=0x80000000|(addr&mask);
1045     int pg1;
1046     inv_code_start=addr_main&~0xfff;
1047     inv_code_end=addr_main|0xfff;
1048     pg1=page;
1049     if (pg1>0) {
1050       // must check previous page too because of spans..
1051       pg1--;
1052       inv_code_start-=0x1000;
1053     }
1054     for(;pg1<=page;pg1++) {
1055       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1056         u_int start,end;
1057         get_bounds((int)head->addr,&start,&end);
1058         if(ram_offset) {
1059           start-=ram_offset;
1060           end-=ram_offset;
1061         }
1062         if(start<=addr_main&&addr_main<end) {
1063           if(start<addr_min) addr_min=start;
1064           if(end>addr_max) addr_max=end;
1065         }
1066         else if(addr_main<start) {
1067           if(start<inv_code_end)
1068             inv_code_end=start-1;
1069         }
1070         else {
1071           if(end>inv_code_start)
1072             inv_code_start=end;
1073         }
1074       }
1075     }
1076     if (addr_min!=~0) {
1077       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1078       inv_code_start=inv_code_end=~0;
1079       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1080       return;
1081     }
1082     else {
1083       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1084       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1085       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1086       return;
1087     }
1088   }
1089   invalidate_block(addr>>12);
1090 }
1091
1092 // This is called when loading a save state.
1093 // Anything could have changed, so invalidate everything.
1094 void invalidate_all_pages()
1095 {
1096   u_int page;
1097   for(page=0;page<4096;page++)
1098     invalidate_page(page);
1099   for(page=0;page<1048576;page++)
1100     if(!invalid_code[page]) {
1101       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1102       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1103     }
1104   #ifdef USE_MINI_HT
1105   memset(mini_ht,-1,sizeof(mini_ht));
1106   #endif
1107 }
1108
1109 // Add an entry to jump_out after making a link
1110 void add_link(u_int vaddr,void *src)
1111 {
1112   u_int page=get_page(vaddr);
1113   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1114   int *ptr=(int *)(src+4);
1115   assert((*ptr&0x0fff0000)==0x059f0000);
1116   (void)ptr;
1117   ll_add(jump_out+page,vaddr,src);
1118   //int ptr=get_pointer(src);
1119   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1120 }
1121
1122 // If a code block was found to be unmodified (bit was set in
1123 // restore_candidate) and it remains unmodified (bit is clear
1124 // in invalid_code) then move the entries for that 4K page from
1125 // the dirty list to the clean list.
1126 void clean_blocks(u_int page)
1127 {
1128   struct ll_entry *head;
1129   inv_debug("INV: clean_blocks page=%d\n",page);
1130   head=jump_dirty[page];
1131   while(head!=NULL) {
1132     if(!invalid_code[head->vaddr>>12]) {
1133       // Don't restore blocks which are about to expire from the cache
1134       if (doesnt_expire_soon(head->addr)) {
1135         u_int start,end;
1136         if(verify_dirty(head->addr)) {
1137           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1138           u_int i;
1139           u_int inv=0;
1140           get_bounds((int)head->addr,&start,&end);
1141           if(start-(u_int)rdram<RAM_SIZE) {
1142             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1143               inv|=invalid_code[i];
1144             }
1145           }
1146           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1147             inv=1;
1148           }
1149           if(!inv) {
1150             void *clean_addr = get_clean_addr(head->addr);
1151             if (doesnt_expire_soon(clean_addr)) {
1152               u_int ppage=page;
1153               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1154               //printf("page=%x, addr=%x\n",page,head->vaddr);
1155               //assert(head->vaddr>>12==(page|0x80000));
1156               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1157               struct ht_entry *ht_bin = hash_table_get(head->vaddr);
1158               if (ht_bin->vaddr[0] == head->vaddr)
1159                 ht_bin->tcaddr[0] = clean_addr; // Replace existing entry
1160               if (ht_bin->vaddr[1] == head->vaddr)
1161                 ht_bin->tcaddr[1] = clean_addr; // Replace existing entry
1162             }
1163           }
1164         }
1165       }
1166     }
1167     head=head->next;
1168   }
1169 }
1170
1171
1172 void mov_alloc(struct regstat *current,int i)
1173 {
1174   // Note: Don't need to actually alloc the source registers
1175   if((~current->is32>>rs1[i])&1) {
1176     //alloc_reg64(current,i,rs1[i]);
1177     alloc_reg64(current,i,rt1[i]);
1178     current->is32&=~(1LL<<rt1[i]);
1179   } else {
1180     //alloc_reg(current,i,rs1[i]);
1181     alloc_reg(current,i,rt1[i]);
1182     current->is32|=(1LL<<rt1[i]);
1183   }
1184   clear_const(current,rs1[i]);
1185   clear_const(current,rt1[i]);
1186   dirty_reg(current,rt1[i]);
1187 }
1188
1189 void shiftimm_alloc(struct regstat *current,int i)
1190 {
1191   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1192   {
1193     if(rt1[i]) {
1194       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1195       else lt1[i]=rs1[i];
1196       alloc_reg(current,i,rt1[i]);
1197       current->is32|=1LL<<rt1[i];
1198       dirty_reg(current,rt1[i]);
1199       if(is_const(current,rs1[i])) {
1200         int v=get_const(current,rs1[i]);
1201         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1202         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1203         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1204       }
1205       else clear_const(current,rt1[i]);
1206     }
1207   }
1208   else
1209   {
1210     clear_const(current,rs1[i]);
1211     clear_const(current,rt1[i]);
1212   }
1213
1214   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1215   {
1216     if(rt1[i]) {
1217       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1218       alloc_reg64(current,i,rt1[i]);
1219       current->is32&=~(1LL<<rt1[i]);
1220       dirty_reg(current,rt1[i]);
1221     }
1222   }
1223   if(opcode2[i]==0x3c) // DSLL32
1224   {
1225     if(rt1[i]) {
1226       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1227       alloc_reg64(current,i,rt1[i]);
1228       current->is32&=~(1LL<<rt1[i]);
1229       dirty_reg(current,rt1[i]);
1230     }
1231   }
1232   if(opcode2[i]==0x3e) // DSRL32
1233   {
1234     if(rt1[i]) {
1235       alloc_reg64(current,i,rs1[i]);
1236       if(imm[i]==32) {
1237         alloc_reg64(current,i,rt1[i]);
1238         current->is32&=~(1LL<<rt1[i]);
1239       } else {
1240         alloc_reg(current,i,rt1[i]);
1241         current->is32|=1LL<<rt1[i];
1242       }
1243       dirty_reg(current,rt1[i]);
1244     }
1245   }
1246   if(opcode2[i]==0x3f) // DSRA32
1247   {
1248     if(rt1[i]) {
1249       alloc_reg64(current,i,rs1[i]);
1250       alloc_reg(current,i,rt1[i]);
1251       current->is32|=1LL<<rt1[i];
1252       dirty_reg(current,rt1[i]);
1253     }
1254   }
1255 }
1256
1257 void shift_alloc(struct regstat *current,int i)
1258 {
1259   if(rt1[i]) {
1260     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1261     {
1262       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1263       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1264       alloc_reg(current,i,rt1[i]);
1265       if(rt1[i]==rs2[i]) {
1266         alloc_reg_temp(current,i,-1);
1267         minimum_free_regs[i]=1;
1268       }
1269       current->is32|=1LL<<rt1[i];
1270     } else { // DSLLV/DSRLV/DSRAV
1271       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1272       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1273       alloc_reg64(current,i,rt1[i]);
1274       current->is32&=~(1LL<<rt1[i]);
1275       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1276       {
1277         alloc_reg_temp(current,i,-1);
1278         minimum_free_regs[i]=1;
1279       }
1280     }
1281     clear_const(current,rs1[i]);
1282     clear_const(current,rs2[i]);
1283     clear_const(current,rt1[i]);
1284     dirty_reg(current,rt1[i]);
1285   }
1286 }
1287
1288 void alu_alloc(struct regstat *current,int i)
1289 {
1290   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1291     if(rt1[i]) {
1292       if(rs1[i]&&rs2[i]) {
1293         alloc_reg(current,i,rs1[i]);
1294         alloc_reg(current,i,rs2[i]);
1295       }
1296       else {
1297         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1298         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1299       }
1300       alloc_reg(current,i,rt1[i]);
1301     }
1302     current->is32|=1LL<<rt1[i];
1303   }
1304   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1305     if(rt1[i]) {
1306       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1307       {
1308         alloc_reg64(current,i,rs1[i]);
1309         alloc_reg64(current,i,rs2[i]);
1310         alloc_reg(current,i,rt1[i]);
1311       } else {
1312         alloc_reg(current,i,rs1[i]);
1313         alloc_reg(current,i,rs2[i]);
1314         alloc_reg(current,i,rt1[i]);
1315       }
1316     }
1317     current->is32|=1LL<<rt1[i];
1318   }
1319   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1320     if(rt1[i]) {
1321       if(rs1[i]&&rs2[i]) {
1322         alloc_reg(current,i,rs1[i]);
1323         alloc_reg(current,i,rs2[i]);
1324       }
1325       else
1326       {
1327         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1328         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1329       }
1330       alloc_reg(current,i,rt1[i]);
1331       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1332       {
1333         if(!((current->uu>>rt1[i])&1)) {
1334           alloc_reg64(current,i,rt1[i]);
1335         }
1336         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1337           if(rs1[i]&&rs2[i]) {
1338             alloc_reg64(current,i,rs1[i]);
1339             alloc_reg64(current,i,rs2[i]);
1340           }
1341           else
1342           {
1343             // Is is really worth it to keep 64-bit values in registers?
1344             #ifdef NATIVE_64BIT
1345             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1346             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1347             #endif
1348           }
1349         }
1350         current->is32&=~(1LL<<rt1[i]);
1351       } else {
1352         current->is32|=1LL<<rt1[i];
1353       }
1354     }
1355   }
1356   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1357     if(rt1[i]) {
1358       if(rs1[i]&&rs2[i]) {
1359         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1360           alloc_reg64(current,i,rs1[i]);
1361           alloc_reg64(current,i,rs2[i]);
1362           alloc_reg64(current,i,rt1[i]);
1363         } else {
1364           alloc_reg(current,i,rs1[i]);
1365           alloc_reg(current,i,rs2[i]);
1366           alloc_reg(current,i,rt1[i]);
1367         }
1368       }
1369       else {
1370         alloc_reg(current,i,rt1[i]);
1371         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1372           // DADD used as move, or zeroing
1373           // If we have a 64-bit source, then make the target 64 bits too
1374           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1375             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1376             alloc_reg64(current,i,rt1[i]);
1377           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1378             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1379             alloc_reg64(current,i,rt1[i]);
1380           }
1381           if(opcode2[i]>=0x2e&&rs2[i]) {
1382             // DSUB used as negation - 64-bit result
1383             // If we have a 32-bit register, extend it to 64 bits
1384             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1385             alloc_reg64(current,i,rt1[i]);
1386           }
1387         }
1388       }
1389       if(rs1[i]&&rs2[i]) {
1390         current->is32&=~(1LL<<rt1[i]);
1391       } else if(rs1[i]) {
1392         current->is32&=~(1LL<<rt1[i]);
1393         if((current->is32>>rs1[i])&1)
1394           current->is32|=1LL<<rt1[i];
1395       } else if(rs2[i]) {
1396         current->is32&=~(1LL<<rt1[i]);
1397         if((current->is32>>rs2[i])&1)
1398           current->is32|=1LL<<rt1[i];
1399       } else {
1400         current->is32|=1LL<<rt1[i];
1401       }
1402     }
1403   }
1404   clear_const(current,rs1[i]);
1405   clear_const(current,rs2[i]);
1406   clear_const(current,rt1[i]);
1407   dirty_reg(current,rt1[i]);
1408 }
1409
1410 void imm16_alloc(struct regstat *current,int i)
1411 {
1412   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1413   else lt1[i]=rs1[i];
1414   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1415   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1416     current->is32&=~(1LL<<rt1[i]);
1417     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1418       // TODO: Could preserve the 32-bit flag if the immediate is zero
1419       alloc_reg64(current,i,rt1[i]);
1420       alloc_reg64(current,i,rs1[i]);
1421     }
1422     clear_const(current,rs1[i]);
1423     clear_const(current,rt1[i]);
1424   }
1425   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1426     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1427     current->is32|=1LL<<rt1[i];
1428     clear_const(current,rs1[i]);
1429     clear_const(current,rt1[i]);
1430   }
1431   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1432     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1433       if(rs1[i]!=rt1[i]) {
1434         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1435         alloc_reg64(current,i,rt1[i]);
1436         current->is32&=~(1LL<<rt1[i]);
1437       }
1438     }
1439     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1440     if(is_const(current,rs1[i])) {
1441       int v=get_const(current,rs1[i]);
1442       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1443       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1444       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1445     }
1446     else clear_const(current,rt1[i]);
1447   }
1448   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1449     if(is_const(current,rs1[i])) {
1450       int v=get_const(current,rs1[i]);
1451       set_const(current,rt1[i],v+imm[i]);
1452     }
1453     else clear_const(current,rt1[i]);
1454     current->is32|=1LL<<rt1[i];
1455   }
1456   else {
1457     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1458     current->is32|=1LL<<rt1[i];
1459   }
1460   dirty_reg(current,rt1[i]);
1461 }
1462
1463 void load_alloc(struct regstat *current,int i)
1464 {
1465   clear_const(current,rt1[i]);
1466   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1467   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1468   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1469   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1470     alloc_reg(current,i,rt1[i]);
1471     assert(get_reg(current->regmap,rt1[i])>=0);
1472     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1473     {
1474       current->is32&=~(1LL<<rt1[i]);
1475       alloc_reg64(current,i,rt1[i]);
1476     }
1477     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1478     {
1479       current->is32&=~(1LL<<rt1[i]);
1480       alloc_reg64(current,i,rt1[i]);
1481       alloc_all(current,i);
1482       alloc_reg64(current,i,FTEMP);
1483       minimum_free_regs[i]=HOST_REGS;
1484     }
1485     else current->is32|=1LL<<rt1[i];
1486     dirty_reg(current,rt1[i]);
1487     // LWL/LWR need a temporary register for the old value
1488     if(opcode[i]==0x22||opcode[i]==0x26)
1489     {
1490       alloc_reg(current,i,FTEMP);
1491       alloc_reg_temp(current,i,-1);
1492       minimum_free_regs[i]=1;
1493     }
1494   }
1495   else
1496   {
1497     // Load to r0 or unneeded register (dummy load)
1498     // but we still need a register to calculate the address
1499     if(opcode[i]==0x22||opcode[i]==0x26)
1500     {
1501       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1502     }
1503     alloc_reg_temp(current,i,-1);
1504     minimum_free_regs[i]=1;
1505     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1506     {
1507       alloc_all(current,i);
1508       alloc_reg64(current,i,FTEMP);
1509       minimum_free_regs[i]=HOST_REGS;
1510     }
1511   }
1512 }
1513
1514 void store_alloc(struct regstat *current,int i)
1515 {
1516   clear_const(current,rs2[i]);
1517   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1518   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1519   alloc_reg(current,i,rs2[i]);
1520   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1521     alloc_reg64(current,i,rs2[i]);
1522     if(rs2[i]) alloc_reg(current,i,FTEMP);
1523   }
1524   #if defined(HOST_IMM8)
1525   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1526   else alloc_reg(current,i,INVCP);
1527   #endif
1528   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1529     alloc_reg(current,i,FTEMP);
1530   }
1531   // We need a temporary register for address generation
1532   alloc_reg_temp(current,i,-1);
1533   minimum_free_regs[i]=1;
1534 }
1535
1536 void c1ls_alloc(struct regstat *current,int i)
1537 {
1538   //clear_const(current,rs1[i]); // FIXME
1539   clear_const(current,rt1[i]);
1540   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1541   alloc_reg(current,i,CSREG); // Status
1542   alloc_reg(current,i,FTEMP);
1543   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1544     alloc_reg64(current,i,FTEMP);
1545   }
1546   #if defined(HOST_IMM8)
1547   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1548   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1549     alloc_reg(current,i,INVCP);
1550   #endif
1551   // We need a temporary register for address generation
1552   alloc_reg_temp(current,i,-1);
1553 }
1554
1555 void c2ls_alloc(struct regstat *current,int i)
1556 {
1557   clear_const(current,rt1[i]);
1558   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1559   alloc_reg(current,i,FTEMP);
1560   #if defined(HOST_IMM8)
1561   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1562   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1563     alloc_reg(current,i,INVCP);
1564   #endif
1565   // We need a temporary register for address generation
1566   alloc_reg_temp(current,i,-1);
1567   minimum_free_regs[i]=1;
1568 }
1569
1570 #ifndef multdiv_alloc
1571 void multdiv_alloc(struct regstat *current,int i)
1572 {
1573   //  case 0x18: MULT
1574   //  case 0x19: MULTU
1575   //  case 0x1A: DIV
1576   //  case 0x1B: DIVU
1577   //  case 0x1C: DMULT
1578   //  case 0x1D: DMULTU
1579   //  case 0x1E: DDIV
1580   //  case 0x1F: DDIVU
1581   clear_const(current,rs1[i]);
1582   clear_const(current,rs2[i]);
1583   if(rs1[i]&&rs2[i])
1584   {
1585     if((opcode2[i]&4)==0) // 32-bit
1586     {
1587       current->u&=~(1LL<<HIREG);
1588       current->u&=~(1LL<<LOREG);
1589       alloc_reg(current,i,HIREG);
1590       alloc_reg(current,i,LOREG);
1591       alloc_reg(current,i,rs1[i]);
1592       alloc_reg(current,i,rs2[i]);
1593       current->is32|=1LL<<HIREG;
1594       current->is32|=1LL<<LOREG;
1595       dirty_reg(current,HIREG);
1596       dirty_reg(current,LOREG);
1597     }
1598     else // 64-bit
1599     {
1600       current->u&=~(1LL<<HIREG);
1601       current->u&=~(1LL<<LOREG);
1602       current->uu&=~(1LL<<HIREG);
1603       current->uu&=~(1LL<<LOREG);
1604       alloc_reg64(current,i,HIREG);
1605       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1606       alloc_reg64(current,i,rs1[i]);
1607       alloc_reg64(current,i,rs2[i]);
1608       alloc_all(current,i);
1609       current->is32&=~(1LL<<HIREG);
1610       current->is32&=~(1LL<<LOREG);
1611       dirty_reg(current,HIREG);
1612       dirty_reg(current,LOREG);
1613       minimum_free_regs[i]=HOST_REGS;
1614     }
1615   }
1616   else
1617   {
1618     // Multiply by zero is zero.
1619     // MIPS does not have a divide by zero exception.
1620     // The result is undefined, we return zero.
1621     alloc_reg(current,i,HIREG);
1622     alloc_reg(current,i,LOREG);
1623     current->is32|=1LL<<HIREG;
1624     current->is32|=1LL<<LOREG;
1625     dirty_reg(current,HIREG);
1626     dirty_reg(current,LOREG);
1627   }
1628 }
1629 #endif
1630
1631 void cop0_alloc(struct regstat *current,int i)
1632 {
1633   if(opcode2[i]==0) // MFC0
1634   {
1635     if(rt1[i]) {
1636       clear_const(current,rt1[i]);
1637       alloc_all(current,i);
1638       alloc_reg(current,i,rt1[i]);
1639       current->is32|=1LL<<rt1[i];
1640       dirty_reg(current,rt1[i]);
1641     }
1642   }
1643   else if(opcode2[i]==4) // MTC0
1644   {
1645     if(rs1[i]){
1646       clear_const(current,rs1[i]);
1647       alloc_reg(current,i,rs1[i]);
1648       alloc_all(current,i);
1649     }
1650     else {
1651       alloc_all(current,i); // FIXME: Keep r0
1652       current->u&=~1LL;
1653       alloc_reg(current,i,0);
1654     }
1655   }
1656   else
1657   {
1658     // TLBR/TLBWI/TLBWR/TLBP/ERET
1659     assert(opcode2[i]==0x10);
1660     alloc_all(current,i);
1661   }
1662   minimum_free_regs[i]=HOST_REGS;
1663 }
1664
1665 void cop1_alloc(struct regstat *current,int i)
1666 {
1667   alloc_reg(current,i,CSREG); // Load status
1668   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1669   {
1670     if(rt1[i]){
1671       clear_const(current,rt1[i]);
1672       if(opcode2[i]==1) {
1673         alloc_reg64(current,i,rt1[i]); // DMFC1
1674         current->is32&=~(1LL<<rt1[i]);
1675       }else{
1676         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1677         current->is32|=1LL<<rt1[i];
1678       }
1679       dirty_reg(current,rt1[i]);
1680     }
1681     alloc_reg_temp(current,i,-1);
1682   }
1683   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1684   {
1685     if(rs1[i]){
1686       clear_const(current,rs1[i]);
1687       if(opcode2[i]==5)
1688         alloc_reg64(current,i,rs1[i]); // DMTC1
1689       else
1690         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1691       alloc_reg_temp(current,i,-1);
1692     }
1693     else {
1694       current->u&=~1LL;
1695       alloc_reg(current,i,0);
1696       alloc_reg_temp(current,i,-1);
1697     }
1698   }
1699   minimum_free_regs[i]=1;
1700 }
1701 void fconv_alloc(struct regstat *current,int i)
1702 {
1703   alloc_reg(current,i,CSREG); // Load status
1704   alloc_reg_temp(current,i,-1);
1705   minimum_free_regs[i]=1;
1706 }
1707 void float_alloc(struct regstat *current,int i)
1708 {
1709   alloc_reg(current,i,CSREG); // Load status
1710   alloc_reg_temp(current,i,-1);
1711   minimum_free_regs[i]=1;
1712 }
1713 void c2op_alloc(struct regstat *current,int i)
1714 {
1715   alloc_reg_temp(current,i,-1);
1716 }
1717 void fcomp_alloc(struct regstat *current,int i)
1718 {
1719   alloc_reg(current,i,CSREG); // Load status
1720   alloc_reg(current,i,FSREG); // Load flags
1721   dirty_reg(current,FSREG); // Flag will be modified
1722   alloc_reg_temp(current,i,-1);
1723   minimum_free_regs[i]=1;
1724 }
1725
1726 void syscall_alloc(struct regstat *current,int i)
1727 {
1728   alloc_cc(current,i);
1729   dirty_reg(current,CCREG);
1730   alloc_all(current,i);
1731   minimum_free_regs[i]=HOST_REGS;
1732   current->isconst=0;
1733 }
1734
1735 void delayslot_alloc(struct regstat *current,int i)
1736 {
1737   switch(itype[i]) {
1738     case UJUMP:
1739     case CJUMP:
1740     case SJUMP:
1741     case RJUMP:
1742     case FJUMP:
1743     case SYSCALL:
1744     case HLECALL:
1745     case SPAN:
1746       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1747       SysPrintf("Disabled speculative precompilation\n");
1748       stop_after_jal=1;
1749       break;
1750     case IMM16:
1751       imm16_alloc(current,i);
1752       break;
1753     case LOAD:
1754     case LOADLR:
1755       load_alloc(current,i);
1756       break;
1757     case STORE:
1758     case STORELR:
1759       store_alloc(current,i);
1760       break;
1761     case ALU:
1762       alu_alloc(current,i);
1763       break;
1764     case SHIFT:
1765       shift_alloc(current,i);
1766       break;
1767     case MULTDIV:
1768       multdiv_alloc(current,i);
1769       break;
1770     case SHIFTIMM:
1771       shiftimm_alloc(current,i);
1772       break;
1773     case MOV:
1774       mov_alloc(current,i);
1775       break;
1776     case COP0:
1777       cop0_alloc(current,i);
1778       break;
1779     case COP1:
1780     case COP2:
1781       cop1_alloc(current,i);
1782       break;
1783     case C1LS:
1784       c1ls_alloc(current,i);
1785       break;
1786     case C2LS:
1787       c2ls_alloc(current,i);
1788       break;
1789     case FCONV:
1790       fconv_alloc(current,i);
1791       break;
1792     case FLOAT:
1793       float_alloc(current,i);
1794       break;
1795     case FCOMP:
1796       fcomp_alloc(current,i);
1797       break;
1798     case C2OP:
1799       c2op_alloc(current,i);
1800       break;
1801   }
1802 }
1803
1804 // Special case where a branch and delay slot span two pages in virtual memory
1805 static void pagespan_alloc(struct regstat *current,int i)
1806 {
1807   current->isconst=0;
1808   current->wasconst=0;
1809   regs[i].wasconst=0;
1810   minimum_free_regs[i]=HOST_REGS;
1811   alloc_all(current,i);
1812   alloc_cc(current,i);
1813   dirty_reg(current,CCREG);
1814   if(opcode[i]==3) // JAL
1815   {
1816     alloc_reg(current,i,31);
1817     dirty_reg(current,31);
1818   }
1819   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1820   {
1821     alloc_reg(current,i,rs1[i]);
1822     if (rt1[i]!=0) {
1823       alloc_reg(current,i,rt1[i]);
1824       dirty_reg(current,rt1[i]);
1825     }
1826   }
1827   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1828   {
1829     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1830     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1831     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1832     {
1833       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1834       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1835     }
1836   }
1837   else
1838   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1839   {
1840     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1841     if(!((current->is32>>rs1[i])&1))
1842     {
1843       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1844     }
1845   }
1846   else
1847   if(opcode[i]==0x11) // BC1
1848   {
1849     alloc_reg(current,i,FSREG);
1850     alloc_reg(current,i,CSREG);
1851   }
1852   //else ...
1853 }
1854
1855 static void add_stub(enum stub_type type, void *addr, void *retaddr,
1856   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e)
1857 {
1858   assert(a < ARRAY_SIZE(stubs));
1859   stubs[stubcount].type = type;
1860   stubs[stubcount].addr = addr;
1861   stubs[stubcount].retaddr = retaddr;
1862   stubs[stubcount].a = a;
1863   stubs[stubcount].b = b;
1864   stubs[stubcount].c = c;
1865   stubs[stubcount].d = d;
1866   stubs[stubcount].e = e;
1867   stubcount++;
1868 }
1869
1870 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
1871   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist)
1872 {
1873   add_stub(type, addr, retaddr, i, addr_reg, (uintptr_t)i_regs, ccadj, reglist);
1874 }
1875
1876 // Write out a single register
1877 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1878 {
1879   int hr;
1880   for(hr=0;hr<HOST_REGS;hr++) {
1881     if(hr!=EXCLUDE_REG) {
1882       if((regmap[hr]&63)==r) {
1883         if((dirty>>hr)&1) {
1884           if(regmap[hr]<64) {
1885             emit_storereg(r,hr);
1886           }else{
1887             emit_storereg(r|64,hr);
1888           }
1889         }
1890       }
1891     }
1892   }
1893 }
1894
1895 int mchecksum()
1896 {
1897   int i;
1898   int sum=0;
1899   for(i=0;i<2097152;i++) {
1900     unsigned int temp=sum;
1901     sum<<=1;
1902     sum|=(~temp)>>31;
1903     sum^=((u_int *)rdram)[i];
1904   }
1905   return sum;
1906 }
1907 int rchecksum()
1908 {
1909   int i;
1910   int sum=0;
1911   for(i=0;i<64;i++)
1912     sum^=((u_int *)reg)[i];
1913   return sum;
1914 }
1915 void rlist()
1916 {
1917   int i;
1918   printf("TRACE: ");
1919   for(i=0;i<32;i++)
1920     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1921   printf("\n");
1922 }
1923
1924 void alu_assemble(int i,struct regstat *i_regs)
1925 {
1926   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1927     if(rt1[i]) {
1928       signed char s1,s2,t;
1929       t=get_reg(i_regs->regmap,rt1[i]);
1930       if(t>=0) {
1931         s1=get_reg(i_regs->regmap,rs1[i]);
1932         s2=get_reg(i_regs->regmap,rs2[i]);
1933         if(rs1[i]&&rs2[i]) {
1934           assert(s1>=0);
1935           assert(s2>=0);
1936           if(opcode2[i]&2) emit_sub(s1,s2,t);
1937           else emit_add(s1,s2,t);
1938         }
1939         else if(rs1[i]) {
1940           if(s1>=0) emit_mov(s1,t);
1941           else emit_loadreg(rs1[i],t);
1942         }
1943         else if(rs2[i]) {
1944           if(s2>=0) {
1945             if(opcode2[i]&2) emit_neg(s2,t);
1946             else emit_mov(s2,t);
1947           }
1948           else {
1949             emit_loadreg(rs2[i],t);
1950             if(opcode2[i]&2) emit_neg(t,t);
1951           }
1952         }
1953         else emit_zeroreg(t);
1954       }
1955     }
1956   }
1957   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1958     if(rt1[i]) {
1959       signed char s1l,s2l,s1h,s2h,tl,th;
1960       tl=get_reg(i_regs->regmap,rt1[i]);
1961       th=get_reg(i_regs->regmap,rt1[i]|64);
1962       if(tl>=0) {
1963         s1l=get_reg(i_regs->regmap,rs1[i]);
1964         s2l=get_reg(i_regs->regmap,rs2[i]);
1965         s1h=get_reg(i_regs->regmap,rs1[i]|64);
1966         s2h=get_reg(i_regs->regmap,rs2[i]|64);
1967         if(rs1[i]&&rs2[i]) {
1968           assert(s1l>=0);
1969           assert(s2l>=0);
1970           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
1971           else emit_adds(s1l,s2l,tl);
1972           if(th>=0) {
1973             #ifdef INVERTED_CARRY
1974             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
1975             #else
1976             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
1977             #endif
1978             else emit_add(s1h,s2h,th);
1979           }
1980         }
1981         else if(rs1[i]) {
1982           if(s1l>=0) emit_mov(s1l,tl);
1983           else emit_loadreg(rs1[i],tl);
1984           if(th>=0) {
1985             if(s1h>=0) emit_mov(s1h,th);
1986             else emit_loadreg(rs1[i]|64,th);
1987           }
1988         }
1989         else if(rs2[i]) {
1990           if(s2l>=0) {
1991             if(opcode2[i]&2) emit_negs(s2l,tl);
1992             else emit_mov(s2l,tl);
1993           }
1994           else {
1995             emit_loadreg(rs2[i],tl);
1996             if(opcode2[i]&2) emit_negs(tl,tl);
1997           }
1998           if(th>=0) {
1999             #ifdef INVERTED_CARRY
2000             if(s2h>=0) emit_mov(s2h,th);
2001             else emit_loadreg(rs2[i]|64,th);
2002             if(opcode2[i]&2) {
2003               emit_adcimm(-1,th); // x86 has inverted carry flag
2004               emit_not(th,th);
2005             }
2006             #else
2007             if(opcode2[i]&2) {
2008               if(s2h>=0) emit_rscimm(s2h,0,th);
2009               else {
2010                 emit_loadreg(rs2[i]|64,th);
2011                 emit_rscimm(th,0,th);
2012               }
2013             }else{
2014               if(s2h>=0) emit_mov(s2h,th);
2015               else emit_loadreg(rs2[i]|64,th);
2016             }
2017             #endif
2018           }
2019         }
2020         else {
2021           emit_zeroreg(tl);
2022           if(th>=0) emit_zeroreg(th);
2023         }
2024       }
2025     }
2026   }
2027   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2028     if(rt1[i]) {
2029       signed char s1l,s1h,s2l,s2h,t;
2030       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2031       {
2032         t=get_reg(i_regs->regmap,rt1[i]);
2033         //assert(t>=0);
2034         if(t>=0) {
2035           s1l=get_reg(i_regs->regmap,rs1[i]);
2036           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2037           s2l=get_reg(i_regs->regmap,rs2[i]);
2038           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2039           if(rs2[i]==0) // rx<r0
2040           {
2041             assert(s1h>=0);
2042             if(opcode2[i]==0x2a) // SLT
2043               emit_shrimm(s1h,31,t);
2044             else // SLTU (unsigned can not be less than zero)
2045               emit_zeroreg(t);
2046           }
2047           else if(rs1[i]==0) // r0<rx
2048           {
2049             assert(s2h>=0);
2050             if(opcode2[i]==0x2a) // SLT
2051               emit_set_gz64_32(s2h,s2l,t);
2052             else // SLTU (set if not zero)
2053               emit_set_nz64_32(s2h,s2l,t);
2054           }
2055           else {
2056             assert(s1l>=0);assert(s1h>=0);
2057             assert(s2l>=0);assert(s2h>=0);
2058             if(opcode2[i]==0x2a) // SLT
2059               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2060             else // SLTU
2061               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2062           }
2063         }
2064       } else {
2065         t=get_reg(i_regs->regmap,rt1[i]);
2066         //assert(t>=0);
2067         if(t>=0) {
2068           s1l=get_reg(i_regs->regmap,rs1[i]);
2069           s2l=get_reg(i_regs->regmap,rs2[i]);
2070           if(rs2[i]==0) // rx<r0
2071           {
2072             assert(s1l>=0);
2073             if(opcode2[i]==0x2a) // SLT
2074               emit_shrimm(s1l,31,t);
2075             else // SLTU (unsigned can not be less than zero)
2076               emit_zeroreg(t);
2077           }
2078           else if(rs1[i]==0) // r0<rx
2079           {
2080             assert(s2l>=0);
2081             if(opcode2[i]==0x2a) // SLT
2082               emit_set_gz32(s2l,t);
2083             else // SLTU (set if not zero)
2084               emit_set_nz32(s2l,t);
2085           }
2086           else{
2087             assert(s1l>=0);assert(s2l>=0);
2088             if(opcode2[i]==0x2a) // SLT
2089               emit_set_if_less32(s1l,s2l,t);
2090             else // SLTU
2091               emit_set_if_carry32(s1l,s2l,t);
2092           }
2093         }
2094       }
2095     }
2096   }
2097   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2098     if(rt1[i]) {
2099       signed char s1l,s1h,s2l,s2h,th,tl;
2100       tl=get_reg(i_regs->regmap,rt1[i]);
2101       th=get_reg(i_regs->regmap,rt1[i]|64);
2102       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2103       {
2104         assert(tl>=0);
2105         if(tl>=0) {
2106           s1l=get_reg(i_regs->regmap,rs1[i]);
2107           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2108           s2l=get_reg(i_regs->regmap,rs2[i]);
2109           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2110           if(rs1[i]&&rs2[i]) {
2111             assert(s1l>=0);assert(s1h>=0);
2112             assert(s2l>=0);assert(s2h>=0);
2113             if(opcode2[i]==0x24) { // AND
2114               emit_and(s1l,s2l,tl);
2115               emit_and(s1h,s2h,th);
2116             } else
2117             if(opcode2[i]==0x25) { // OR
2118               emit_or(s1l,s2l,tl);
2119               emit_or(s1h,s2h,th);
2120             } else
2121             if(opcode2[i]==0x26) { // XOR
2122               emit_xor(s1l,s2l,tl);
2123               emit_xor(s1h,s2h,th);
2124             } else
2125             if(opcode2[i]==0x27) { // NOR
2126               emit_or(s1l,s2l,tl);
2127               emit_or(s1h,s2h,th);
2128               emit_not(tl,tl);
2129               emit_not(th,th);
2130             }
2131           }
2132           else
2133           {
2134             if(opcode2[i]==0x24) { // AND
2135               emit_zeroreg(tl);
2136               emit_zeroreg(th);
2137             } else
2138             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2139               if(rs1[i]){
2140                 if(s1l>=0) emit_mov(s1l,tl);
2141                 else emit_loadreg(rs1[i],tl);
2142                 if(s1h>=0) emit_mov(s1h,th);
2143                 else emit_loadreg(rs1[i]|64,th);
2144               }
2145               else
2146               if(rs2[i]){
2147                 if(s2l>=0) emit_mov(s2l,tl);
2148                 else emit_loadreg(rs2[i],tl);
2149                 if(s2h>=0) emit_mov(s2h,th);
2150                 else emit_loadreg(rs2[i]|64,th);
2151               }
2152               else{
2153                 emit_zeroreg(tl);
2154                 emit_zeroreg(th);
2155               }
2156             } else
2157             if(opcode2[i]==0x27) { // NOR
2158               if(rs1[i]){
2159                 if(s1l>=0) emit_not(s1l,tl);
2160                 else{
2161                   emit_loadreg(rs1[i],tl);
2162                   emit_not(tl,tl);
2163                 }
2164                 if(s1h>=0) emit_not(s1h,th);
2165                 else{
2166                   emit_loadreg(rs1[i]|64,th);
2167                   emit_not(th,th);
2168                 }
2169               }
2170               else
2171               if(rs2[i]){
2172                 if(s2l>=0) emit_not(s2l,tl);
2173                 else{
2174                   emit_loadreg(rs2[i],tl);
2175                   emit_not(tl,tl);
2176                 }
2177                 if(s2h>=0) emit_not(s2h,th);
2178                 else{
2179                   emit_loadreg(rs2[i]|64,th);
2180                   emit_not(th,th);
2181                 }
2182               }
2183               else {
2184                 emit_movimm(-1,tl);
2185                 emit_movimm(-1,th);
2186               }
2187             }
2188           }
2189         }
2190       }
2191       else
2192       {
2193         // 32 bit
2194         if(tl>=0) {
2195           s1l=get_reg(i_regs->regmap,rs1[i]);
2196           s2l=get_reg(i_regs->regmap,rs2[i]);
2197           if(rs1[i]&&rs2[i]) {
2198             assert(s1l>=0);
2199             assert(s2l>=0);
2200             if(opcode2[i]==0x24) { // AND
2201               emit_and(s1l,s2l,tl);
2202             } else
2203             if(opcode2[i]==0x25) { // OR
2204               emit_or(s1l,s2l,tl);
2205             } else
2206             if(opcode2[i]==0x26) { // XOR
2207               emit_xor(s1l,s2l,tl);
2208             } else
2209             if(opcode2[i]==0x27) { // NOR
2210               emit_or(s1l,s2l,tl);
2211               emit_not(tl,tl);
2212             }
2213           }
2214           else
2215           {
2216             if(opcode2[i]==0x24) { // AND
2217               emit_zeroreg(tl);
2218             } else
2219             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2220               if(rs1[i]){
2221                 if(s1l>=0) emit_mov(s1l,tl);
2222                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2223               }
2224               else
2225               if(rs2[i]){
2226                 if(s2l>=0) emit_mov(s2l,tl);
2227                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2228               }
2229               else emit_zeroreg(tl);
2230             } else
2231             if(opcode2[i]==0x27) { // NOR
2232               if(rs1[i]){
2233                 if(s1l>=0) emit_not(s1l,tl);
2234                 else {
2235                   emit_loadreg(rs1[i],tl);
2236                   emit_not(tl,tl);
2237                 }
2238               }
2239               else
2240               if(rs2[i]){
2241                 if(s2l>=0) emit_not(s2l,tl);
2242                 else {
2243                   emit_loadreg(rs2[i],tl);
2244                   emit_not(tl,tl);
2245                 }
2246               }
2247               else emit_movimm(-1,tl);
2248             }
2249           }
2250         }
2251       }
2252     }
2253   }
2254 }
2255
2256 void imm16_assemble(int i,struct regstat *i_regs)
2257 {
2258   if (opcode[i]==0x0f) { // LUI
2259     if(rt1[i]) {
2260       signed char t;
2261       t=get_reg(i_regs->regmap,rt1[i]);
2262       //assert(t>=0);
2263       if(t>=0) {
2264         if(!((i_regs->isconst>>t)&1))
2265           emit_movimm(imm[i]<<16,t);
2266       }
2267     }
2268   }
2269   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2270     if(rt1[i]) {
2271       signed char s,t;
2272       t=get_reg(i_regs->regmap,rt1[i]);
2273       s=get_reg(i_regs->regmap,rs1[i]);
2274       if(rs1[i]) {
2275         //assert(t>=0);
2276         //assert(s>=0);
2277         if(t>=0) {
2278           if(!((i_regs->isconst>>t)&1)) {
2279             if(s<0) {
2280               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2281               emit_addimm(t,imm[i],t);
2282             }else{
2283               if(!((i_regs->wasconst>>s)&1))
2284                 emit_addimm(s,imm[i],t);
2285               else
2286                 emit_movimm(constmap[i][s]+imm[i],t);
2287             }
2288           }
2289         }
2290       } else {
2291         if(t>=0) {
2292           if(!((i_regs->isconst>>t)&1))
2293             emit_movimm(imm[i],t);
2294         }
2295       }
2296     }
2297   }
2298   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2299     if(rt1[i]) {
2300       signed char sh,sl,th,tl;
2301       th=get_reg(i_regs->regmap,rt1[i]|64);
2302       tl=get_reg(i_regs->regmap,rt1[i]);
2303       sh=get_reg(i_regs->regmap,rs1[i]|64);
2304       sl=get_reg(i_regs->regmap,rs1[i]);
2305       if(tl>=0) {
2306         if(rs1[i]) {
2307           assert(sh>=0);
2308           assert(sl>=0);
2309           if(th>=0) {
2310             emit_addimm64_32(sh,sl,imm[i],th,tl);
2311           }
2312           else {
2313             emit_addimm(sl,imm[i],tl);
2314           }
2315         } else {
2316           emit_movimm(imm[i],tl);
2317           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2318         }
2319       }
2320     }
2321   }
2322   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2323     if(rt1[i]) {
2324       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2325       signed char sh,sl,t;
2326       t=get_reg(i_regs->regmap,rt1[i]);
2327       sh=get_reg(i_regs->regmap,rs1[i]|64);
2328       sl=get_reg(i_regs->regmap,rs1[i]);
2329       //assert(t>=0);
2330       if(t>=0) {
2331         if(rs1[i]>0) {
2332           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2333           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2334             if(opcode[i]==0x0a) { // SLTI
2335               if(sl<0) {
2336                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2337                 emit_slti32(t,imm[i],t);
2338               }else{
2339                 emit_slti32(sl,imm[i],t);
2340               }
2341             }
2342             else { // SLTIU
2343               if(sl<0) {
2344                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2345                 emit_sltiu32(t,imm[i],t);
2346               }else{
2347                 emit_sltiu32(sl,imm[i],t);
2348               }
2349             }
2350           }else{ // 64-bit
2351             assert(sl>=0);
2352             if(opcode[i]==0x0a) // SLTI
2353               emit_slti64_32(sh,sl,imm[i],t);
2354             else // SLTIU
2355               emit_sltiu64_32(sh,sl,imm[i],t);
2356           }
2357         }else{
2358           // SLTI(U) with r0 is just stupid,
2359           // nonetheless examples can be found
2360           if(opcode[i]==0x0a) // SLTI
2361             if(0<imm[i]) emit_movimm(1,t);
2362             else emit_zeroreg(t);
2363           else // SLTIU
2364           {
2365             if(imm[i]) emit_movimm(1,t);
2366             else emit_zeroreg(t);
2367           }
2368         }
2369       }
2370     }
2371   }
2372   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2373     if(rt1[i]) {
2374       signed char sh,sl,th,tl;
2375       th=get_reg(i_regs->regmap,rt1[i]|64);
2376       tl=get_reg(i_regs->regmap,rt1[i]);
2377       sh=get_reg(i_regs->regmap,rs1[i]|64);
2378       sl=get_reg(i_regs->regmap,rs1[i]);
2379       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2380         if(opcode[i]==0x0c) //ANDI
2381         {
2382           if(rs1[i]) {
2383             if(sl<0) {
2384               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2385               emit_andimm(tl,imm[i],tl);
2386             }else{
2387               if(!((i_regs->wasconst>>sl)&1))
2388                 emit_andimm(sl,imm[i],tl);
2389               else
2390                 emit_movimm(constmap[i][sl]&imm[i],tl);
2391             }
2392           }
2393           else
2394             emit_zeroreg(tl);
2395           if(th>=0) emit_zeroreg(th);
2396         }
2397         else
2398         {
2399           if(rs1[i]) {
2400             if(sl<0) {
2401               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2402             }
2403             if(th>=0) {
2404               if(sh<0) {
2405                 emit_loadreg(rs1[i]|64,th);
2406               }else{
2407                 emit_mov(sh,th);
2408               }
2409             }
2410             if(opcode[i]==0x0d) { // ORI
2411               if(sl<0) {
2412                 emit_orimm(tl,imm[i],tl);
2413               }else{
2414                 if(!((i_regs->wasconst>>sl)&1))
2415                   emit_orimm(sl,imm[i],tl);
2416                 else
2417                   emit_movimm(constmap[i][sl]|imm[i],tl);
2418               }
2419             }
2420             if(opcode[i]==0x0e) { // XORI
2421               if(sl<0) {
2422                 emit_xorimm(tl,imm[i],tl);
2423               }else{
2424                 if(!((i_regs->wasconst>>sl)&1))
2425                   emit_xorimm(sl,imm[i],tl);
2426                 else
2427                   emit_movimm(constmap[i][sl]^imm[i],tl);
2428               }
2429             }
2430           }
2431           else {
2432             emit_movimm(imm[i],tl);
2433             if(th>=0) emit_zeroreg(th);
2434           }
2435         }
2436       }
2437     }
2438   }
2439 }
2440
2441 void shiftimm_assemble(int i,struct regstat *i_regs)
2442 {
2443   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2444   {
2445     if(rt1[i]) {
2446       signed char s,t;
2447       t=get_reg(i_regs->regmap,rt1[i]);
2448       s=get_reg(i_regs->regmap,rs1[i]);
2449       //assert(t>=0);
2450       if(t>=0&&!((i_regs->isconst>>t)&1)){
2451         if(rs1[i]==0)
2452         {
2453           emit_zeroreg(t);
2454         }
2455         else
2456         {
2457           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2458           if(imm[i]) {
2459             if(opcode2[i]==0) // SLL
2460             {
2461               emit_shlimm(s<0?t:s,imm[i],t);
2462             }
2463             if(opcode2[i]==2) // SRL
2464             {
2465               emit_shrimm(s<0?t:s,imm[i],t);
2466             }
2467             if(opcode2[i]==3) // SRA
2468             {
2469               emit_sarimm(s<0?t:s,imm[i],t);
2470             }
2471           }else{
2472             // Shift by zero
2473             if(s>=0 && s!=t) emit_mov(s,t);
2474           }
2475         }
2476       }
2477       //emit_storereg(rt1[i],t); //DEBUG
2478     }
2479   }
2480   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2481   {
2482     if(rt1[i]) {
2483       signed char sh,sl,th,tl;
2484       th=get_reg(i_regs->regmap,rt1[i]|64);
2485       tl=get_reg(i_regs->regmap,rt1[i]);
2486       sh=get_reg(i_regs->regmap,rs1[i]|64);
2487       sl=get_reg(i_regs->regmap,rs1[i]);
2488       if(tl>=0) {
2489         if(rs1[i]==0)
2490         {
2491           emit_zeroreg(tl);
2492           if(th>=0) emit_zeroreg(th);
2493         }
2494         else
2495         {
2496           assert(sl>=0);
2497           assert(sh>=0);
2498           if(imm[i]) {
2499             if(opcode2[i]==0x38) // DSLL
2500             {
2501               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2502               emit_shlimm(sl,imm[i],tl);
2503             }
2504             if(opcode2[i]==0x3a) // DSRL
2505             {
2506               emit_shrdimm(sl,sh,imm[i],tl);
2507               if(th>=0) emit_shrimm(sh,imm[i],th);
2508             }
2509             if(opcode2[i]==0x3b) // DSRA
2510             {
2511               emit_shrdimm(sl,sh,imm[i],tl);
2512               if(th>=0) emit_sarimm(sh,imm[i],th);
2513             }
2514           }else{
2515             // Shift by zero
2516             if(sl!=tl) emit_mov(sl,tl);
2517             if(th>=0&&sh!=th) emit_mov(sh,th);
2518           }
2519         }
2520       }
2521     }
2522   }
2523   if(opcode2[i]==0x3c) // DSLL32
2524   {
2525     if(rt1[i]) {
2526       signed char sl,tl,th;
2527       tl=get_reg(i_regs->regmap,rt1[i]);
2528       th=get_reg(i_regs->regmap,rt1[i]|64);
2529       sl=get_reg(i_regs->regmap,rs1[i]);
2530       if(th>=0||tl>=0){
2531         assert(tl>=0);
2532         assert(th>=0);
2533         assert(sl>=0);
2534         emit_mov(sl,th);
2535         emit_zeroreg(tl);
2536         if(imm[i]>32)
2537         {
2538           emit_shlimm(th,imm[i]&31,th);
2539         }
2540       }
2541     }
2542   }
2543   if(opcode2[i]==0x3e) // DSRL32
2544   {
2545     if(rt1[i]) {
2546       signed char sh,tl,th;
2547       tl=get_reg(i_regs->regmap,rt1[i]);
2548       th=get_reg(i_regs->regmap,rt1[i]|64);
2549       sh=get_reg(i_regs->regmap,rs1[i]|64);
2550       if(tl>=0){
2551         assert(sh>=0);
2552         emit_mov(sh,tl);
2553         if(th>=0) emit_zeroreg(th);
2554         if(imm[i]>32)
2555         {
2556           emit_shrimm(tl,imm[i]&31,tl);
2557         }
2558       }
2559     }
2560   }
2561   if(opcode2[i]==0x3f) // DSRA32
2562   {
2563     if(rt1[i]) {
2564       signed char sh,tl;
2565       tl=get_reg(i_regs->regmap,rt1[i]);
2566       sh=get_reg(i_regs->regmap,rs1[i]|64);
2567       if(tl>=0){
2568         assert(sh>=0);
2569         emit_mov(sh,tl);
2570         if(imm[i]>32)
2571         {
2572           emit_sarimm(tl,imm[i]&31,tl);
2573         }
2574       }
2575     }
2576   }
2577 }
2578
2579 #ifndef shift_assemble
2580 void shift_assemble(int i,struct regstat *i_regs)
2581 {
2582   printf("Need shift_assemble for this architecture.\n");
2583   exit(1);
2584 }
2585 #endif
2586
2587 void load_assemble(int i,struct regstat *i_regs)
2588 {
2589   int s,th,tl,addr,map=-1;
2590   int offset;
2591   void *jaddr=0;
2592   int memtarget=0,c=0;
2593   int fastload_reg_override=0;
2594   u_int hr,reglist=0;
2595   th=get_reg(i_regs->regmap,rt1[i]|64);
2596   tl=get_reg(i_regs->regmap,rt1[i]);
2597   s=get_reg(i_regs->regmap,rs1[i]);
2598   offset=imm[i];
2599   for(hr=0;hr<HOST_REGS;hr++) {
2600     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2601   }
2602   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2603   if(s>=0) {
2604     c=(i_regs->wasconst>>s)&1;
2605     if (c) {
2606       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2607     }
2608   }
2609   //printf("load_assemble: c=%d\n",c);
2610   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2611   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2612   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2613     ||rt1[i]==0) {
2614       // could be FIFO, must perform the read
2615       // ||dummy read
2616       assem_debug("(forced read)\n");
2617       tl=get_reg(i_regs->regmap,-1);
2618       assert(tl>=0);
2619   }
2620   if(offset||s<0||c) addr=tl;
2621   else addr=s;
2622   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2623  if(tl>=0) {
2624   //printf("load_assemble: c=%d\n",c);
2625   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2626   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2627   reglist&=~(1<<tl);
2628   if(th>=0) reglist&=~(1<<th);
2629   if(!c) {
2630     #ifdef RAM_OFFSET
2631     map=get_reg(i_regs->regmap,ROREG);
2632     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2633     #endif
2634     #ifdef R29_HACK
2635     // Strmnnrmn's speed hack
2636     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2637     #endif
2638     {
2639       jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2640     }
2641   }
2642   else if(ram_offset&&memtarget) {
2643     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2644     fastload_reg_override=HOST_TEMPREG;
2645   }
2646   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2647   if (opcode[i]==0x20) { // LB
2648     if(!c||memtarget) {
2649       if(!dummy) {
2650         #ifdef HOST_IMM_ADDR32
2651         if(c)
2652           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2653         else
2654         #endif
2655         {
2656           //emit_xorimm(addr,3,tl);
2657           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2658           int x=0,a=tl;
2659 #ifdef BIG_ENDIAN_MIPS
2660           if(!c) emit_xorimm(addr,3,tl);
2661           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2662 #else
2663           if(!c) a=addr;
2664 #endif
2665           if(fastload_reg_override) a=fastload_reg_override;
2666
2667           emit_movsbl_indexed_tlb(x,a,map,tl);
2668         }
2669       }
2670       if(jaddr)
2671         add_stub_r(LOADB_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2672     }
2673     else
2674       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2675   }
2676   if (opcode[i]==0x21) { // LH
2677     if(!c||memtarget) {
2678       if(!dummy) {
2679         #ifdef HOST_IMM_ADDR32
2680         if(c)
2681           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2682         else
2683         #endif
2684         {
2685           int x=0,a=tl;
2686 #ifdef BIG_ENDIAN_MIPS
2687           if(!c) emit_xorimm(addr,2,tl);
2688           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2689 #else
2690           if(!c) a=addr;
2691 #endif
2692           if(fastload_reg_override) a=fastload_reg_override;
2693           //#ifdef
2694           //emit_movswl_indexed_tlb(x,tl,map,tl);
2695           //else
2696           if(map>=0) {
2697             emit_movswl_indexed(x,a,tl);
2698           }else{
2699             #if 1 //def RAM_OFFSET
2700             emit_movswl_indexed(x,a,tl);
2701             #else
2702             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2703             #endif
2704           }
2705         }
2706       }
2707       if(jaddr)
2708         add_stub_r(LOADH_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2709     }
2710     else
2711       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2712   }
2713   if (opcode[i]==0x23) { // LW
2714     if(!c||memtarget) {
2715       if(!dummy) {
2716         int a=addr;
2717         if(fastload_reg_override) a=fastload_reg_override;
2718         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2719         #ifdef HOST_IMM_ADDR32
2720         if(c)
2721           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2722         else
2723         #endif
2724         emit_readword_indexed_tlb(0,a,map,tl);
2725       }
2726       if(jaddr)
2727         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2728     }
2729     else
2730       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2731   }
2732   if (opcode[i]==0x24) { // LBU
2733     if(!c||memtarget) {
2734       if(!dummy) {
2735         #ifdef HOST_IMM_ADDR32
2736         if(c)
2737           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2738         else
2739         #endif
2740         {
2741           //emit_xorimm(addr,3,tl);
2742           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2743           int x=0,a=tl;
2744 #ifdef BIG_ENDIAN_MIPS
2745           if(!c) emit_xorimm(addr,3,tl);
2746           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2747 #else
2748           if(!c) a=addr;
2749 #endif
2750           if(fastload_reg_override) a=fastload_reg_override;
2751
2752           emit_movzbl_indexed_tlb(x,a,map,tl);
2753         }
2754       }
2755       if(jaddr)
2756         add_stub_r(LOADBU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2757     }
2758     else
2759       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2760   }
2761   if (opcode[i]==0x25) { // LHU
2762     if(!c||memtarget) {
2763       if(!dummy) {
2764         #ifdef HOST_IMM_ADDR32
2765         if(c)
2766           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2767         else
2768         #endif
2769         {
2770           int x=0,a=tl;
2771 #ifdef BIG_ENDIAN_MIPS
2772           if(!c) emit_xorimm(addr,2,tl);
2773           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2774 #else
2775           if(!c) a=addr;
2776 #endif
2777           if(fastload_reg_override) a=fastload_reg_override;
2778           //#ifdef
2779           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2780           //#else
2781           if(map>=0) {
2782             emit_movzwl_indexed(x,a,tl);
2783           }else{
2784             #if 1 //def RAM_OFFSET
2785             emit_movzwl_indexed(x,a,tl);
2786             #else
2787             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
2788             #endif
2789           }
2790         }
2791       }
2792       if(jaddr)
2793         add_stub_r(LOADHU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2794     }
2795     else
2796       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2797   }
2798   if (opcode[i]==0x27) { // LWU
2799     assert(th>=0);
2800     if(!c||memtarget) {
2801       if(!dummy) {
2802         int a=addr;
2803         if(fastload_reg_override) a=fastload_reg_override;
2804         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2805         #ifdef HOST_IMM_ADDR32
2806         if(c)
2807           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2808         else
2809         #endif
2810         emit_readword_indexed_tlb(0,a,map,tl);
2811       }
2812       if(jaddr)
2813         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2814     }
2815     else {
2816       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2817     }
2818     emit_zeroreg(th);
2819   }
2820   if (opcode[i]==0x37) { // LD
2821     if(!c||memtarget) {
2822       if(!dummy) {
2823         int a=addr;
2824         if(fastload_reg_override) a=fastload_reg_override;
2825         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
2826         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
2827         #ifdef HOST_IMM_ADDR32
2828         if(c)
2829           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2830         else
2831         #endif
2832         emit_readdword_indexed_tlb(0,a,map,th,tl);
2833       }
2834       if(jaddr)
2835         add_stub_r(LOADD_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2836     }
2837     else
2838       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2839   }
2840  }
2841 }
2842
2843 #ifndef loadlr_assemble
2844 void loadlr_assemble(int i,struct regstat *i_regs)
2845 {
2846   printf("Need loadlr_assemble for this architecture.\n");
2847   exit(1);
2848 }
2849 #endif
2850
2851 void store_assemble(int i,struct regstat *i_regs)
2852 {
2853   int s,th,tl,map=-1;
2854   int addr,temp;
2855   int offset;
2856   void *jaddr=0;
2857   enum stub_type type;
2858   int memtarget=0,c=0;
2859   int agr=AGEN1+(i&1);
2860   int faststore_reg_override=0;
2861   u_int hr,reglist=0;
2862   th=get_reg(i_regs->regmap,rs2[i]|64);
2863   tl=get_reg(i_regs->regmap,rs2[i]);
2864   s=get_reg(i_regs->regmap,rs1[i]);
2865   temp=get_reg(i_regs->regmap,agr);
2866   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2867   offset=imm[i];
2868   if(s>=0) {
2869     c=(i_regs->wasconst>>s)&1;
2870     if(c) {
2871       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2872     }
2873   }
2874   assert(tl>=0);
2875   assert(temp>=0);
2876   for(hr=0;hr<HOST_REGS;hr++) {
2877     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2878   }
2879   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2880   if(offset||s<0||c) addr=temp;
2881   else addr=s;
2882   if(!c) {
2883     jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
2884   }
2885   else if(ram_offset&&memtarget) {
2886     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2887     faststore_reg_override=HOST_TEMPREG;
2888   }
2889
2890   if (opcode[i]==0x28) { // SB
2891     if(!c||memtarget) {
2892       int x=0,a=temp;
2893 #ifdef BIG_ENDIAN_MIPS
2894       if(!c) emit_xorimm(addr,3,temp);
2895       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2896 #else
2897       if(!c) a=addr;
2898 #endif
2899       if(faststore_reg_override) a=faststore_reg_override;
2900       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
2901       emit_writebyte_indexed_tlb(tl,x,a,map,a);
2902     }
2903     type=STOREB_STUB;
2904   }
2905   if (opcode[i]==0x29) { // SH
2906     if(!c||memtarget) {
2907       int x=0,a=temp;
2908 #ifdef BIG_ENDIAN_MIPS
2909       if(!c) emit_xorimm(addr,2,temp);
2910       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2911 #else
2912       if(!c) a=addr;
2913 #endif
2914       if(faststore_reg_override) a=faststore_reg_override;
2915       //#ifdef
2916       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
2917       //#else
2918       if(map>=0) {
2919         emit_writehword_indexed(tl,x,a);
2920       }else
2921         //emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
2922         emit_writehword_indexed(tl,x,a);
2923     }
2924     type=STOREH_STUB;
2925   }
2926   if (opcode[i]==0x2B) { // SW
2927     if(!c||memtarget) {
2928       int a=addr;
2929       if(faststore_reg_override) a=faststore_reg_override;
2930       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
2931       emit_writeword_indexed_tlb(tl,0,a,map,temp);
2932     }
2933     type=STOREW_STUB;
2934   }
2935   if (opcode[i]==0x3F) { // SD
2936     if(!c||memtarget) {
2937       int a=addr;
2938       if(faststore_reg_override) a=faststore_reg_override;
2939       if(rs2[i]) {
2940         assert(th>=0);
2941         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
2942         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
2943         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
2944       }else{
2945         // Store zero
2946         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
2947         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
2948         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
2949       }
2950     }
2951     type=STORED_STUB;
2952   }
2953   if(jaddr) {
2954     // PCSX store handlers don't check invcode again
2955     reglist|=1<<addr;
2956     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2957     jaddr=0;
2958   }
2959   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2960     if(!c||memtarget) {
2961       #ifdef DESTRUCTIVE_SHIFT
2962       // The x86 shift operation is 'destructive'; it overwrites the
2963       // source register, so we need to make a copy first and use that.
2964       addr=temp;
2965       #endif
2966       #if defined(HOST_IMM8)
2967       int ir=get_reg(i_regs->regmap,INVCP);
2968       assert(ir>=0);
2969       emit_cmpmem_indexedsr12_reg(ir,addr,1);
2970       #else
2971       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
2972       #endif
2973       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2974       emit_callne(invalidate_addr_reg[addr]);
2975       #else
2976       void *jaddr2 = out;
2977       emit_jne(0);
2978       add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),addr,0,0,0);
2979       #endif
2980     }
2981   }
2982   u_int addr_val=constmap[i][s]+offset;
2983   if(jaddr) {
2984     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2985   } else if(c&&!memtarget) {
2986     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
2987   }
2988   // basic current block modification detection..
2989   // not looking back as that should be in mips cache already
2990   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
2991     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
2992     assert(i_regs->regmap==regs[i].regmap); // not delay slot
2993     if(i_regs->regmap==regs[i].regmap) {
2994       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
2995       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
2996       emit_movimm(start+i*4+4,0);
2997       emit_writeword(0,(int)&pcaddr);
2998       emit_jmp(do_interrupt);
2999     }
3000   }
3001 }
3002
3003 void storelr_assemble(int i,struct regstat *i_regs)
3004 {
3005   int s,th,tl;
3006   int temp;
3007   int temp2=-1;
3008   int offset;
3009   void *jaddr=0;
3010   void *case1, *case2, *case3;
3011   void *done0, *done1, *done2;
3012   int memtarget=0,c=0;
3013   int agr=AGEN1+(i&1);
3014   u_int hr,reglist=0;
3015   th=get_reg(i_regs->regmap,rs2[i]|64);
3016   tl=get_reg(i_regs->regmap,rs2[i]);
3017   s=get_reg(i_regs->regmap,rs1[i]);
3018   temp=get_reg(i_regs->regmap,agr);
3019   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3020   offset=imm[i];
3021   if(s>=0) {
3022     c=(i_regs->isconst>>s)&1;
3023     if(c) {
3024       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3025     }
3026   }
3027   assert(tl>=0);
3028   for(hr=0;hr<HOST_REGS;hr++) {
3029     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3030   }
3031   assert(temp>=0);
3032   if(!c) {
3033     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3034     if(!offset&&s!=temp) emit_mov(s,temp);
3035     jaddr=out;
3036     emit_jno(0);
3037   }
3038   else
3039   {
3040     if(!memtarget||!rs1[i]) {
3041       jaddr=out;
3042       emit_jmp(0);
3043     }
3044   }
3045   #ifdef RAM_OFFSET
3046   int map=get_reg(i_regs->regmap,ROREG);
3047   if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3048   #else
3049   if((u_int)rdram!=0x80000000)
3050     emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3051   #endif
3052
3053   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3054     temp2=get_reg(i_regs->regmap,FTEMP);
3055     if(!rs2[i]) temp2=th=tl;
3056   }
3057
3058 #ifndef BIG_ENDIAN_MIPS
3059     emit_xorimm(temp,3,temp);
3060 #endif
3061   emit_testimm(temp,2);
3062   case2=out;
3063   emit_jne(0);
3064   emit_testimm(temp,1);
3065   case1=out;
3066   emit_jne(0);
3067   // 0
3068   if (opcode[i]==0x2A) { // SWL
3069     emit_writeword_indexed(tl,0,temp);
3070   }
3071   if (opcode[i]==0x2E) { // SWR
3072     emit_writebyte_indexed(tl,3,temp);
3073   }
3074   if (opcode[i]==0x2C) { // SDL
3075     emit_writeword_indexed(th,0,temp);
3076     if(rs2[i]) emit_mov(tl,temp2);
3077   }
3078   if (opcode[i]==0x2D) { // SDR
3079     emit_writebyte_indexed(tl,3,temp);
3080     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3081   }
3082   done0=out;
3083   emit_jmp(0);
3084   // 1
3085   set_jump_target(case1, out);
3086   if (opcode[i]==0x2A) { // SWL
3087     // Write 3 msb into three least significant bytes
3088     if(rs2[i]) emit_rorimm(tl,8,tl);
3089     emit_writehword_indexed(tl,-1,temp);
3090     if(rs2[i]) emit_rorimm(tl,16,tl);
3091     emit_writebyte_indexed(tl,1,temp);
3092     if(rs2[i]) emit_rorimm(tl,8,tl);
3093   }
3094   if (opcode[i]==0x2E) { // SWR
3095     // Write two lsb into two most significant bytes
3096     emit_writehword_indexed(tl,1,temp);
3097   }
3098   if (opcode[i]==0x2C) { // SDL
3099     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3100     // Write 3 msb into three least significant bytes
3101     if(rs2[i]) emit_rorimm(th,8,th);
3102     emit_writehword_indexed(th,-1,temp);
3103     if(rs2[i]) emit_rorimm(th,16,th);
3104     emit_writebyte_indexed(th,1,temp);
3105     if(rs2[i]) emit_rorimm(th,8,th);
3106   }
3107   if (opcode[i]==0x2D) { // SDR
3108     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3109     // Write two lsb into two most significant bytes
3110     emit_writehword_indexed(tl,1,temp);
3111   }
3112   done1=out;
3113   emit_jmp(0);
3114   // 2
3115   set_jump_target(case2, out);
3116   emit_testimm(temp,1);
3117   case3=out;
3118   emit_jne(0);
3119   if (opcode[i]==0x2A) { // SWL
3120     // Write two msb into two least significant bytes
3121     if(rs2[i]) emit_rorimm(tl,16,tl);
3122     emit_writehword_indexed(tl,-2,temp);
3123     if(rs2[i]) emit_rorimm(tl,16,tl);
3124   }
3125   if (opcode[i]==0x2E) { // SWR
3126     // Write 3 lsb into three most significant bytes
3127     emit_writebyte_indexed(tl,-1,temp);
3128     if(rs2[i]) emit_rorimm(tl,8,tl);
3129     emit_writehword_indexed(tl,0,temp);
3130     if(rs2[i]) emit_rorimm(tl,24,tl);
3131   }
3132   if (opcode[i]==0x2C) { // SDL
3133     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3134     // Write two msb into two least significant bytes
3135     if(rs2[i]) emit_rorimm(th,16,th);
3136     emit_writehword_indexed(th,-2,temp);
3137     if(rs2[i]) emit_rorimm(th,16,th);
3138   }
3139   if (opcode[i]==0x2D) { // SDR
3140     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3141     // Write 3 lsb into three most significant bytes
3142     emit_writebyte_indexed(tl,-1,temp);
3143     if(rs2[i]) emit_rorimm(tl,8,tl);
3144     emit_writehword_indexed(tl,0,temp);
3145     if(rs2[i]) emit_rorimm(tl,24,tl);
3146   }
3147   done2=out;
3148   emit_jmp(0);
3149   // 3
3150   set_jump_target(case3, out);
3151   if (opcode[i]==0x2A) { // SWL
3152     // Write msb into least significant byte
3153     if(rs2[i]) emit_rorimm(tl,24,tl);
3154     emit_writebyte_indexed(tl,-3,temp);
3155     if(rs2[i]) emit_rorimm(tl,8,tl);
3156   }
3157   if (opcode[i]==0x2E) { // SWR
3158     // Write entire word
3159     emit_writeword_indexed(tl,-3,temp);
3160   }
3161   if (opcode[i]==0x2C) { // SDL
3162     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3163     // Write msb into least significant byte
3164     if(rs2[i]) emit_rorimm(th,24,th);
3165     emit_writebyte_indexed(th,-3,temp);
3166     if(rs2[i]) emit_rorimm(th,8,th);
3167   }
3168   if (opcode[i]==0x2D) { // SDR
3169     if(rs2[i]) emit_mov(th,temp2);
3170     // Write entire word
3171     emit_writeword_indexed(tl,-3,temp);
3172   }
3173   set_jump_target(done0, out);
3174   set_jump_target(done1, out);
3175   set_jump_target(done2, out);
3176   if (opcode[i]==0x2C) { // SDL
3177     emit_testimm(temp,4);
3178     done0=out;
3179     emit_jne(0);
3180     emit_andimm(temp,~3,temp);
3181     emit_writeword_indexed(temp2,4,temp);
3182     set_jump_target(done0, out);
3183   }
3184   if (opcode[i]==0x2D) { // SDR
3185     emit_testimm(temp,4);
3186     done0=out;
3187     emit_jeq(0);
3188     emit_andimm(temp,~3,temp);
3189     emit_writeword_indexed(temp2,-4,temp);
3190     set_jump_target(done0, out);
3191   }
3192   if(!c||!memtarget)
3193     add_stub_r(STORELR_STUB,jaddr,out,i,temp,i_regs,ccadj[i],reglist);
3194   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3195     #ifdef RAM_OFFSET
3196     int map=get_reg(i_regs->regmap,ROREG);
3197     if(map<0) map=HOST_TEMPREG;
3198     gen_orig_addr_w(temp,map);
3199     #else
3200     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3201     #endif
3202     #if defined(HOST_IMM8)
3203     int ir=get_reg(i_regs->regmap,INVCP);
3204     assert(ir>=0);
3205     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3206     #else
3207     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3208     #endif
3209     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3210     emit_callne(invalidate_addr_reg[temp]);
3211     #else
3212     void *jaddr2 = out;
3213     emit_jne(0);
3214     add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3215     #endif
3216   }
3217 }
3218
3219 void c1ls_assemble(int i,struct regstat *i_regs)
3220 {
3221   cop1_unusable(i, i_regs);
3222 }
3223
3224 void c2ls_assemble(int i,struct regstat *i_regs)
3225 {
3226   int s,tl;
3227   int ar;
3228   int offset;
3229   int memtarget=0,c=0;
3230   void *jaddr2=NULL;
3231   enum stub_type type;
3232   int agr=AGEN1+(i&1);
3233   int fastio_reg_override=0;
3234   u_int hr,reglist=0;
3235   u_int copr=(source[i]>>16)&0x1f;
3236   s=get_reg(i_regs->regmap,rs1[i]);
3237   tl=get_reg(i_regs->regmap,FTEMP);
3238   offset=imm[i];
3239   assert(rs1[i]>0);
3240   assert(tl>=0);
3241
3242   for(hr=0;hr<HOST_REGS;hr++) {
3243     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3244   }
3245   if(i_regs->regmap[HOST_CCREG]==CCREG)
3246     reglist&=~(1<<HOST_CCREG);
3247
3248   // get the address
3249   if (opcode[i]==0x3a) { // SWC2
3250     ar=get_reg(i_regs->regmap,agr);
3251     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3252     reglist|=1<<ar;
3253   } else { // LWC2
3254     ar=tl;
3255   }
3256   if(s>=0) c=(i_regs->wasconst>>s)&1;
3257   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3258   if (!offset&&!c&&s>=0) ar=s;
3259   assert(ar>=0);
3260
3261   if (opcode[i]==0x3a) { // SWC2
3262     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3263     type=STOREW_STUB;
3264   }
3265   else
3266     type=LOADW_STUB;
3267
3268   if(c&&!memtarget) {
3269     jaddr2=out;
3270     emit_jmp(0); // inline_readstub/inline_writestub?
3271   }
3272   else {
3273     if(!c) {
3274       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3275     }
3276     else if(ram_offset&&memtarget) {
3277       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3278       fastio_reg_override=HOST_TEMPREG;
3279     }
3280     if (opcode[i]==0x32) { // LWC2
3281       #ifdef HOST_IMM_ADDR32
3282       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3283       else
3284       #endif
3285       int a=ar;
3286       if(fastio_reg_override) a=fastio_reg_override;
3287       emit_readword_indexed(0,a,tl);
3288     }
3289     if (opcode[i]==0x3a) { // SWC2
3290       #ifdef DESTRUCTIVE_SHIFT
3291       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3292       #endif
3293       int a=ar;
3294       if(fastio_reg_override) a=fastio_reg_override;
3295       emit_writeword_indexed(tl,0,a);
3296     }
3297   }
3298   if(jaddr2)
3299     add_stub_r(type,jaddr2,out,i,ar,i_regs,ccadj[i],reglist);
3300   if(opcode[i]==0x3a) // SWC2
3301   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3302 #if defined(HOST_IMM8)
3303     int ir=get_reg(i_regs->regmap,INVCP);
3304     assert(ir>=0);
3305     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3306 #else
3307     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3308 #endif
3309     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3310     emit_callne(invalidate_addr_reg[ar]);
3311     #else
3312     void *jaddr3 = out;
3313     emit_jne(0);
3314     add_stub(INVCODE_STUB,jaddr3,out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3315     #endif
3316   }
3317   if (opcode[i]==0x32) { // LWC2
3318     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3319   }
3320 }
3321
3322 #ifndef multdiv_assemble
3323 void multdiv_assemble(int i,struct regstat *i_regs)
3324 {
3325   printf("Need multdiv_assemble for this architecture.\n");
3326   exit(1);
3327 }
3328 #endif
3329
3330 void mov_assemble(int i,struct regstat *i_regs)
3331 {
3332   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3333   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3334   if(rt1[i]) {
3335     signed char sh,sl,th,tl;
3336     th=get_reg(i_regs->regmap,rt1[i]|64);
3337     tl=get_reg(i_regs->regmap,rt1[i]);
3338     //assert(tl>=0);
3339     if(tl>=0) {
3340       sh=get_reg(i_regs->regmap,rs1[i]|64);
3341       sl=get_reg(i_regs->regmap,rs1[i]);
3342       if(sl>=0) emit_mov(sl,tl);
3343       else emit_loadreg(rs1[i],tl);
3344       if(th>=0) {
3345         if(sh>=0) emit_mov(sh,th);
3346         else emit_loadreg(rs1[i]|64,th);
3347       }
3348     }
3349   }
3350 }
3351
3352 #ifndef fconv_assemble
3353 void fconv_assemble(int i,struct regstat *i_regs)
3354 {
3355   printf("Need fconv_assemble for this architecture.\n");
3356   exit(1);
3357 }
3358 #endif
3359
3360 #if 0
3361 void float_assemble(int i,struct regstat *i_regs)
3362 {
3363   printf("Need float_assemble for this architecture.\n");
3364   exit(1);
3365 }
3366 #endif
3367
3368 void syscall_assemble(int i,struct regstat *i_regs)
3369 {
3370   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3371   assert(ccreg==HOST_CCREG);
3372   assert(!is_delayslot);
3373   (void)ccreg;
3374   emit_movimm(start+i*4,EAX); // Get PC
3375   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3376   emit_jmp(jump_syscall_hle); // XXX
3377 }
3378
3379 void hlecall_assemble(int i,struct regstat *i_regs)
3380 {
3381   extern void psxNULL();
3382   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3383   assert(ccreg==HOST_CCREG);
3384   assert(!is_delayslot);
3385   (void)ccreg;
3386   emit_movimm(start+i*4+4,0); // Get PC
3387   uint32_t hleCode = source[i] & 0x03ffffff;
3388   if (hleCode >= ARRAY_SIZE(psxHLEt))
3389     emit_movimm((int)psxNULL,1);
3390   else
3391     emit_movimm((int)psxHLEt[hleCode],1);
3392   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3393   emit_jmp(jump_hlecall);
3394 }
3395
3396 void intcall_assemble(int i,struct regstat *i_regs)
3397 {
3398   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3399   assert(ccreg==HOST_CCREG);
3400   assert(!is_delayslot);
3401   (void)ccreg;
3402   emit_movimm(start+i*4,0); // Get PC
3403   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3404   emit_jmp(jump_intcall);
3405 }
3406
3407 void ds_assemble(int i,struct regstat *i_regs)
3408 {
3409   speculate_register_values(i);
3410   is_delayslot=1;
3411   switch(itype[i]) {
3412     case ALU:
3413       alu_assemble(i,i_regs);break;
3414     case IMM16:
3415       imm16_assemble(i,i_regs);break;
3416     case SHIFT:
3417       shift_assemble(i,i_regs);break;
3418     case SHIFTIMM:
3419       shiftimm_assemble(i,i_regs);break;
3420     case LOAD:
3421       load_assemble(i,i_regs);break;
3422     case LOADLR:
3423       loadlr_assemble(i,i_regs);break;
3424     case STORE:
3425       store_assemble(i,i_regs);break;
3426     case STORELR:
3427       storelr_assemble(i,i_regs);break;
3428     case COP0:
3429       cop0_assemble(i,i_regs);break;
3430     case COP1:
3431       cop1_assemble(i,i_regs);break;
3432     case C1LS:
3433       c1ls_assemble(i,i_regs);break;
3434     case COP2:
3435       cop2_assemble(i,i_regs);break;
3436     case C2LS:
3437       c2ls_assemble(i,i_regs);break;
3438     case C2OP:
3439       c2op_assemble(i,i_regs);break;
3440     case FCONV:
3441       fconv_assemble(i,i_regs);break;
3442     case FLOAT:
3443       float_assemble(i,i_regs);break;
3444     case FCOMP:
3445       fcomp_assemble(i,i_regs);break;
3446     case MULTDIV:
3447       multdiv_assemble(i,i_regs);break;
3448     case MOV:
3449       mov_assemble(i,i_regs);break;
3450     case SYSCALL:
3451     case HLECALL:
3452     case INTCALL:
3453     case SPAN:
3454     case UJUMP:
3455     case RJUMP:
3456     case CJUMP:
3457     case SJUMP:
3458     case FJUMP:
3459       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3460   }
3461   is_delayslot=0;
3462 }
3463
3464 // Is the branch target a valid internal jump?
3465 int internal_branch(uint64_t i_is32,int addr)
3466 {
3467   if(addr&1) return 0; // Indirect (register) jump
3468   if(addr>=start && addr<start+slen*4-4)
3469   {
3470     //int t=(addr-start)>>2;
3471     // Delay slots are not valid branch targets
3472     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3473     // 64 -> 32 bit transition requires a recompile
3474     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3475     {
3476       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3477       else printf("optimizable: yes\n");
3478     }*/
3479     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3480     return 1;
3481   }
3482   return 0;
3483 }
3484
3485 #ifndef wb_invalidate
3486 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3487   uint64_t u,uint64_t uu)
3488 {
3489   int hr;
3490   for(hr=0;hr<HOST_REGS;hr++) {
3491     if(hr!=EXCLUDE_REG) {
3492       if(pre[hr]!=entry[hr]) {
3493         if(pre[hr]>=0) {
3494           if((dirty>>hr)&1) {
3495             if(get_reg(entry,pre[hr])<0) {
3496               if(pre[hr]<64) {
3497                 if(!((u>>pre[hr])&1)) {
3498                   emit_storereg(pre[hr],hr);
3499                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3500                     emit_sarimm(hr,31,hr);
3501                     emit_storereg(pre[hr]|64,hr);
3502                   }
3503                 }
3504               }else{
3505                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3506                   emit_storereg(pre[hr],hr);
3507                 }
3508               }
3509             }
3510           }
3511         }
3512       }
3513     }
3514   }
3515   // Move from one register to another (no writeback)
3516   for(hr=0;hr<HOST_REGS;hr++) {
3517     if(hr!=EXCLUDE_REG) {
3518       if(pre[hr]!=entry[hr]) {
3519         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3520           int nr;
3521           if((nr=get_reg(entry,pre[hr]))>=0) {
3522             emit_mov(hr,nr);
3523           }
3524         }
3525       }
3526     }
3527   }
3528 }
3529 #endif
3530
3531 // Load the specified registers
3532 // This only loads the registers given as arguments because
3533 // we don't want to load things that will be overwritten
3534 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3535 {
3536   int hr;
3537   // Load 32-bit regs
3538   for(hr=0;hr<HOST_REGS;hr++) {
3539     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3540       if(entry[hr]!=regmap[hr]) {
3541         if(regmap[hr]==rs1||regmap[hr]==rs2)
3542         {
3543           if(regmap[hr]==0) {
3544             emit_zeroreg(hr);
3545           }
3546           else
3547           {
3548             emit_loadreg(regmap[hr],hr);
3549           }
3550         }
3551       }
3552     }
3553   }
3554   //Load 64-bit regs
3555   for(hr=0;hr<HOST_REGS;hr++) {
3556     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3557       if(entry[hr]!=regmap[hr]) {
3558         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3559         {
3560           assert(regmap[hr]!=64);
3561           if((is32>>(regmap[hr]&63))&1) {
3562             int lr=get_reg(regmap,regmap[hr]-64);
3563             if(lr>=0)
3564               emit_sarimm(lr,31,hr);
3565             else
3566               emit_loadreg(regmap[hr],hr);
3567           }
3568           else
3569           {
3570             emit_loadreg(regmap[hr],hr);
3571           }
3572         }
3573       }
3574     }
3575   }
3576 }
3577
3578 // Load registers prior to the start of a loop
3579 // so that they are not loaded within the loop
3580 static void loop_preload(signed char pre[],signed char entry[])
3581 {
3582   int hr;
3583   for(hr=0;hr<HOST_REGS;hr++) {
3584     if(hr!=EXCLUDE_REG) {
3585       if(pre[hr]!=entry[hr]) {
3586         if(entry[hr]>=0) {
3587           if(get_reg(pre,entry[hr])<0) {
3588             assem_debug("loop preload:\n");
3589             //printf("loop preload: %d\n",hr);
3590             if(entry[hr]==0) {
3591               emit_zeroreg(hr);
3592             }
3593             else if(entry[hr]<TEMPREG)
3594             {
3595               emit_loadreg(entry[hr],hr);
3596             }
3597             else if(entry[hr]-64<TEMPREG)
3598             {
3599               emit_loadreg(entry[hr],hr);
3600             }
3601           }
3602         }
3603       }
3604     }
3605   }
3606 }
3607
3608 // Generate address for load/store instruction
3609 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
3610 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3611 {
3612   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
3613     int ra=-1;
3614     int agr=AGEN1+(i&1);
3615     if(itype[i]==LOAD) {
3616       ra=get_reg(i_regs->regmap,rt1[i]);
3617       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3618       assert(ra>=0);
3619     }
3620     if(itype[i]==LOADLR) {
3621       ra=get_reg(i_regs->regmap,FTEMP);
3622     }
3623     if(itype[i]==STORE||itype[i]==STORELR) {
3624       ra=get_reg(i_regs->regmap,agr);
3625       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3626     }
3627     if(itype[i]==C1LS||itype[i]==C2LS) {
3628       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
3629         ra=get_reg(i_regs->regmap,FTEMP);
3630       else { // SWC1/SDC1/SWC2/SDC2
3631         ra=get_reg(i_regs->regmap,agr);
3632         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3633       }
3634     }
3635     int rs=get_reg(i_regs->regmap,rs1[i]);
3636     if(ra>=0) {
3637       int offset=imm[i];
3638       int c=(i_regs->wasconst>>rs)&1;
3639       if(rs1[i]==0) {
3640         // Using r0 as a base address
3641         if(!entry||entry[ra]!=agr) {
3642           if (opcode[i]==0x22||opcode[i]==0x26) {
3643             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3644           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3645             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3646           }else{
3647             emit_movimm(offset,ra);
3648           }
3649         } // else did it in the previous cycle
3650       }
3651       else if(rs<0) {
3652         if(!entry||entry[ra]!=rs1[i])
3653           emit_loadreg(rs1[i],ra);
3654         //if(!entry||entry[ra]!=rs1[i])
3655         //  printf("poor load scheduling!\n");
3656       }
3657       else if(c) {
3658         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3659           if(!entry||entry[ra]!=agr) {
3660             if (opcode[i]==0x22||opcode[i]==0x26) {
3661               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3662             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3663               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3664             }else{
3665               #ifdef HOST_IMM_ADDR32
3666               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3667               #endif
3668               emit_movimm(constmap[i][rs]+offset,ra);
3669               regs[i].loadedconst|=1<<ra;
3670             }
3671           } // else did it in the previous cycle
3672         } // else load_consts already did it
3673       }
3674       if(offset&&!c&&rs1[i]) {
3675         if(rs>=0) {
3676           emit_addimm(rs,offset,ra);
3677         }else{
3678           emit_addimm(ra,offset,ra);
3679         }
3680       }
3681     }
3682   }
3683   // Preload constants for next instruction
3684   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
3685     int agr,ra;
3686     // Actual address
3687     agr=AGEN1+((i+1)&1);
3688     ra=get_reg(i_regs->regmap,agr);
3689     if(ra>=0) {
3690       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3691       int offset=imm[i+1];
3692       int c=(regs[i+1].wasconst>>rs)&1;
3693       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3694         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3695           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3696         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3697           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3698         }else{
3699           #ifdef HOST_IMM_ADDR32
3700           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3701           #endif
3702           emit_movimm(constmap[i+1][rs]+offset,ra);
3703           regs[i+1].loadedconst|=1<<ra;
3704         }
3705       }
3706       else if(rs1[i+1]==0) {
3707         // Using r0 as a base address
3708         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3709           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3710         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3711           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3712         }else{
3713           emit_movimm(offset,ra);
3714         }
3715       }
3716     }
3717   }
3718 }
3719
3720 static int get_final_value(int hr, int i, int *value)
3721 {
3722   int reg=regs[i].regmap[hr];
3723   while(i<slen-1) {
3724     if(regs[i+1].regmap[hr]!=reg) break;
3725     if(!((regs[i+1].isconst>>hr)&1)) break;
3726     if(bt[i+1]) break;
3727     i++;
3728   }
3729   if(i<slen-1) {
3730     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3731       *value=constmap[i][hr];
3732       return 1;
3733     }
3734     if(!bt[i+1]) {
3735       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3736         // Load in delay slot, out-of-order execution
3737         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3738         {
3739           // Precompute load address
3740           *value=constmap[i][hr]+imm[i+2];
3741           return 1;
3742         }
3743       }
3744       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3745       {
3746         // Precompute load address
3747         *value=constmap[i][hr]+imm[i+1];
3748         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
3749         return 1;
3750       }
3751     }
3752   }
3753   *value=constmap[i][hr];
3754   //printf("c=%x\n",(int)constmap[i][hr]);
3755   if(i==slen-1) return 1;
3756   if(reg<64) {
3757     return !((unneeded_reg[i+1]>>reg)&1);
3758   }else{
3759     return !((unneeded_reg_upper[i+1]>>reg)&1);
3760   }
3761 }
3762
3763 // Load registers with known constants
3764 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
3765 {
3766   int hr,hr2;
3767   // propagate loaded constant flags
3768   if(i==0||bt[i])
3769     regs[i].loadedconst=0;
3770   else {
3771     for(hr=0;hr<HOST_REGS;hr++) {
3772       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
3773          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
3774       {
3775         regs[i].loadedconst|=1<<hr;
3776       }
3777     }
3778   }
3779   // Load 32-bit regs
3780   for(hr=0;hr<HOST_REGS;hr++) {
3781     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3782       //if(entry[hr]!=regmap[hr]) {
3783       if(!((regs[i].loadedconst>>hr)&1)) {
3784         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3785           int value,similar=0;
3786           if(get_final_value(hr,i,&value)) {
3787             // see if some other register has similar value
3788             for(hr2=0;hr2<HOST_REGS;hr2++) {
3789               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
3790                 if(is_similar_value(value,constmap[i][hr2])) {
3791                   similar=1;
3792                   break;
3793                 }
3794               }
3795             }
3796             if(similar) {
3797               int value2;
3798               if(get_final_value(hr2,i,&value2)) // is this needed?
3799                 emit_movimm_from(value2,hr2,value,hr);
3800               else
3801                 emit_movimm(value,hr);
3802             }
3803             else if(value==0) {
3804               emit_zeroreg(hr);
3805             }
3806             else {
3807               emit_movimm(value,hr);
3808             }
3809           }
3810           regs[i].loadedconst|=1<<hr;
3811         }
3812       }
3813     }
3814   }
3815   // Load 64-bit regs
3816   for(hr=0;hr<HOST_REGS;hr++) {
3817     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3818       //if(entry[hr]!=regmap[hr]) {
3819       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3820         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3821           if((is32>>(regmap[hr]&63))&1) {
3822             int lr=get_reg(regmap,regmap[hr]-64);
3823             assert(lr>=0);
3824             emit_sarimm(lr,31,hr);
3825           }
3826           else
3827           {
3828             int value;
3829             if(get_final_value(hr,i,&value)) {
3830               if(value==0) {
3831                 emit_zeroreg(hr);
3832               }
3833               else {
3834                 emit_movimm(value,hr);
3835               }
3836             }
3837           }
3838         }
3839       }
3840     }
3841   }
3842 }
3843 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
3844 {
3845   int hr;
3846   // Load 32-bit regs
3847   for(hr=0;hr<HOST_REGS;hr++) {
3848     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3849       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3850         int value=constmap[i][hr];
3851         if(value==0) {
3852           emit_zeroreg(hr);
3853         }
3854         else {
3855           emit_movimm(value,hr);
3856         }
3857       }
3858     }
3859   }
3860   // Load 64-bit regs
3861   for(hr=0;hr<HOST_REGS;hr++) {
3862     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3863       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3864         if((is32>>(regmap[hr]&63))&1) {
3865           int lr=get_reg(regmap,regmap[hr]-64);
3866           assert(lr>=0);
3867           emit_sarimm(lr,31,hr);
3868         }
3869         else
3870         {
3871           int value=constmap[i][hr];
3872           if(value==0) {
3873             emit_zeroreg(hr);
3874           }
3875           else {
3876             emit_movimm(value,hr);
3877           }
3878         }
3879       }
3880     }
3881   }
3882 }
3883
3884 // Write out all dirty registers (except cycle count)
3885 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
3886 {
3887   int hr;
3888   for(hr=0;hr<HOST_REGS;hr++) {
3889     if(hr!=EXCLUDE_REG) {
3890       if(i_regmap[hr]>0) {
3891         if(i_regmap[hr]!=CCREG) {
3892           if((i_dirty>>hr)&1) {
3893             if(i_regmap[hr]<64) {
3894               emit_storereg(i_regmap[hr],hr);
3895             }else{
3896               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
3897                 emit_storereg(i_regmap[hr],hr);
3898               }
3899             }
3900           }
3901         }
3902       }
3903     }
3904   }
3905 }
3906 // Write out dirty registers that we need to reload (pair with load_needed_regs)
3907 // This writes the registers not written by store_regs_bt
3908 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
3909 {
3910   int hr;
3911   int t=(addr-start)>>2;
3912   for(hr=0;hr<HOST_REGS;hr++) {
3913     if(hr!=EXCLUDE_REG) {
3914       if(i_regmap[hr]>0) {
3915         if(i_regmap[hr]!=CCREG) {
3916           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
3917             if((i_dirty>>hr)&1) {
3918               if(i_regmap[hr]<64) {
3919                 emit_storereg(i_regmap[hr],hr);
3920               }else{
3921                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
3922                   emit_storereg(i_regmap[hr],hr);
3923                 }
3924               }
3925             }
3926           }
3927         }
3928       }
3929     }
3930   }
3931 }
3932
3933 // Load all registers (except cycle count)
3934 void load_all_regs(signed char i_regmap[])
3935 {
3936   int hr;
3937   for(hr=0;hr<HOST_REGS;hr++) {
3938     if(hr!=EXCLUDE_REG) {
3939       if(i_regmap[hr]==0) {
3940         emit_zeroreg(hr);
3941       }
3942       else
3943       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
3944       {
3945         emit_loadreg(i_regmap[hr],hr);
3946       }
3947     }
3948   }
3949 }
3950
3951 // Load all current registers also needed by next instruction
3952 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
3953 {
3954   int hr;
3955   for(hr=0;hr<HOST_REGS;hr++) {
3956     if(hr!=EXCLUDE_REG) {
3957       if(get_reg(next_regmap,i_regmap[hr])>=0) {
3958         if(i_regmap[hr]==0) {
3959           emit_zeroreg(hr);
3960         }
3961         else
3962         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
3963         {
3964           emit_loadreg(i_regmap[hr],hr);
3965         }
3966       }
3967     }
3968   }
3969 }
3970
3971 // Load all regs, storing cycle count if necessary
3972 void load_regs_entry(int t)
3973 {
3974   int hr;
3975   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
3976   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
3977   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
3978     emit_storereg(CCREG,HOST_CCREG);
3979   }
3980   // Load 32-bit regs
3981   for(hr=0;hr<HOST_REGS;hr++) {
3982     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
3983       if(regs[t].regmap_entry[hr]==0) {
3984         emit_zeroreg(hr);
3985       }
3986       else if(regs[t].regmap_entry[hr]!=CCREG)
3987       {
3988         emit_loadreg(regs[t].regmap_entry[hr],hr);
3989       }
3990     }
3991   }
3992   // Load 64-bit regs
3993   for(hr=0;hr<HOST_REGS;hr++) {
3994     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
3995       assert(regs[t].regmap_entry[hr]!=64);
3996       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
3997         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
3998         if(lr<0) {
3999           emit_loadreg(regs[t].regmap_entry[hr],hr);
4000         }
4001         else
4002         {
4003           emit_sarimm(lr,31,hr);
4004         }
4005       }
4006       else
4007       {
4008         emit_loadreg(regs[t].regmap_entry[hr],hr);
4009       }
4010     }
4011   }
4012 }
4013
4014 // Store dirty registers prior to branch
4015 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4016 {
4017   if(internal_branch(i_is32,addr))
4018   {
4019     int t=(addr-start)>>2;
4020     int hr;
4021     for(hr=0;hr<HOST_REGS;hr++) {
4022       if(hr!=EXCLUDE_REG) {
4023         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4024           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4025             if((i_dirty>>hr)&1) {
4026               if(i_regmap[hr]<64) {
4027                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4028                   emit_storereg(i_regmap[hr],hr);
4029                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4030                     #ifdef DESTRUCTIVE_WRITEBACK
4031                     emit_sarimm(hr,31,hr);
4032                     emit_storereg(i_regmap[hr]|64,hr);
4033                     #else
4034                     emit_sarimm(hr,31,HOST_TEMPREG);
4035                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4036                     #endif
4037                   }
4038                 }
4039               }else{
4040                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4041                   emit_storereg(i_regmap[hr],hr);
4042                 }
4043               }
4044             }
4045           }
4046         }
4047       }
4048     }
4049   }
4050   else
4051   {
4052     // Branch out of this block, write out all dirty regs
4053     wb_dirtys(i_regmap,i_is32,i_dirty);
4054   }
4055 }
4056
4057 // Load all needed registers for branch target
4058 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4059 {
4060   //if(addr>=start && addr<(start+slen*4))
4061   if(internal_branch(i_is32,addr))
4062   {
4063     int t=(addr-start)>>2;
4064     int hr;
4065     // Store the cycle count before loading something else
4066     if(i_regmap[HOST_CCREG]!=CCREG) {
4067       assert(i_regmap[HOST_CCREG]==-1);
4068     }
4069     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4070       emit_storereg(CCREG,HOST_CCREG);
4071     }
4072     // Load 32-bit regs
4073     for(hr=0;hr<HOST_REGS;hr++) {
4074       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4075         #ifdef DESTRUCTIVE_WRITEBACK
4076         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4077         #else
4078         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4079         #endif
4080           if(regs[t].regmap_entry[hr]==0) {
4081             emit_zeroreg(hr);
4082           }
4083           else if(regs[t].regmap_entry[hr]!=CCREG)
4084           {
4085             emit_loadreg(regs[t].regmap_entry[hr],hr);
4086           }
4087         }
4088       }
4089     }
4090     //Load 64-bit regs
4091     for(hr=0;hr<HOST_REGS;hr++) {
4092       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4093         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4094           assert(regs[t].regmap_entry[hr]!=64);
4095           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4096             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4097             if(lr<0) {
4098               emit_loadreg(regs[t].regmap_entry[hr],hr);
4099             }
4100             else
4101             {
4102               emit_sarimm(lr,31,hr);
4103             }
4104           }
4105           else
4106           {
4107             emit_loadreg(regs[t].regmap_entry[hr],hr);
4108           }
4109         }
4110         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4111           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4112           assert(lr>=0);
4113           emit_sarimm(lr,31,hr);
4114         }
4115       }
4116     }
4117   }
4118 }
4119
4120 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4121 {
4122   if(addr>=start && addr<start+slen*4-4)
4123   {
4124     int t=(addr-start)>>2;
4125     int hr;
4126     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4127     for(hr=0;hr<HOST_REGS;hr++)
4128     {
4129       if(hr!=EXCLUDE_REG)
4130       {
4131         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4132         {
4133           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4134           {
4135             return 0;
4136           }
4137           else
4138           if((i_dirty>>hr)&1)
4139           {
4140             if(i_regmap[hr]<TEMPREG)
4141             {
4142               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4143                 return 0;
4144             }
4145             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4146             {
4147               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4148                 return 0;
4149             }
4150           }
4151         }
4152         else // Same register but is it 32-bit or dirty?
4153         if(i_regmap[hr]>=0)
4154         {
4155           if(!((regs[t].dirty>>hr)&1))
4156           {
4157             if((i_dirty>>hr)&1)
4158             {
4159               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4160               {
4161                 //printf("%x: dirty no match\n",addr);
4162                 return 0;
4163               }
4164             }
4165           }
4166           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4167           {
4168             //printf("%x: is32 no match\n",addr);
4169             return 0;
4170           }
4171         }
4172       }
4173     }
4174     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4175     // Delay slots are not valid branch targets
4176     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4177     // Delay slots require additional processing, so do not match
4178     if(is_ds[t]) return 0;
4179   }
4180   else
4181   {
4182     int hr;
4183     for(hr=0;hr<HOST_REGS;hr++)
4184     {
4185       if(hr!=EXCLUDE_REG)
4186       {
4187         if(i_regmap[hr]>=0)
4188         {
4189           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4190           {
4191             if((i_dirty>>hr)&1)
4192             {
4193               return 0;
4194             }
4195           }
4196         }
4197       }
4198     }
4199   }
4200   return 1;
4201 }
4202
4203 #ifdef DRC_DBG
4204 static void drc_dbg_emit_do_cmp(int i)
4205 {
4206   extern void do_insn_cmp();
4207   extern int cycle;
4208   u_int hr,reglist=0;
4209
4210   for(hr=0;hr<HOST_REGS;hr++)
4211     if(regs[i].regmap[hr]>=0) reglist|=1<<hr;
4212   save_regs(reglist);
4213   emit_movimm(start+i*4,0);
4214   emit_writeword(0,(int)&pcaddr);
4215   emit_call((int)do_insn_cmp);
4216   //emit_readword((int)&cycle,0);
4217   //emit_addimm(0,2,0);
4218   //emit_writeword(0,(int)&cycle);
4219   restore_regs(reglist);
4220 }
4221 #else
4222 #define drc_dbg_emit_do_cmp(x)
4223 #endif
4224
4225 // Used when a branch jumps into the delay slot of another branch
4226 void ds_assemble_entry(int i)
4227 {
4228   int t=(ba[i]-start)>>2;
4229   if (!instr_addr[t])
4230     instr_addr[t] = out;
4231   assem_debug("Assemble delay slot at %x\n",ba[i]);
4232   assem_debug("<->\n");
4233   drc_dbg_emit_do_cmp(t);
4234   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4235     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4236   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4237   address_generation(t,&regs[t],regs[t].regmap_entry);
4238   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4239     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4240   cop1_usable=0;
4241   is_delayslot=0;
4242   switch(itype[t]) {
4243     case ALU:
4244       alu_assemble(t,&regs[t]);break;
4245     case IMM16:
4246       imm16_assemble(t,&regs[t]);break;
4247     case SHIFT:
4248       shift_assemble(t,&regs[t]);break;
4249     case SHIFTIMM:
4250       shiftimm_assemble(t,&regs[t]);break;
4251     case LOAD:
4252       load_assemble(t,&regs[t]);break;
4253     case LOADLR:
4254       loadlr_assemble(t,&regs[t]);break;
4255     case STORE:
4256       store_assemble(t,&regs[t]);break;
4257     case STORELR:
4258       storelr_assemble(t,&regs[t]);break;
4259     case COP0:
4260       cop0_assemble(t,&regs[t]);break;
4261     case COP1:
4262       cop1_assemble(t,&regs[t]);break;
4263     case C1LS:
4264       c1ls_assemble(t,&regs[t]);break;
4265     case COP2:
4266       cop2_assemble(t,&regs[t]);break;
4267     case C2LS:
4268       c2ls_assemble(t,&regs[t]);break;
4269     case C2OP:
4270       c2op_assemble(t,&regs[t]);break;
4271     case FCONV:
4272       fconv_assemble(t,&regs[t]);break;
4273     case FLOAT:
4274       float_assemble(t,&regs[t]);break;
4275     case FCOMP:
4276       fcomp_assemble(t,&regs[t]);break;
4277     case MULTDIV:
4278       multdiv_assemble(t,&regs[t]);break;
4279     case MOV:
4280       mov_assemble(t,&regs[t]);break;
4281     case SYSCALL:
4282     case HLECALL:
4283     case INTCALL:
4284     case SPAN:
4285     case UJUMP:
4286     case RJUMP:
4287     case CJUMP:
4288     case SJUMP:
4289     case FJUMP:
4290       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4291   }
4292   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4293   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4294   if(internal_branch(regs[t].is32,ba[i]+4))
4295     assem_debug("branch: internal\n");
4296   else
4297     assem_debug("branch: external\n");
4298   assert(internal_branch(regs[t].is32,ba[i]+4));
4299   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4300   emit_jmp(0);
4301 }
4302
4303 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4304 {
4305   int count;
4306   void *jaddr;
4307   void *idle=NULL;
4308   int t=0;
4309   if(itype[i]==RJUMP)
4310   {
4311     *adj=0;
4312   }
4313   //if(ba[i]>=start && ba[i]<(start+slen*4))
4314   if(internal_branch(branch_regs[i].is32,ba[i]))
4315   {
4316     t=(ba[i]-start)>>2;
4317     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4318     else *adj=ccadj[t];
4319   }
4320   else
4321   {
4322     *adj=0;
4323   }
4324   count=ccadj[i];
4325   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4326     // Idle loop
4327     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4328     idle=out;
4329     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4330     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4331     jaddr=out;
4332     emit_jmp(0);
4333   }
4334   else if(*adj==0||invert) {
4335     int cycles=CLOCK_ADJUST(count+2);
4336     // faster loop HACK
4337     if (t&&*adj) {
4338       int rel=t-i;
4339       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4340         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4341     }
4342     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4343     jaddr=out;
4344     emit_jns(0);
4345   }
4346   else
4347   {
4348     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4349     jaddr=out;
4350     emit_jns(0);
4351   }
4352   add_stub(CC_STUB,jaddr,idle?idle:out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4353 }
4354
4355 static void do_ccstub(int n)
4356 {
4357   literal_pool(256);
4358   assem_debug("do_ccstub %x\n",start+stubs[n].b*4);
4359   set_jump_target(stubs[n].addr, out);
4360   int i=stubs[n].b;
4361   if(stubs[n].d==NULLDS) {
4362     // Delay slot instruction is nullified ("likely" branch)
4363     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4364   }
4365   else if(stubs[n].d!=TAKEN) {
4366     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4367   }
4368   else {
4369     if(internal_branch(branch_regs[i].is32,ba[i]))
4370       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4371   }
4372   if(stubs[n].c!=-1)
4373   {
4374     // Save PC as return address
4375     emit_movimm(stubs[n].c,EAX);
4376     emit_writeword(EAX,(int)&pcaddr);
4377   }
4378   else
4379   {
4380     // Return address depends on which way the branch goes
4381     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4382     {
4383       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4384       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4385       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4386       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4387       if(rs1[i]==0)
4388       {
4389         s1l=s2l;s1h=s2h;
4390         s2l=s2h=-1;
4391       }
4392       else if(rs2[i]==0)
4393       {
4394         s2l=s2h=-1;
4395       }
4396       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4397         s1h=s2h=-1;
4398       }
4399       assert(s1l>=0);
4400       #ifdef DESTRUCTIVE_WRITEBACK
4401       if(rs1[i]) {
4402         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4403           emit_loadreg(rs1[i],s1l);
4404       }
4405       else {
4406         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4407           emit_loadreg(rs2[i],s1l);
4408       }
4409       if(s2l>=0)
4410         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4411           emit_loadreg(rs2[i],s2l);
4412       #endif
4413       int hr=0;
4414       int addr=-1,alt=-1,ntaddr=-1;
4415       while(hr<HOST_REGS)
4416       {
4417         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4418            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4419            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4420         {
4421           addr=hr++;break;
4422         }
4423         hr++;
4424       }
4425       while(hr<HOST_REGS)
4426       {
4427         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4428            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4429            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4430         {
4431           alt=hr++;break;
4432         }
4433         hr++;
4434       }
4435       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4436       {
4437         while(hr<HOST_REGS)
4438         {
4439           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4440              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4441              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4442           {
4443             ntaddr=hr;break;
4444           }
4445           hr++;
4446         }
4447         assert(hr<HOST_REGS);
4448       }
4449       if((opcode[i]&0x2f)==4) // BEQ
4450       {
4451         #ifdef HAVE_CMOV_IMM
4452         if(s1h<0) {
4453           if(s2l>=0) emit_cmp(s1l,s2l);
4454           else emit_test(s1l,s1l);
4455           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4456         }
4457         else
4458         #endif
4459         {
4460           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4461           if(s1h>=0) {
4462             if(s2h>=0) emit_cmp(s1h,s2h);
4463             else emit_test(s1h,s1h);
4464             emit_cmovne_reg(alt,addr);
4465           }
4466           if(s2l>=0) emit_cmp(s1l,s2l);
4467           else emit_test(s1l,s1l);
4468           emit_cmovne_reg(alt,addr);
4469         }
4470       }
4471       if((opcode[i]&0x2f)==5) // BNE
4472       {
4473         #ifdef HAVE_CMOV_IMM
4474         if(s1h<0) {
4475           if(s2l>=0) emit_cmp(s1l,s2l);
4476           else emit_test(s1l,s1l);
4477           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4478         }
4479         else
4480         #endif
4481         {
4482           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4483           if(s1h>=0) {
4484             if(s2h>=0) emit_cmp(s1h,s2h);
4485             else emit_test(s1h,s1h);
4486             emit_cmovne_reg(alt,addr);
4487           }
4488           if(s2l>=0) emit_cmp(s1l,s2l);
4489           else emit_test(s1l,s1l);
4490           emit_cmovne_reg(alt,addr);
4491         }
4492       }
4493       if((opcode[i]&0x2f)==6) // BLEZ
4494       {
4495         //emit_movimm(ba[i],alt);
4496         //emit_movimm(start+i*4+8,addr);
4497         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4498         emit_cmpimm(s1l,1);
4499         if(s1h>=0) emit_mov(addr,ntaddr);
4500         emit_cmovl_reg(alt,addr);
4501         if(s1h>=0) {
4502           emit_test(s1h,s1h);
4503           emit_cmovne_reg(ntaddr,addr);
4504           emit_cmovs_reg(alt,addr);
4505         }
4506       }
4507       if((opcode[i]&0x2f)==7) // BGTZ
4508       {
4509         //emit_movimm(ba[i],addr);
4510         //emit_movimm(start+i*4+8,ntaddr);
4511         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4512         emit_cmpimm(s1l,1);
4513         if(s1h>=0) emit_mov(addr,alt);
4514         emit_cmovl_reg(ntaddr,addr);
4515         if(s1h>=0) {
4516           emit_test(s1h,s1h);
4517           emit_cmovne_reg(alt,addr);
4518           emit_cmovs_reg(ntaddr,addr);
4519         }
4520       }
4521       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4522       {
4523         //emit_movimm(ba[i],alt);
4524         //emit_movimm(start+i*4+8,addr);
4525         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4526         if(s1h>=0) emit_test(s1h,s1h);
4527         else emit_test(s1l,s1l);
4528         emit_cmovs_reg(alt,addr);
4529       }
4530       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4531       {
4532         //emit_movimm(ba[i],addr);
4533         //emit_movimm(start+i*4+8,alt);
4534         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4535         if(s1h>=0) emit_test(s1h,s1h);
4536         else emit_test(s1l,s1l);
4537         emit_cmovs_reg(alt,addr);
4538       }
4539       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4540         if(source[i]&0x10000) // BC1T
4541         {
4542           //emit_movimm(ba[i],alt);
4543           //emit_movimm(start+i*4+8,addr);
4544           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4545           emit_testimm(s1l,0x800000);
4546           emit_cmovne_reg(alt,addr);
4547         }
4548         else // BC1F
4549         {
4550           //emit_movimm(ba[i],addr);
4551           //emit_movimm(start+i*4+8,alt);
4552           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4553           emit_testimm(s1l,0x800000);
4554           emit_cmovne_reg(alt,addr);
4555         }
4556       }
4557       emit_writeword(addr,(int)&pcaddr);
4558     }
4559     else
4560     if(itype[i]==RJUMP)
4561     {
4562       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4563       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4564         r=get_reg(branch_regs[i].regmap,RTEMP);
4565       }
4566       emit_writeword(r,(int)&pcaddr);
4567     }
4568     else {SysPrintf("Unknown branch type in do_ccstub\n");exit(1);}
4569   }
4570   // Update cycle count
4571   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4572   if(stubs[n].a) emit_addimm(HOST_CCREG,CLOCK_ADJUST((int)stubs[n].a),HOST_CCREG);
4573   emit_call((int)cc_interrupt);
4574   if(stubs[n].a) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((int)stubs[n].a),HOST_CCREG);
4575   if(stubs[n].d==TAKEN) {
4576     if(internal_branch(branch_regs[i].is32,ba[i]))
4577       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4578     else if(itype[i]==RJUMP) {
4579       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4580         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4581       else
4582         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4583     }
4584   }else if(stubs[n].d==NOTTAKEN) {
4585     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4586     else load_all_regs(branch_regs[i].regmap);
4587   }else if(stubs[n].d==NULLDS) {
4588     // Delay slot instruction is nullified ("likely" branch)
4589     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4590     else load_all_regs(regs[i].regmap);
4591   }else{
4592     load_all_regs(branch_regs[i].regmap);
4593   }
4594   emit_jmp(stubs[n].retaddr);
4595 }
4596
4597 static void add_to_linker(int addr,int target,int ext)
4598 {
4599   link_addr[linkcount][0]=addr;
4600   link_addr[linkcount][1]=target;
4601   link_addr[linkcount][2]=ext;
4602   linkcount++;
4603 }
4604
4605 static void ujump_assemble_write_ra(int i)
4606 {
4607   int rt;
4608   unsigned int return_address;
4609   rt=get_reg(branch_regs[i].regmap,31);
4610   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4611   //assert(rt>=0);
4612   return_address=start+i*4+8;
4613   if(rt>=0) {
4614     #ifdef USE_MINI_HT
4615     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
4616       int temp=-1; // note: must be ds-safe
4617       #ifdef HOST_TEMPREG
4618       temp=HOST_TEMPREG;
4619       #endif
4620       if(temp>=0) do_miniht_insert(return_address,rt,temp);
4621       else emit_movimm(return_address,rt);
4622     }
4623     else
4624     #endif
4625     {
4626       #ifdef REG_PREFETCH
4627       if(temp>=0)
4628       {
4629         if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table_get(return_address),temp);
4630       }
4631       #endif
4632       emit_movimm(return_address,rt); // PC into link register
4633       #ifdef IMM_PREFETCH
4634       emit_prefetch(hash_table_get(return_address));
4635       #endif
4636     }
4637   }
4638 }
4639
4640 void ujump_assemble(int i,struct regstat *i_regs)
4641 {
4642   int ra_done=0;
4643   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4644   address_generation(i+1,i_regs,regs[i].regmap_entry);
4645   #ifdef REG_PREFETCH
4646   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4647   if(rt1[i]==31&&temp>=0)
4648   {
4649     signed char *i_regmap=i_regs->regmap;
4650     int return_address=start+i*4+8;
4651     if(get_reg(branch_regs[i].regmap,31)>0)
4652     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table_get(return_address),temp);
4653   }
4654   #endif
4655   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4656     ujump_assemble_write_ra(i); // writeback ra for DS
4657     ra_done=1;
4658   }
4659   ds_assemble(i+1,i_regs);
4660   uint64_t bc_unneeded=branch_regs[i].u;
4661   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4662   bc_unneeded|=1|(1LL<<rt1[i]);
4663   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4664   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4665                 bc_unneeded,bc_unneeded_upper);
4666   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4667   if(!ra_done&&rt1[i]==31)
4668     ujump_assemble_write_ra(i);
4669   int cc,adj;
4670   cc=get_reg(branch_regs[i].regmap,CCREG);
4671   assert(cc==HOST_CCREG);
4672   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4673   #ifdef REG_PREFETCH
4674   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4675   #endif
4676   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4677   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4678   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4679   if(internal_branch(branch_regs[i].is32,ba[i]))
4680     assem_debug("branch: internal\n");
4681   else
4682     assem_debug("branch: external\n");
4683   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4684     ds_assemble_entry(i);
4685   }
4686   else {
4687     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4688     emit_jmp(0);
4689   }
4690 }
4691
4692 static void rjump_assemble_write_ra(int i)
4693 {
4694   int rt,return_address;
4695   assert(rt1[i+1]!=rt1[i]);
4696   assert(rt2[i+1]!=rt1[i]);
4697   rt=get_reg(branch_regs[i].regmap,rt1[i]);
4698   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4699   assert(rt>=0);
4700   return_address=start+i*4+8;
4701   #ifdef REG_PREFETCH
4702   if(temp>=0)
4703   {
4704     if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table_get(return_address),temp);
4705   }
4706   #endif
4707   emit_movimm(return_address,rt); // PC into link register
4708   #ifdef IMM_PREFETCH
4709   emit_prefetch(hash_table_get(return_address));
4710   #endif
4711 }
4712
4713 void rjump_assemble(int i,struct regstat *i_regs)
4714 {
4715   int temp;
4716   int rs,cc;
4717   int ra_done=0;
4718   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4719   assert(rs>=0);
4720   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4721     // Delay slot abuse, make a copy of the branch address register
4722     temp=get_reg(branch_regs[i].regmap,RTEMP);
4723     assert(temp>=0);
4724     assert(regs[i].regmap[temp]==RTEMP);
4725     emit_mov(rs,temp);
4726     rs=temp;
4727   }
4728   address_generation(i+1,i_regs,regs[i].regmap_entry);
4729   #ifdef REG_PREFETCH
4730   if(rt1[i]==31)
4731   {
4732     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4733       signed char *i_regmap=i_regs->regmap;
4734       int return_address=start+i*4+8;
4735       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table_get(return_address),temp);
4736     }
4737   }
4738   #endif
4739   #ifdef USE_MINI_HT
4740   if(rs1[i]==31) {
4741     int rh=get_reg(regs[i].regmap,RHASH);
4742     if(rh>=0) do_preload_rhash(rh);
4743   }
4744   #endif
4745   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4746     rjump_assemble_write_ra(i);
4747     ra_done=1;
4748   }
4749   ds_assemble(i+1,i_regs);
4750   uint64_t bc_unneeded=branch_regs[i].u;
4751   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4752   bc_unneeded|=1|(1LL<<rt1[i]);
4753   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4754   bc_unneeded&=~(1LL<<rs1[i]);
4755   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4756                 bc_unneeded,bc_unneeded_upper);
4757   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4758   if(!ra_done&&rt1[i]!=0)
4759     rjump_assemble_write_ra(i);
4760   cc=get_reg(branch_regs[i].regmap,CCREG);
4761   assert(cc==HOST_CCREG);
4762   (void)cc;
4763   #ifdef USE_MINI_HT
4764   int rh=get_reg(branch_regs[i].regmap,RHASH);
4765   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4766   if(rs1[i]==31) {
4767     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4768     do_preload_rhtbl(ht);
4769     do_rhash(rs,rh);
4770   }
4771   #endif
4772   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4773   #ifdef DESTRUCTIVE_WRITEBACK
4774   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4775     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4776       emit_loadreg(rs1[i],rs);
4777     }
4778   }
4779   #endif
4780   #ifdef REG_PREFETCH
4781   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4782   #endif
4783   #ifdef USE_MINI_HT
4784   if(rs1[i]==31) {
4785     do_miniht_load(ht,rh);
4786   }
4787   #endif
4788   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4789   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4790   //assert(adj==0);
4791   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4792   add_stub(CC_STUB,out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
4793   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
4794     // special case for RFE
4795     emit_jmp(0);
4796   else
4797     emit_jns(0);
4798   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4799   #ifdef USE_MINI_HT
4800   if(rs1[i]==31) {
4801     do_miniht_jump(rs,rh,ht);
4802   }
4803   else
4804   #endif
4805   {
4806     emit_jmp(jump_vaddr_reg[rs]);
4807   }
4808   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4809   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4810   #endif
4811 }
4812
4813 void cjump_assemble(int i,struct regstat *i_regs)
4814 {
4815   signed char *i_regmap=i_regs->regmap;
4816   int cc;
4817   int match;
4818   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4819   assem_debug("match=%d\n",match);
4820   int s1h,s1l,s2h,s2l;
4821   int prev_cop1_usable=cop1_usable;
4822   int unconditional=0,nop=0;
4823   int only32=0;
4824   int invert=0;
4825   int internal=internal_branch(branch_regs[i].is32,ba[i]);
4826   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4827   if(!match) invert=1;
4828   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4829   if(i>(ba[i]-start)>>2) invert=1;
4830   #endif
4831
4832   if(ooo[i]) {
4833     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4834     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4835     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4836     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4837   }
4838   else {
4839     s1l=get_reg(i_regmap,rs1[i]);
4840     s1h=get_reg(i_regmap,rs1[i]|64);
4841     s2l=get_reg(i_regmap,rs2[i]);
4842     s2h=get_reg(i_regmap,rs2[i]|64);
4843   }
4844   if(rs1[i]==0&&rs2[i]==0)
4845   {
4846     if(opcode[i]&1) nop=1;
4847     else unconditional=1;
4848     //assert(opcode[i]!=5);
4849     //assert(opcode[i]!=7);
4850     //assert(opcode[i]!=0x15);
4851     //assert(opcode[i]!=0x17);
4852   }
4853   else if(rs1[i]==0)
4854   {
4855     s1l=s2l;s1h=s2h;
4856     s2l=s2h=-1;
4857     only32=(regs[i].was32>>rs2[i])&1;
4858   }
4859   else if(rs2[i]==0)
4860   {
4861     s2l=s2h=-1;
4862     only32=(regs[i].was32>>rs1[i])&1;
4863   }
4864   else {
4865     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
4866   }
4867
4868   if(ooo[i]) {
4869     // Out of order execution (delay slot first)
4870     //printf("OOOE\n");
4871     address_generation(i+1,i_regs,regs[i].regmap_entry);
4872     ds_assemble(i+1,i_regs);
4873     int adj;
4874     uint64_t bc_unneeded=branch_regs[i].u;
4875     uint64_t bc_unneeded_upper=branch_regs[i].uu;
4876     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
4877     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
4878     bc_unneeded|=1;
4879     bc_unneeded_upper|=1;
4880     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4881                   bc_unneeded,bc_unneeded_upper);
4882     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
4883     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4884     cc=get_reg(branch_regs[i].regmap,CCREG);
4885     assert(cc==HOST_CCREG);
4886     if(unconditional)
4887       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4888     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
4889     //assem_debug("cycle count (adj)\n");
4890     if(unconditional) {
4891       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4892       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
4893         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4894         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4895         if(internal)
4896           assem_debug("branch: internal\n");
4897         else
4898           assem_debug("branch: external\n");
4899         if(internal&&is_ds[(ba[i]-start)>>2]) {
4900           ds_assemble_entry(i);
4901         }
4902         else {
4903           add_to_linker((int)out,ba[i],internal);
4904           emit_jmp(0);
4905         }
4906         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4907         if(((u_int)out)&7) emit_addnop(0);
4908         #endif
4909       }
4910     }
4911     else if(nop) {
4912       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
4913       void *jaddr=out;
4914       emit_jns(0);
4915       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
4916     }
4917     else {
4918       void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
4919       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
4920       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4921       if(!only32)
4922       {
4923         assert(s1h>=0);
4924         if(opcode[i]==4) // BEQ
4925         {
4926           if(s2h>=0) emit_cmp(s1h,s2h);
4927           else emit_test(s1h,s1h);
4928           nottaken1=out;
4929           emit_jne(1);
4930         }
4931         if(opcode[i]==5) // BNE
4932         {
4933           if(s2h>=0) emit_cmp(s1h,s2h);
4934           else emit_test(s1h,s1h);
4935           if(invert) taken=out;
4936           else add_to_linker((int)out,ba[i],internal);
4937           emit_jne(0);
4938         }
4939         if(opcode[i]==6) // BLEZ
4940         {
4941           emit_test(s1h,s1h);
4942           if(invert) taken=out;
4943           else add_to_linker((int)out,ba[i],internal);
4944           emit_js(0);
4945           nottaken1=out;
4946           emit_jne(1);
4947         }
4948         if(opcode[i]==7) // BGTZ
4949         {
4950           emit_test(s1h,s1h);
4951           nottaken1=out;
4952           emit_js(1);
4953           if(invert) taken=out;
4954           else add_to_linker((int)out,ba[i],internal);
4955           emit_jne(0);
4956         }
4957       } // if(!only32)
4958
4959       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4960       assert(s1l>=0);
4961       if(opcode[i]==4) // BEQ
4962       {
4963         if(s2l>=0) emit_cmp(s1l,s2l);
4964         else emit_test(s1l,s1l);
4965         if(invert){
4966           nottaken=out;
4967           emit_jne(1);
4968         }else{
4969           add_to_linker((int)out,ba[i],internal);
4970           emit_jeq(0);
4971         }
4972       }
4973       if(opcode[i]==5) // BNE
4974       {
4975         if(s2l>=0) emit_cmp(s1l,s2l);
4976         else emit_test(s1l,s1l);
4977         if(invert){
4978           nottaken=out;
4979           emit_jeq(1);
4980         }else{
4981           add_to_linker((int)out,ba[i],internal);
4982           emit_jne(0);
4983         }
4984       }
4985       if(opcode[i]==6) // BLEZ
4986       {
4987         emit_cmpimm(s1l,1);
4988         if(invert){
4989           nottaken=out;
4990           emit_jge(1);
4991         }else{
4992           add_to_linker((int)out,ba[i],internal);
4993           emit_jl(0);
4994         }
4995       }
4996       if(opcode[i]==7) // BGTZ
4997       {
4998         emit_cmpimm(s1l,1);
4999         if(invert){
5000           nottaken=out;
5001           emit_jl(1);
5002         }else{
5003           add_to_linker((int)out,ba[i],internal);
5004           emit_jge(0);
5005         }
5006       }
5007       if(invert) {
5008         if(taken) set_jump_target(taken, out);
5009         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5010         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5011           if(adj) {
5012             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5013             add_to_linker((int)out,ba[i],internal);
5014           }else{
5015             emit_addnop(13);
5016             add_to_linker((int)out,ba[i],internal*2);
5017           }
5018           emit_jmp(0);
5019         }else
5020         #endif
5021         {
5022           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5023           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5024           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5025           if(internal)
5026             assem_debug("branch: internal\n");
5027           else
5028             assem_debug("branch: external\n");
5029           if(internal&&is_ds[(ba[i]-start)>>2]) {
5030             ds_assemble_entry(i);
5031           }
5032           else {
5033             add_to_linker((int)out,ba[i],internal);
5034             emit_jmp(0);
5035           }
5036         }
5037         set_jump_target(nottaken, out);
5038       }
5039
5040       if(nottaken1) set_jump_target(nottaken1, out);
5041       if(adj) {
5042         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5043       }
5044     } // (!unconditional)
5045   } // if(ooo)
5046   else
5047   {
5048     // In-order execution (branch first)
5049     //if(likely[i]) printf("IOL\n");
5050     //else
5051     //printf("IOE\n");
5052     void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
5053     if(!unconditional&&!nop) {
5054       if(!only32)
5055       {
5056         assert(s1h>=0);
5057         if((opcode[i]&0x2f)==4) // BEQ
5058         {
5059           if(s2h>=0) emit_cmp(s1h,s2h);
5060           else emit_test(s1h,s1h);
5061           nottaken1=out;
5062           emit_jne(2);
5063         }
5064         if((opcode[i]&0x2f)==5) // BNE
5065         {
5066           if(s2h>=0) emit_cmp(s1h,s2h);
5067           else emit_test(s1h,s1h);
5068           taken=out;
5069           emit_jne(1);
5070         }
5071         if((opcode[i]&0x2f)==6) // BLEZ
5072         {
5073           emit_test(s1h,s1h);
5074           taken=out;
5075           emit_js(1);
5076           nottaken1=out;
5077           emit_jne(2);
5078         }
5079         if((opcode[i]&0x2f)==7) // BGTZ
5080         {
5081           emit_test(s1h,s1h);
5082           nottaken1=out;
5083           emit_js(2);
5084           taken=out;
5085           emit_jne(1);
5086         }
5087       } // if(!only32)
5088
5089       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5090       assert(s1l>=0);
5091       if((opcode[i]&0x2f)==4) // BEQ
5092       {
5093         if(s2l>=0) emit_cmp(s1l,s2l);
5094         else emit_test(s1l,s1l);
5095         nottaken=out;
5096         emit_jne(2);
5097       }
5098       if((opcode[i]&0x2f)==5) // BNE
5099       {
5100         if(s2l>=0) emit_cmp(s1l,s2l);
5101         else emit_test(s1l,s1l);
5102         nottaken=out;
5103         emit_jeq(2);
5104       }
5105       if((opcode[i]&0x2f)==6) // BLEZ
5106       {
5107         emit_cmpimm(s1l,1);
5108         nottaken=out;
5109         emit_jge(2);
5110       }
5111       if((opcode[i]&0x2f)==7) // BGTZ
5112       {
5113         emit_cmpimm(s1l,1);
5114         nottaken=out;
5115         emit_jl(2);
5116       }
5117     } // if(!unconditional)
5118     int adj;
5119     uint64_t ds_unneeded=branch_regs[i].u;
5120     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5121     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5122     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5123     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5124     ds_unneeded|=1;
5125     ds_unneeded_upper|=1;
5126     // branch taken
5127     if(!nop) {
5128       if(taken) set_jump_target(taken, out);
5129       assem_debug("1:\n");
5130       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5131                     ds_unneeded,ds_unneeded_upper);
5132       // load regs
5133       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5134       address_generation(i+1,&branch_regs[i],0);
5135       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5136       ds_assemble(i+1,&branch_regs[i]);
5137       cc=get_reg(branch_regs[i].regmap,CCREG);
5138       if(cc==-1) {
5139         emit_loadreg(CCREG,cc=HOST_CCREG);
5140         // CHECK: Is the following instruction (fall thru) allocated ok?
5141       }
5142       assert(cc==HOST_CCREG);
5143       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5144       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5145       assem_debug("cycle count (adj)\n");
5146       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5147       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5148       if(internal)
5149         assem_debug("branch: internal\n");
5150       else
5151         assem_debug("branch: external\n");
5152       if(internal&&is_ds[(ba[i]-start)>>2]) {
5153         ds_assemble_entry(i);
5154       }
5155       else {
5156         add_to_linker((int)out,ba[i],internal);
5157         emit_jmp(0);
5158       }
5159     }
5160     // branch not taken
5161     cop1_usable=prev_cop1_usable;
5162     if(!unconditional) {
5163       if(nottaken1) set_jump_target(nottaken1, out);
5164       set_jump_target(nottaken, out);
5165       assem_debug("2:\n");
5166       if(!likely[i]) {
5167         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5168                       ds_unneeded,ds_unneeded_upper);
5169         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5170         address_generation(i+1,&branch_regs[i],0);
5171         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5172         ds_assemble(i+1,&branch_regs[i]);
5173       }
5174       cc=get_reg(branch_regs[i].regmap,CCREG);
5175       if(cc==-1&&!likely[i]) {
5176         // Cycle count isn't in a register, temporarily load it then write it out
5177         emit_loadreg(CCREG,HOST_CCREG);
5178         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5179         void *jaddr=out;
5180         emit_jns(0);
5181         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5182         emit_storereg(CCREG,HOST_CCREG);
5183       }
5184       else{
5185         cc=get_reg(i_regmap,CCREG);
5186         assert(cc==HOST_CCREG);
5187         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5188         void *jaddr=out;
5189         emit_jns(0);
5190         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5191       }
5192     }
5193   }
5194 }
5195
5196 void sjump_assemble(int i,struct regstat *i_regs)
5197 {
5198   signed char *i_regmap=i_regs->regmap;
5199   int cc;
5200   int match;
5201   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5202   assem_debug("smatch=%d\n",match);
5203   int s1h,s1l;
5204   int prev_cop1_usable=cop1_usable;
5205   int unconditional=0,nevertaken=0;
5206   int only32=0;
5207   int invert=0;
5208   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5209   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5210   if(!match) invert=1;
5211   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5212   if(i>(ba[i]-start)>>2) invert=1;
5213   #endif
5214
5215   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5216   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5217
5218   if(ooo[i]) {
5219     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5220     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5221   }
5222   else {
5223     s1l=get_reg(i_regmap,rs1[i]);
5224     s1h=get_reg(i_regmap,rs1[i]|64);
5225   }
5226   if(rs1[i]==0)
5227   {
5228     if(opcode2[i]&1) unconditional=1;
5229     else nevertaken=1;
5230     // These are never taken (r0 is never less than zero)
5231     //assert(opcode2[i]!=0);
5232     //assert(opcode2[i]!=2);
5233     //assert(opcode2[i]!=0x10);
5234     //assert(opcode2[i]!=0x12);
5235   }
5236   else {
5237     only32=(regs[i].was32>>rs1[i])&1;
5238   }
5239
5240   if(ooo[i]) {
5241     // Out of order execution (delay slot first)
5242     //printf("OOOE\n");
5243     address_generation(i+1,i_regs,regs[i].regmap_entry);
5244     ds_assemble(i+1,i_regs);
5245     int adj;
5246     uint64_t bc_unneeded=branch_regs[i].u;
5247     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5248     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5249     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5250     bc_unneeded|=1;
5251     bc_unneeded_upper|=1;
5252     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5253                   bc_unneeded,bc_unneeded_upper);
5254     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5255     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5256     if(rt1[i]==31) {
5257       int rt,return_address;
5258       rt=get_reg(branch_regs[i].regmap,31);
5259       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5260       if(rt>=0) {
5261         // Save the PC even if the branch is not taken
5262         return_address=start+i*4+8;
5263         emit_movimm(return_address,rt); // PC into link register
5264         #ifdef IMM_PREFETCH
5265         if(!nevertaken) emit_prefetch(hash_table_get(return_address));
5266         #endif
5267       }
5268     }
5269     cc=get_reg(branch_regs[i].regmap,CCREG);
5270     assert(cc==HOST_CCREG);
5271     if(unconditional)
5272       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5273     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5274     assem_debug("cycle count (adj)\n");
5275     if(unconditional) {
5276       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5277       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5278         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5279         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5280         if(internal)
5281           assem_debug("branch: internal\n");
5282         else
5283           assem_debug("branch: external\n");
5284         if(internal&&is_ds[(ba[i]-start)>>2]) {
5285           ds_assemble_entry(i);
5286         }
5287         else {
5288           add_to_linker((int)out,ba[i],internal);
5289           emit_jmp(0);
5290         }
5291         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5292         if(((u_int)out)&7) emit_addnop(0);
5293         #endif
5294       }
5295     }
5296     else if(nevertaken) {
5297       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5298       void *jaddr=out;
5299       emit_jns(0);
5300       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5301     }
5302     else {
5303       void *nottaken = NULL;
5304       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5305       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5306       if(!only32)
5307       {
5308         assert(s1h>=0);
5309         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5310         {
5311           emit_test(s1h,s1h);
5312           if(invert){
5313             nottaken=out;
5314             emit_jns(1);
5315           }else{
5316             add_to_linker((int)out,ba[i],internal);
5317             emit_js(0);
5318           }
5319         }
5320         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5321         {
5322           emit_test(s1h,s1h);
5323           if(invert){
5324             nottaken=out;
5325             emit_js(1);
5326           }else{
5327             add_to_linker((int)out,ba[i],internal);
5328             emit_jns(0);
5329           }
5330         }
5331       } // if(!only32)
5332       else
5333       {
5334         assert(s1l>=0);
5335         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5336         {
5337           emit_test(s1l,s1l);
5338           if(invert){
5339             nottaken=out;
5340             emit_jns(1);
5341           }else{
5342             add_to_linker((int)out,ba[i],internal);
5343             emit_js(0);
5344           }
5345         }
5346         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5347         {
5348           emit_test(s1l,s1l);
5349           if(invert){
5350             nottaken=out;
5351             emit_js(1);
5352           }else{
5353             add_to_linker((int)out,ba[i],internal);
5354             emit_jns(0);
5355           }
5356         }
5357       } // if(!only32)
5358
5359       if(invert) {
5360         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5361         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5362           if(adj) {
5363             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5364             add_to_linker((int)out,ba[i],internal);
5365           }else{
5366             emit_addnop(13);
5367             add_to_linker((int)out,ba[i],internal*2);
5368           }
5369           emit_jmp(0);
5370         }else
5371         #endif
5372         {
5373           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5374           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5375           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5376           if(internal)
5377             assem_debug("branch: internal\n");
5378           else
5379             assem_debug("branch: external\n");
5380           if(internal&&is_ds[(ba[i]-start)>>2]) {
5381             ds_assemble_entry(i);
5382           }
5383           else {
5384             add_to_linker((int)out,ba[i],internal);
5385             emit_jmp(0);
5386           }
5387         }
5388         set_jump_target(nottaken, out);
5389       }
5390
5391       if(adj) {
5392         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5393       }
5394     } // (!unconditional)
5395   } // if(ooo)
5396   else
5397   {
5398     // In-order execution (branch first)
5399     //printf("IOE\n");
5400     void *nottaken = NULL;
5401     if(rt1[i]==31) {
5402       int rt,return_address;
5403       rt=get_reg(branch_regs[i].regmap,31);
5404       if(rt>=0) {
5405         // Save the PC even if the branch is not taken
5406         return_address=start+i*4+8;
5407         emit_movimm(return_address,rt); // PC into link register
5408         #ifdef IMM_PREFETCH
5409         emit_prefetch(hash_table_get(return_address));
5410         #endif
5411       }
5412     }
5413     if(!unconditional) {
5414       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5415       if(!only32)
5416       {
5417         assert(s1h>=0);
5418         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5419         {
5420           emit_test(s1h,s1h);
5421           nottaken=out;
5422           emit_jns(1);
5423         }
5424         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5425         {
5426           emit_test(s1h,s1h);
5427           nottaken=out;
5428           emit_js(1);
5429         }
5430       } // if(!only32)
5431       else
5432       {
5433         assert(s1l>=0);
5434         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5435         {
5436           emit_test(s1l,s1l);
5437           nottaken=out;
5438           emit_jns(1);
5439         }
5440         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5441         {
5442           emit_test(s1l,s1l);
5443           nottaken=out;
5444           emit_js(1);
5445         }
5446       }
5447     } // if(!unconditional)
5448     int adj;
5449     uint64_t ds_unneeded=branch_regs[i].u;
5450     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5451     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5452     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5453     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5454     ds_unneeded|=1;
5455     ds_unneeded_upper|=1;
5456     // branch taken
5457     if(!nevertaken) {
5458       //assem_debug("1:\n");
5459       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5460                     ds_unneeded,ds_unneeded_upper);
5461       // load regs
5462       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5463       address_generation(i+1,&branch_regs[i],0);
5464       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5465       ds_assemble(i+1,&branch_regs[i]);
5466       cc=get_reg(branch_regs[i].regmap,CCREG);
5467       if(cc==-1) {
5468         emit_loadreg(CCREG,cc=HOST_CCREG);
5469         // CHECK: Is the following instruction (fall thru) allocated ok?
5470       }
5471       assert(cc==HOST_CCREG);
5472       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5473       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5474       assem_debug("cycle count (adj)\n");
5475       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5476       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5477       if(internal)
5478         assem_debug("branch: internal\n");
5479       else
5480         assem_debug("branch: external\n");
5481       if(internal&&is_ds[(ba[i]-start)>>2]) {
5482         ds_assemble_entry(i);
5483       }
5484       else {
5485         add_to_linker((int)out,ba[i],internal);
5486         emit_jmp(0);
5487       }
5488     }
5489     // branch not taken
5490     cop1_usable=prev_cop1_usable;
5491     if(!unconditional) {
5492       set_jump_target(nottaken, out);
5493       assem_debug("1:\n");
5494       if(!likely[i]) {
5495         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5496                       ds_unneeded,ds_unneeded_upper);
5497         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5498         address_generation(i+1,&branch_regs[i],0);
5499         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5500         ds_assemble(i+1,&branch_regs[i]);
5501       }
5502       cc=get_reg(branch_regs[i].regmap,CCREG);
5503       if(cc==-1&&!likely[i]) {
5504         // Cycle count isn't in a register, temporarily load it then write it out
5505         emit_loadreg(CCREG,HOST_CCREG);
5506         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5507         void *jaddr=out;
5508         emit_jns(0);
5509         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5510         emit_storereg(CCREG,HOST_CCREG);
5511       }
5512       else{
5513         cc=get_reg(i_regmap,CCREG);
5514         assert(cc==HOST_CCREG);
5515         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5516         void *jaddr=out;
5517         emit_jns(0);
5518         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5519       }
5520     }
5521   }
5522 }
5523
5524 void fjump_assemble(int i,struct regstat *i_regs)
5525 {
5526   signed char *i_regmap=i_regs->regmap;
5527   int cc;
5528   int match;
5529   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5530   assem_debug("fmatch=%d\n",match);
5531   int fs,cs;
5532   void *eaddr;
5533   int invert=0;
5534   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5535   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5536   if(!match) invert=1;
5537   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5538   if(i>(ba[i]-start)>>2) invert=1;
5539   #endif
5540
5541   if(ooo[i]) {
5542     fs=get_reg(branch_regs[i].regmap,FSREG);
5543     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5544   }
5545   else {
5546     fs=get_reg(i_regmap,FSREG);
5547   }
5548
5549   // Check cop1 unusable
5550   if(!cop1_usable) {
5551     cs=get_reg(i_regmap,CSREG);
5552     assert(cs>=0);
5553     emit_testimm(cs,0x20000000);
5554     eaddr=out;
5555     emit_jeq(0);
5556     add_stub_r(FP_STUB,eaddr,out,i,cs,i_regs,0,0);
5557     cop1_usable=1;
5558   }
5559
5560   if(ooo[i]) {
5561     // Out of order execution (delay slot first)
5562     //printf("OOOE\n");
5563     ds_assemble(i+1,i_regs);
5564     int adj;
5565     uint64_t bc_unneeded=branch_regs[i].u;
5566     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5567     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5568     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5569     bc_unneeded|=1;
5570     bc_unneeded_upper|=1;
5571     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5572                   bc_unneeded,bc_unneeded_upper);
5573     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5574     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5575     cc=get_reg(branch_regs[i].regmap,CCREG);
5576     assert(cc==HOST_CCREG);
5577     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5578     assem_debug("cycle count (adj)\n");
5579     if(1) {
5580       void *nottaken = NULL;
5581       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5582       if(1) {
5583         assert(fs>=0);
5584         emit_testimm(fs,0x800000);
5585         if(source[i]&0x10000) // BC1T
5586         {
5587           if(invert){
5588             nottaken=out;
5589             emit_jeq(1);
5590           }else{
5591             add_to_linker((int)out,ba[i],internal);
5592             emit_jne(0);
5593           }
5594         }
5595         else // BC1F
5596           if(invert){
5597             nottaken=out;
5598             emit_jne(1);
5599           }else{
5600             add_to_linker((int)out,ba[i],internal);
5601             emit_jeq(0);
5602           }
5603         {
5604         }
5605       } // if(!only32)
5606
5607       if(invert) {
5608         if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5609         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5610         else if(match) emit_addnop(13);
5611         #endif
5612         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5613         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5614         if(internal)
5615           assem_debug("branch: internal\n");
5616         else
5617           assem_debug("branch: external\n");
5618         if(internal&&is_ds[(ba[i]-start)>>2]) {
5619           ds_assemble_entry(i);
5620         }
5621         else {
5622           add_to_linker((int)out,ba[i],internal);
5623           emit_jmp(0);
5624         }
5625         set_jump_target(nottaken, out);
5626       }
5627
5628       if(adj) {
5629         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5630       }
5631     } // (!unconditional)
5632   } // if(ooo)
5633   else
5634   {
5635     // In-order execution (branch first)
5636     //printf("IOE\n");
5637     void *nottaken = NULL;
5638     if(1) {
5639       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5640       if(1) {
5641         assert(fs>=0);
5642         emit_testimm(fs,0x800000);
5643         if(source[i]&0x10000) // BC1T
5644         {
5645           nottaken=out;
5646           emit_jeq(1);
5647         }
5648         else // BC1F
5649         {
5650           nottaken=out;
5651           emit_jne(1);
5652         }
5653       }
5654     } // if(!unconditional)
5655     int adj;
5656     uint64_t ds_unneeded=branch_regs[i].u;
5657     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5658     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5659     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5660     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5661     ds_unneeded|=1;
5662     ds_unneeded_upper|=1;
5663     // branch taken
5664     //assem_debug("1:\n");
5665     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5666                   ds_unneeded,ds_unneeded_upper);
5667     // load regs
5668     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5669     address_generation(i+1,&branch_regs[i],0);
5670     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5671     ds_assemble(i+1,&branch_regs[i]);
5672     cc=get_reg(branch_regs[i].regmap,CCREG);
5673     if(cc==-1) {
5674       emit_loadreg(CCREG,cc=HOST_CCREG);
5675       // CHECK: Is the following instruction (fall thru) allocated ok?
5676     }
5677     assert(cc==HOST_CCREG);
5678     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5679     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5680     assem_debug("cycle count (adj)\n");
5681     if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5682     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5683     if(internal)
5684       assem_debug("branch: internal\n");
5685     else
5686       assem_debug("branch: external\n");
5687     if(internal&&is_ds[(ba[i]-start)>>2]) {
5688       ds_assemble_entry(i);
5689     }
5690     else {
5691       add_to_linker((int)out,ba[i],internal);
5692       emit_jmp(0);
5693     }
5694
5695     // branch not taken
5696     if(1) { // <- FIXME (don't need this)
5697       set_jump_target(nottaken, out);
5698       assem_debug("1:\n");
5699       if(!likely[i]) {
5700         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5701                       ds_unneeded,ds_unneeded_upper);
5702         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5703         address_generation(i+1,&branch_regs[i],0);
5704         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5705         ds_assemble(i+1,&branch_regs[i]);
5706       }
5707       cc=get_reg(branch_regs[i].regmap,CCREG);
5708       if(cc==-1&&!likely[i]) {
5709         // Cycle count isn't in a register, temporarily load it then write it out
5710         emit_loadreg(CCREG,HOST_CCREG);
5711         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5712         void *jaddr=out;
5713         emit_jns(0);
5714         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5715         emit_storereg(CCREG,HOST_CCREG);
5716       }
5717       else{
5718         cc=get_reg(i_regmap,CCREG);
5719         assert(cc==HOST_CCREG);
5720         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5721         void *jaddr=out;
5722         emit_jns(0);
5723         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5724       }
5725     }
5726   }
5727 }
5728
5729 static void pagespan_assemble(int i,struct regstat *i_regs)
5730 {
5731   int s1l=get_reg(i_regs->regmap,rs1[i]);
5732   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
5733   int s2l=get_reg(i_regs->regmap,rs2[i]);
5734   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
5735   void *taken = NULL;
5736   void *nottaken = NULL;
5737   int unconditional=0;
5738   if(rs1[i]==0)
5739   {
5740     s1l=s2l;s1h=s2h;
5741     s2l=s2h=-1;
5742   }
5743   else if(rs2[i]==0)
5744   {
5745     s2l=s2h=-1;
5746   }
5747   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
5748     s1h=s2h=-1;
5749   }
5750   int hr=0;
5751   int addr=-1,alt=-1,ntaddr=-1;
5752   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5753   else {
5754     while(hr<HOST_REGS)
5755     {
5756       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5757          (i_regs->regmap[hr]&63)!=rs1[i] &&
5758          (i_regs->regmap[hr]&63)!=rs2[i] )
5759       {
5760         addr=hr++;break;
5761       }
5762       hr++;
5763     }
5764   }
5765   while(hr<HOST_REGS)
5766   {
5767     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5768        (i_regs->regmap[hr]&63)!=rs1[i] &&
5769        (i_regs->regmap[hr]&63)!=rs2[i] )
5770     {
5771       alt=hr++;break;
5772     }
5773     hr++;
5774   }
5775   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5776   {
5777     while(hr<HOST_REGS)
5778     {
5779       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5780          (i_regs->regmap[hr]&63)!=rs1[i] &&
5781          (i_regs->regmap[hr]&63)!=rs2[i] )
5782       {
5783         ntaddr=hr;break;
5784       }
5785       hr++;
5786     }
5787   }
5788   assert(hr<HOST_REGS);
5789   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5790     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
5791   }
5792   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5793   if(opcode[i]==2) // J
5794   {
5795     unconditional=1;
5796   }
5797   if(opcode[i]==3) // JAL
5798   {
5799     // TODO: mini_ht
5800     int rt=get_reg(i_regs->regmap,31);
5801     emit_movimm(start+i*4+8,rt);
5802     unconditional=1;
5803   }
5804   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
5805   {
5806     emit_mov(s1l,addr);
5807     if(opcode2[i]==9) // JALR
5808     {
5809       int rt=get_reg(i_regs->regmap,rt1[i]);
5810       emit_movimm(start+i*4+8,rt);
5811     }
5812   }
5813   if((opcode[i]&0x3f)==4) // BEQ
5814   {
5815     if(rs1[i]==rs2[i])
5816     {
5817       unconditional=1;
5818     }
5819     else
5820     #ifdef HAVE_CMOV_IMM
5821     if(s1h<0) {
5822       if(s2l>=0) emit_cmp(s1l,s2l);
5823       else emit_test(s1l,s1l);
5824       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5825     }
5826     else
5827     #endif
5828     {
5829       assert(s1l>=0);
5830       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5831       if(s1h>=0) {
5832         if(s2h>=0) emit_cmp(s1h,s2h);
5833         else emit_test(s1h,s1h);
5834         emit_cmovne_reg(alt,addr);
5835       }
5836       if(s2l>=0) emit_cmp(s1l,s2l);
5837       else emit_test(s1l,s1l);
5838       emit_cmovne_reg(alt,addr);
5839     }
5840   }
5841   if((opcode[i]&0x3f)==5) // BNE
5842   {
5843     #ifdef HAVE_CMOV_IMM
5844     if(s1h<0) {
5845       if(s2l>=0) emit_cmp(s1l,s2l);
5846       else emit_test(s1l,s1l);
5847       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5848     }
5849     else
5850     #endif
5851     {
5852       assert(s1l>=0);
5853       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5854       if(s1h>=0) {
5855         if(s2h>=0) emit_cmp(s1h,s2h);
5856         else emit_test(s1h,s1h);
5857         emit_cmovne_reg(alt,addr);
5858       }
5859       if(s2l>=0) emit_cmp(s1l,s2l);
5860       else emit_test(s1l,s1l);
5861       emit_cmovne_reg(alt,addr);
5862     }
5863   }
5864   if((opcode[i]&0x3f)==0x14) // BEQL
5865   {
5866     if(s1h>=0) {
5867       if(s2h>=0) emit_cmp(s1h,s2h);
5868       else emit_test(s1h,s1h);
5869       nottaken=out;
5870       emit_jne(0);
5871     }
5872     if(s2l>=0) emit_cmp(s1l,s2l);
5873     else emit_test(s1l,s1l);
5874     if(nottaken) set_jump_target(nottaken, out);
5875     nottaken=out;
5876     emit_jne(0);
5877   }
5878   if((opcode[i]&0x3f)==0x15) // BNEL
5879   {
5880     if(s1h>=0) {
5881       if(s2h>=0) emit_cmp(s1h,s2h);
5882       else emit_test(s1h,s1h);
5883       taken=out;
5884       emit_jne(0);
5885     }
5886     if(s2l>=0) emit_cmp(s1l,s2l);
5887     else emit_test(s1l,s1l);
5888     nottaken=out;
5889     emit_jeq(0);
5890     if(taken) set_jump_target(taken, out);
5891   }
5892   if((opcode[i]&0x3f)==6) // BLEZ
5893   {
5894     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5895     emit_cmpimm(s1l,1);
5896     if(s1h>=0) emit_mov(addr,ntaddr);
5897     emit_cmovl_reg(alt,addr);
5898     if(s1h>=0) {
5899       emit_test(s1h,s1h);
5900       emit_cmovne_reg(ntaddr,addr);
5901       emit_cmovs_reg(alt,addr);
5902     }
5903   }
5904   if((opcode[i]&0x3f)==7) // BGTZ
5905   {
5906     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5907     emit_cmpimm(s1l,1);
5908     if(s1h>=0) emit_mov(addr,alt);
5909     emit_cmovl_reg(ntaddr,addr);
5910     if(s1h>=0) {
5911       emit_test(s1h,s1h);
5912       emit_cmovne_reg(alt,addr);
5913       emit_cmovs_reg(ntaddr,addr);
5914     }
5915   }
5916   if((opcode[i]&0x3f)==0x16) // BLEZL
5917   {
5918     assert((opcode[i]&0x3f)!=0x16);
5919   }
5920   if((opcode[i]&0x3f)==0x17) // BGTZL
5921   {
5922     assert((opcode[i]&0x3f)!=0x17);
5923   }
5924   assert(opcode[i]!=1); // BLTZ/BGEZ
5925
5926   //FIXME: Check CSREG
5927   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5928     if((source[i]&0x30000)==0) // BC1F
5929     {
5930       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5931       emit_testimm(s1l,0x800000);
5932       emit_cmovne_reg(alt,addr);
5933     }
5934     if((source[i]&0x30000)==0x10000) // BC1T
5935     {
5936       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5937       emit_testimm(s1l,0x800000);
5938       emit_cmovne_reg(alt,addr);
5939     }
5940     if((source[i]&0x30000)==0x20000) // BC1FL
5941     {
5942       emit_testimm(s1l,0x800000);
5943       nottaken=out;
5944       emit_jne(0);
5945     }
5946     if((source[i]&0x30000)==0x30000) // BC1TL
5947     {
5948       emit_testimm(s1l,0x800000);
5949       nottaken=out;
5950       emit_jeq(0);
5951     }
5952   }
5953
5954   assert(i_regs->regmap[HOST_CCREG]==CCREG);
5955   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
5956   if(likely[i]||unconditional)
5957   {
5958     emit_movimm(ba[i],HOST_BTREG);
5959   }
5960   else if(addr!=HOST_BTREG)
5961   {
5962     emit_mov(addr,HOST_BTREG);
5963   }
5964   void *branch_addr=out;
5965   emit_jmp(0);
5966   int target_addr=start+i*4+5;
5967   void *stub=out;
5968   void *compiled_target_addr=check_addr(target_addr);
5969   emit_extjump_ds((int)branch_addr,target_addr);
5970   if(compiled_target_addr) {
5971     set_jump_target(branch_addr, compiled_target_addr);
5972     add_link(target_addr,stub);
5973   }
5974   else set_jump_target(branch_addr, stub);
5975   if(likely[i]) {
5976     // Not-taken path
5977     set_jump_target(nottaken, out);
5978     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
5979     void *branch_addr=out;
5980     emit_jmp(0);
5981     int target_addr=start+i*4+8;
5982     void *stub=out;
5983     void *compiled_target_addr=check_addr(target_addr);
5984     emit_extjump_ds((int)branch_addr,target_addr);
5985     if(compiled_target_addr) {
5986       set_jump_target(branch_addr, compiled_target_addr);
5987       add_link(target_addr,stub);
5988     }
5989     else set_jump_target(branch_addr, stub);
5990   }
5991 }
5992
5993 // Assemble the delay slot for the above
5994 static void pagespan_ds()
5995 {
5996   assem_debug("initial delay slot:\n");
5997   u_int vaddr=start+1;
5998   u_int page=get_page(vaddr);
5999   u_int vpage=get_vpage(vaddr);
6000   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6001   do_dirty_stub_ds();
6002   ll_add(jump_in+page,vaddr,(void *)out);
6003   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6004   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6005     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6006   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6007     emit_writeword(HOST_BTREG,(int)&branch_target);
6008   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6009   address_generation(0,&regs[0],regs[0].regmap_entry);
6010   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6011     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6012   cop1_usable=0;
6013   is_delayslot=0;
6014   switch(itype[0]) {
6015     case ALU:
6016       alu_assemble(0,&regs[0]);break;
6017     case IMM16:
6018       imm16_assemble(0,&regs[0]);break;
6019     case SHIFT:
6020       shift_assemble(0,&regs[0]);break;
6021     case SHIFTIMM:
6022       shiftimm_assemble(0,&regs[0]);break;
6023     case LOAD:
6024       load_assemble(0,&regs[0]);break;
6025     case LOADLR:
6026       loadlr_assemble(0,&regs[0]);break;
6027     case STORE:
6028       store_assemble(0,&regs[0]);break;
6029     case STORELR:
6030       storelr_assemble(0,&regs[0]);break;
6031     case COP0:
6032       cop0_assemble(0,&regs[0]);break;
6033     case COP1:
6034       cop1_assemble(0,&regs[0]);break;
6035     case C1LS:
6036       c1ls_assemble(0,&regs[0]);break;
6037     case COP2:
6038       cop2_assemble(0,&regs[0]);break;
6039     case C2LS:
6040       c2ls_assemble(0,&regs[0]);break;
6041     case C2OP:
6042       c2op_assemble(0,&regs[0]);break;
6043     case FCONV:
6044       fconv_assemble(0,&regs[0]);break;
6045     case FLOAT:
6046       float_assemble(0,&regs[0]);break;
6047     case FCOMP:
6048       fcomp_assemble(0,&regs[0]);break;
6049     case MULTDIV:
6050       multdiv_assemble(0,&regs[0]);break;
6051     case MOV:
6052       mov_assemble(0,&regs[0]);break;
6053     case SYSCALL:
6054     case HLECALL:
6055     case INTCALL:
6056     case SPAN:
6057     case UJUMP:
6058     case RJUMP:
6059     case CJUMP:
6060     case SJUMP:
6061     case FJUMP:
6062       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
6063   }
6064   int btaddr=get_reg(regs[0].regmap,BTREG);
6065   if(btaddr<0) {
6066     btaddr=get_reg(regs[0].regmap,-1);
6067     emit_readword((int)&branch_target,btaddr);
6068   }
6069   assert(btaddr!=HOST_CCREG);
6070   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6071 #ifdef HOST_IMM8
6072   emit_movimm(start+4,HOST_TEMPREG);
6073   emit_cmp(btaddr,HOST_TEMPREG);
6074 #else
6075   emit_cmpimm(btaddr,start+4);
6076 #endif
6077   void *branch = out;
6078   emit_jeq(0);
6079   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6080   emit_jmp(jump_vaddr_reg[btaddr]);
6081   set_jump_target(branch, out);
6082   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6083   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6084 }
6085
6086 // Basic liveness analysis for MIPS registers
6087 void unneeded_registers(int istart,int iend,int r)
6088 {
6089   int i;
6090   uint64_t u,uu,gte_u,b,bu,gte_bu;
6091   uint64_t temp_u,temp_uu,temp_gte_u=0;
6092   uint64_t tdep;
6093   uint64_t gte_u_unknown=0;
6094   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
6095     gte_u_unknown=~0ll;
6096   if(iend==slen-1) {
6097     u=1;uu=1;
6098     gte_u=gte_u_unknown;
6099   }else{
6100     u=unneeded_reg[iend+1];
6101     uu=unneeded_reg_upper[iend+1];
6102     u=1;uu=1;
6103     gte_u=gte_unneeded[iend+1];
6104   }
6105
6106   for (i=iend;i>=istart;i--)
6107   {
6108     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6109     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6110     {
6111       // If subroutine call, flag return address as a possible branch target
6112       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6113
6114       if(ba[i]<start || ba[i]>=(start+slen*4))
6115       {
6116         // Branch out of this block, flush all regs
6117         u=1;
6118         uu=1;
6119         gte_u=gte_u_unknown;
6120         /* Hexagon hack
6121         if(itype[i]==UJUMP&&rt1[i]==31)
6122         {
6123           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6124         }
6125         if(itype[i]==RJUMP&&rs1[i]==31)
6126         {
6127           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6128         }
6129         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6130           if(itype[i]==UJUMP&&rt1[i]==31)
6131           {
6132             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6133             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6134           }
6135           if(itype[i]==RJUMP&&rs1[i]==31)
6136           {
6137             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6138             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6139           }
6140         }*/
6141         branch_unneeded_reg[i]=u;
6142         branch_unneeded_reg_upper[i]=uu;
6143         // Merge in delay slot
6144         tdep=(~uu>>rt1[i+1])&1;
6145         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6146         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6147         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6148         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6149         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6150         u|=1;uu|=1;
6151         gte_u|=gte_rt[i+1];
6152         gte_u&=~gte_rs[i+1];
6153         // If branch is "likely" (and conditional)
6154         // then we skip the delay slot on the fall-thru path
6155         if(likely[i]) {
6156           if(i<slen-1) {
6157             u&=unneeded_reg[i+2];
6158             uu&=unneeded_reg_upper[i+2];
6159             gte_u&=gte_unneeded[i+2];
6160           }
6161           else
6162           {
6163             u=1;
6164             uu=1;
6165             gte_u=gte_u_unknown;
6166           }
6167         }
6168       }
6169       else
6170       {
6171         // Internal branch, flag target
6172         bt[(ba[i]-start)>>2]=1;
6173         if(ba[i]<=start+i*4) {
6174           // Backward branch
6175           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6176           {
6177             // Unconditional branch
6178             temp_u=1;temp_uu=1;
6179             temp_gte_u=0;
6180           } else {
6181             // Conditional branch (not taken case)
6182             temp_u=unneeded_reg[i+2];
6183             temp_uu=unneeded_reg_upper[i+2];
6184             temp_gte_u&=gte_unneeded[i+2];
6185           }
6186           // Merge in delay slot
6187           tdep=(~temp_uu>>rt1[i+1])&1;
6188           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6189           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6190           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6191           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6192           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6193           temp_u|=1;temp_uu|=1;
6194           temp_gte_u|=gte_rt[i+1];
6195           temp_gte_u&=~gte_rs[i+1];
6196           // If branch is "likely" (and conditional)
6197           // then we skip the delay slot on the fall-thru path
6198           if(likely[i]) {
6199             if(i<slen-1) {
6200               temp_u&=unneeded_reg[i+2];
6201               temp_uu&=unneeded_reg_upper[i+2];
6202               temp_gte_u&=gte_unneeded[i+2];
6203             }
6204             else
6205             {
6206               temp_u=1;
6207               temp_uu=1;
6208               temp_gte_u=gte_u_unknown;
6209             }
6210           }
6211           tdep=(~temp_uu>>rt1[i])&1;
6212           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6213           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6214           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6215           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6216           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6217           temp_u|=1;temp_uu|=1;
6218           temp_gte_u|=gte_rt[i];
6219           temp_gte_u&=~gte_rs[i];
6220           unneeded_reg[i]=temp_u;
6221           unneeded_reg_upper[i]=temp_uu;
6222           gte_unneeded[i]=temp_gte_u;
6223           // Only go three levels deep.  This recursion can take an
6224           // excessive amount of time if there are a lot of nested loops.
6225           if(r<2) {
6226             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6227           }else{
6228             unneeded_reg[(ba[i]-start)>>2]=1;
6229             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6230             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
6231           }
6232         } /*else*/ if(1) {
6233           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6234           {
6235             // Unconditional branch
6236             u=unneeded_reg[(ba[i]-start)>>2];
6237             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6238             gte_u=gte_unneeded[(ba[i]-start)>>2];
6239             branch_unneeded_reg[i]=u;
6240             branch_unneeded_reg_upper[i]=uu;
6241         //u=1;
6242         //uu=1;
6243         //branch_unneeded_reg[i]=u;
6244         //branch_unneeded_reg_upper[i]=uu;
6245             // Merge in delay slot
6246             tdep=(~uu>>rt1[i+1])&1;
6247             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6248             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6249             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6250             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6251             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6252             u|=1;uu|=1;
6253             gte_u|=gte_rt[i+1];
6254             gte_u&=~gte_rs[i+1];
6255           } else {
6256             // Conditional branch
6257             b=unneeded_reg[(ba[i]-start)>>2];
6258             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6259             gte_bu=gte_unneeded[(ba[i]-start)>>2];
6260             branch_unneeded_reg[i]=b;
6261             branch_unneeded_reg_upper[i]=bu;
6262         //b=1;
6263         //bu=1;
6264         //branch_unneeded_reg[i]=b;
6265         //branch_unneeded_reg_upper[i]=bu;
6266             // Branch delay slot
6267             tdep=(~uu>>rt1[i+1])&1;
6268             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6269             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6270             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6271             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6272             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6273             b|=1;bu|=1;
6274             gte_bu|=gte_rt[i+1];
6275             gte_bu&=~gte_rs[i+1];
6276             // If branch is "likely" then we skip the
6277             // delay slot on the fall-thru path
6278             if(likely[i]) {
6279               u=b;
6280               uu=bu;
6281               gte_u=gte_bu;
6282               if(i<slen-1) {
6283                 u&=unneeded_reg[i+2];
6284                 uu&=unneeded_reg_upper[i+2];
6285                 gte_u&=gte_unneeded[i+2];
6286         //u=1;
6287         //uu=1;
6288               }
6289             } else {
6290               u&=b;
6291               uu&=bu;
6292               gte_u&=gte_bu;
6293         //u=1;
6294         //uu=1;
6295             }
6296             if(i<slen-1) {
6297               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6298               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6299         //branch_unneeded_reg[i]=1;
6300         //branch_unneeded_reg_upper[i]=1;
6301             } else {
6302               branch_unneeded_reg[i]=1;
6303               branch_unneeded_reg_upper[i]=1;
6304             }
6305           }
6306         }
6307       }
6308     }
6309     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6310     {
6311       // SYSCALL instruction (software interrupt)
6312       u=1;
6313       uu=1;
6314     }
6315     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6316     {
6317       // ERET instruction (return from interrupt)
6318       u=1;
6319       uu=1;
6320     }
6321     //u=uu=1; // DEBUG
6322     tdep=(~uu>>rt1[i])&1;
6323     // Written registers are unneeded
6324     u|=1LL<<rt1[i];
6325     u|=1LL<<rt2[i];
6326     uu|=1LL<<rt1[i];
6327     uu|=1LL<<rt2[i];
6328     gte_u|=gte_rt[i];
6329     // Accessed registers are needed
6330     u&=~(1LL<<rs1[i]);
6331     u&=~(1LL<<rs2[i]);
6332     uu&=~(1LL<<us1[i]);
6333     uu&=~(1LL<<us2[i]);
6334     gte_u&=~gte_rs[i];
6335     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
6336       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
6337     // Source-target dependencies
6338     uu&=~(tdep<<dep1[i]);
6339     uu&=~(tdep<<dep2[i]);
6340     // R0 is always unneeded
6341     u|=1;uu|=1;
6342     // Save it
6343     unneeded_reg[i]=u;
6344     unneeded_reg_upper[i]=uu;
6345     gte_unneeded[i]=gte_u;
6346     /*
6347     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6348     printf("U:");
6349     int r;
6350     for(r=1;r<=CCREG;r++) {
6351       if((unneeded_reg[i]>>r)&1) {
6352         if(r==HIREG) printf(" HI");
6353         else if(r==LOREG) printf(" LO");
6354         else printf(" r%d",r);
6355       }
6356     }
6357     printf(" UU:");
6358     for(r=1;r<=CCREG;r++) {
6359       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6360         if(r==HIREG) printf(" HI");
6361         else if(r==LOREG) printf(" LO");
6362         else printf(" r%d",r);
6363       }
6364     }
6365     printf("\n");*/
6366   }
6367   for (i=iend;i>=istart;i--)
6368   {
6369     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
6370   }
6371 }
6372
6373 // Write back dirty registers as soon as we will no longer modify them,
6374 // so that we don't end up with lots of writes at the branches.
6375 void clean_registers(int istart,int iend,int wr)
6376 {
6377   int i;
6378   int r;
6379   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6380   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6381   if(iend==slen-1) {
6382     will_dirty_i=will_dirty_next=0;
6383     wont_dirty_i=wont_dirty_next=0;
6384   }else{
6385     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6386     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6387   }
6388   for (i=iend;i>=istart;i--)
6389   {
6390     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6391     {
6392       if(ba[i]<start || ba[i]>=(start+slen*4))
6393       {
6394         // Branch out of this block, flush all regs
6395         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6396         {
6397           // Unconditional branch
6398           will_dirty_i=0;
6399           wont_dirty_i=0;
6400           // Merge in delay slot (will dirty)
6401           for(r=0;r<HOST_REGS;r++) {
6402             if(r!=EXCLUDE_REG) {
6403               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6404               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6405               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6406               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6407               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6408               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6409               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6410               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6411               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6412               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6413               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6414               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6415               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6416               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6417             }
6418           }
6419         }
6420         else
6421         {
6422           // Conditional branch
6423           will_dirty_i=0;
6424           wont_dirty_i=wont_dirty_next;
6425           // Merge in delay slot (will dirty)
6426           for(r=0;r<HOST_REGS;r++) {
6427             if(r!=EXCLUDE_REG) {
6428               if(!likely[i]) {
6429                 // Might not dirty if likely branch is not taken
6430                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6431                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6432                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6433                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6434                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6435                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6436                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6437                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6438                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6439                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6440                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6441                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6442                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6443                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6444               }
6445             }
6446           }
6447         }
6448         // Merge in delay slot (wont dirty)
6449         for(r=0;r<HOST_REGS;r++) {
6450           if(r!=EXCLUDE_REG) {
6451             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6452             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6453             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6454             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6455             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6456             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6457             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6458             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6459             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6460             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6461           }
6462         }
6463         if(wr) {
6464           #ifndef DESTRUCTIVE_WRITEBACK
6465           branch_regs[i].dirty&=wont_dirty_i;
6466           #endif
6467           branch_regs[i].dirty|=will_dirty_i;
6468         }
6469       }
6470       else
6471       {
6472         // Internal branch
6473         if(ba[i]<=start+i*4) {
6474           // Backward branch
6475           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6476           {
6477             // Unconditional branch
6478             temp_will_dirty=0;
6479             temp_wont_dirty=0;
6480             // Merge in delay slot (will dirty)
6481             for(r=0;r<HOST_REGS;r++) {
6482               if(r!=EXCLUDE_REG) {
6483                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6484                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6485                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6486                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6487                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6488                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6489                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6490                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6491                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6492                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6493                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6494                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6495                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6496                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6497               }
6498             }
6499           } else {
6500             // Conditional branch (not taken case)
6501             temp_will_dirty=will_dirty_next;
6502             temp_wont_dirty=wont_dirty_next;
6503             // Merge in delay slot (will dirty)
6504             for(r=0;r<HOST_REGS;r++) {
6505               if(r!=EXCLUDE_REG) {
6506                 if(!likely[i]) {
6507                   // Will not dirty if likely branch is not taken
6508                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6509                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6510                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6511                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6512                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6513                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
6514                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6515                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6516                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6517                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6518                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6519                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6520                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6521                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6522                 }
6523               }
6524             }
6525           }
6526           // Merge in delay slot (wont dirty)
6527           for(r=0;r<HOST_REGS;r++) {
6528             if(r!=EXCLUDE_REG) {
6529               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6530               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6531               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6532               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6533               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6534               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6535               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6536               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6537               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6538               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6539             }
6540           }
6541           // Deal with changed mappings
6542           if(i<iend) {
6543             for(r=0;r<HOST_REGS;r++) {
6544               if(r!=EXCLUDE_REG) {
6545                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
6546                   temp_will_dirty&=~(1<<r);
6547                   temp_wont_dirty&=~(1<<r);
6548                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6549                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6550                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6551                   } else {
6552                     temp_will_dirty|=1<<r;
6553                     temp_wont_dirty|=1<<r;
6554                   }
6555                 }
6556               }
6557             }
6558           }
6559           if(wr) {
6560             will_dirty[i]=temp_will_dirty;
6561             wont_dirty[i]=temp_wont_dirty;
6562             clean_registers((ba[i]-start)>>2,i-1,0);
6563           }else{
6564             // Limit recursion.  It can take an excessive amount
6565             // of time if there are a lot of nested loops.
6566             will_dirty[(ba[i]-start)>>2]=0;
6567             wont_dirty[(ba[i]-start)>>2]=-1;
6568           }
6569         }
6570         /*else*/ if(1)
6571         {
6572           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6573           {
6574             // Unconditional branch
6575             will_dirty_i=0;
6576             wont_dirty_i=0;
6577           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6578             for(r=0;r<HOST_REGS;r++) {
6579               if(r!=EXCLUDE_REG) {
6580                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6581                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
6582                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6583                 }
6584                 if(branch_regs[i].regmap[r]>=0) {
6585                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6586                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6587                 }
6588               }
6589             }
6590           //}
6591             // Merge in delay slot
6592             for(r=0;r<HOST_REGS;r++) {
6593               if(r!=EXCLUDE_REG) {
6594                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6595                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6596                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6597                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6598                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6599                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6600                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6601                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6602                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6603                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6604                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6605                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6606                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6607                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6608               }
6609             }
6610           } else {
6611             // Conditional branch
6612             will_dirty_i=will_dirty_next;
6613             wont_dirty_i=wont_dirty_next;
6614           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6615             for(r=0;r<HOST_REGS;r++) {
6616               if(r!=EXCLUDE_REG) {
6617                 signed char target_reg=branch_regs[i].regmap[r];
6618                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6619                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6620                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6621                 }
6622                 else if(target_reg>=0) {
6623                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6624                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6625                 }
6626                 // Treat delay slot as part of branch too
6627                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6628                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6629                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6630                 }
6631                 else
6632                 {
6633                   will_dirty[i+1]&=~(1<<r);
6634                 }*/
6635               }
6636             }
6637           //}
6638             // Merge in delay slot
6639             for(r=0;r<HOST_REGS;r++) {
6640               if(r!=EXCLUDE_REG) {
6641                 if(!likely[i]) {
6642                   // Might not dirty if likely branch is not taken
6643                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6644                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6645                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6646                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6647                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6648                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6649                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6650                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6651                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6652                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6653                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6654                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6655                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6656                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6657                 }
6658               }
6659             }
6660           }
6661           // Merge in delay slot (won't dirty)
6662           for(r=0;r<HOST_REGS;r++) {
6663             if(r!=EXCLUDE_REG) {
6664               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6665               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6666               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6667               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6668               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6669               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6670               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6671               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6672               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6673               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6674             }
6675           }
6676           if(wr) {
6677             #ifndef DESTRUCTIVE_WRITEBACK
6678             branch_regs[i].dirty&=wont_dirty_i;
6679             #endif
6680             branch_regs[i].dirty|=will_dirty_i;
6681           }
6682         }
6683       }
6684     }
6685     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6686     {
6687       // SYSCALL instruction (software interrupt)
6688       will_dirty_i=0;
6689       wont_dirty_i=0;
6690     }
6691     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6692     {
6693       // ERET instruction (return from interrupt)
6694       will_dirty_i=0;
6695       wont_dirty_i=0;
6696     }
6697     will_dirty_next=will_dirty_i;
6698     wont_dirty_next=wont_dirty_i;
6699     for(r=0;r<HOST_REGS;r++) {
6700       if(r!=EXCLUDE_REG) {
6701         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6702         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6703         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6704         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6705         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6706         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6707         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6708         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6709         if(i>istart) {
6710           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP)
6711           {
6712             // Don't store a register immediately after writing it,
6713             // may prevent dual-issue.
6714             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
6715             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
6716           }
6717         }
6718       }
6719     }
6720     // Save it
6721     will_dirty[i]=will_dirty_i;
6722     wont_dirty[i]=wont_dirty_i;
6723     // Mark registers that won't be dirtied as not dirty
6724     if(wr) {
6725       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
6726       for(r=0;r<HOST_REGS;r++) {
6727         if((will_dirty_i>>r)&1) {
6728           printf(" r%d",r);
6729         }
6730       }
6731       printf("\n");*/
6732
6733       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
6734         regs[i].dirty|=will_dirty_i;
6735         #ifndef DESTRUCTIVE_WRITEBACK
6736         regs[i].dirty&=wont_dirty_i;
6737         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6738         {
6739           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
6740             for(r=0;r<HOST_REGS;r++) {
6741               if(r!=EXCLUDE_REG) {
6742                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
6743                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
6744                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6745               }
6746             }
6747           }
6748         }
6749         else
6750         {
6751           if(i<iend) {
6752             for(r=0;r<HOST_REGS;r++) {
6753               if(r!=EXCLUDE_REG) {
6754                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
6755                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
6756                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6757               }
6758             }
6759           }
6760         }
6761         #endif
6762       //}
6763     }
6764     // Deal with changed mappings
6765     temp_will_dirty=will_dirty_i;
6766     temp_wont_dirty=wont_dirty_i;
6767     for(r=0;r<HOST_REGS;r++) {
6768       if(r!=EXCLUDE_REG) {
6769         int nr;
6770         if(regs[i].regmap[r]==regmap_pre[i][r]) {
6771           if(wr) {
6772             #ifndef DESTRUCTIVE_WRITEBACK
6773             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6774             #endif
6775             regs[i].wasdirty|=will_dirty_i&(1<<r);
6776           }
6777         }
6778         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
6779           // Register moved to a different register
6780           will_dirty_i&=~(1<<r);
6781           wont_dirty_i&=~(1<<r);
6782           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
6783           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
6784           if(wr) {
6785             #ifndef DESTRUCTIVE_WRITEBACK
6786             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6787             #endif
6788             regs[i].wasdirty|=will_dirty_i&(1<<r);
6789           }
6790         }
6791         else {
6792           will_dirty_i&=~(1<<r);
6793           wont_dirty_i&=~(1<<r);
6794           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6795             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6796             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6797           } else {
6798             wont_dirty_i|=1<<r;
6799             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
6800           }
6801         }
6802       }
6803     }
6804   }
6805 }
6806
6807 #ifdef DISASM
6808   /* disassembly */
6809 void disassemble_inst(int i)
6810 {
6811     if (bt[i]) printf("*"); else printf(" ");
6812     switch(itype[i]) {
6813       case UJUMP:
6814         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6815       case CJUMP:
6816         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
6817       case SJUMP:
6818         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
6819       case FJUMP:
6820         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6821       case RJUMP:
6822         if (opcode[i]==0x9&&rt1[i]!=31)
6823           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
6824         else
6825           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6826         break;
6827       case SPAN:
6828         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
6829       case IMM16:
6830         if(opcode[i]==0xf) //LUI
6831           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
6832         else
6833           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6834         break;
6835       case LOAD:
6836       case LOADLR:
6837         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6838         break;
6839       case STORE:
6840       case STORELR:
6841         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
6842         break;
6843       case ALU:
6844       case SHIFT:
6845         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
6846         break;
6847       case MULTDIV:
6848         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
6849         break;
6850       case SHIFTIMM:
6851         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6852         break;
6853       case MOV:
6854         if((opcode2[i]&0x1d)==0x10)
6855           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
6856         else if((opcode2[i]&0x1d)==0x11)
6857           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6858         else
6859           printf (" %x: %s\n",start+i*4,insn[i]);
6860         break;
6861       case COP0:
6862         if(opcode2[i]==0)
6863           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
6864         else if(opcode2[i]==4)
6865           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
6866         else printf (" %x: %s\n",start+i*4,insn[i]);
6867         break;
6868       case COP1:
6869         if(opcode2[i]<3)
6870           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
6871         else if(opcode2[i]>3)
6872           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
6873         else printf (" %x: %s\n",start+i*4,insn[i]);
6874         break;
6875       case COP2:
6876         if(opcode2[i]<3)
6877           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
6878         else if(opcode2[i]>3)
6879           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
6880         else printf (" %x: %s\n",start+i*4,insn[i]);
6881         break;
6882       case C1LS:
6883         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6884         break;
6885       case C2LS:
6886         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6887         break;
6888       case INTCALL:
6889         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
6890         break;
6891       default:
6892         //printf (" %s %8x\n",insn[i],source[i]);
6893         printf (" %x: %s\n",start+i*4,insn[i]);
6894     }
6895 }
6896 #else
6897 static void disassemble_inst(int i) {}
6898 #endif // DISASM
6899
6900 #define DRC_TEST_VAL 0x74657374
6901
6902 static int new_dynarec_test(void)
6903 {
6904   int (*testfunc)(void) = (void *)out;
6905   void *beginning;
6906   int ret;
6907
6908   beginning = start_block();
6909   emit_movimm(DRC_TEST_VAL,0); // test
6910   emit_jmpreg(14);
6911   literal_pool(0);
6912   end_block(beginning);
6913   SysPrintf("testing if we can run recompiled code..\n");
6914   ret = testfunc();
6915   if (ret == DRC_TEST_VAL)
6916     SysPrintf("test passed.\n");
6917   else
6918     SysPrintf("test failed: %08x\n", ret);
6919   out=(u_char *)BASE_ADDR;
6920   return ret == DRC_TEST_VAL;
6921 }
6922
6923 // clear the state completely, instead of just marking
6924 // things invalid like invalidate_all_pages() does
6925 void new_dynarec_clear_full()
6926 {
6927   int n;
6928   out=(u_char *)BASE_ADDR;
6929   memset(invalid_code,1,sizeof(invalid_code));
6930   memset(hash_table,0xff,sizeof(hash_table));
6931   memset(mini_ht,-1,sizeof(mini_ht));
6932   memset(restore_candidate,0,sizeof(restore_candidate));
6933   memset(shadow,0,sizeof(shadow));
6934   copy=shadow;
6935   expirep=16384; // Expiry pointer, +2 blocks
6936   pending_exception=0;
6937   literalcount=0;
6938   stop_after_jal=0;
6939   inv_code_start=inv_code_end=~0;
6940   // TLB
6941   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6942   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6943   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6944 }
6945
6946 void new_dynarec_init()
6947 {
6948   SysPrintf("Init new dynarec\n");
6949
6950   // allocate/prepare a buffer for translation cache
6951   // see assem_arm.h for some explanation
6952 #if   defined(BASE_ADDR_FIXED)
6953   if (mmap (translation_cache, 1 << TARGET_SIZE_2,
6954             PROT_READ | PROT_WRITE | PROT_EXEC,
6955             MAP_PRIVATE | MAP_ANONYMOUS,
6956             -1, 0) != translation_cache) {
6957     SysPrintf("mmap() failed: %s\n", strerror(errno));
6958     SysPrintf("disable BASE_ADDR_FIXED and recompile\n");
6959     abort();
6960   }
6961 #elif defined(BASE_ADDR_DYNAMIC)
6962   #ifdef VITA
6963   sceBlock = sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
6964   if (sceBlock < 0)
6965     SysPrintf("sceKernelAllocMemBlockForVM failed\n");
6966   int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&translation_cache);
6967   if (ret < 0)
6968     SysPrintf("sceKernelGetMemBlockBase failed\n");
6969   #else
6970   translation_cache = mmap (NULL, 1 << TARGET_SIZE_2,
6971             PROT_READ | PROT_WRITE | PROT_EXEC,
6972             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
6973   if (translation_cache == MAP_FAILED) {
6974     SysPrintf("mmap() failed: %s\n", strerror(errno));
6975     abort();
6976   }
6977   #endif
6978 #else
6979   #ifndef NO_WRITE_EXEC
6980   // not all systems allow execute in data segment by default
6981   if (mprotect((void *)BASE_ADDR, 1<<TARGET_SIZE_2, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
6982     SysPrintf("mprotect() failed: %s\n", strerror(errno));
6983   #endif
6984 #endif
6985   out=(u_char *)BASE_ADDR;
6986   cycle_multiplier=200;
6987   new_dynarec_clear_full();
6988 #ifdef HOST_IMM8
6989   // Copy this into local area so we don't have to put it in every literal pool
6990   invc_ptr=invalid_code;
6991 #endif
6992   arch_init();
6993   new_dynarec_test();
6994 #ifndef RAM_FIXED
6995   ram_offset=(u_int)rdram-0x80000000;
6996 #endif
6997   if (ram_offset!=0)
6998     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
6999 }
7000
7001 void new_dynarec_cleanup()
7002 {
7003   int n;
7004 #if defined(BASE_ADDR_FIXED) || defined(BASE_ADDR_DYNAMIC)
7005   #ifdef VITA
7006   sceKernelFreeMemBlock(sceBlock);
7007   sceBlock = -1;
7008   #else
7009   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0)
7010     SysPrintf("munmap() failed\n");
7011   #endif
7012 #endif
7013   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7014   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7015   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7016   #ifdef ROM_COPY
7017   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
7018   #endif
7019 }
7020
7021 static u_int *get_source_start(u_int addr, u_int *limit)
7022 {
7023   if (addr < 0x00200000 ||
7024     (0xa0000000 <= addr && addr < 0xa0200000)) {
7025     // used for BIOS calls mostly?
7026     *limit = (addr&0xa0000000)|0x00200000;
7027     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7028   }
7029   else if (!Config.HLE && (
7030     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
7031     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
7032     // BIOS
7033     *limit = (addr & 0xfff00000) | 0x80000;
7034     return (u_int *)((u_int)psxR + (addr&0x7ffff));
7035   }
7036   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
7037     *limit = (addr & 0x80600000) + 0x00200000;
7038     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7039   }
7040   return NULL;
7041 }
7042
7043 static u_int scan_for_ret(u_int addr)
7044 {
7045   u_int limit = 0;
7046   u_int *mem;
7047
7048   mem = get_source_start(addr, &limit);
7049   if (mem == NULL)
7050     return addr;
7051
7052   if (limit > addr + 0x1000)
7053     limit = addr + 0x1000;
7054   for (; addr < limit; addr += 4, mem++) {
7055     if (*mem == 0x03e00008) // jr $ra
7056       return addr + 8;
7057   }
7058   return addr;
7059 }
7060
7061 struct savestate_block {
7062   uint32_t addr;
7063   uint32_t regflags;
7064 };
7065
7066 static int addr_cmp(const void *p1_, const void *p2_)
7067 {
7068   const struct savestate_block *p1 = p1_, *p2 = p2_;
7069   return p1->addr - p2->addr;
7070 }
7071
7072 int new_dynarec_save_blocks(void *save, int size)
7073 {
7074   struct savestate_block *blocks = save;
7075   int maxcount = size / sizeof(blocks[0]);
7076   struct savestate_block tmp_blocks[1024];
7077   struct ll_entry *head;
7078   int p, s, d, o, bcnt;
7079   u_int addr;
7080
7081   o = 0;
7082   for (p = 0; p < ARRAY_SIZE(jump_in); p++) {
7083     bcnt = 0;
7084     for (head = jump_in[p]; head != NULL; head = head->next) {
7085       tmp_blocks[bcnt].addr = head->vaddr;
7086       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
7087       bcnt++;
7088     }
7089     if (bcnt < 1)
7090       continue;
7091     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
7092
7093     addr = tmp_blocks[0].addr;
7094     for (s = d = 0; s < bcnt; s++) {
7095       if (tmp_blocks[s].addr < addr)
7096         continue;
7097       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
7098         tmp_blocks[d++] = tmp_blocks[s];
7099       addr = scan_for_ret(tmp_blocks[s].addr);
7100     }
7101
7102     if (o + d > maxcount)
7103       d = maxcount - o;
7104     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
7105     o += d;
7106   }
7107
7108   return o * sizeof(blocks[0]);
7109 }
7110
7111 void new_dynarec_load_blocks(const void *save, int size)
7112 {
7113   const struct savestate_block *blocks = save;
7114   int count = size / sizeof(blocks[0]);
7115   u_int regs_save[32];
7116   uint32_t f;
7117   int i, b;
7118
7119   get_addr(psxRegs.pc);
7120
7121   // change GPRs for speculation to at least partially work..
7122   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
7123   for (i = 1; i < 32; i++)
7124     psxRegs.GPR.r[i] = 0x80000000;
7125
7126   for (b = 0; b < count; b++) {
7127     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7128       if (f & 1)
7129         psxRegs.GPR.r[i] = 0x1f800000;
7130     }
7131
7132     get_addr(blocks[b].addr);
7133
7134     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7135       if (f & 1)
7136         psxRegs.GPR.r[i] = 0x80000000;
7137     }
7138   }
7139
7140   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
7141 }
7142
7143 int new_recompile_block(int addr)
7144 {
7145   u_int pagelimit = 0;
7146   u_int state_rflags = 0;
7147   int i;
7148
7149   assem_debug("NOTCOMPILED: addr = %x -> %p\n", addr, out);
7150   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7151   //if(debug)
7152   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7153   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7154
7155   // this is just for speculation
7156   for (i = 1; i < 32; i++) {
7157     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
7158       state_rflags |= 1 << i;
7159   }
7160
7161   start = (u_int)addr&~3;
7162   //assert(((u_int)addr&1)==0);
7163   new_dynarec_did_compile=1;
7164   if (Config.HLE && start == 0x80001000) // hlecall
7165   {
7166     // XXX: is this enough? Maybe check hleSoftCall?
7167     void *beginning=start_block();
7168     u_int page=get_page(start);
7169
7170     invalid_code[start>>12]=0;
7171     emit_movimm(start,0);
7172     emit_writeword(0,(int)&pcaddr);
7173     emit_jmp(new_dyna_leave);
7174     literal_pool(0);
7175     end_block(beginning);
7176     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
7177     return 0;
7178   }
7179
7180   source = get_source_start(start, &pagelimit);
7181   if (source == NULL) {
7182     SysPrintf("Compile at bogus memory address: %08x\n", addr);
7183     exit(1);
7184   }
7185
7186   /* Pass 1: disassemble */
7187   /* Pass 2: register dependencies, branch targets */
7188   /* Pass 3: register allocation */
7189   /* Pass 4: branch dependencies */
7190   /* Pass 5: pre-alloc */
7191   /* Pass 6: optimize clean/dirty state */
7192   /* Pass 7: flag 32-bit registers */
7193   /* Pass 8: assembly */
7194   /* Pass 9: linker */
7195   /* Pass 10: garbage collection / free memory */
7196
7197   int j;
7198   int done=0;
7199   unsigned int type,op,op2;
7200
7201   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7202
7203   /* Pass 1 disassembly */
7204
7205   for(i=0;!done;i++) {
7206     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
7207     minimum_free_regs[i]=0;
7208     opcode[i]=op=source[i]>>26;
7209     switch(op)
7210     {
7211       case 0x00: strcpy(insn[i],"special"); type=NI;
7212         op2=source[i]&0x3f;
7213         switch(op2)
7214         {
7215           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7216           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7217           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7218           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7219           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7220           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7221           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7222           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7223           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7224           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7225           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7226           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7227           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7228           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7229           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7230           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7231           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7232           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7233           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7234           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7235           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7236           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7237           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7238           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7239           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7240           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7241           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7242           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7243           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7244           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7245           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7246           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7247           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7248           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7249           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7250 #if 0
7251           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7252           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7253           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7254           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7255           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7256           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7257           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7258           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7259           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7260           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7261           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7262           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7263           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7264           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7265           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7266           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7267           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7268 #endif
7269         }
7270         break;
7271       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7272         op2=(source[i]>>16)&0x1f;
7273         switch(op2)
7274         {
7275           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7276           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7277           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7278           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7279           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7280           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7281           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7282           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7283           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7284           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7285           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7286           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7287           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7288           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7289         }
7290         break;
7291       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7292       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7293       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7294       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7295       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7296       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7297       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7298       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7299       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7300       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7301       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7302       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7303       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7304       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7305       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7306         op2=(source[i]>>21)&0x1f;
7307         switch(op2)
7308         {
7309           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7310           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7311           case 0x10: strcpy(insn[i],"tlb"); type=NI;
7312           switch(source[i]&0x3f)
7313           {
7314             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
7315             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
7316             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
7317             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
7318             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
7319             //case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
7320           }
7321         }
7322         break;
7323       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7324         op2=(source[i]>>21)&0x1f;
7325         switch(op2)
7326         {
7327           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7328           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7329           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7330           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7331           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7332           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7333           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7334           switch((source[i]>>16)&0x3)
7335           {
7336             case 0x00: strcpy(insn[i],"BC1F"); break;
7337             case 0x01: strcpy(insn[i],"BC1T"); break;
7338             case 0x02: strcpy(insn[i],"BC1FL"); break;
7339             case 0x03: strcpy(insn[i],"BC1TL"); break;
7340           }
7341           break;
7342           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7343           switch(source[i]&0x3f)
7344           {
7345             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7346             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7347             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7348             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7349             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7350             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7351             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7352             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7353             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7354             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7355             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7356             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7357             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7358             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7359             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7360             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
7361             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
7362             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
7363             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
7364             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
7365             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
7366             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
7367             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
7368             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
7369             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
7370             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
7371             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
7372             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
7373             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
7374             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
7375             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
7376             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
7377             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
7378             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
7379             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
7380           }
7381           break;
7382           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
7383           switch(source[i]&0x3f)
7384           {
7385             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
7386             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
7387             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
7388             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
7389             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
7390             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
7391             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
7392             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
7393             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
7394             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
7395             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
7396             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
7397             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
7398             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
7399             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
7400             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
7401             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
7402             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
7403             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
7404             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
7405             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
7406             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
7407             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
7408             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
7409             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
7410             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
7411             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
7412             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
7413             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
7414             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
7415             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
7416             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
7417             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
7418             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
7419             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
7420           }
7421           break;
7422           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
7423           switch(source[i]&0x3f)
7424           {
7425             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
7426             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
7427           }
7428           break;
7429           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
7430           switch(source[i]&0x3f)
7431           {
7432             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
7433             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
7434           }
7435           break;
7436         }
7437         break;
7438 #if 0
7439       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7440       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7441       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7442       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7443       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7444       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7445       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7446       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7447 #endif
7448       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7449       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7450       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7451       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7452       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7453       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7454       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7455 #if 0
7456       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7457 #endif
7458       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7459       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7460       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7461       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7462 #if 0
7463       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7464       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7465 #endif
7466       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7467       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7468       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7469       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7470 #if 0
7471       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7472       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7473       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7474 #endif
7475       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7476       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7477 #if 0
7478       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7479       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7480       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7481 #endif
7482       case 0x12: strcpy(insn[i],"COP2"); type=NI;
7483         op2=(source[i]>>21)&0x1f;
7484         //if (op2 & 0x10) {
7485         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
7486           if (gte_handlers[source[i]&0x3f]!=NULL) {
7487             if (gte_regnames[source[i]&0x3f]!=NULL)
7488               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
7489             else
7490               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
7491             type=C2OP;
7492           }
7493         }
7494         else switch(op2)
7495         {
7496           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
7497           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
7498           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
7499           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
7500         }
7501         break;
7502       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
7503       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
7504       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
7505       default: strcpy(insn[i],"???"); type=NI;
7506         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
7507         break;
7508     }
7509     itype[i]=type;
7510     opcode2[i]=op2;
7511     /* Get registers/immediates */
7512     lt1[i]=0;
7513     us1[i]=0;
7514     us2[i]=0;
7515     dep1[i]=0;
7516     dep2[i]=0;
7517     gte_rs[i]=gte_rt[i]=0;
7518     switch(type) {
7519       case LOAD:
7520         rs1[i]=(source[i]>>21)&0x1f;
7521         rs2[i]=0;
7522         rt1[i]=(source[i]>>16)&0x1f;
7523         rt2[i]=0;
7524         imm[i]=(short)source[i];
7525         break;
7526       case STORE:
7527       case STORELR:
7528         rs1[i]=(source[i]>>21)&0x1f;
7529         rs2[i]=(source[i]>>16)&0x1f;
7530         rt1[i]=0;
7531         rt2[i]=0;
7532         imm[i]=(short)source[i];
7533         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
7534         break;
7535       case LOADLR:
7536         // LWL/LWR only load part of the register,
7537         // therefore the target register must be treated as a source too
7538         rs1[i]=(source[i]>>21)&0x1f;
7539         rs2[i]=(source[i]>>16)&0x1f;
7540         rt1[i]=(source[i]>>16)&0x1f;
7541         rt2[i]=0;
7542         imm[i]=(short)source[i];
7543         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
7544         if(op==0x26) dep1[i]=rt1[i]; // LWR
7545         break;
7546       case IMM16:
7547         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7548         else rs1[i]=(source[i]>>21)&0x1f;
7549         rs2[i]=0;
7550         rt1[i]=(source[i]>>16)&0x1f;
7551         rt2[i]=0;
7552         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7553           imm[i]=(unsigned short)source[i];
7554         }else{
7555           imm[i]=(short)source[i];
7556         }
7557         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
7558         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
7559         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7560         break;
7561       case UJUMP:
7562         rs1[i]=0;
7563         rs2[i]=0;
7564         rt1[i]=0;
7565         rt2[i]=0;
7566         // The JAL instruction writes to r31.
7567         if (op&1) {
7568           rt1[i]=31;
7569         }
7570         rs2[i]=CCREG;
7571         break;
7572       case RJUMP:
7573         rs1[i]=(source[i]>>21)&0x1f;
7574         rs2[i]=0;
7575         rt1[i]=0;
7576         rt2[i]=0;
7577         // The JALR instruction writes to rd.
7578         if (op2&1) {
7579           rt1[i]=(source[i]>>11)&0x1f;
7580         }
7581         rs2[i]=CCREG;
7582         break;
7583       case CJUMP:
7584         rs1[i]=(source[i]>>21)&0x1f;
7585         rs2[i]=(source[i]>>16)&0x1f;
7586         rt1[i]=0;
7587         rt2[i]=0;
7588         if(op&2) { // BGTZ/BLEZ
7589           rs2[i]=0;
7590         }
7591         us1[i]=rs1[i];
7592         us2[i]=rs2[i];
7593         likely[i]=op>>4;
7594         break;
7595       case SJUMP:
7596         rs1[i]=(source[i]>>21)&0x1f;
7597         rs2[i]=CCREG;
7598         rt1[i]=0;
7599         rt2[i]=0;
7600         us1[i]=rs1[i];
7601         if(op2&0x10) { // BxxAL
7602           rt1[i]=31;
7603           // NOTE: If the branch is not taken, r31 is still overwritten
7604         }
7605         likely[i]=(op2&2)>>1;
7606         break;
7607       case FJUMP:
7608         rs1[i]=FSREG;
7609         rs2[i]=CSREG;
7610         rt1[i]=0;
7611         rt2[i]=0;
7612         likely[i]=((source[i])>>17)&1;
7613         break;
7614       case ALU:
7615         rs1[i]=(source[i]>>21)&0x1f; // source
7616         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7617         rt1[i]=(source[i]>>11)&0x1f; // destination
7618         rt2[i]=0;
7619         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7620           us1[i]=rs1[i];us2[i]=rs2[i];
7621         }
7622         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7623           dep1[i]=rs1[i];dep2[i]=rs2[i];
7624         }
7625         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7626           dep1[i]=rs1[i];dep2[i]=rs2[i];
7627         }
7628         break;
7629       case MULTDIV:
7630         rs1[i]=(source[i]>>21)&0x1f; // source
7631         rs2[i]=(source[i]>>16)&0x1f; // divisor
7632         rt1[i]=HIREG;
7633         rt2[i]=LOREG;
7634         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7635           us1[i]=rs1[i];us2[i]=rs2[i];
7636         }
7637         break;
7638       case MOV:
7639         rs1[i]=0;
7640         rs2[i]=0;
7641         rt1[i]=0;
7642         rt2[i]=0;
7643         if(op2==0x10) rs1[i]=HIREG; // MFHI
7644         if(op2==0x11) rt1[i]=HIREG; // MTHI
7645         if(op2==0x12) rs1[i]=LOREG; // MFLO
7646         if(op2==0x13) rt1[i]=LOREG; // MTLO
7647         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7648         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7649         dep1[i]=rs1[i];
7650         break;
7651       case SHIFT:
7652         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7653         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7654         rt1[i]=(source[i]>>11)&0x1f; // destination
7655         rt2[i]=0;
7656         // DSLLV/DSRLV/DSRAV are 64-bit
7657         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
7658         break;
7659       case SHIFTIMM:
7660         rs1[i]=(source[i]>>16)&0x1f;
7661         rs2[i]=0;
7662         rt1[i]=(source[i]>>11)&0x1f;
7663         rt2[i]=0;
7664         imm[i]=(source[i]>>6)&0x1f;
7665         // DSxx32 instructions
7666         if(op2>=0x3c) imm[i]|=0x20;
7667         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
7668         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
7669         break;
7670       case COP0:
7671         rs1[i]=0;
7672         rs2[i]=0;
7673         rt1[i]=0;
7674         rt2[i]=0;
7675         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
7676         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
7677         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7678         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7679         break;
7680       case COP1:
7681         rs1[i]=0;
7682         rs2[i]=0;
7683         rt1[i]=0;
7684         rt2[i]=0;
7685         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7686         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7687         if(op2==5) us1[i]=rs1[i]; // DMTC1
7688         rs2[i]=CSREG;
7689         break;
7690       case COP2:
7691         rs1[i]=0;
7692         rs2[i]=0;
7693         rt1[i]=0;
7694         rt2[i]=0;
7695         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
7696         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
7697         rs2[i]=CSREG;
7698         int gr=(source[i]>>11)&0x1F;
7699         switch(op2)
7700         {
7701           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
7702           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
7703           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
7704           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
7705         }
7706         break;
7707       case C1LS:
7708         rs1[i]=(source[i]>>21)&0x1F;
7709         rs2[i]=CSREG;
7710         rt1[i]=0;
7711         rt2[i]=0;
7712         imm[i]=(short)source[i];
7713         break;
7714       case C2LS:
7715         rs1[i]=(source[i]>>21)&0x1F;
7716         rs2[i]=0;
7717         rt1[i]=0;
7718         rt2[i]=0;
7719         imm[i]=(short)source[i];
7720         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
7721         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
7722         break;
7723       case C2OP:
7724         rs1[i]=0;
7725         rs2[i]=0;
7726         rt1[i]=0;
7727         rt2[i]=0;
7728         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
7729         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
7730         gte_rt[i]|=1ll<<63; // every op changes flags
7731         if((source[i]&0x3f)==GTE_MVMVA) {
7732           int v = (source[i] >> 15) & 3;
7733           gte_rs[i]&=~0xe3fll;
7734           if(v==3) gte_rs[i]|=0xe00ll;
7735           else gte_rs[i]|=3ll<<(v*2);
7736         }
7737         break;
7738       case FLOAT:
7739       case FCONV:
7740         rs1[i]=0;
7741         rs2[i]=CSREG;
7742         rt1[i]=0;
7743         rt2[i]=0;
7744         break;
7745       case FCOMP:
7746         rs1[i]=FSREG;
7747         rs2[i]=CSREG;
7748         rt1[i]=FSREG;
7749         rt2[i]=0;
7750         break;
7751       case SYSCALL:
7752       case HLECALL:
7753       case INTCALL:
7754         rs1[i]=CCREG;
7755         rs2[i]=0;
7756         rt1[i]=0;
7757         rt2[i]=0;
7758         break;
7759       default:
7760         rs1[i]=0;
7761         rs2[i]=0;
7762         rt1[i]=0;
7763         rt2[i]=0;
7764     }
7765     /* Calculate branch target addresses */
7766     if(type==UJUMP)
7767       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
7768     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
7769       ba[i]=start+i*4+8; // Ignore never taken branch
7770     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
7771       ba[i]=start+i*4+8; // Ignore never taken branch
7772     else if(type==CJUMP||type==SJUMP||type==FJUMP)
7773       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
7774     else ba[i]=-1;
7775     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
7776       int do_in_intrp=0;
7777       // branch in delay slot?
7778       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
7779         // don't handle first branch and call interpreter if it's hit
7780         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
7781         do_in_intrp=1;
7782       }
7783       // basic load delay detection
7784       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
7785         int t=(ba[i-1]-start)/4;
7786         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
7787           // jump target wants DS result - potential load delay effect
7788           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
7789           do_in_intrp=1;
7790           bt[t+1]=1; // expected return from interpreter
7791         }
7792         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
7793               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
7794           // v0 overwrite like this is a sign of trouble, bail out
7795           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
7796           do_in_intrp=1;
7797         }
7798       }
7799       if(do_in_intrp) {
7800         rs1[i-1]=CCREG;
7801         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
7802         ba[i-1]=-1;
7803         itype[i-1]=INTCALL;
7804         done=2;
7805         i--; // don't compile the DS
7806       }
7807     }
7808     /* Is this the end of the block? */
7809     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
7810       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
7811         done=2;
7812       }
7813       else {
7814         if(stop_after_jal) done=1;
7815         // Stop on BREAK
7816         if((source[i+1]&0xfc00003f)==0x0d) done=1;
7817       }
7818       // Don't recompile stuff that's already compiled
7819       if(check_addr(start+i*4+4)) done=1;
7820       // Don't get too close to the limit
7821       if(i>MAXBLOCK/2) done=1;
7822     }
7823     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
7824     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
7825     if(done==2) {
7826       // Does the block continue due to a branch?
7827       for(j=i-1;j>=0;j--)
7828       {
7829         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
7830         if(ba[j]==start+i*4+4) done=j=0;
7831         if(ba[j]==start+i*4+8) done=j=0;
7832       }
7833     }
7834     //assert(i<MAXBLOCK-1);
7835     if(start+i*4==pagelimit-4) done=1;
7836     assert(start+i*4<pagelimit);
7837     if (i==MAXBLOCK-1) done=1;
7838     // Stop if we're compiling junk
7839     if(itype[i]==NI&&opcode[i]==0x11) {
7840       done=stop_after_jal=1;
7841       SysPrintf("Disabled speculative precompilation\n");
7842     }
7843   }
7844   slen=i;
7845   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
7846     if(start+i*4==pagelimit) {
7847       itype[i-1]=SPAN;
7848     }
7849   }
7850   assert(slen>0);
7851
7852   /* Pass 2 - Register dependencies and branch targets */
7853
7854   unneeded_registers(0,slen-1,0);
7855
7856   /* Pass 3 - Register allocation */
7857
7858   struct regstat current; // Current register allocations/status
7859   current.is32=1;
7860   current.dirty=0;
7861   current.u=unneeded_reg[0];
7862   current.uu=unneeded_reg_upper[0];
7863   clear_all_regs(current.regmap);
7864   alloc_reg(&current,0,CCREG);
7865   dirty_reg(&current,CCREG);
7866   current.isconst=0;
7867   current.wasconst=0;
7868   current.waswritten=0;
7869   int ds=0;
7870   int cc=0;
7871   int hr=-1;
7872
7873   if((u_int)addr&1) {
7874     // First instruction is delay slot
7875     cc=-1;
7876     bt[1]=1;
7877     ds=1;
7878     unneeded_reg[0]=1;
7879     unneeded_reg_upper[0]=1;
7880     current.regmap[HOST_BTREG]=BTREG;
7881   }
7882
7883   for(i=0;i<slen;i++)
7884   {
7885     if(bt[i])
7886     {
7887       int hr;
7888       for(hr=0;hr<HOST_REGS;hr++)
7889       {
7890         // Is this really necessary?
7891         if(current.regmap[hr]==0) current.regmap[hr]=-1;
7892       }
7893       current.isconst=0;
7894       current.waswritten=0;
7895     }
7896     if(i>1)
7897     {
7898       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
7899       {
7900         if(rs1[i-2]==0||rs2[i-2]==0)
7901         {
7902           if(rs1[i-2]) {
7903             current.is32|=1LL<<rs1[i-2];
7904             int hr=get_reg(current.regmap,rs1[i-2]|64);
7905             if(hr>=0) current.regmap[hr]=-1;
7906           }
7907           if(rs2[i-2]) {
7908             current.is32|=1LL<<rs2[i-2];
7909             int hr=get_reg(current.regmap,rs2[i-2]|64);
7910             if(hr>=0) current.regmap[hr]=-1;
7911           }
7912         }
7913       }
7914     }
7915     current.is32=-1LL;
7916
7917     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
7918     regs[i].wasconst=current.isconst;
7919     regs[i].was32=current.is32;
7920     regs[i].wasdirty=current.dirty;
7921     regs[i].loadedconst=0;
7922     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
7923       if(i+1<slen) {
7924         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7925         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
7926         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
7927         current.u|=1;
7928         current.uu|=1;
7929       } else {
7930         current.u=1;
7931         current.uu=1;
7932       }
7933     } else {
7934       if(i+1<slen) {
7935         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
7936         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
7937         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
7938         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7939         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
7940         current.u|=1;
7941         current.uu|=1;
7942       } else { SysPrintf("oops, branch at end of block with no delay slot\n");exit(1); }
7943     }
7944     is_ds[i]=ds;
7945     if(ds) {
7946       ds=0; // Skip delay slot, already allocated as part of branch
7947       // ...but we need to alloc it in case something jumps here
7948       if(i+1<slen) {
7949         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
7950         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
7951       }else{
7952         current.u=branch_unneeded_reg[i-1];
7953         current.uu=branch_unneeded_reg_upper[i-1];
7954       }
7955       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7956       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
7957       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
7958       current.u|=1;
7959       current.uu|=1;
7960       struct regstat temp;
7961       memcpy(&temp,&current,sizeof(current));
7962       temp.wasdirty=temp.dirty;
7963       temp.was32=temp.is32;
7964       // TODO: Take into account unconditional branches, as below
7965       delayslot_alloc(&temp,i);
7966       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
7967       regs[i].wasdirty=temp.wasdirty;
7968       regs[i].was32=temp.was32;
7969       regs[i].dirty=temp.dirty;
7970       regs[i].is32=temp.is32;
7971       regs[i].isconst=0;
7972       regs[i].wasconst=0;
7973       current.isconst=0;
7974       // Create entry (branch target) regmap
7975       for(hr=0;hr<HOST_REGS;hr++)
7976       {
7977         int r=temp.regmap[hr];
7978         if(r>=0) {
7979           if(r!=regmap_pre[i][hr]) {
7980             regs[i].regmap_entry[hr]=-1;
7981           }
7982           else
7983           {
7984             if(r<64){
7985               if((current.u>>r)&1) {
7986                 regs[i].regmap_entry[hr]=-1;
7987                 regs[i].regmap[hr]=-1;
7988                 //Don't clear regs in the delay slot as the branch might need them
7989                 //current.regmap[hr]=-1;
7990               }else
7991                 regs[i].regmap_entry[hr]=r;
7992             }
7993             else {
7994               if((current.uu>>(r&63))&1) {
7995                 regs[i].regmap_entry[hr]=-1;
7996                 regs[i].regmap[hr]=-1;
7997                 //Don't clear regs in the delay slot as the branch might need them
7998                 //current.regmap[hr]=-1;
7999               }else
8000                 regs[i].regmap_entry[hr]=r;
8001             }
8002           }
8003         } else {
8004           // First instruction expects CCREG to be allocated
8005           if(i==0&&hr==HOST_CCREG)
8006             regs[i].regmap_entry[hr]=CCREG;
8007           else
8008             regs[i].regmap_entry[hr]=-1;
8009         }
8010       }
8011     }
8012     else { // Not delay slot
8013       switch(itype[i]) {
8014         case UJUMP:
8015           //current.isconst=0; // DEBUG
8016           //current.wasconst=0; // DEBUG
8017           //regs[i].wasconst=0; // DEBUG
8018           clear_const(&current,rt1[i]);
8019           alloc_cc(&current,i);
8020           dirty_reg(&current,CCREG);
8021           if (rt1[i]==31) {
8022             alloc_reg(&current,i,31);
8023             dirty_reg(&current,31);
8024             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8025             //assert(rt1[i+1]!=rt1[i]);
8026             #ifdef REG_PREFETCH
8027             alloc_reg(&current,i,PTEMP);
8028             #endif
8029             //current.is32|=1LL<<rt1[i];
8030           }
8031           ooo[i]=1;
8032           delayslot_alloc(&current,i+1);
8033           //current.isconst=0; // DEBUG
8034           ds=1;
8035           //printf("i=%d, isconst=%x\n",i,current.isconst);
8036           break;
8037         case RJUMP:
8038           //current.isconst=0;
8039           //current.wasconst=0;
8040           //regs[i].wasconst=0;
8041           clear_const(&current,rs1[i]);
8042           clear_const(&current,rt1[i]);
8043           alloc_cc(&current,i);
8044           dirty_reg(&current,CCREG);
8045           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8046             alloc_reg(&current,i,rs1[i]);
8047             if (rt1[i]!=0) {
8048               alloc_reg(&current,i,rt1[i]);
8049               dirty_reg(&current,rt1[i]);
8050               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
8051               assert(rt1[i+1]!=rt1[i]);
8052               #ifdef REG_PREFETCH
8053               alloc_reg(&current,i,PTEMP);
8054               #endif
8055             }
8056             #ifdef USE_MINI_HT
8057             if(rs1[i]==31) { // JALR
8058               alloc_reg(&current,i,RHASH);
8059               #ifndef HOST_IMM_ADDR32
8060               alloc_reg(&current,i,RHTBL);
8061               #endif
8062             }
8063             #endif
8064             delayslot_alloc(&current,i+1);
8065           } else {
8066             // The delay slot overwrites our source register,
8067             // allocate a temporary register to hold the old value.
8068             current.isconst=0;
8069             current.wasconst=0;
8070             regs[i].wasconst=0;
8071             delayslot_alloc(&current,i+1);
8072             current.isconst=0;
8073             alloc_reg(&current,i,RTEMP);
8074           }
8075           //current.isconst=0; // DEBUG
8076           ooo[i]=1;
8077           ds=1;
8078           break;
8079         case CJUMP:
8080           //current.isconst=0;
8081           //current.wasconst=0;
8082           //regs[i].wasconst=0;
8083           clear_const(&current,rs1[i]);
8084           clear_const(&current,rs2[i]);
8085           if((opcode[i]&0x3E)==4) // BEQ/BNE
8086           {
8087             alloc_cc(&current,i);
8088             dirty_reg(&current,CCREG);
8089             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8090             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8091             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8092             {
8093               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8094               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8095             }
8096             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8097                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8098               // The delay slot overwrites one of our conditions.
8099               // Allocate the branch condition registers instead.
8100               current.isconst=0;
8101               current.wasconst=0;
8102               regs[i].wasconst=0;
8103               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8104               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8105               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8106               {
8107                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8108                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8109               }
8110             }
8111             else
8112             {
8113               ooo[i]=1;
8114               delayslot_alloc(&current,i+1);
8115             }
8116           }
8117           else
8118           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8119           {
8120             alloc_cc(&current,i);
8121             dirty_reg(&current,CCREG);
8122             alloc_reg(&current,i,rs1[i]);
8123             if(!(current.is32>>rs1[i]&1))
8124             {
8125               alloc_reg64(&current,i,rs1[i]);
8126             }
8127             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8128               // The delay slot overwrites one of our conditions.
8129               // Allocate the branch condition registers instead.
8130               current.isconst=0;
8131               current.wasconst=0;
8132               regs[i].wasconst=0;
8133               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8134               if(!((current.is32>>rs1[i])&1))
8135               {
8136                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8137               }
8138             }
8139             else
8140             {
8141               ooo[i]=1;
8142               delayslot_alloc(&current,i+1);
8143             }
8144           }
8145           else
8146           // Don't alloc the delay slot yet because we might not execute it
8147           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
8148           {
8149             current.isconst=0;
8150             current.wasconst=0;
8151             regs[i].wasconst=0;
8152             alloc_cc(&current,i);
8153             dirty_reg(&current,CCREG);
8154             alloc_reg(&current,i,rs1[i]);
8155             alloc_reg(&current,i,rs2[i]);
8156             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8157             {
8158               alloc_reg64(&current,i,rs1[i]);
8159               alloc_reg64(&current,i,rs2[i]);
8160             }
8161           }
8162           else
8163           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
8164           {
8165             current.isconst=0;
8166             current.wasconst=0;
8167             regs[i].wasconst=0;
8168             alloc_cc(&current,i);
8169             dirty_reg(&current,CCREG);
8170             alloc_reg(&current,i,rs1[i]);
8171             if(!(current.is32>>rs1[i]&1))
8172             {
8173               alloc_reg64(&current,i,rs1[i]);
8174             }
8175           }
8176           ds=1;
8177           //current.isconst=0;
8178           break;
8179         case SJUMP:
8180           //current.isconst=0;
8181           //current.wasconst=0;
8182           //regs[i].wasconst=0;
8183           clear_const(&current,rs1[i]);
8184           clear_const(&current,rt1[i]);
8185           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
8186           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
8187           {
8188             alloc_cc(&current,i);
8189             dirty_reg(&current,CCREG);
8190             alloc_reg(&current,i,rs1[i]);
8191             if(!(current.is32>>rs1[i]&1))
8192             {
8193               alloc_reg64(&current,i,rs1[i]);
8194             }
8195             if (rt1[i]==31) { // BLTZAL/BGEZAL
8196               alloc_reg(&current,i,31);
8197               dirty_reg(&current,31);
8198               //#ifdef REG_PREFETCH
8199               //alloc_reg(&current,i,PTEMP);
8200               //#endif
8201               //current.is32|=1LL<<rt1[i];
8202             }
8203             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
8204                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
8205               // Allocate the branch condition registers instead.
8206               current.isconst=0;
8207               current.wasconst=0;
8208               regs[i].wasconst=0;
8209               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8210               if(!((current.is32>>rs1[i])&1))
8211               {
8212                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8213               }
8214             }
8215             else
8216             {
8217               ooo[i]=1;
8218               delayslot_alloc(&current,i+1);
8219             }
8220           }
8221           else
8222           // Don't alloc the delay slot yet because we might not execute it
8223           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
8224           {
8225             current.isconst=0;
8226             current.wasconst=0;
8227             regs[i].wasconst=0;
8228             alloc_cc(&current,i);
8229             dirty_reg(&current,CCREG);
8230             alloc_reg(&current,i,rs1[i]);
8231             if(!(current.is32>>rs1[i]&1))
8232             {
8233               alloc_reg64(&current,i,rs1[i]);
8234             }
8235           }
8236           ds=1;
8237           //current.isconst=0;
8238           break;
8239         case FJUMP:
8240           current.isconst=0;
8241           current.wasconst=0;
8242           regs[i].wasconst=0;
8243           if(likely[i]==0) // BC1F/BC1T
8244           {
8245             // TODO: Theoretically we can run out of registers here on x86.
8246             // The delay slot can allocate up to six, and we need to check
8247             // CSREG before executing the delay slot.  Possibly we can drop
8248             // the cycle count and then reload it after checking that the
8249             // FPU is in a usable state, or don't do out-of-order execution.
8250             alloc_cc(&current,i);
8251             dirty_reg(&current,CCREG);
8252             alloc_reg(&current,i,FSREG);
8253             alloc_reg(&current,i,CSREG);
8254             if(itype[i+1]==FCOMP) {
8255               // The delay slot overwrites the branch condition.
8256               // Allocate the branch condition registers instead.
8257               alloc_cc(&current,i);
8258               dirty_reg(&current,CCREG);
8259               alloc_reg(&current,i,CSREG);
8260               alloc_reg(&current,i,FSREG);
8261             }
8262             else {
8263               ooo[i]=1;
8264               delayslot_alloc(&current,i+1);
8265               alloc_reg(&current,i+1,CSREG);
8266             }
8267           }
8268           else
8269           // Don't alloc the delay slot yet because we might not execute it
8270           if(likely[i]) // BC1FL/BC1TL
8271           {
8272             alloc_cc(&current,i);
8273             dirty_reg(&current,CCREG);
8274             alloc_reg(&current,i,CSREG);
8275             alloc_reg(&current,i,FSREG);
8276           }
8277           ds=1;
8278           current.isconst=0;
8279           break;
8280         case IMM16:
8281           imm16_alloc(&current,i);
8282           break;
8283         case LOAD:
8284         case LOADLR:
8285           load_alloc(&current,i);
8286           break;
8287         case STORE:
8288         case STORELR:
8289           store_alloc(&current,i);
8290           break;
8291         case ALU:
8292           alu_alloc(&current,i);
8293           break;
8294         case SHIFT:
8295           shift_alloc(&current,i);
8296           break;
8297         case MULTDIV:
8298           multdiv_alloc(&current,i);
8299           break;
8300         case SHIFTIMM:
8301           shiftimm_alloc(&current,i);
8302           break;
8303         case MOV:
8304           mov_alloc(&current,i);
8305           break;
8306         case COP0:
8307           cop0_alloc(&current,i);
8308           break;
8309         case COP1:
8310         case COP2:
8311           cop1_alloc(&current,i);
8312           break;
8313         case C1LS:
8314           c1ls_alloc(&current,i);
8315           break;
8316         case C2LS:
8317           c2ls_alloc(&current,i);
8318           break;
8319         case C2OP:
8320           c2op_alloc(&current,i);
8321           break;
8322         case FCONV:
8323           fconv_alloc(&current,i);
8324           break;
8325         case FLOAT:
8326           float_alloc(&current,i);
8327           break;
8328         case FCOMP:
8329           fcomp_alloc(&current,i);
8330           break;
8331         case SYSCALL:
8332         case HLECALL:
8333         case INTCALL:
8334           syscall_alloc(&current,i);
8335           break;
8336         case SPAN:
8337           pagespan_alloc(&current,i);
8338           break;
8339       }
8340
8341       // Drop the upper half of registers that have become 32-bit
8342       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
8343       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8344         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8345         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8346         current.uu|=1;
8347       } else {
8348         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
8349         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8350         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8351         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8352         current.uu|=1;
8353       }
8354
8355       // Create entry (branch target) regmap
8356       for(hr=0;hr<HOST_REGS;hr++)
8357       {
8358         int r,or;
8359         r=current.regmap[hr];
8360         if(r>=0) {
8361           if(r!=regmap_pre[i][hr]) {
8362             // TODO: delay slot (?)
8363             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
8364             if(or<0||(r&63)>=TEMPREG){
8365               regs[i].regmap_entry[hr]=-1;
8366             }
8367             else
8368             {
8369               // Just move it to a different register
8370               regs[i].regmap_entry[hr]=r;
8371               // If it was dirty before, it's still dirty
8372               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
8373             }
8374           }
8375           else
8376           {
8377             // Unneeded
8378             if(r==0){
8379               regs[i].regmap_entry[hr]=0;
8380             }
8381             else
8382             if(r<64){
8383               if((current.u>>r)&1) {
8384                 regs[i].regmap_entry[hr]=-1;
8385                 //regs[i].regmap[hr]=-1;
8386                 current.regmap[hr]=-1;
8387               }else
8388                 regs[i].regmap_entry[hr]=r;
8389             }
8390             else {
8391               if((current.uu>>(r&63))&1) {
8392                 regs[i].regmap_entry[hr]=-1;
8393                 //regs[i].regmap[hr]=-1;
8394                 current.regmap[hr]=-1;
8395               }else
8396                 regs[i].regmap_entry[hr]=r;
8397             }
8398           }
8399         } else {
8400           // Branches expect CCREG to be allocated at the target
8401           if(regmap_pre[i][hr]==CCREG)
8402             regs[i].regmap_entry[hr]=CCREG;
8403           else
8404             regs[i].regmap_entry[hr]=-1;
8405         }
8406       }
8407       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8408     }
8409
8410     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
8411       current.waswritten|=1<<rs1[i-1];
8412     current.waswritten&=~(1<<rt1[i]);
8413     current.waswritten&=~(1<<rt2[i]);
8414     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
8415       current.waswritten&=~(1<<rs1[i]);
8416
8417     /* Branch post-alloc */
8418     if(i>0)
8419     {
8420       current.was32=current.is32;
8421       current.wasdirty=current.dirty;
8422       switch(itype[i-1]) {
8423         case UJUMP:
8424           memcpy(&branch_regs[i-1],&current,sizeof(current));
8425           branch_regs[i-1].isconst=0;
8426           branch_regs[i-1].wasconst=0;
8427           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8428           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8429           alloc_cc(&branch_regs[i-1],i-1);
8430           dirty_reg(&branch_regs[i-1],CCREG);
8431           if(rt1[i-1]==31) { // JAL
8432             alloc_reg(&branch_regs[i-1],i-1,31);
8433             dirty_reg(&branch_regs[i-1],31);
8434             branch_regs[i-1].is32|=1LL<<31;
8435           }
8436           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8437           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8438           break;
8439         case RJUMP:
8440           memcpy(&branch_regs[i-1],&current,sizeof(current));
8441           branch_regs[i-1].isconst=0;
8442           branch_regs[i-1].wasconst=0;
8443           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8444           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8445           alloc_cc(&branch_regs[i-1],i-1);
8446           dirty_reg(&branch_regs[i-1],CCREG);
8447           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8448           if(rt1[i-1]!=0) { // JALR
8449             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
8450             dirty_reg(&branch_regs[i-1],rt1[i-1]);
8451             branch_regs[i-1].is32|=1LL<<rt1[i-1];
8452           }
8453           #ifdef USE_MINI_HT
8454           if(rs1[i-1]==31) { // JALR
8455             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8456             #ifndef HOST_IMM_ADDR32
8457             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8458             #endif
8459           }
8460           #endif
8461           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8462           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8463           break;
8464         case CJUMP:
8465           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8466           {
8467             alloc_cc(&current,i-1);
8468             dirty_reg(&current,CCREG);
8469             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8470                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8471               // The delay slot overwrote one of our conditions
8472               // Delay slot goes after the test (in order)
8473               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8474               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8475               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8476               current.u|=1;
8477               current.uu|=1;
8478               delayslot_alloc(&current,i);
8479               current.isconst=0;
8480             }
8481             else
8482             {
8483               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8484               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8485               // Alloc the branch condition registers
8486               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8487               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8488               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
8489               {
8490                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
8491                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
8492               }
8493             }
8494             memcpy(&branch_regs[i-1],&current,sizeof(current));
8495             branch_regs[i-1].isconst=0;
8496             branch_regs[i-1].wasconst=0;
8497             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8498             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8499           }
8500           else
8501           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8502           {
8503             alloc_cc(&current,i-1);
8504             dirty_reg(&current,CCREG);
8505             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8506               // The delay slot overwrote the branch condition
8507               // Delay slot goes after the test (in order)
8508               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8509               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8510               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8511               current.u|=1;
8512               current.uu|=1;
8513               delayslot_alloc(&current,i);
8514               current.isconst=0;
8515             }
8516             else
8517             {
8518               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8519               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8520               // Alloc the branch condition register
8521               alloc_reg(&current,i-1,rs1[i-1]);
8522               if(!(current.is32>>rs1[i-1]&1))
8523               {
8524                 alloc_reg64(&current,i-1,rs1[i-1]);
8525               }
8526             }
8527             memcpy(&branch_regs[i-1],&current,sizeof(current));
8528             branch_regs[i-1].isconst=0;
8529             branch_regs[i-1].wasconst=0;
8530             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8531             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8532           }
8533           else
8534           // Alloc the delay slot in case the branch is taken
8535           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8536           {
8537             memcpy(&branch_regs[i-1],&current,sizeof(current));
8538             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8539             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8540             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8541             alloc_cc(&branch_regs[i-1],i);
8542             dirty_reg(&branch_regs[i-1],CCREG);
8543             delayslot_alloc(&branch_regs[i-1],i);
8544             branch_regs[i-1].isconst=0;
8545             alloc_reg(&current,i,CCREG); // Not taken path
8546             dirty_reg(&current,CCREG);
8547             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8548           }
8549           else
8550           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8551           {
8552             memcpy(&branch_regs[i-1],&current,sizeof(current));
8553             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8554             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8555             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8556             alloc_cc(&branch_regs[i-1],i);
8557             dirty_reg(&branch_regs[i-1],CCREG);
8558             delayslot_alloc(&branch_regs[i-1],i);
8559             branch_regs[i-1].isconst=0;
8560             alloc_reg(&current,i,CCREG); // Not taken path
8561             dirty_reg(&current,CCREG);
8562             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8563           }
8564           break;
8565         case SJUMP:
8566           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8567           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8568           {
8569             alloc_cc(&current,i-1);
8570             dirty_reg(&current,CCREG);
8571             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8572               // The delay slot overwrote the branch condition
8573               // Delay slot goes after the test (in order)
8574               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8575               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8576               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8577               current.u|=1;
8578               current.uu|=1;
8579               delayslot_alloc(&current,i);
8580               current.isconst=0;
8581             }
8582             else
8583             {
8584               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8585               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8586               // Alloc the branch condition register
8587               alloc_reg(&current,i-1,rs1[i-1]);
8588               if(!(current.is32>>rs1[i-1]&1))
8589               {
8590                 alloc_reg64(&current,i-1,rs1[i-1]);
8591               }
8592             }
8593             memcpy(&branch_regs[i-1],&current,sizeof(current));
8594             branch_regs[i-1].isconst=0;
8595             branch_regs[i-1].wasconst=0;
8596             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8597             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8598           }
8599           else
8600           // Alloc the delay slot in case the branch is taken
8601           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
8602           {
8603             memcpy(&branch_regs[i-1],&current,sizeof(current));
8604             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8605             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8606             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8607             alloc_cc(&branch_regs[i-1],i);
8608             dirty_reg(&branch_regs[i-1],CCREG);
8609             delayslot_alloc(&branch_regs[i-1],i);
8610             branch_regs[i-1].isconst=0;
8611             alloc_reg(&current,i,CCREG); // Not taken path
8612             dirty_reg(&current,CCREG);
8613             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8614           }
8615           // FIXME: BLTZAL/BGEZAL
8616           if(opcode2[i-1]&0x10) { // BxxZAL
8617             alloc_reg(&branch_regs[i-1],i-1,31);
8618             dirty_reg(&branch_regs[i-1],31);
8619             branch_regs[i-1].is32|=1LL<<31;
8620           }
8621           break;
8622         case FJUMP:
8623           if(likely[i-1]==0) // BC1F/BC1T
8624           {
8625             alloc_cc(&current,i-1);
8626             dirty_reg(&current,CCREG);
8627             if(itype[i]==FCOMP) {
8628               // The delay slot overwrote the branch condition
8629               // Delay slot goes after the test (in order)
8630               delayslot_alloc(&current,i);
8631               current.isconst=0;
8632             }
8633             else
8634             {
8635               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8636               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8637               // Alloc the branch condition register
8638               alloc_reg(&current,i-1,FSREG);
8639             }
8640             memcpy(&branch_regs[i-1],&current,sizeof(current));
8641             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8642           }
8643           else // BC1FL/BC1TL
8644           {
8645             // Alloc the delay slot in case the branch is taken
8646             memcpy(&branch_regs[i-1],&current,sizeof(current));
8647             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8648             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8649             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8650             alloc_cc(&branch_regs[i-1],i);
8651             dirty_reg(&branch_regs[i-1],CCREG);
8652             delayslot_alloc(&branch_regs[i-1],i);
8653             branch_regs[i-1].isconst=0;
8654             alloc_reg(&current,i,CCREG); // Not taken path
8655             dirty_reg(&current,CCREG);
8656             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8657           }
8658           break;
8659       }
8660
8661       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
8662       {
8663         if(rt1[i-1]==31) // JAL/JALR
8664         {
8665           // Subroutine call will return here, don't alloc any registers
8666           current.is32=1;
8667           current.dirty=0;
8668           clear_all_regs(current.regmap);
8669           alloc_reg(&current,i,CCREG);
8670           dirty_reg(&current,CCREG);
8671         }
8672         else if(i+1<slen)
8673         {
8674           // Internal branch will jump here, match registers to caller
8675           current.is32=0x3FFFFFFFFLL;
8676           current.dirty=0;
8677           clear_all_regs(current.regmap);
8678           alloc_reg(&current,i,CCREG);
8679           dirty_reg(&current,CCREG);
8680           for(j=i-1;j>=0;j--)
8681           {
8682             if(ba[j]==start+i*4+4) {
8683               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
8684               current.is32=branch_regs[j].is32;
8685               current.dirty=branch_regs[j].dirty;
8686               break;
8687             }
8688           }
8689           while(j>=0) {
8690             if(ba[j]==start+i*4+4) {
8691               for(hr=0;hr<HOST_REGS;hr++) {
8692                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
8693                   current.regmap[hr]=-1;
8694                 }
8695                 current.is32&=branch_regs[j].is32;
8696                 current.dirty&=branch_regs[j].dirty;
8697               }
8698             }
8699             j--;
8700           }
8701         }
8702       }
8703     }
8704
8705     // Count cycles in between branches
8706     ccadj[i]=cc;
8707     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
8708     {
8709       cc=0;
8710     }
8711 #if !defined(DRC_DBG)
8712     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
8713     {
8714       // GTE runs in parallel until accessed, divide by 2 for a rough guess
8715       cc+=gte_cycletab[source[i]&0x3f]/2;
8716     }
8717     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
8718     {
8719       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
8720     }
8721     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
8722     {
8723       cc+=4;
8724     }
8725     else if(itype[i]==C2LS)
8726     {
8727       cc+=4;
8728     }
8729 #endif
8730     else
8731     {
8732       cc++;
8733     }
8734
8735     flush_dirty_uppers(&current);
8736     if(!is_ds[i]) {
8737       regs[i].is32=current.is32;
8738       regs[i].dirty=current.dirty;
8739       regs[i].isconst=current.isconst;
8740       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
8741     }
8742     for(hr=0;hr<HOST_REGS;hr++) {
8743       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
8744         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
8745           regs[i].wasconst&=~(1<<hr);
8746         }
8747       }
8748     }
8749     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
8750     regs[i].waswritten=current.waswritten;
8751   }
8752
8753   /* Pass 4 - Cull unused host registers */
8754
8755   uint64_t nr=0;
8756
8757   for (i=slen-1;i>=0;i--)
8758   {
8759     int hr;
8760     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8761     {
8762       if(ba[i]<start || ba[i]>=(start+slen*4))
8763       {
8764         // Branch out of this block, don't need anything
8765         nr=0;
8766       }
8767       else
8768       {
8769         // Internal branch
8770         // Need whatever matches the target
8771         nr=0;
8772         int t=(ba[i]-start)>>2;
8773         for(hr=0;hr<HOST_REGS;hr++)
8774         {
8775           if(regs[i].regmap_entry[hr]>=0) {
8776             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
8777           }
8778         }
8779       }
8780       // Conditional branch may need registers for following instructions
8781       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8782       {
8783         if(i<slen-2) {
8784           nr|=needed_reg[i+2];
8785           for(hr=0;hr<HOST_REGS;hr++)
8786           {
8787             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
8788             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
8789           }
8790         }
8791       }
8792       // Don't need stuff which is overwritten
8793       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8794       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8795       // Merge in delay slot
8796       for(hr=0;hr<HOST_REGS;hr++)
8797       {
8798         if(!likely[i]) {
8799           // These are overwritten unless the branch is "likely"
8800           // and the delay slot is nullified if not taken
8801           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8802           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8803         }
8804         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8805         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8806         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8807         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8808         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8809         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8810         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8811         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8812         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
8813           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8814           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8815         }
8816         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
8817           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8818           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8819         }
8820         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
8821           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8822           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8823         }
8824       }
8825     }
8826     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
8827     {
8828       // SYSCALL instruction (software interrupt)
8829       nr=0;
8830     }
8831     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
8832     {
8833       // ERET instruction (return from interrupt)
8834       nr=0;
8835     }
8836     else // Non-branch
8837     {
8838       if(i<slen-1) {
8839         for(hr=0;hr<HOST_REGS;hr++) {
8840           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
8841           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
8842           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8843           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8844         }
8845       }
8846     }
8847     for(hr=0;hr<HOST_REGS;hr++)
8848     {
8849       // Overwritten registers are not needed
8850       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8851       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8852       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8853       // Source registers are needed
8854       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8855       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8856       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
8857       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
8858       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8859       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8860       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8861       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8862       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
8863         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8864         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8865       }
8866       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
8867         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8868         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8869       }
8870       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
8871         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8872         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8873       }
8874       // Don't store a register immediately after writing it,
8875       // may prevent dual-issue.
8876       // But do so if this is a branch target, otherwise we
8877       // might have to load the register before the branch.
8878       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
8879         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
8880            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
8881           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8882           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8883         }
8884         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
8885            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
8886           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8887           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8888         }
8889       }
8890     }
8891     // Cycle count is needed at branches.  Assume it is needed at the target too.
8892     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
8893       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8894       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8895     }
8896     // Save it
8897     needed_reg[i]=nr;
8898
8899     // Deallocate unneeded registers
8900     for(hr=0;hr<HOST_REGS;hr++)
8901     {
8902       if(!((nr>>hr)&1)) {
8903         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
8904         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8905            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8906            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
8907         {
8908           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8909           {
8910             if(likely[i]) {
8911               regs[i].regmap[hr]=-1;
8912               regs[i].isconst&=~(1<<hr);
8913               if(i<slen-2) {
8914                 regmap_pre[i+2][hr]=-1;
8915                 regs[i+2].wasconst&=~(1<<hr);
8916               }
8917             }
8918           }
8919         }
8920         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8921         {
8922           int d1=0,d2=0,map=0,temp=0;
8923           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
8924           {
8925             d1=dep1[i+1];
8926             d2=dep2[i+1];
8927           }
8928           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
8929              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8930             map=INVCP;
8931           }
8932           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
8933              itype[i+1]==C1LS || itype[i+1]==C2LS)
8934             temp=FTEMP;
8935           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8936              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8937              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
8938              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
8939              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
8940              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
8941              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
8942              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
8943              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
8944              regs[i].regmap[hr]!=map )
8945           {
8946             regs[i].regmap[hr]=-1;
8947             regs[i].isconst&=~(1<<hr);
8948             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
8949                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
8950                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
8951                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
8952                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
8953                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
8954                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
8955                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
8956                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
8957                branch_regs[i].regmap[hr]!=map)
8958             {
8959               branch_regs[i].regmap[hr]=-1;
8960               branch_regs[i].regmap_entry[hr]=-1;
8961               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8962               {
8963                 if(!likely[i]&&i<slen-2) {
8964                   regmap_pre[i+2][hr]=-1;
8965                   regs[i+2].wasconst&=~(1<<hr);
8966                 }
8967               }
8968             }
8969           }
8970         }
8971         else
8972         {
8973           // Non-branch
8974           if(i>0)
8975           {
8976             int d1=0,d2=0,map=-1,temp=-1;
8977             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
8978             {
8979               d1=dep1[i];
8980               d2=dep2[i];
8981             }
8982             if(itype[i]==STORE || itype[i]==STORELR ||
8983                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8984               map=INVCP;
8985             }
8986             if(itype[i]==LOADLR || itype[i]==STORELR ||
8987                itype[i]==C1LS || itype[i]==C2LS)
8988               temp=FTEMP;
8989             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8990                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
8991                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
8992                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
8993                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
8994                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
8995             {
8996               if(i<slen-1&&!is_ds[i]) {
8997                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
8998                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
8999                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9000                 {
9001                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9002                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9003                 }
9004                 regmap_pre[i+1][hr]=-1;
9005                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9006                 regs[i+1].wasconst&=~(1<<hr);
9007               }
9008               regs[i].regmap[hr]=-1;
9009               regs[i].isconst&=~(1<<hr);
9010             }
9011           }
9012         }
9013       }
9014     }
9015   }
9016
9017   /* Pass 5 - Pre-allocate registers */
9018
9019   // If a register is allocated during a loop, try to allocate it for the
9020   // entire loop, if possible.  This avoids loading/storing registers
9021   // inside of the loop.
9022
9023   signed char f_regmap[HOST_REGS];
9024   clear_all_regs(f_regmap);
9025   for(i=0;i<slen-1;i++)
9026   {
9027     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9028     {
9029       if(ba[i]>=start && ba[i]<(start+i*4))
9030       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9031       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9032       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9033       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9034       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
9035       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
9036       {
9037         int t=(ba[i]-start)>>2;
9038         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9039         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
9040         for(hr=0;hr<HOST_REGS;hr++)
9041         {
9042           if(regs[i].regmap[hr]>64) {
9043             if(!((regs[i].dirty>>hr)&1))
9044               f_regmap[hr]=regs[i].regmap[hr];
9045             else f_regmap[hr]=-1;
9046           }
9047           else if(regs[i].regmap[hr]>=0) {
9048             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9049               // dealloc old register
9050               int n;
9051               for(n=0;n<HOST_REGS;n++)
9052               {
9053                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9054               }
9055               // and alloc new one
9056               f_regmap[hr]=regs[i].regmap[hr];
9057             }
9058           }
9059           if(branch_regs[i].regmap[hr]>64) {
9060             if(!((branch_regs[i].dirty>>hr)&1))
9061               f_regmap[hr]=branch_regs[i].regmap[hr];
9062             else f_regmap[hr]=-1;
9063           }
9064           else if(branch_regs[i].regmap[hr]>=0) {
9065             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
9066               // dealloc old register
9067               int n;
9068               for(n=0;n<HOST_REGS;n++)
9069               {
9070                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
9071               }
9072               // and alloc new one
9073               f_regmap[hr]=branch_regs[i].regmap[hr];
9074             }
9075           }
9076           if(ooo[i]) {
9077             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
9078               f_regmap[hr]=branch_regs[i].regmap[hr];
9079           }else{
9080             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
9081               f_regmap[hr]=branch_regs[i].regmap[hr];
9082           }
9083           // Avoid dirty->clean transition
9084           #ifdef DESTRUCTIVE_WRITEBACK
9085           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9086           #endif
9087           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
9088           // case above, however it's always a good idea.  We can't hoist the
9089           // load if the register was already allocated, so there's no point
9090           // wasting time analyzing most of these cases.  It only "succeeds"
9091           // when the mapping was different and the load can be replaced with
9092           // a mov, which is of negligible benefit.  So such cases are
9093           // skipped below.
9094           if(f_regmap[hr]>0) {
9095             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
9096               int r=f_regmap[hr];
9097               for(j=t;j<=i;j++)
9098               {
9099                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9100                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9101                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9102                 if(r>63) {
9103                   // NB This can exclude the case where the upper-half
9104                   // register is lower numbered than the lower-half
9105                   // register.  Not sure if it's worth fixing...
9106                   if(get_reg(regs[j].regmap,r&63)<0) break;
9107                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
9108                   if(regs[j].is32&(1LL<<(r&63))) break;
9109                 }
9110                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9111                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9112                   int k;
9113                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9114                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9115                     if(r>63) {
9116                       if(get_reg(regs[i].regmap,r&63)<0) break;
9117                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9118                     }
9119                     k=i;
9120                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9121                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9122                         //printf("no free regs for store %x\n",start+(k-1)*4);
9123                         break;
9124                       }
9125                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9126                         //printf("no-match due to different register\n");
9127                         break;
9128                       }
9129                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9130                         //printf("no-match due to branch\n");
9131                         break;
9132                       }
9133                       // call/ret fast path assumes no registers allocated
9134                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
9135                         break;
9136                       }
9137                       if(r>63) {
9138                         // NB This can exclude the case where the upper-half
9139                         // register is lower numbered than the lower-half
9140                         // register.  Not sure if it's worth fixing...
9141                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9142                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9143                       }
9144                       k--;
9145                     }
9146                     if(i<slen-1) {
9147                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9148                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9149                         //printf("bad match after branch\n");
9150                         break;
9151                       }
9152                     }
9153                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
9154                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
9155                       while(k<i) {
9156                         regs[k].regmap_entry[hr]=f_regmap[hr];
9157                         regs[k].regmap[hr]=f_regmap[hr];
9158                         regmap_pre[k+1][hr]=f_regmap[hr];
9159                         regs[k].wasdirty&=~(1<<hr);
9160                         regs[k].dirty&=~(1<<hr);
9161                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
9162                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
9163                         regs[k].wasconst&=~(1<<hr);
9164                         regs[k].isconst&=~(1<<hr);
9165                         k++;
9166                       }
9167                     }
9168                     else {
9169                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
9170                       break;
9171                     }
9172                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
9173                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
9174                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
9175                       regs[i].regmap_entry[hr]=f_regmap[hr];
9176                       regs[i].regmap[hr]=f_regmap[hr];
9177                       regs[i].wasdirty&=~(1<<hr);
9178                       regs[i].dirty&=~(1<<hr);
9179                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
9180                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
9181                       regs[i].wasconst&=~(1<<hr);
9182                       regs[i].isconst&=~(1<<hr);
9183                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
9184                       branch_regs[i].wasdirty&=~(1<<hr);
9185                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
9186                       branch_regs[i].regmap[hr]=f_regmap[hr];
9187                       branch_regs[i].dirty&=~(1<<hr);
9188                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
9189                       branch_regs[i].wasconst&=~(1<<hr);
9190                       branch_regs[i].isconst&=~(1<<hr);
9191                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
9192                         regmap_pre[i+2][hr]=f_regmap[hr];
9193                         regs[i+2].wasdirty&=~(1<<hr);
9194                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
9195                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
9196                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
9197                       }
9198                     }
9199                   }
9200                   for(k=t;k<j;k++) {
9201                     // Alloc register clean at beginning of loop,
9202                     // but may dirty it in pass 6
9203                     regs[k].regmap_entry[hr]=f_regmap[hr];
9204                     regs[k].regmap[hr]=f_regmap[hr];
9205                     regs[k].dirty&=~(1<<hr);
9206                     regs[k].wasconst&=~(1<<hr);
9207                     regs[k].isconst&=~(1<<hr);
9208                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
9209                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
9210                       branch_regs[k].regmap[hr]=f_regmap[hr];
9211                       branch_regs[k].dirty&=~(1<<hr);
9212                       branch_regs[k].wasconst&=~(1<<hr);
9213                       branch_regs[k].isconst&=~(1<<hr);
9214                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
9215                         regmap_pre[k+2][hr]=f_regmap[hr];
9216                         regs[k+2].wasdirty&=~(1<<hr);
9217                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
9218                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
9219                       }
9220                     }
9221                     else
9222                     {
9223                       regmap_pre[k+1][hr]=f_regmap[hr];
9224                       regs[k+1].wasdirty&=~(1<<hr);
9225                     }
9226                   }
9227                   if(regs[j].regmap[hr]==f_regmap[hr])
9228                     regs[j].regmap_entry[hr]=f_regmap[hr];
9229                   break;
9230                 }
9231                 if(j==i) break;
9232                 if(regs[j].regmap[hr]>=0)
9233                   break;
9234                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
9235                   //printf("no-match due to different register\n");
9236                   break;
9237                 }
9238                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
9239                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
9240                   break;
9241                 }
9242                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9243                 {
9244                   // Stop on unconditional branch
9245                   break;
9246                 }
9247                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
9248                 {
9249                   if(ooo[j]) {
9250                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
9251                       break;
9252                   }else{
9253                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
9254                       break;
9255                   }
9256                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
9257                     //printf("no-match due to different register (branch)\n");
9258                     break;
9259                   }
9260                 }
9261                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9262                   //printf("No free regs for store %x\n",start+j*4);
9263                   break;
9264                 }
9265                 if(f_regmap[hr]>=64) {
9266                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
9267                     break;
9268                   }
9269                   else
9270                   {
9271                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
9272                       break;
9273                     }
9274                   }
9275                 }
9276               }
9277             }
9278           }
9279         }
9280       }
9281     }else{
9282       // Non branch or undetermined branch target
9283       for(hr=0;hr<HOST_REGS;hr++)
9284       {
9285         if(hr!=EXCLUDE_REG) {
9286           if(regs[i].regmap[hr]>64) {
9287             if(!((regs[i].dirty>>hr)&1))
9288               f_regmap[hr]=regs[i].regmap[hr];
9289           }
9290           else if(regs[i].regmap[hr]>=0) {
9291             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9292               // dealloc old register
9293               int n;
9294               for(n=0;n<HOST_REGS;n++)
9295               {
9296                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9297               }
9298               // and alloc new one
9299               f_regmap[hr]=regs[i].regmap[hr];
9300             }
9301           }
9302         }
9303       }
9304       // Try to restore cycle count at branch targets
9305       if(bt[i]) {
9306         for(j=i;j<slen-1;j++) {
9307           if(regs[j].regmap[HOST_CCREG]!=-1) break;
9308           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9309             //printf("no free regs for store %x\n",start+j*4);
9310             break;
9311           }
9312         }
9313         if(regs[j].regmap[HOST_CCREG]==CCREG) {
9314           int k=i;
9315           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
9316           while(k<j) {
9317             regs[k].regmap_entry[HOST_CCREG]=CCREG;
9318             regs[k].regmap[HOST_CCREG]=CCREG;
9319             regmap_pre[k+1][HOST_CCREG]=CCREG;
9320             regs[k+1].wasdirty|=1<<HOST_CCREG;
9321             regs[k].dirty|=1<<HOST_CCREG;
9322             regs[k].wasconst&=~(1<<HOST_CCREG);
9323             regs[k].isconst&=~(1<<HOST_CCREG);
9324             k++;
9325           }
9326           regs[j].regmap_entry[HOST_CCREG]=CCREG;
9327         }
9328         // Work backwards from the branch target
9329         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9330         {
9331           //printf("Extend backwards\n");
9332           int k;
9333           k=i;
9334           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9335             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9336               //printf("no free regs for store %x\n",start+(k-1)*4);
9337               break;
9338             }
9339             k--;
9340           }
9341           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9342             //printf("Extend CC, %x ->\n",start+k*4);
9343             while(k<=i) {
9344               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9345               regs[k].regmap[HOST_CCREG]=CCREG;
9346               regmap_pre[k+1][HOST_CCREG]=CCREG;
9347               regs[k+1].wasdirty|=1<<HOST_CCREG;
9348               regs[k].dirty|=1<<HOST_CCREG;
9349               regs[k].wasconst&=~(1<<HOST_CCREG);
9350               regs[k].isconst&=~(1<<HOST_CCREG);
9351               k++;
9352             }
9353           }
9354           else {
9355             //printf("Fail Extend CC, %x ->\n",start+k*4);
9356           }
9357         }
9358       }
9359       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9360          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9361          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9362          itype[i]!=FCONV&&itype[i]!=FCOMP)
9363       {
9364         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9365       }
9366     }
9367   }
9368
9369   // Cache memory offset or tlb map pointer if a register is available
9370   #ifndef HOST_IMM_ADDR32
9371   #ifndef RAM_OFFSET
9372   if(0)
9373   #endif
9374   {
9375     int earliest_available[HOST_REGS];
9376     int loop_start[HOST_REGS];
9377     int score[HOST_REGS];
9378     int end[HOST_REGS];
9379     int reg=ROREG;
9380
9381     // Init
9382     for(hr=0;hr<HOST_REGS;hr++) {
9383       score[hr]=0;earliest_available[hr]=0;
9384       loop_start[hr]=MAXBLOCK;
9385     }
9386     for(i=0;i<slen-1;i++)
9387     {
9388       // Can't do anything if no registers are available
9389       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
9390         for(hr=0;hr<HOST_REGS;hr++) {
9391           score[hr]=0;earliest_available[hr]=i+1;
9392           loop_start[hr]=MAXBLOCK;
9393         }
9394       }
9395       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9396         if(!ooo[i]) {
9397           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
9398             for(hr=0;hr<HOST_REGS;hr++) {
9399               score[hr]=0;earliest_available[hr]=i+1;
9400               loop_start[hr]=MAXBLOCK;
9401             }
9402           }
9403         }else{
9404           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
9405             for(hr=0;hr<HOST_REGS;hr++) {
9406               score[hr]=0;earliest_available[hr]=i+1;
9407               loop_start[hr]=MAXBLOCK;
9408             }
9409           }
9410         }
9411       }
9412       // Mark unavailable registers
9413       for(hr=0;hr<HOST_REGS;hr++) {
9414         if(regs[i].regmap[hr]>=0) {
9415           score[hr]=0;earliest_available[hr]=i+1;
9416           loop_start[hr]=MAXBLOCK;
9417         }
9418         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9419           if(branch_regs[i].regmap[hr]>=0) {
9420             score[hr]=0;earliest_available[hr]=i+2;
9421             loop_start[hr]=MAXBLOCK;
9422           }
9423         }
9424       }
9425       // No register allocations after unconditional jumps
9426       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
9427       {
9428         for(hr=0;hr<HOST_REGS;hr++) {
9429           score[hr]=0;earliest_available[hr]=i+2;
9430           loop_start[hr]=MAXBLOCK;
9431         }
9432         i++; // Skip delay slot too
9433         //printf("skip delay slot: %x\n",start+i*4);
9434       }
9435       else
9436       // Possible match
9437       if(itype[i]==LOAD||itype[i]==LOADLR||
9438          itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
9439         for(hr=0;hr<HOST_REGS;hr++) {
9440           if(hr!=EXCLUDE_REG) {
9441             end[hr]=i-1;
9442             for(j=i;j<slen-1;j++) {
9443               if(regs[j].regmap[hr]>=0) break;
9444               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9445                 if(branch_regs[j].regmap[hr]>=0) break;
9446                 if(ooo[j]) {
9447                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
9448                 }else{
9449                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
9450                 }
9451               }
9452               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
9453               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9454                 int t=(ba[j]-start)>>2;
9455                 if(t<j&&t>=earliest_available[hr]) {
9456                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
9457                     // Score a point for hoisting loop invariant
9458                     if(t<loop_start[hr]) loop_start[hr]=t;
9459                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
9460                     score[hr]++;
9461                     end[hr]=j;
9462                   }
9463                 }
9464                 else if(t<j) {
9465                   if(regs[t].regmap[hr]==reg) {
9466                     // Score a point if the branch target matches this register
9467                     score[hr]++;
9468                     end[hr]=j;
9469                   }
9470                 }
9471                 if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
9472                    itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
9473                   score[hr]++;
9474                   end[hr]=j;
9475                 }
9476               }
9477               if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9478               {
9479                 // Stop on unconditional branch
9480                 break;
9481               }
9482               else
9483               if(itype[j]==LOAD||itype[j]==LOADLR||
9484                  itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
9485                 score[hr]++;
9486                 end[hr]=j;
9487               }
9488             }
9489           }
9490         }
9491         // Find highest score and allocate that register
9492         int maxscore=0;
9493         for(hr=0;hr<HOST_REGS;hr++) {
9494           if(hr!=EXCLUDE_REG) {
9495             if(score[hr]>score[maxscore]) {
9496               maxscore=hr;
9497               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
9498             }
9499           }
9500         }
9501         if(score[maxscore]>1)
9502         {
9503           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
9504           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
9505             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
9506             assert(regs[j].regmap[maxscore]<0);
9507             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
9508             regs[j].regmap[maxscore]=reg;
9509             regs[j].dirty&=~(1<<maxscore);
9510             regs[j].wasconst&=~(1<<maxscore);
9511             regs[j].isconst&=~(1<<maxscore);
9512             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9513               branch_regs[j].regmap[maxscore]=reg;
9514               branch_regs[j].wasdirty&=~(1<<maxscore);
9515               branch_regs[j].dirty&=~(1<<maxscore);
9516               branch_regs[j].wasconst&=~(1<<maxscore);
9517               branch_regs[j].isconst&=~(1<<maxscore);
9518               if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
9519                 regmap_pre[j+2][maxscore]=reg;
9520                 regs[j+2].wasdirty&=~(1<<maxscore);
9521               }
9522               // loop optimization (loop_preload)
9523               int t=(ba[j]-start)>>2;
9524               if(t==loop_start[maxscore]) {
9525                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
9526                   regs[t].regmap_entry[maxscore]=reg;
9527               }
9528             }
9529             else
9530             {
9531               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
9532                 regmap_pre[j+1][maxscore]=reg;
9533                 regs[j+1].wasdirty&=~(1<<maxscore);
9534               }
9535             }
9536           }
9537           i=j-1;
9538           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
9539           for(hr=0;hr<HOST_REGS;hr++) {
9540             score[hr]=0;earliest_available[hr]=i+i;
9541             loop_start[hr]=MAXBLOCK;
9542           }
9543         }
9544       }
9545     }
9546   }
9547   #endif
9548
9549   // This allocates registers (if possible) one instruction prior
9550   // to use, which can avoid a load-use penalty on certain CPUs.
9551   for(i=0;i<slen-1;i++)
9552   {
9553     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9554     {
9555       if(!bt[i+1])
9556       {
9557         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
9558            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
9559         {
9560           if(rs1[i+1]) {
9561             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9562             {
9563               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9564               {
9565                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9566                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9567                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9568                 regs[i].isconst&=~(1<<hr);
9569                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9570                 constmap[i][hr]=constmap[i+1][hr];
9571                 regs[i+1].wasdirty&=~(1<<hr);
9572                 regs[i].dirty&=~(1<<hr);
9573               }
9574             }
9575           }
9576           if(rs2[i+1]) {
9577             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
9578             {
9579               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9580               {
9581                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9582                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9583                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9584                 regs[i].isconst&=~(1<<hr);
9585                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9586                 constmap[i][hr]=constmap[i+1][hr];
9587                 regs[i+1].wasdirty&=~(1<<hr);
9588                 regs[i].dirty&=~(1<<hr);
9589               }
9590             }
9591           }
9592           // Preload target address for load instruction (non-constant)
9593           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9594             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9595             {
9596               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9597               {
9598                 regs[i].regmap[hr]=rs1[i+1];
9599                 regmap_pre[i+1][hr]=rs1[i+1];
9600                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9601                 regs[i].isconst&=~(1<<hr);
9602                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9603                 constmap[i][hr]=constmap[i+1][hr];
9604                 regs[i+1].wasdirty&=~(1<<hr);
9605                 regs[i].dirty&=~(1<<hr);
9606               }
9607             }
9608           }
9609           // Load source into target register
9610           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9611             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9612             {
9613               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9614               {
9615                 regs[i].regmap[hr]=rs1[i+1];
9616                 regmap_pre[i+1][hr]=rs1[i+1];
9617                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9618                 regs[i].isconst&=~(1<<hr);
9619                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9620                 constmap[i][hr]=constmap[i+1][hr];
9621                 regs[i+1].wasdirty&=~(1<<hr);
9622                 regs[i].dirty&=~(1<<hr);
9623               }
9624             }
9625           }
9626           // Address for store instruction (non-constant)
9627           if(itype[i+1]==STORE||itype[i+1]==STORELR
9628              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
9629             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9630               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
9631               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9632               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
9633               assert(hr>=0);
9634               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9635               {
9636                 regs[i].regmap[hr]=rs1[i+1];
9637                 regmap_pre[i+1][hr]=rs1[i+1];
9638                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9639                 regs[i].isconst&=~(1<<hr);
9640                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9641                 constmap[i][hr]=constmap[i+1][hr];
9642                 regs[i+1].wasdirty&=~(1<<hr);
9643                 regs[i].dirty&=~(1<<hr);
9644               }
9645             }
9646           }
9647           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
9648             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9649               int nr;
9650               hr=get_reg(regs[i+1].regmap,FTEMP);
9651               assert(hr>=0);
9652               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9653               {
9654                 regs[i].regmap[hr]=rs1[i+1];
9655                 regmap_pre[i+1][hr]=rs1[i+1];
9656                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9657                 regs[i].isconst&=~(1<<hr);
9658                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9659                 constmap[i][hr]=constmap[i+1][hr];
9660                 regs[i+1].wasdirty&=~(1<<hr);
9661                 regs[i].dirty&=~(1<<hr);
9662               }
9663               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9664               {
9665                 // move it to another register
9666                 regs[i+1].regmap[hr]=-1;
9667                 regmap_pre[i+2][hr]=-1;
9668                 regs[i+1].regmap[nr]=FTEMP;
9669                 regmap_pre[i+2][nr]=FTEMP;
9670                 regs[i].regmap[nr]=rs1[i+1];
9671                 regmap_pre[i+1][nr]=rs1[i+1];
9672                 regs[i+1].regmap_entry[nr]=rs1[i+1];
9673                 regs[i].isconst&=~(1<<nr);
9674                 regs[i+1].isconst&=~(1<<nr);
9675                 regs[i].dirty&=~(1<<nr);
9676                 regs[i+1].wasdirty&=~(1<<nr);
9677                 regs[i+1].dirty&=~(1<<nr);
9678                 regs[i+2].wasdirty&=~(1<<nr);
9679               }
9680             }
9681           }
9682           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
9683             if(itype[i+1]==LOAD)
9684               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
9685             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
9686               hr=get_reg(regs[i+1].regmap,FTEMP);
9687             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
9688               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
9689               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9690             }
9691             if(hr>=0&&regs[i].regmap[hr]<0) {
9692               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
9693               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
9694                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
9695                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
9696                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
9697                 regs[i].isconst&=~(1<<hr);
9698                 regs[i+1].wasdirty&=~(1<<hr);
9699                 regs[i].dirty&=~(1<<hr);
9700               }
9701             }
9702           }
9703         }
9704       }
9705     }
9706   }
9707
9708   /* Pass 6 - Optimize clean/dirty state */
9709   clean_registers(0,slen-1,1);
9710
9711   /* Pass 7 - Identify 32-bit registers */
9712   for (i=slen-1;i>=0;i--)
9713   {
9714     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9715     {
9716       // Conditional branch
9717       if((source[i]>>16)!=0x1000&&i<slen-2) {
9718         // Mark this address as a branch target since it may be called
9719         // upon return from interrupt
9720         bt[i+2]=1;
9721       }
9722     }
9723   }
9724
9725   if(itype[slen-1]==SPAN) {
9726     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
9727   }
9728
9729 #ifdef DISASM
9730   /* Debug/disassembly */
9731   for(i=0;i<slen;i++)
9732   {
9733     printf("U:");
9734     int r;
9735     for(r=1;r<=CCREG;r++) {
9736       if((unneeded_reg[i]>>r)&1) {
9737         if(r==HIREG) printf(" HI");
9738         else if(r==LOREG) printf(" LO");
9739         else printf(" r%d",r);
9740       }
9741     }
9742     printf("\n");
9743     #if defined(__i386__) || defined(__x86_64__)
9744     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
9745     #endif
9746     #ifdef __arm__
9747     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
9748     #endif
9749     printf("needs: ");
9750     if(needed_reg[i]&1) printf("eax ");
9751     if((needed_reg[i]>>1)&1) printf("ecx ");
9752     if((needed_reg[i]>>2)&1) printf("edx ");
9753     if((needed_reg[i]>>3)&1) printf("ebx ");
9754     if((needed_reg[i]>>5)&1) printf("ebp ");
9755     if((needed_reg[i]>>6)&1) printf("esi ");
9756     if((needed_reg[i]>>7)&1) printf("edi ");
9757     printf("\n");
9758     #if defined(__i386__) || defined(__x86_64__)
9759     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
9760     printf("dirty: ");
9761     if(regs[i].wasdirty&1) printf("eax ");
9762     if((regs[i].wasdirty>>1)&1) printf("ecx ");
9763     if((regs[i].wasdirty>>2)&1) printf("edx ");
9764     if((regs[i].wasdirty>>3)&1) printf("ebx ");
9765     if((regs[i].wasdirty>>5)&1) printf("ebp ");
9766     if((regs[i].wasdirty>>6)&1) printf("esi ");
9767     if((regs[i].wasdirty>>7)&1) printf("edi ");
9768     #endif
9769     #ifdef __arm__
9770     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
9771     printf("dirty: ");
9772     if(regs[i].wasdirty&1) printf("r0 ");
9773     if((regs[i].wasdirty>>1)&1) printf("r1 ");
9774     if((regs[i].wasdirty>>2)&1) printf("r2 ");
9775     if((regs[i].wasdirty>>3)&1) printf("r3 ");
9776     if((regs[i].wasdirty>>4)&1) printf("r4 ");
9777     if((regs[i].wasdirty>>5)&1) printf("r5 ");
9778     if((regs[i].wasdirty>>6)&1) printf("r6 ");
9779     if((regs[i].wasdirty>>7)&1) printf("r7 ");
9780     if((regs[i].wasdirty>>8)&1) printf("r8 ");
9781     if((regs[i].wasdirty>>9)&1) printf("r9 ");
9782     if((regs[i].wasdirty>>10)&1) printf("r10 ");
9783     if((regs[i].wasdirty>>12)&1) printf("r12 ");
9784     #endif
9785     printf("\n");
9786     disassemble_inst(i);
9787     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
9788     #if defined(__i386__) || defined(__x86_64__)
9789     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
9790     if(regs[i].dirty&1) printf("eax ");
9791     if((regs[i].dirty>>1)&1) printf("ecx ");
9792     if((regs[i].dirty>>2)&1) printf("edx ");
9793     if((regs[i].dirty>>3)&1) printf("ebx ");
9794     if((regs[i].dirty>>5)&1) printf("ebp ");
9795     if((regs[i].dirty>>6)&1) printf("esi ");
9796     if((regs[i].dirty>>7)&1) printf("edi ");
9797     #endif
9798     #ifdef __arm__
9799     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
9800     if(regs[i].dirty&1) printf("r0 ");
9801     if((regs[i].dirty>>1)&1) printf("r1 ");
9802     if((regs[i].dirty>>2)&1) printf("r2 ");
9803     if((regs[i].dirty>>3)&1) printf("r3 ");
9804     if((regs[i].dirty>>4)&1) printf("r4 ");
9805     if((regs[i].dirty>>5)&1) printf("r5 ");
9806     if((regs[i].dirty>>6)&1) printf("r6 ");
9807     if((regs[i].dirty>>7)&1) printf("r7 ");
9808     if((regs[i].dirty>>8)&1) printf("r8 ");
9809     if((regs[i].dirty>>9)&1) printf("r9 ");
9810     if((regs[i].dirty>>10)&1) printf("r10 ");
9811     if((regs[i].dirty>>12)&1) printf("r12 ");
9812     #endif
9813     printf("\n");
9814     if(regs[i].isconst) {
9815       printf("constants: ");
9816       #if defined(__i386__) || defined(__x86_64__)
9817       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
9818       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
9819       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
9820       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
9821       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
9822       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
9823       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
9824       #endif
9825       #ifdef __arm__
9826       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
9827       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
9828       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
9829       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
9830       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
9831       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
9832       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
9833       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
9834       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
9835       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
9836       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
9837       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
9838       #endif
9839       printf("\n");
9840     }
9841     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9842       #if defined(__i386__) || defined(__x86_64__)
9843       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
9844       if(branch_regs[i].dirty&1) printf("eax ");
9845       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
9846       if((branch_regs[i].dirty>>2)&1) printf("edx ");
9847       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
9848       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
9849       if((branch_regs[i].dirty>>6)&1) printf("esi ");
9850       if((branch_regs[i].dirty>>7)&1) printf("edi ");
9851       #endif
9852       #ifdef __arm__
9853       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
9854       if(branch_regs[i].dirty&1) printf("r0 ");
9855       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
9856       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
9857       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
9858       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
9859       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
9860       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
9861       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
9862       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
9863       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
9864       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
9865       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
9866       #endif
9867     }
9868   }
9869 #endif // DISASM
9870
9871   /* Pass 8 - Assembly */
9872   linkcount=0;stubcount=0;
9873   ds=0;is_delayslot=0;
9874   cop1_usable=0;
9875   uint64_t is32_pre=0;
9876   u_int dirty_pre=0;
9877   void *beginning=start_block();
9878   if((u_int)addr&1) {
9879     ds=1;
9880     pagespan_ds();
9881   }
9882   void *instr_addr0_override = NULL;
9883
9884   if (start == 0x80030000) {
9885     // nasty hack for fastbios thing
9886     // override block entry to this code
9887     instr_addr0_override = out;
9888     emit_movimm(start,0);
9889     // abuse io address var as a flag that we
9890     // have already returned here once
9891     emit_readword((int)&address,1);
9892     emit_writeword(0,(int)&pcaddr);
9893     emit_writeword(0,(int)&address);
9894     emit_cmp(0,1);
9895     emit_jne((int)new_dyna_leave);
9896   }
9897   for(i=0;i<slen;i++)
9898   {
9899     //if(ds) printf("ds: ");
9900     disassemble_inst(i);
9901     if(ds) {
9902       ds=0; // Skip delay slot
9903       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
9904       instr_addr[i] = NULL;
9905     } else {
9906       speculate_register_values(i);
9907       #ifndef DESTRUCTIVE_WRITEBACK
9908       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
9909       {
9910         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
9911               unneeded_reg[i],unneeded_reg_upper[i]);
9912       }
9913       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
9914         is32_pre=branch_regs[i].is32;
9915         dirty_pre=branch_regs[i].dirty;
9916       }else{
9917         is32_pre=regs[i].is32;
9918         dirty_pre=regs[i].dirty;
9919       }
9920       #endif
9921       // write back
9922       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
9923       {
9924         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
9925                       unneeded_reg[i],unneeded_reg_upper[i]);
9926         loop_preload(regmap_pre[i],regs[i].regmap_entry);
9927       }
9928       // branch target entry point
9929       instr_addr[i] = out;
9930       assem_debug("<->\n");
9931       drc_dbg_emit_do_cmp(i);
9932
9933       // load regs
9934       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
9935         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
9936       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
9937       address_generation(i,&regs[i],regs[i].regmap_entry);
9938       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
9939       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9940       {
9941         // Load the delay slot registers if necessary
9942         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
9943           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
9944         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
9945           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
9946         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
9947           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
9948       }
9949       else if(i+1<slen)
9950       {
9951         // Preload registers for following instruction
9952         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
9953           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
9954             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
9955         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
9956           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
9957             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
9958       }
9959       // TODO: if(is_ooo(i)) address_generation(i+1);
9960       if(itype[i]==CJUMP||itype[i]==FJUMP)
9961         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
9962       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
9963         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
9964       if(bt[i]) cop1_usable=0;
9965       // assemble
9966       switch(itype[i]) {
9967         case ALU:
9968           alu_assemble(i,&regs[i]);break;
9969         case IMM16:
9970           imm16_assemble(i,&regs[i]);break;
9971         case SHIFT:
9972           shift_assemble(i,&regs[i]);break;
9973         case SHIFTIMM:
9974           shiftimm_assemble(i,&regs[i]);break;
9975         case LOAD:
9976           load_assemble(i,&regs[i]);break;
9977         case LOADLR:
9978           loadlr_assemble(i,&regs[i]);break;
9979         case STORE:
9980           store_assemble(i,&regs[i]);break;
9981         case STORELR:
9982           storelr_assemble(i,&regs[i]);break;
9983         case COP0:
9984           cop0_assemble(i,&regs[i]);break;
9985         case COP1:
9986           cop1_assemble(i,&regs[i]);break;
9987         case C1LS:
9988           c1ls_assemble(i,&regs[i]);break;
9989         case COP2:
9990           cop2_assemble(i,&regs[i]);break;
9991         case C2LS:
9992           c2ls_assemble(i,&regs[i]);break;
9993         case C2OP:
9994           c2op_assemble(i,&regs[i]);break;
9995         case FCONV:
9996           fconv_assemble(i,&regs[i]);break;
9997         case FLOAT:
9998           float_assemble(i,&regs[i]);break;
9999         case FCOMP:
10000           fcomp_assemble(i,&regs[i]);break;
10001         case MULTDIV:
10002           multdiv_assemble(i,&regs[i]);break;
10003         case MOV:
10004           mov_assemble(i,&regs[i]);break;
10005         case SYSCALL:
10006           syscall_assemble(i,&regs[i]);break;
10007         case HLECALL:
10008           hlecall_assemble(i,&regs[i]);break;
10009         case INTCALL:
10010           intcall_assemble(i,&regs[i]);break;
10011         case UJUMP:
10012           ujump_assemble(i,&regs[i]);ds=1;break;
10013         case RJUMP:
10014           rjump_assemble(i,&regs[i]);ds=1;break;
10015         case CJUMP:
10016           cjump_assemble(i,&regs[i]);ds=1;break;
10017         case SJUMP:
10018           sjump_assemble(i,&regs[i]);ds=1;break;
10019         case FJUMP:
10020           fjump_assemble(i,&regs[i]);ds=1;break;
10021         case SPAN:
10022           pagespan_assemble(i,&regs[i]);break;
10023       }
10024       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10025         literal_pool(1024);
10026       else
10027         literal_pool_jumpover(256);
10028     }
10029   }
10030   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10031   // If the block did not end with an unconditional branch,
10032   // add a jump to the next instruction.
10033   if(i>1) {
10034     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10035       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10036       assert(i==slen);
10037       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10038         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10039         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10040           emit_loadreg(CCREG,HOST_CCREG);
10041         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10042       }
10043       else if(!likely[i-2])
10044       {
10045         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10046         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10047       }
10048       else
10049       {
10050         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10051         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10052       }
10053       add_to_linker((int)out,start+i*4,0);
10054       emit_jmp(0);
10055     }
10056   }
10057   else
10058   {
10059     assert(i>0);
10060     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10061     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10062     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10063       emit_loadreg(CCREG,HOST_CCREG);
10064     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10065     add_to_linker((int)out,start+i*4,0);
10066     emit_jmp(0);
10067   }
10068
10069   // TODO: delay slot stubs?
10070   // Stubs
10071   for(i=0;i<stubcount;i++)
10072   {
10073     switch(stubs[i].type)
10074     {
10075       case LOADB_STUB:
10076       case LOADH_STUB:
10077       case LOADW_STUB:
10078       case LOADD_STUB:
10079       case LOADBU_STUB:
10080       case LOADHU_STUB:
10081         do_readstub(i);break;
10082       case STOREB_STUB:
10083       case STOREH_STUB:
10084       case STOREW_STUB:
10085       case STORED_STUB:
10086         do_writestub(i);break;
10087       case CC_STUB:
10088         do_ccstub(i);break;
10089       case INVCODE_STUB:
10090         do_invstub(i);break;
10091       case FP_STUB:
10092         do_cop1stub(i);break;
10093       case STORELR_STUB:
10094         do_unalignedwritestub(i);break;
10095     }
10096   }
10097
10098   if (instr_addr0_override)
10099     instr_addr[0] = instr_addr0_override;
10100
10101   /* Pass 9 - Linker */
10102   for(i=0;i<linkcount;i++)
10103   {
10104     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10105     literal_pool(64);
10106     if(!link_addr[i][2])
10107     {
10108       void *stub=out;
10109       void *addr=check_addr(link_addr[i][1]);
10110       emit_extjump(link_addr[i][0],link_addr[i][1]);
10111       if(addr) {
10112         set_jump_target(link_addr[i][0], addr);
10113         add_link(link_addr[i][1],stub);
10114       }
10115       else set_jump_target(link_addr[i][0], stub);
10116     }
10117     else
10118     {
10119       // Internal branch
10120       int target=(link_addr[i][1]-start)>>2;
10121       assert(target>=0&&target<slen);
10122       assert(instr_addr[target]);
10123       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10124       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
10125       //#else
10126       set_jump_target(link_addr[i][0],instr_addr[target]);
10127       //#endif
10128     }
10129   }
10130   // External Branch Targets (jump_in)
10131   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
10132   for(i=0;i<slen;i++)
10133   {
10134     if(bt[i]||i==0)
10135     {
10136       if(instr_addr[i]) // TODO - delay slots (=null)
10137       {
10138         u_int vaddr=start+i*4;
10139         u_int page=get_page(vaddr);
10140         u_int vpage=get_vpage(vaddr);
10141         literal_pool(256);
10142         {
10143           assem_debug("%p (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10144           assem_debug("jump_in: %x\n",start+i*4);
10145           ll_add(jump_dirty+vpage,vaddr,out);
10146           void *entry_point = do_dirty_stub(i);
10147           ll_add_flags(jump_in+page,vaddr,state_rflags,entry_point);
10148           // If there was an existing entry in the hash table,
10149           // replace it with the new address.
10150           // Don't add new entries.  We'll insert the
10151           // ones that actually get used in check_addr().
10152           struct ht_entry *ht_bin = hash_table_get(vaddr);
10153           if (ht_bin->vaddr[0] == vaddr)
10154             ht_bin->tcaddr[0] = entry_point;
10155           if (ht_bin->vaddr[1] == vaddr)
10156             ht_bin->tcaddr[1] = entry_point;
10157         }
10158       }
10159     }
10160   }
10161   // Write out the literal pool if necessary
10162   literal_pool(0);
10163   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10164   // Align code
10165   if(((u_int)out)&7) emit_addnop(13);
10166   #endif
10167   assert((u_int)out-(u_int)beginning<MAX_OUTPUT_BLOCK_SIZE);
10168   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
10169   memcpy(copy,source,slen*4);
10170   copy+=slen*4;
10171
10172   end_block(beginning);
10173
10174   // If we're within 256K of the end of the buffer,
10175   // start over from the beginning. (Is 256K enough?)
10176   if((u_int)out>(u_int)BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
10177
10178   // Trap writes to any of the pages we compiled
10179   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
10180     invalid_code[i]=0;
10181   }
10182   inv_code_start=inv_code_end=~0;
10183
10184   // for PCSX we need to mark all mirrors too
10185   if(get_page(start)<(RAM_SIZE>>12))
10186     for(i=start>>12;i<=(start+slen*4)>>12;i++)
10187       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
10188       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
10189       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
10190
10191   /* Pass 10 - Free memory by expiring oldest blocks */
10192
10193   int end=((((int)out-(int)BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
10194   while(expirep!=end)
10195   {
10196     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
10197     uintptr_t base=(uintptr_t)BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
10198     inv_debug("EXP: Phase %d\n",expirep);
10199     switch((expirep>>11)&3)
10200     {
10201       case 0:
10202         // Clear jump_in and jump_dirty
10203         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
10204         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
10205         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
10206         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
10207         break;
10208       case 1:
10209         // Clear pointers
10210         ll_kill_pointers(jump_out[expirep&2047],base,shift);
10211         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
10212         break;
10213       case 2:
10214         // Clear hash table
10215         for(i=0;i<32;i++) {
10216           struct ht_entry *ht_bin = &hash_table[((expirep&2047)<<5)+i];
10217           if (((uintptr_t)ht_bin->tcaddr[1]>>shift) == (base>>shift) ||
10218              (((uintptr_t)ht_bin->tcaddr[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10219             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[1],ht_bin->tcaddr[1]);
10220             ht_bin->vaddr[1] = -1;
10221             ht_bin->tcaddr[1] = NULL;
10222           }
10223           if (((uintptr_t)ht_bin->tcaddr[0]>>shift) == (base>>shift) ||
10224              (((uintptr_t)ht_bin->tcaddr[0]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10225             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[0],ht_bin->tcaddr[0]);
10226             ht_bin->vaddr[0] = ht_bin->vaddr[1];
10227             ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
10228             ht_bin->vaddr[1] = -1;
10229             ht_bin->tcaddr[1] = NULL;
10230           }
10231         }
10232         break;
10233       case 3:
10234         // Clear jump_out
10235         #ifdef __arm__
10236         if((expirep&2047)==0)
10237           do_clear_cache();
10238         #endif
10239         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
10240         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
10241         break;
10242     }
10243     expirep=(expirep+1)&65535;
10244   }
10245   return 0;
10246 }
10247
10248 // vim:shiftwidth=2:expandtab