Fix dynarec crashes on 3DS
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29 #ifdef _3DS
30 #include <3ds_utils.h>
31 #endif
32 #ifdef VITA
33 #include <psp2/kernel/sysmem.h>
34 static int sceBlock;
35 int getVMBlock();
36 #endif
37
38 #include "new_dynarec_config.h"
39 #include "backends/psx/emu_if.h" //emulator interface
40
41 //#define DISASM
42 //#define assem_debug printf
43 //#define inv_debug printf
44 #define assem_debug(...)
45 #define inv_debug(...)
46
47 #ifdef __i386__
48 #include "x86/assem_x86.h"
49 #endif
50 #ifdef __x86_64__
51 #include "x64/assem_x64.h"
52 #endif
53 #ifdef __arm__
54 #include "arm/assem_arm.h"
55 #endif
56
57 #ifdef VITA
58 int _newlib_vm_size_user = 1 << TARGET_SIZE_2;
59 #endif
60
61 #define MAXBLOCK 4096
62 #define MAX_OUTPUT_BLOCK_SIZE 262144
63
64 struct regstat
65 {
66   signed char regmap_entry[HOST_REGS];
67   signed char regmap[HOST_REGS];
68   uint64_t was32;
69   uint64_t is32;
70   uint64_t wasdirty;
71   uint64_t dirty;
72   uint64_t u;
73   uint64_t uu;
74   u_int wasconst;
75   u_int isconst;
76   u_int loadedconst;             // host regs that have constants loaded
77   u_int waswritten;              // MIPS regs that were used as store base before
78 };
79
80 // note: asm depends on this layout
81 struct ll_entry
82 {
83   u_int vaddr;
84   u_int reg_sv_flags;
85   void *addr;
86   struct ll_entry *next;
87 };
88
89   // used by asm:
90   u_char *out;
91   u_int hash_table[65536][4]  __attribute__((aligned(16)));
92   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
93   struct ll_entry *jump_dirty[4096];
94
95   static struct ll_entry *jump_out[4096];
96   static u_int start;
97   static u_int *source;
98   static char insn[MAXBLOCK][10];
99   static u_char itype[MAXBLOCK];
100   static u_char opcode[MAXBLOCK];
101   static u_char opcode2[MAXBLOCK];
102   static u_char bt[MAXBLOCK];
103   static u_char rs1[MAXBLOCK];
104   static u_char rs2[MAXBLOCK];
105   static u_char rt1[MAXBLOCK];
106   static u_char rt2[MAXBLOCK];
107   static u_char us1[MAXBLOCK];
108   static u_char us2[MAXBLOCK];
109   static u_char dep1[MAXBLOCK];
110   static u_char dep2[MAXBLOCK];
111   static u_char lt1[MAXBLOCK];
112   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
113   static uint64_t gte_rt[MAXBLOCK];
114   static uint64_t gte_unneeded[MAXBLOCK];
115   static u_int smrv[32]; // speculated MIPS register values
116   static u_int smrv_strong; // mask or regs that are likely to have correct values
117   static u_int smrv_weak; // same, but somewhat less likely
118   static u_int smrv_strong_next; // same, but after current insn executes
119   static u_int smrv_weak_next;
120   static int imm[MAXBLOCK];
121   static u_int ba[MAXBLOCK];
122   static char likely[MAXBLOCK];
123   static char is_ds[MAXBLOCK];
124   static char ooo[MAXBLOCK];
125   static uint64_t unneeded_reg[MAXBLOCK];
126   static uint64_t unneeded_reg_upper[MAXBLOCK];
127   static uint64_t branch_unneeded_reg[MAXBLOCK];
128   static uint64_t branch_unneeded_reg_upper[MAXBLOCK];
129   static signed char regmap_pre[MAXBLOCK][HOST_REGS];
130   static uint64_t current_constmap[HOST_REGS];
131   static uint64_t constmap[MAXBLOCK][HOST_REGS];
132   static struct regstat regs[MAXBLOCK];
133   static struct regstat branch_regs[MAXBLOCK];
134   static signed char minimum_free_regs[MAXBLOCK];
135   static u_int needed_reg[MAXBLOCK];
136   static u_int wont_dirty[MAXBLOCK];
137   static u_int will_dirty[MAXBLOCK];
138   static int ccadj[MAXBLOCK];
139   static int slen;
140   static u_int instr_addr[MAXBLOCK];
141   static u_int link_addr[MAXBLOCK][3];
142   static int linkcount;
143   static u_int stubs[MAXBLOCK*3][8];
144   static int stubcount;
145   static u_int literals[1024][2];
146   static int literalcount;
147   static int is_delayslot;
148   static int cop1_usable;
149   static char shadow[1048576]  __attribute__((aligned(16)));
150   static void *copy;
151   static int expirep;
152   static u_int stop_after_jal;
153 #ifndef RAM_FIXED
154   static u_int ram_offset;
155 #else
156   static const u_int ram_offset=0;
157 #endif
158
159   int new_dynarec_hacks;
160   int new_dynarec_did_compile;
161   extern u_char restore_candidate[512];
162   extern int cycle_count;
163
164   /* registers that may be allocated */
165   /* 1-31 gpr */
166 #define HIREG 32 // hi
167 #define LOREG 33 // lo
168 #define FSREG 34 // FPU status (FCSR)
169 #define CSREG 35 // Coprocessor status
170 #define CCREG 36 // Cycle count
171 #define INVCP 37 // Pointer to invalid_code
172 //#define MMREG 38 // Pointer to memory_map
173 #define ROREG 39 // ram offset (if rdram!=0x80000000)
174 #define TEMPREG 40
175 #define FTEMP 40 // FPU temporary register
176 #define PTEMP 41 // Prefetch temporary register
177 //#define TLREG 42 // TLB mapping offset
178 #define RHASH 43 // Return address hash
179 #define RHTBL 44 // Return address hash table address
180 #define RTEMP 45 // JR/JALR address register
181 #define MAXREG 45
182 #define AGEN1 46 // Address generation temporary register
183 //#define AGEN2 47 // Address generation temporary register
184 //#define MGEN1 48 // Maptable address generation temporary register
185 //#define MGEN2 49 // Maptable address generation temporary register
186 #define BTREG 50 // Branch target temporary register
187
188   /* instruction types */
189 #define NOP 0     // No operation
190 #define LOAD 1    // Load
191 #define STORE 2   // Store
192 #define LOADLR 3  // Unaligned load
193 #define STORELR 4 // Unaligned store
194 #define MOV 5     // Move
195 #define ALU 6     // Arithmetic/logic
196 #define MULTDIV 7 // Multiply/divide
197 #define SHIFT 8   // Shift by register
198 #define SHIFTIMM 9// Shift by immediate
199 #define IMM16 10  // 16-bit immediate
200 #define RJUMP 11  // Unconditional jump to register
201 #define UJUMP 12  // Unconditional jump
202 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
203 #define SJUMP 14  // Conditional branch (regimm format)
204 #define COP0 15   // Coprocessor 0
205 #define COP1 16   // Coprocessor 1
206 #define C1LS 17   // Coprocessor 1 load/store
207 #define FJUMP 18  // Conditional branch (floating point)
208 #define FLOAT 19  // Floating point unit
209 #define FCONV 20  // Convert integer to float
210 #define FCOMP 21  // Floating point compare (sets FSREG)
211 #define SYSCALL 22// SYSCALL
212 #define OTHER 23  // Other
213 #define SPAN 24   // Branch/delay slot spans 2 pages
214 #define NI 25     // Not implemented
215 #define HLECALL 26// PCSX fake opcodes for HLE
216 #define COP2 27   // Coprocessor 2 move
217 #define C2LS 28   // Coprocessor 2 load/store
218 #define C2OP 29   // Coprocessor 2 operation
219 #define INTCALL 30// Call interpreter to handle rare corner cases
220
221   /* stubs */
222 #define CC_STUB 1
223 #define FP_STUB 2
224 #define LOADB_STUB 3
225 #define LOADH_STUB 4
226 #define LOADW_STUB 5
227 #define LOADD_STUB 6
228 #define LOADBU_STUB 7
229 #define LOADHU_STUB 8
230 #define STOREB_STUB 9
231 #define STOREH_STUB 10
232 #define STOREW_STUB 11
233 #define STORED_STUB 12
234 #define STORELR_STUB 13
235 #define INVCODE_STUB 14
236
237   /* branch codes */
238 #define TAKEN 1
239 #define NOTTAKEN 2
240 #define NULLDS 3
241
242 // asm linkage
243 int new_recompile_block(int addr);
244 void *get_addr_ht(u_int vaddr);
245 void invalidate_block(u_int block);
246 void invalidate_addr(u_int addr);
247 void remove_hash(int vaddr);
248 void dyna_linker();
249 void dyna_linker_ds();
250 void verify_code();
251 void verify_code_vm();
252 void verify_code_ds();
253 void cc_interrupt();
254 void fp_exception();
255 void fp_exception_ds();
256 void jump_syscall_hle();
257 void jump_hlecall();
258 void jump_intcall();
259 void new_dyna_leave();
260
261 // Needed by assembler
262 static void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
263 static void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
264 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
265 static void load_all_regs(signed char i_regmap[]);
266 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
267 static void load_regs_entry(int t);
268 static void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
269
270 static int verify_dirty(u_int *ptr);
271 static int get_final_value(int hr, int i, int *value);
272 static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e);
273 static void add_to_linker(int addr,int target,int ext);
274
275 static int tracedebug=0;
276
277 static void mprotect_w_x(void *start, void *end, int is_x)
278 {
279 #ifdef NO_WRITE_EXEC
280   #if defined(VITA)
281   // *Open* enables write on all memory that was
282   // allocated by sceKernelAllocMemBlockForVM()?
283   if (is_x)
284     sceKernelCloseVMDomain();
285   else
286     sceKernelOpenVMDomain();
287   #else
288   u_long mstart = (u_long)start & ~4095ul;
289   u_long mend = (u_long)end;
290   if (mprotect((void *)mstart, mend - mstart,
291                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
292     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
293   #endif
294 #endif
295 }
296
297 static void start_tcache_write(void *start, void *end)
298 {
299   mprotect_w_x(start, end, 0);
300 }
301
302 static void end_tcache_write(void *start, void *end)
303 {
304 #ifdef __arm__
305   size_t len = (char *)end - (char *)start;
306   #if   defined(__BLACKBERRY_QNX__)
307   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
308   #elif defined(__MACH__)
309   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
310   #elif defined(VITA)
311   sceKernelSyncVMDomain(sceBlock, start, len);
312   #elif defined(_3DS)
313   ctr_flush_invalidate_cache();
314   #else
315   __clear_cache(start, end);
316   #endif
317   (void)len;
318 #endif
319
320   mprotect_w_x(start, end, 1);
321 }
322
323 static void *start_block(void)
324 {
325   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
326   if (end > (u_char *)BASE_ADDR + (1<<TARGET_SIZE_2))
327     end = (u_char *)BASE_ADDR + (1<<TARGET_SIZE_2);
328   start_tcache_write(out, end);
329   return out;
330 }
331
332 static void end_block(void *start)
333 {
334   end_tcache_write(start, out);
335 }
336
337 //#define DEBUG_CYCLE_COUNT 1
338
339 #define NO_CYCLE_PENALTY_THR 12
340
341 int cycle_multiplier; // 100 for 1.0
342
343 static int CLOCK_ADJUST(int x)
344 {
345   int s=(x>>31)|1;
346   return (x * cycle_multiplier + s * 50) / 100;
347 }
348
349 static u_int get_page(u_int vaddr)
350 {
351   u_int page=vaddr&~0xe0000000;
352   if (page < 0x1000000)
353     page &= ~0x0e00000; // RAM mirrors
354   page>>=12;
355   if(page>2048) page=2048+(page&2047);
356   return page;
357 }
358
359 // no virtual mem in PCSX
360 static u_int get_vpage(u_int vaddr)
361 {
362   return get_page(vaddr);
363 }
364
365 // Get address from virtual address
366 // This is called from the recompiled JR/JALR instructions
367 void *get_addr(u_int vaddr)
368 {
369   struct ll_entry *head = NULL;
370   u_int page            = get_page(vaddr);
371   u_int vpage           = get_vpage(vaddr);
372   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
373   head=jump_in[page];
374   while(head!=NULL)
375   {
376     if(head->vaddr==vaddr)
377     {
378       //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
379       u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
380       ht_bin[3]=ht_bin[1];
381       ht_bin[2]=ht_bin[0];
382       ht_bin[1]=(u_int)head->addr;
383       ht_bin[0]=vaddr;
384       return head->addr;
385     }
386     head=head->next;
387   }
388   head=jump_dirty[vpage];
389   while(head!=NULL)
390   {
391     if(head->vaddr==vaddr)
392     {
393       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
394       // Don't restore blocks which are about to expire from the cache
395       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
396         if(verify_dirty(head->addr))
397         {
398           //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
399           invalid_code[vaddr>>12]=0;
400           inv_code_start=inv_code_end=~0;
401           if(vpage<2048)
402           {
403             restore_candidate[vpage>>3]|=1<<(vpage&7);
404           }
405           else
406           {
407             restore_candidate[page>>3]|=1<<(page&7);
408           }
409           u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
410
411           if(ht_bin[0]==vaddr)
412             ht_bin[1]=(u_int)head->addr; // Replace existing entry
413           else
414           {
415             ht_bin[3]=ht_bin[1];
416             ht_bin[2]=ht_bin[0];
417             ht_bin[1]=(int)head->addr;
418             ht_bin[0]=vaddr;
419           }
420           return head->addr;
421         }
422     }
423     head=head->next;
424   }
425   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
426   int r=new_recompile_block(vaddr);
427   if(r==0)
428     return get_addr(vaddr);
429   // Execute in unmapped page, generate pagefault exception
430   Status|=2;
431   Cause=(vaddr<<31)|0x8;
432   EPC=(vaddr&1)?vaddr-5:vaddr;
433   BadVAddr=(vaddr&~1);
434   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
435   EntryHi=BadVAddr&0xFFFFE000;
436   return get_addr_ht(0x80000000);
437 }
438
439 // Look up address in hash table first
440 void *get_addr_ht(u_int vaddr)
441 {
442   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
443   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
444   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
445   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
446   return get_addr(vaddr);
447 }
448
449 void clear_all_regs(signed char regmap[])
450 {
451   int hr;
452   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
453 }
454
455 signed char get_reg(signed char regmap[],int r)
456 {
457   int hr;
458   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
459   return -1;
460 }
461
462 // Find a register that is available for two consecutive cycles
463 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
464 {
465   int hr;
466   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
467   return -1;
468 }
469
470 int count_free_regs(signed char regmap[])
471 {
472   int count=0;
473   int hr;
474   for(hr=0;hr<HOST_REGS;hr++)
475   {
476     if(hr!=EXCLUDE_REG) {
477       if(regmap[hr]<0) count++;
478     }
479   }
480   return count;
481 }
482
483 void dirty_reg(struct regstat *cur,signed char reg)
484 {
485   int hr;
486   if(!reg) return;
487   for (hr=0;hr<HOST_REGS;hr++) {
488     if((cur->regmap[hr]&63)==reg) {
489       cur->dirty|=1<<hr;
490     }
491   }
492 }
493
494 // If we dirty the lower half of a 64 bit register which is now being
495 // sign-extended, we need to dump the upper half.
496 // Note: Do this only after completion of the instruction, because
497 // some instructions may need to read the full 64-bit value even if
498 // overwriting it (eg SLTI, DSRA32).
499 static void flush_dirty_uppers(struct regstat *cur)
500 {
501   int hr,reg;
502   for (hr=0;hr<HOST_REGS;hr++) {
503     if((cur->dirty>>hr)&1) {
504       reg=cur->regmap[hr];
505       if(reg>=64)
506         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
507     }
508   }
509 }
510
511 void set_const(struct regstat *cur,signed char reg,uint64_t value)
512 {
513   int hr;
514   if(!reg) return;
515   for (hr=0;hr<HOST_REGS;hr++) {
516     if(cur->regmap[hr]==reg) {
517       cur->isconst|=1<<hr;
518       current_constmap[hr]=value;
519     }
520     else if((cur->regmap[hr]^64)==reg) {
521       cur->isconst|=1<<hr;
522       current_constmap[hr]=value>>32;
523     }
524   }
525 }
526
527 void clear_const(struct regstat *cur,signed char reg)
528 {
529   int hr;
530   if(!reg) return;
531   for (hr=0;hr<HOST_REGS;hr++) {
532     if((cur->regmap[hr]&63)==reg) {
533       cur->isconst&=~(1<<hr);
534     }
535   }
536 }
537
538 int is_const(struct regstat *cur,signed char reg)
539 {
540   int hr;
541   if(reg<0) return 0;
542   if(!reg) return 1;
543   for (hr=0;hr<HOST_REGS;hr++) {
544     if((cur->regmap[hr]&63)==reg) {
545       return (cur->isconst>>hr)&1;
546     }
547   }
548   return 0;
549 }
550 uint64_t get_const(struct regstat *cur,signed char reg)
551 {
552   int hr;
553   if(!reg) return 0;
554   for (hr=0;hr<HOST_REGS;hr++) {
555     if(cur->regmap[hr]==reg) {
556       return current_constmap[hr];
557     }
558   }
559   SysPrintf("Unknown constant in r%d\n",reg);
560   exit(1);
561 }
562
563 // Least soon needed registers
564 // Look at the next ten instructions and see which registers
565 // will be used.  Try not to reallocate these.
566 void lsn(u_char hsn[], int i, int *preferred_reg)
567 {
568   int j;
569   int b=-1;
570   for(j=0;j<9;j++)
571   {
572     if(i+j>=slen) {
573       j=slen-i-1;
574       break;
575     }
576     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
577     {
578       // Don't go past an unconditonal jump
579       j++;
580       break;
581     }
582   }
583   for(;j>=0;j--)
584   {
585     if(rs1[i+j]) hsn[rs1[i+j]]=j;
586     if(rs2[i+j]) hsn[rs2[i+j]]=j;
587     if(rt1[i+j]) hsn[rt1[i+j]]=j;
588     if(rt2[i+j]) hsn[rt2[i+j]]=j;
589     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
590       // Stores can allocate zero
591       hsn[rs1[i+j]]=j;
592       hsn[rs2[i+j]]=j;
593     }
594     // On some architectures stores need invc_ptr
595     #if defined(HOST_IMM8)
596     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
597       hsn[INVCP]=j;
598     }
599     #endif
600     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
601     {
602       hsn[CCREG]=j;
603       b=j;
604     }
605   }
606   if(b>=0)
607   {
608     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
609     {
610       // Follow first branch
611       int t=(ba[i+b]-start)>>2;
612       j=7-b;if(t+j>=slen) j=slen-t-1;
613       for(;j>=0;j--)
614       {
615         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
616         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
617         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
618         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
619       }
620     }
621     // TODO: preferred register based on backward branch
622   }
623   // Delay slot should preferably not overwrite branch conditions or cycle count
624   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
625     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
626     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
627     hsn[CCREG]=1;
628     // ...or hash tables
629     hsn[RHASH]=1;
630     hsn[RHTBL]=1;
631   }
632   // Coprocessor load/store needs FTEMP, even if not declared
633   if(itype[i]==C1LS||itype[i]==C2LS) {
634     hsn[FTEMP]=0;
635   }
636   // Load L/R also uses FTEMP as a temporary register
637   if(itype[i]==LOADLR) {
638     hsn[FTEMP]=0;
639   }
640   // Also SWL/SWR/SDL/SDR
641   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
642     hsn[FTEMP]=0;
643   }
644   // Don't remove the miniht registers
645   if(itype[i]==UJUMP||itype[i]==RJUMP)
646   {
647     hsn[RHASH]=0;
648     hsn[RHTBL]=0;
649   }
650 }
651
652 // We only want to allocate registers if we're going to use them again soon
653 int needed_again(int r, int i)
654 {
655   int j;
656   int b=-1;
657   int rn=10;
658
659   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
660   {
661     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
662       return 0; // Don't need any registers if exiting the block
663   }
664   for(j=0;j<9;j++)
665   {
666     if(i+j>=slen) {
667       j=slen-i-1;
668       break;
669     }
670     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
671     {
672       // Don't go past an unconditonal jump
673       j++;
674       break;
675     }
676     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
677     {
678       break;
679     }
680   }
681   for(;j>=1;j--)
682   {
683     if(rs1[i+j]==r) rn=j;
684     if(rs2[i+j]==r) rn=j;
685     if((unneeded_reg[i+j]>>r)&1) rn=10;
686     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
687     {
688       b=j;
689     }
690   }
691   /*
692   if(b>=0)
693   {
694     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
695     {
696       // Follow first branch
697       int o=rn;
698       int t=(ba[i+b]-start)>>2;
699       j=7-b;if(t+j>=slen) j=slen-t-1;
700       for(;j>=0;j--)
701       {
702         if(!((unneeded_reg[t+j]>>r)&1)) {
703           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
704           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
705         }
706         else rn=o;
707       }
708     }
709   }*/
710   if(rn<10) return 1;
711   (void)b;
712   return 0;
713 }
714
715 // Try to match register allocations at the end of a loop with those
716 // at the beginning
717 int loop_reg(int i, int r, int hr)
718 {
719   int j,k;
720   for(j=0;j<9;j++)
721   {
722     if(i+j>=slen) {
723       j=slen-i-1;
724       break;
725     }
726     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
727     {
728       // Don't go past an unconditonal jump
729       j++;
730       break;
731     }
732   }
733   k=0;
734   if(i>0){
735     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
736       k--;
737   }
738   for(;k<j;k++)
739   {
740     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
741     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
742     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
743     {
744       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
745       {
746         int t=(ba[i+k]-start)>>2;
747         int reg=get_reg(regs[t].regmap_entry,r);
748         if(reg>=0) return reg;
749         //reg=get_reg(regs[t+1].regmap_entry,r);
750         //if(reg>=0) return reg;
751       }
752     }
753   }
754   return hr;
755 }
756
757
758 // Allocate every register, preserving source/target regs
759 void alloc_all(struct regstat *cur,int i)
760 {
761   int hr;
762
763   for(hr=0;hr<HOST_REGS;hr++) {
764     if(hr!=EXCLUDE_REG) {
765       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
766          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
767       {
768         cur->regmap[hr]=-1;
769         cur->dirty&=~(1<<hr);
770       }
771       // Don't need zeros
772       if((cur->regmap[hr]&63)==0)
773       {
774         cur->regmap[hr]=-1;
775         cur->dirty&=~(1<<hr);
776       }
777     }
778   }
779 }
780
781 #ifdef __i386__
782 #include "x86/assem_x86.c"
783 #endif
784 #ifdef __x86_64__
785 #include "x64/assem_x64.c"
786 #endif
787 #ifdef __arm__
788 #include "arm/assem_arm.c"
789 #endif
790
791 // Add virtual address mapping to linked list
792 void ll_add(struct ll_entry **head,int vaddr,void *addr)
793 {
794   struct ll_entry *new_entry;
795   new_entry=malloc(sizeof(struct ll_entry));
796   assert(new_entry!=NULL);
797   new_entry->vaddr=vaddr;
798   new_entry->reg_sv_flags=0;
799   new_entry->addr=addr;
800   new_entry->next=*head;
801   *head=new_entry;
802 }
803
804 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
805 {
806   ll_add(head,vaddr,addr);
807   (*head)->reg_sv_flags=reg_sv_flags;
808 }
809
810 // Check if an address is already compiled
811 // but don't return addresses which are about to expire from the cache
812 void *check_addr(u_int vaddr)
813 {
814   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
815   if(ht_bin[0]==vaddr) {
816     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
817       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
818   }
819   if(ht_bin[2]==vaddr) {
820     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
821       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
822   }
823   u_int page=get_page(vaddr);
824   struct ll_entry *head;
825   head=jump_in[page];
826   while(head!=NULL) {
827     if(head->vaddr==vaddr) {
828       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
829         // Update existing entry with current address
830         if(ht_bin[0]==vaddr) {
831           ht_bin[1]=(int)head->addr;
832           return head->addr;
833         }
834         if(ht_bin[2]==vaddr) {
835           ht_bin[3]=(int)head->addr;
836           return head->addr;
837         }
838         // Insert into hash table with low priority.
839         // Don't evict existing entries, as they are probably
840         // addresses that are being accessed frequently.
841         if(ht_bin[0]==-1) {
842           ht_bin[1]=(int)head->addr;
843           ht_bin[0]=vaddr;
844         }else if(ht_bin[2]==-1) {
845           ht_bin[3]=(int)head->addr;
846           ht_bin[2]=vaddr;
847         }
848         return head->addr;
849       }
850     }
851     head=head->next;
852   }
853   return 0;
854 }
855
856 void remove_hash(int vaddr)
857 {
858   //printf("remove hash: %x\n",vaddr);
859   u_int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
860   if(ht_bin[2]==vaddr) {
861     ht_bin[2]=ht_bin[3]=-1;
862   }
863   if(ht_bin[0]==vaddr) {
864     ht_bin[0]=ht_bin[2];
865     ht_bin[1]=ht_bin[3];
866     ht_bin[2]=ht_bin[3]=-1;
867   }
868 }
869
870 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
871 {
872   struct ll_entry *next;
873   while(*head) {
874     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) ||
875        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
876     {
877       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
878       remove_hash((*head)->vaddr);
879       next=(*head)->next;
880       free(*head);
881       *head=next;
882     }
883     else
884     {
885       head=&((*head)->next);
886     }
887   }
888 }
889
890 // Remove all entries from linked list
891 void ll_clear(struct ll_entry **head)
892 {
893   struct ll_entry *cur;
894   struct ll_entry *next;
895   if((cur=*head)) {
896     *head=0;
897     while(cur) {
898       next=cur->next;
899       free(cur);
900       cur=next;
901     }
902   }
903 }
904
905 // Dereference the pointers and remove if it matches
906 static void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
907 {
908   while(head) {
909     int ptr=get_pointer(head->addr);
910     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
911     if(((ptr>>shift)==(addr>>shift)) ||
912        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
913     {
914       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
915       void *host_addr=find_extjump_insn(head->addr);
916       #ifdef __arm__
917         mark_clear_cache(host_addr);
918       #endif
919       set_jump_target((int)host_addr,(int)head->addr);
920     }
921     head=head->next;
922   }
923 }
924
925 // This is called when we write to a compiled block (see do_invstub)
926 void invalidate_page(u_int page)
927 {
928   struct ll_entry *head;
929   struct ll_entry *next;
930   head=jump_in[page];
931   jump_in[page]=0;
932   while(head!=NULL) {
933     inv_debug("INVALIDATE: %x\n",head->vaddr);
934     remove_hash(head->vaddr);
935     next=head->next;
936     free(head);
937     head=next;
938   }
939   head=jump_out[page];
940   jump_out[page]=0;
941   while(head!=NULL) {
942     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
943     void *host_addr=find_extjump_insn(head->addr);
944     #ifdef __arm__
945       mark_clear_cache(host_addr);
946     #endif
947     set_jump_target((int)host_addr,(int)head->addr);
948     next=head->next;
949     free(head);
950     head=next;
951   }
952 }
953
954 static void invalidate_block_range(u_int block, u_int first, u_int last)
955 {
956   u_int page=get_page(block<<12);
957   //printf("first=%d last=%d\n",first,last);
958   invalidate_page(page);
959   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
960   assert(last<page+5);
961   // Invalidate the adjacent pages if a block crosses a 4K boundary
962   while(first<page)
963   {
964     invalidate_page(first);
965     first++;
966   }
967   for(first=page+1;first<last;first++)
968   {
969     invalidate_page(first);
970   }
971
972 #ifdef __arm__
973   do_clear_cache();
974 #endif
975
976   // Don't trap writes
977   invalid_code[block]=1;
978
979 #ifdef USE_MINI_HT
980   memset(mini_ht,-1,sizeof(mini_ht));
981 #endif
982 }
983
984 void invalidate_block(u_int block)
985 {
986   u_int page=get_page(block<<12);
987   u_int vpage=get_vpage(block<<12);
988   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
989   u_int first,last;
990   first=last=page;
991   struct ll_entry *head;
992   head=jump_dirty[vpage];
993   //printf("page=%d vpage=%d\n",page,vpage);
994   while(head!=NULL)
995   {
996     u_int start,end;
997     if(vpage>2047||(head->vaddr>>12)==block)
998     { // Ignore vaddr hash collision
999       get_bounds((int)head->addr,&start,&end);
1000       //printf("start: %x end: %x\n",start,end);
1001       if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE)
1002       {
1003         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page)
1004         {
1005           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1006           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1007         }
1008       }
1009     }
1010     head=head->next;
1011   }
1012   invalidate_block_range(block,first,last);
1013 }
1014
1015 void invalidate_addr(u_int addr)
1016 {
1017   //static int rhits;
1018   // this check is done by the caller
1019   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1020   u_int page=get_vpage(addr);
1021   if(page<2048) { // RAM
1022     struct ll_entry *head;
1023     u_int addr_min=~0, addr_max=0;
1024     u_int mask=RAM_SIZE-1;
1025     u_int addr_main=0x80000000|(addr&mask);
1026     int pg1;
1027     inv_code_start=addr_main&~0xfff;
1028     inv_code_end=addr_main|0xfff;
1029     pg1=page;
1030     if (pg1>0) {
1031       // must check previous page too because of spans..
1032       pg1--;
1033       inv_code_start-=0x1000;
1034     }
1035     for(;pg1<=page;pg1++) {
1036       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1037         u_int start,end;
1038         get_bounds((int)head->addr,&start,&end);
1039         if(ram_offset) {
1040           start-=ram_offset;
1041           end-=ram_offset;
1042         }
1043         if(start<=addr_main&&addr_main<end) {
1044           if(start<addr_min) addr_min=start;
1045           if(end>addr_max) addr_max=end;
1046         }
1047         else if(addr_main<start) {
1048           if(start<inv_code_end)
1049             inv_code_end=start-1;
1050         }
1051         else {
1052           if(end>inv_code_start)
1053             inv_code_start=end;
1054         }
1055       }
1056     }
1057     if (addr_min!=~0) {
1058       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1059       inv_code_start=inv_code_end=~0;
1060       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1061       return;
1062     }
1063     else {
1064       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1065       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1066       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1067       return;
1068     }
1069   }
1070   invalidate_block(addr>>12);
1071 }
1072
1073 // This is called when loading a save state.
1074 // Anything could have changed, so invalidate everything.
1075 void invalidate_all_pages(void)
1076 {
1077   u_int page;
1078   for(page=0;page<4096;page++)
1079     invalidate_page(page);
1080   for(page=0;page<1048576;page++)
1081   {
1082     if(!invalid_code[page])
1083     {
1084       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1085       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1086     }
1087   }
1088
1089 #ifdef USE_MINI_HT
1090   memset(mini_ht,-1,sizeof(mini_ht));
1091 #endif
1092 }
1093
1094 // Add an entry to jump_out after making a link
1095 void add_link(u_int vaddr,void *src)
1096 {
1097   u_int page=get_page(vaddr);
1098   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1099   int *ptr=(int *)(src+4);
1100   assert((*ptr&0x0fff0000)==0x059f0000);
1101   (void)ptr;
1102   ll_add(jump_out+page,vaddr,src);
1103   //int ptr=get_pointer(src);
1104   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1105 }
1106
1107 // If a code block was found to be unmodified (bit was set in
1108 // restore_candidate) and it remains unmodified (bit is clear
1109 // in invalid_code) then move the entries for that 4K page from
1110 // the dirty list to the clean list.
1111 void clean_blocks(u_int page)
1112 {
1113   struct ll_entry *head;
1114   inv_debug("INV: clean_blocks page=%d\n",page);
1115   head=jump_dirty[page];
1116   while(head!=NULL)
1117   {
1118     if(!invalid_code[head->vaddr>>12])
1119     {
1120       // Don't restore blocks which are about to expire from the cache
1121       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1122       {
1123         u_int start,end;
1124         if(verify_dirty(head->addr))
1125         {
1126           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1127           u_int i;
1128           u_int inv=0;
1129           get_bounds((int)head->addr,&start,&end);
1130           if(start-(u_int)rdram<RAM_SIZE)
1131           {
1132             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++)
1133             {
1134               inv|=invalid_code[i];
1135             }
1136           }
1137           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE)
1138           {
1139             inv=1;
1140           }
1141           if(!inv)
1142           {
1143             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1144             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1145             {
1146               u_int ppage=page;
1147               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1148               //printf("page=%x, addr=%x\n",page,head->vaddr);
1149               //assert(head->vaddr>>12==(page|0x80000));
1150               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1151               u_int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1152               if(ht_bin[0]==head->vaddr)
1153               {
1154                 ht_bin[1]=(u_int)clean_addr; // Replace existing entry
1155               }
1156               if(ht_bin[2]==head->vaddr)
1157               {
1158                 ht_bin[3]=(u_int)clean_addr; // Replace existing entry
1159               }
1160             }
1161           }
1162         }
1163       }
1164     }
1165     head=head->next;
1166   }
1167 }
1168
1169 static void mov_alloc(struct regstat *current,int i)
1170 {
1171   // Note: Don't need to actually alloc the source registers
1172   if((~current->is32>>rs1[i])&1)
1173   {
1174     //alloc_reg64(current,i,rs1[i]);
1175     alloc_reg64(current,i,rt1[i]);
1176     current->is32&=~(1LL<<rt1[i]);
1177   }
1178   else
1179   {
1180     //alloc_reg(current,i,rs1[i]);
1181     alloc_reg(current,i,rt1[i]);
1182     current->is32|=(1LL<<rt1[i]);
1183   }
1184   clear_const(current,rs1[i]);
1185   clear_const(current,rt1[i]);
1186   dirty_reg(current,rt1[i]);
1187 }
1188
1189 void shiftimm_alloc(struct regstat *current,int i)
1190 {
1191   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1192   {
1193     if(rt1[i]) {
1194       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1195       else lt1[i]=rs1[i];
1196       alloc_reg(current,i,rt1[i]);
1197       current->is32|=1LL<<rt1[i];
1198       dirty_reg(current,rt1[i]);
1199       if(is_const(current,rs1[i])) {
1200         int v=get_const(current,rs1[i]);
1201         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1202         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1203         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1204       }
1205       else clear_const(current,rt1[i]);
1206     }
1207   }
1208   else
1209   {
1210     clear_const(current,rs1[i]);
1211     clear_const(current,rt1[i]);
1212   }
1213
1214   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1215   {
1216     if(rt1[i]) {
1217       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1218       alloc_reg64(current,i,rt1[i]);
1219       current->is32&=~(1LL<<rt1[i]);
1220       dirty_reg(current,rt1[i]);
1221     }
1222   }
1223   if(opcode2[i]==0x3c) // DSLL32
1224   {
1225     if(rt1[i]) {
1226       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1227       alloc_reg64(current,i,rt1[i]);
1228       current->is32&=~(1LL<<rt1[i]);
1229       dirty_reg(current,rt1[i]);
1230     }
1231   }
1232   if(opcode2[i]==0x3e) // DSRL32
1233   {
1234     if(rt1[i]) {
1235       alloc_reg64(current,i,rs1[i]);
1236       if(imm[i]==32) {
1237         alloc_reg64(current,i,rt1[i]);
1238         current->is32&=~(1LL<<rt1[i]);
1239       } else {
1240         alloc_reg(current,i,rt1[i]);
1241         current->is32|=1LL<<rt1[i];
1242       }
1243       dirty_reg(current,rt1[i]);
1244     }
1245   }
1246   if(opcode2[i]==0x3f) // DSRA32
1247   {
1248     if(rt1[i]) {
1249       alloc_reg64(current,i,rs1[i]);
1250       alloc_reg(current,i,rt1[i]);
1251       current->is32|=1LL<<rt1[i];
1252       dirty_reg(current,rt1[i]);
1253     }
1254   }
1255 }
1256
1257 void shift_alloc(struct regstat *current,int i)
1258 {
1259   if(rt1[i]) {
1260     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1261     {
1262       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1263       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1264       alloc_reg(current,i,rt1[i]);
1265       if(rt1[i]==rs2[i]) {
1266         alloc_reg_temp(current,i,-1);
1267         minimum_free_regs[i]=1;
1268       }
1269       current->is32|=1LL<<rt1[i];
1270     } else { // DSLLV/DSRLV/DSRAV
1271       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1272       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1273       alloc_reg64(current,i,rt1[i]);
1274       current->is32&=~(1LL<<rt1[i]);
1275       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1276       {
1277         alloc_reg_temp(current,i,-1);
1278         minimum_free_regs[i]=1;
1279       }
1280     }
1281     clear_const(current,rs1[i]);
1282     clear_const(current,rs2[i]);
1283     clear_const(current,rt1[i]);
1284     dirty_reg(current,rt1[i]);
1285   }
1286 }
1287
1288 void alu_alloc(struct regstat *current,int i)
1289 {
1290   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1291     if(rt1[i]) {
1292       if(rs1[i]&&rs2[i]) {
1293         alloc_reg(current,i,rs1[i]);
1294         alloc_reg(current,i,rs2[i]);
1295       }
1296       else {
1297         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1298         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1299       }
1300       alloc_reg(current,i,rt1[i]);
1301     }
1302     current->is32|=1LL<<rt1[i];
1303   }
1304   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1305     if(rt1[i]) {
1306       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1307       {
1308         alloc_reg64(current,i,rs1[i]);
1309         alloc_reg64(current,i,rs2[i]);
1310         alloc_reg(current,i,rt1[i]);
1311       } else {
1312         alloc_reg(current,i,rs1[i]);
1313         alloc_reg(current,i,rs2[i]);
1314         alloc_reg(current,i,rt1[i]);
1315       }
1316     }
1317     current->is32|=1LL<<rt1[i];
1318   }
1319   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1320     if(rt1[i]) {
1321       if(rs1[i]&&rs2[i]) {
1322         alloc_reg(current,i,rs1[i]);
1323         alloc_reg(current,i,rs2[i]);
1324       }
1325       else
1326       {
1327         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1328         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1329       }
1330       alloc_reg(current,i,rt1[i]);
1331       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1332       {
1333         if(!((current->uu>>rt1[i])&1)) {
1334           alloc_reg64(current,i,rt1[i]);
1335         }
1336         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1337           if(rs1[i]&&rs2[i]) {
1338             alloc_reg64(current,i,rs1[i]);
1339             alloc_reg64(current,i,rs2[i]);
1340           }
1341           else
1342           {
1343             // Is is really worth it to keep 64-bit values in registers?
1344             #ifdef NATIVE_64BIT
1345             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1346             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1347             #endif
1348           }
1349         }
1350         current->is32&=~(1LL<<rt1[i]);
1351       } else {
1352         current->is32|=1LL<<rt1[i];
1353       }
1354     }
1355   }
1356   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1357     if(rt1[i]) {
1358       if(rs1[i]&&rs2[i]) {
1359         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1360           alloc_reg64(current,i,rs1[i]);
1361           alloc_reg64(current,i,rs2[i]);
1362           alloc_reg64(current,i,rt1[i]);
1363         } else {
1364           alloc_reg(current,i,rs1[i]);
1365           alloc_reg(current,i,rs2[i]);
1366           alloc_reg(current,i,rt1[i]);
1367         }
1368       }
1369       else {
1370         alloc_reg(current,i,rt1[i]);
1371         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1372           // DADD used as move, or zeroing
1373           // If we have a 64-bit source, then make the target 64 bits too
1374           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1375             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1376             alloc_reg64(current,i,rt1[i]);
1377           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1378             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1379             alloc_reg64(current,i,rt1[i]);
1380           }
1381           if(opcode2[i]>=0x2e&&rs2[i]) {
1382             // DSUB used as negation - 64-bit result
1383             // If we have a 32-bit register, extend it to 64 bits
1384             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1385             alloc_reg64(current,i,rt1[i]);
1386           }
1387         }
1388       }
1389       if(rs1[i]&&rs2[i]) {
1390         current->is32&=~(1LL<<rt1[i]);
1391       } else if(rs1[i]) {
1392         current->is32&=~(1LL<<rt1[i]);
1393         if((current->is32>>rs1[i])&1)
1394           current->is32|=1LL<<rt1[i];
1395       } else if(rs2[i]) {
1396         current->is32&=~(1LL<<rt1[i]);
1397         if((current->is32>>rs2[i])&1)
1398           current->is32|=1LL<<rt1[i];
1399       } else {
1400         current->is32|=1LL<<rt1[i];
1401       }
1402     }
1403   }
1404   clear_const(current,rs1[i]);
1405   clear_const(current,rs2[i]);
1406   clear_const(current,rt1[i]);
1407   dirty_reg(current,rt1[i]);
1408 }
1409
1410 void imm16_alloc(struct regstat *current,int i)
1411 {
1412   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1413   else lt1[i]=rs1[i];
1414   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1415   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1416     current->is32&=~(1LL<<rt1[i]);
1417     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1418       // TODO: Could preserve the 32-bit flag if the immediate is zero
1419       alloc_reg64(current,i,rt1[i]);
1420       alloc_reg64(current,i,rs1[i]);
1421     }
1422     clear_const(current,rs1[i]);
1423     clear_const(current,rt1[i]);
1424   }
1425   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1426     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1427     current->is32|=1LL<<rt1[i];
1428     clear_const(current,rs1[i]);
1429     clear_const(current,rt1[i]);
1430   }
1431   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1432     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1433       if(rs1[i]!=rt1[i]) {
1434         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1435         alloc_reg64(current,i,rt1[i]);
1436         current->is32&=~(1LL<<rt1[i]);
1437       }
1438     }
1439     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1440     if(is_const(current,rs1[i])) {
1441       int v=get_const(current,rs1[i]);
1442       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1443       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1444       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1445     }
1446     else clear_const(current,rt1[i]);
1447   }
1448   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1449     if(is_const(current,rs1[i])) {
1450       int v=get_const(current,rs1[i]);
1451       set_const(current,rt1[i],v+imm[i]);
1452     }
1453     else clear_const(current,rt1[i]);
1454     current->is32|=1LL<<rt1[i];
1455   }
1456   else {
1457     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1458     current->is32|=1LL<<rt1[i];
1459   }
1460   dirty_reg(current,rt1[i]);
1461 }
1462
1463 void load_alloc(struct regstat *current,int i)
1464 {
1465   clear_const(current,rt1[i]);
1466   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1467   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1468   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1469   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1470     alloc_reg(current,i,rt1[i]);
1471     assert(get_reg(current->regmap,rt1[i])>=0);
1472     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1473     {
1474       current->is32&=~(1LL<<rt1[i]);
1475       alloc_reg64(current,i,rt1[i]);
1476     }
1477     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1478     {
1479       current->is32&=~(1LL<<rt1[i]);
1480       alloc_reg64(current,i,rt1[i]);
1481       alloc_all(current,i);
1482       alloc_reg64(current,i,FTEMP);
1483       minimum_free_regs[i]=HOST_REGS;
1484     }
1485     else current->is32|=1LL<<rt1[i];
1486     dirty_reg(current,rt1[i]);
1487     // LWL/LWR need a temporary register for the old value
1488     if(opcode[i]==0x22||opcode[i]==0x26)
1489     {
1490       alloc_reg(current,i,FTEMP);
1491       alloc_reg_temp(current,i,-1);
1492       minimum_free_regs[i]=1;
1493     }
1494   }
1495   else
1496   {
1497     // Load to r0 or unneeded register (dummy load)
1498     // but we still need a register to calculate the address
1499     if(opcode[i]==0x22||opcode[i]==0x26)
1500     {
1501       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1502     }
1503     alloc_reg_temp(current,i,-1);
1504     minimum_free_regs[i]=1;
1505     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1506     {
1507       alloc_all(current,i);
1508       alloc_reg64(current,i,FTEMP);
1509       minimum_free_regs[i]=HOST_REGS;
1510     }
1511   }
1512 }
1513
1514 void store_alloc(struct regstat *current,int i)
1515 {
1516   clear_const(current,rs2[i]);
1517   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1518   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1519   alloc_reg(current,i,rs2[i]);
1520   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1521     alloc_reg64(current,i,rs2[i]);
1522     if(rs2[i]) alloc_reg(current,i,FTEMP);
1523   }
1524   #if defined(HOST_IMM8)
1525   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1526   else alloc_reg(current,i,INVCP);
1527   #endif
1528   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1529     alloc_reg(current,i,FTEMP);
1530   }
1531   // We need a temporary register for address generation
1532   alloc_reg_temp(current,i,-1);
1533   minimum_free_regs[i]=1;
1534 }
1535
1536 void c1ls_alloc(struct regstat *current,int i)
1537 {
1538   //clear_const(current,rs1[i]); // FIXME
1539   clear_const(current,rt1[i]);
1540   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1541   alloc_reg(current,i,CSREG); // Status
1542   alloc_reg(current,i,FTEMP);
1543   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1544     alloc_reg64(current,i,FTEMP);
1545   }
1546   #if defined(HOST_IMM8)
1547   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1548   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1549     alloc_reg(current,i,INVCP);
1550   #endif
1551   // We need a temporary register for address generation
1552   alloc_reg_temp(current,i,-1);
1553 }
1554
1555 void c2ls_alloc(struct regstat *current,int i)
1556 {
1557   clear_const(current,rt1[i]);
1558   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1559   alloc_reg(current,i,FTEMP);
1560   #if defined(HOST_IMM8)
1561   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1562   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1563     alloc_reg(current,i,INVCP);
1564   #endif
1565   // We need a temporary register for address generation
1566   alloc_reg_temp(current,i,-1);
1567   minimum_free_regs[i]=1;
1568 }
1569
1570 #ifndef multdiv_alloc
1571 void multdiv_alloc(struct regstat *current,int i)
1572 {
1573   //  case 0x18: MULT
1574   //  case 0x19: MULTU
1575   //  case 0x1A: DIV
1576   //  case 0x1B: DIVU
1577   //  case 0x1C: DMULT
1578   //  case 0x1D: DMULTU
1579   //  case 0x1E: DDIV
1580   //  case 0x1F: DDIVU
1581   clear_const(current,rs1[i]);
1582   clear_const(current,rs2[i]);
1583   if(rs1[i]&&rs2[i])
1584   {
1585     if((opcode2[i]&4)==0) // 32-bit
1586     {
1587       current->u&=~(1LL<<HIREG);
1588       current->u&=~(1LL<<LOREG);
1589       alloc_reg(current,i,HIREG);
1590       alloc_reg(current,i,LOREG);
1591       alloc_reg(current,i,rs1[i]);
1592       alloc_reg(current,i,rs2[i]);
1593       current->is32|=1LL<<HIREG;
1594       current->is32|=1LL<<LOREG;
1595       dirty_reg(current,HIREG);
1596       dirty_reg(current,LOREG);
1597     }
1598     else // 64-bit
1599     {
1600       current->u&=~(1LL<<HIREG);
1601       current->u&=~(1LL<<LOREG);
1602       current->uu&=~(1LL<<HIREG);
1603       current->uu&=~(1LL<<LOREG);
1604       alloc_reg64(current,i,HIREG);
1605       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1606       alloc_reg64(current,i,rs1[i]);
1607       alloc_reg64(current,i,rs2[i]);
1608       alloc_all(current,i);
1609       current->is32&=~(1LL<<HIREG);
1610       current->is32&=~(1LL<<LOREG);
1611       dirty_reg(current,HIREG);
1612       dirty_reg(current,LOREG);
1613       minimum_free_regs[i]=HOST_REGS;
1614     }
1615   }
1616   else
1617   {
1618     // Multiply by zero is zero.
1619     // MIPS does not have a divide by zero exception.
1620     // The result is undefined, we return zero.
1621     alloc_reg(current,i,HIREG);
1622     alloc_reg(current,i,LOREG);
1623     current->is32|=1LL<<HIREG;
1624     current->is32|=1LL<<LOREG;
1625     dirty_reg(current,HIREG);
1626     dirty_reg(current,LOREG);
1627   }
1628 }
1629 #endif
1630
1631 void cop0_alloc(struct regstat *current,int i)
1632 {
1633   if(opcode2[i]==0) // MFC0
1634   {
1635     if(rt1[i]) {
1636       clear_const(current,rt1[i]);
1637       alloc_all(current,i);
1638       alloc_reg(current,i,rt1[i]);
1639       current->is32|=1LL<<rt1[i];
1640       dirty_reg(current,rt1[i]);
1641     }
1642   }
1643   else if(opcode2[i]==4) // MTC0
1644   {
1645     if(rs1[i]){
1646       clear_const(current,rs1[i]);
1647       alloc_reg(current,i,rs1[i]);
1648       alloc_all(current,i);
1649     }
1650     else {
1651       alloc_all(current,i); // FIXME: Keep r0
1652       current->u&=~1LL;
1653       alloc_reg(current,i,0);
1654     }
1655   }
1656   else
1657   {
1658     // TLBR/TLBWI/TLBWR/TLBP/ERET
1659     assert(opcode2[i]==0x10);
1660     alloc_all(current,i);
1661   }
1662   minimum_free_regs[i]=HOST_REGS;
1663 }
1664
1665 void cop1_alloc(struct regstat *current,int i)
1666 {
1667   alloc_reg(current,i,CSREG); // Load status
1668   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1669   {
1670     if(rt1[i]){
1671       clear_const(current,rt1[i]);
1672       if(opcode2[i]==1) {
1673         alloc_reg64(current,i,rt1[i]); // DMFC1
1674         current->is32&=~(1LL<<rt1[i]);
1675       }else{
1676         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1677         current->is32|=1LL<<rt1[i];
1678       }
1679       dirty_reg(current,rt1[i]);
1680     }
1681     alloc_reg_temp(current,i,-1);
1682   }
1683   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1684   {
1685     if(rs1[i]){
1686       clear_const(current,rs1[i]);
1687       if(opcode2[i]==5)
1688         alloc_reg64(current,i,rs1[i]); // DMTC1
1689       else
1690         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1691       alloc_reg_temp(current,i,-1);
1692     }
1693     else {
1694       current->u&=~1LL;
1695       alloc_reg(current,i,0);
1696       alloc_reg_temp(current,i,-1);
1697     }
1698   }
1699   minimum_free_regs[i]=1;
1700 }
1701 void fconv_alloc(struct regstat *current,int i)
1702 {
1703   alloc_reg(current,i,CSREG); // Load status
1704   alloc_reg_temp(current,i,-1);
1705   minimum_free_regs[i]=1;
1706 }
1707 void float_alloc(struct regstat *current,int i)
1708 {
1709   alloc_reg(current,i,CSREG); // Load status
1710   alloc_reg_temp(current,i,-1);
1711   minimum_free_regs[i]=1;
1712 }
1713 void c2op_alloc(struct regstat *current,int i)
1714 {
1715   alloc_reg_temp(current,i,-1);
1716 }
1717 void fcomp_alloc(struct regstat *current,int i)
1718 {
1719   alloc_reg(current,i,CSREG); // Load status
1720   alloc_reg(current,i,FSREG); // Load flags
1721   dirty_reg(current,FSREG); // Flag will be modified
1722   alloc_reg_temp(current,i,-1);
1723   minimum_free_regs[i]=1;
1724 }
1725
1726 void syscall_alloc(struct regstat *current,int i)
1727 {
1728   alloc_cc(current,i);
1729   dirty_reg(current,CCREG);
1730   alloc_all(current,i);
1731   minimum_free_regs[i]=HOST_REGS;
1732   current->isconst=0;
1733 }
1734
1735 void delayslot_alloc(struct regstat *current,int i)
1736 {
1737   switch(itype[i])
1738   {
1739     case UJUMP:
1740     case CJUMP:
1741     case SJUMP:
1742     case RJUMP:
1743     case FJUMP:
1744     case SYSCALL:
1745     case HLECALL:
1746     case SPAN:
1747       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1748       SysPrintf("Disabled speculative precompilation\n");
1749       stop_after_jal=1;
1750       break;
1751     case IMM16:
1752       imm16_alloc(current,i);
1753       break;
1754     case LOAD:
1755     case LOADLR:
1756       load_alloc(current,i);
1757       break;
1758     case STORE:
1759     case STORELR:
1760       store_alloc(current,i);
1761       break;
1762     case ALU:
1763       alu_alloc(current,i);
1764       break;
1765     case SHIFT:
1766       shift_alloc(current,i);
1767       break;
1768     case MULTDIV:
1769       multdiv_alloc(current,i);
1770       break;
1771     case SHIFTIMM:
1772       shiftimm_alloc(current,i);
1773       break;
1774     case MOV:
1775       mov_alloc(current,i);
1776       break;
1777     case COP0:
1778       cop0_alloc(current,i);
1779       break;
1780     case COP1:
1781     case COP2:
1782       cop1_alloc(current,i);
1783       break;
1784     case C1LS:
1785       c1ls_alloc(current,i);
1786       break;
1787     case C2LS:
1788       c2ls_alloc(current,i);
1789       break;
1790     case FCONV:
1791       fconv_alloc(current,i);
1792       break;
1793     case FLOAT:
1794       float_alloc(current,i);
1795       break;
1796     case FCOMP:
1797       fcomp_alloc(current,i);
1798       break;
1799     case C2OP:
1800       c2op_alloc(current,i);
1801       break;
1802   }
1803 }
1804
1805 // Special case where a branch and delay slot span two pages in virtual memory
1806 static void pagespan_alloc(struct regstat *current,int i)
1807 {
1808   current->isconst=0;
1809   current->wasconst=0;
1810   regs[i].wasconst=0;
1811   minimum_free_regs[i]=HOST_REGS;
1812   alloc_all(current,i);
1813   alloc_cc(current,i);
1814   dirty_reg(current,CCREG);
1815   if(opcode[i]==3) // JAL
1816   {
1817     alloc_reg(current,i,31);
1818     dirty_reg(current,31);
1819   }
1820   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1821   {
1822     alloc_reg(current,i,rs1[i]);
1823     if (rt1[i]!=0) {
1824       alloc_reg(current,i,rt1[i]);
1825       dirty_reg(current,rt1[i]);
1826     }
1827   }
1828   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1829   {
1830     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1831     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1832     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1833     {
1834       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1835       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1836     }
1837   }
1838   else
1839   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1840   {
1841     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1842     if(!((current->is32>>rs1[i])&1))
1843     {
1844       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1845     }
1846   }
1847   else
1848   if(opcode[i]==0x11) // BC1
1849   {
1850     alloc_reg(current,i,FSREG);
1851     alloc_reg(current,i,CSREG);
1852   }
1853   //else ...
1854 }
1855
1856 static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1857 {
1858   stubs[stubcount][0]=type;
1859   stubs[stubcount][1]=addr;
1860   stubs[stubcount][2]=retaddr;
1861   stubs[stubcount][3]=a;
1862   stubs[stubcount][4]=b;
1863   stubs[stubcount][5]=c;
1864   stubs[stubcount][6]=d;
1865   stubs[stubcount][7]=e;
1866   stubcount++;
1867 }
1868
1869 // Write out a single register
1870 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1871 {
1872   int hr;
1873   for(hr=0;hr<HOST_REGS;hr++) {
1874     if(hr!=EXCLUDE_REG) {
1875       if((regmap[hr]&63)==r) {
1876         if((dirty>>hr)&1) {
1877           if(regmap[hr]<64) {
1878             emit_storereg(r,hr);
1879           }else{
1880             emit_storereg(r|64,hr);
1881           }
1882         }
1883       }
1884     }
1885   }
1886 }
1887
1888 #if 0
1889 static int mchecksum(void)
1890 {
1891   //if(!tracedebug) return 0;
1892   int i;
1893   int sum=0;
1894   for(i=0;i<2097152;i++) {
1895     unsigned int temp=sum;
1896     sum<<=1;
1897     sum|=(~temp)>>31;
1898     sum^=((u_int *)rdram)[i];
1899   }
1900   return sum;
1901 }
1902
1903 static int rchecksum(void)
1904 {
1905   int i;
1906   int sum=0;
1907   for(i=0;i<64;i++)
1908     sum^=((u_int *)reg)[i];
1909   return sum;
1910 }
1911
1912 static void rlist(void)
1913 {
1914   int i;
1915   printf("TRACE: ");
1916   for(i=0;i<32;i++)
1917     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1918   printf("\n");
1919 }
1920
1921 static void enabletrace(void)
1922 {
1923   tracedebug=1;
1924 }
1925
1926 static void memdebug(int i)
1927 {
1928   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
1929   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
1930   //rlist();
1931   //if(tracedebug) {
1932   //if(Count>=-2084597794) {
1933   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
1934   //if(0) {
1935     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
1936     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
1937     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
1938     rlist();
1939     #ifdef __i386__
1940     printf("TRACE: %x\n",(&i)[-1]);
1941     #endif
1942     #ifdef __arm__
1943     int j;
1944     printf("TRACE: %x \n",(&j)[10]);
1945     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
1946     #endif
1947     //fflush(stdout);
1948   }
1949   //printf("TRACE: %x\n",(&i)[-1]);
1950 }
1951 #endif
1952
1953 void alu_assemble(int i,struct regstat *i_regs)
1954 {
1955   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1956     if(rt1[i]) {
1957       signed char s1,s2,t;
1958       t=get_reg(i_regs->regmap,rt1[i]);
1959       if(t>=0) {
1960         s1=get_reg(i_regs->regmap,rs1[i]);
1961         s2=get_reg(i_regs->regmap,rs2[i]);
1962         if(rs1[i]&&rs2[i]) {
1963           assert(s1>=0);
1964           assert(s2>=0);
1965           if(opcode2[i]&2) emit_sub(s1,s2,t);
1966           else emit_add(s1,s2,t);
1967         }
1968         else if(rs1[i]) {
1969           if(s1>=0) emit_mov(s1,t);
1970           else emit_loadreg(rs1[i],t);
1971         }
1972         else if(rs2[i]) {
1973           if(s2>=0) {
1974             if(opcode2[i]&2) emit_neg(s2,t);
1975             else emit_mov(s2,t);
1976           }
1977           else {
1978             emit_loadreg(rs2[i],t);
1979             if(opcode2[i]&2) emit_neg(t,t);
1980           }
1981         }
1982         else emit_zeroreg(t);
1983       }
1984     }
1985   }
1986   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1987     if(rt1[i]) {
1988       signed char s1l,s2l,s1h,s2h,tl,th;
1989       tl=get_reg(i_regs->regmap,rt1[i]);
1990       th=get_reg(i_regs->regmap,rt1[i]|64);
1991       if(tl>=0) {
1992         s1l=get_reg(i_regs->regmap,rs1[i]);
1993         s2l=get_reg(i_regs->regmap,rs2[i]);
1994         s1h=get_reg(i_regs->regmap,rs1[i]|64);
1995         s2h=get_reg(i_regs->regmap,rs2[i]|64);
1996         if(rs1[i]&&rs2[i]) {
1997           assert(s1l>=0);
1998           assert(s2l>=0);
1999           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2000           else emit_adds(s1l,s2l,tl);
2001           if(th>=0) {
2002             #ifdef INVERTED_CARRY
2003             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2004             #else
2005             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2006             #endif
2007             else emit_add(s1h,s2h,th);
2008           }
2009         }
2010         else if(rs1[i]) {
2011           if(s1l>=0) emit_mov(s1l,tl);
2012           else emit_loadreg(rs1[i],tl);
2013           if(th>=0) {
2014             if(s1h>=0) emit_mov(s1h,th);
2015             else emit_loadreg(rs1[i]|64,th);
2016           }
2017         }
2018         else if(rs2[i]) {
2019           if(s2l>=0) {
2020             if(opcode2[i]&2) emit_negs(s2l,tl);
2021             else emit_mov(s2l,tl);
2022           }
2023           else {
2024             emit_loadreg(rs2[i],tl);
2025             if(opcode2[i]&2) emit_negs(tl,tl);
2026           }
2027           if(th>=0) {
2028             #ifdef INVERTED_CARRY
2029             if(s2h>=0) emit_mov(s2h,th);
2030             else emit_loadreg(rs2[i]|64,th);
2031             if(opcode2[i]&2) {
2032               emit_adcimm(-1,th); // x86 has inverted carry flag
2033               emit_not(th,th);
2034             }
2035             #else
2036             if(opcode2[i]&2) {
2037               if(s2h>=0) emit_rscimm(s2h,0,th);
2038               else {
2039                 emit_loadreg(rs2[i]|64,th);
2040                 emit_rscimm(th,0,th);
2041               }
2042             }else{
2043               if(s2h>=0) emit_mov(s2h,th);
2044               else emit_loadreg(rs2[i]|64,th);
2045             }
2046             #endif
2047           }
2048         }
2049         else {
2050           emit_zeroreg(tl);
2051           if(th>=0) emit_zeroreg(th);
2052         }
2053       }
2054     }
2055   }
2056   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2057     if(rt1[i]) {
2058       signed char s1l,s1h,s2l,s2h,t;
2059       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2060       {
2061         t=get_reg(i_regs->regmap,rt1[i]);
2062         //assert(t>=0);
2063         if(t>=0) {
2064           s1l=get_reg(i_regs->regmap,rs1[i]);
2065           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2066           s2l=get_reg(i_regs->regmap,rs2[i]);
2067           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2068           if(rs2[i]==0) // rx<r0
2069           {
2070             assert(s1h>=0);
2071             if(opcode2[i]==0x2a) // SLT
2072               emit_shrimm(s1h,31,t);
2073             else // SLTU (unsigned can not be less than zero)
2074               emit_zeroreg(t);
2075           }
2076           else if(rs1[i]==0) // r0<rx
2077           {
2078             assert(s2h>=0);
2079             if(opcode2[i]==0x2a) // SLT
2080               emit_set_gz64_32(s2h,s2l,t);
2081             else // SLTU (set if not zero)
2082               emit_set_nz64_32(s2h,s2l,t);
2083           }
2084           else {
2085             assert(s1l>=0);assert(s1h>=0);
2086             assert(s2l>=0);assert(s2h>=0);
2087             if(opcode2[i]==0x2a) // SLT
2088               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2089             else // SLTU
2090               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2091           }
2092         }
2093       } else {
2094         t=get_reg(i_regs->regmap,rt1[i]);
2095         //assert(t>=0);
2096         if(t>=0) {
2097           s1l=get_reg(i_regs->regmap,rs1[i]);
2098           s2l=get_reg(i_regs->regmap,rs2[i]);
2099           if(rs2[i]==0) // rx<r0
2100           {
2101             assert(s1l>=0);
2102             if(opcode2[i]==0x2a) // SLT
2103               emit_shrimm(s1l,31,t);
2104             else // SLTU (unsigned can not be less than zero)
2105               emit_zeroreg(t);
2106           }
2107           else if(rs1[i]==0) // r0<rx
2108           {
2109             assert(s2l>=0);
2110             if(opcode2[i]==0x2a) // SLT
2111               emit_set_gz32(s2l,t);
2112             else // SLTU (set if not zero)
2113               emit_set_nz32(s2l,t);
2114           }
2115           else{
2116             assert(s1l>=0);assert(s2l>=0);
2117             if(opcode2[i]==0x2a) // SLT
2118               emit_set_if_less32(s1l,s2l,t);
2119             else // SLTU
2120               emit_set_if_carry32(s1l,s2l,t);
2121           }
2122         }
2123       }
2124     }
2125   }
2126   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2127     if(rt1[i]) {
2128       signed char s1l,s1h,s2l,s2h,th,tl;
2129       tl=get_reg(i_regs->regmap,rt1[i]);
2130       th=get_reg(i_regs->regmap,rt1[i]|64);
2131       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2132       {
2133         assert(tl>=0);
2134         if(tl>=0) {
2135           s1l=get_reg(i_regs->regmap,rs1[i]);
2136           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2137           s2l=get_reg(i_regs->regmap,rs2[i]);
2138           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2139           if(rs1[i]&&rs2[i]) {
2140             assert(s1l>=0);assert(s1h>=0);
2141             assert(s2l>=0);assert(s2h>=0);
2142             if(opcode2[i]==0x24) { // AND
2143               emit_and(s1l,s2l,tl);
2144               emit_and(s1h,s2h,th);
2145             } else
2146             if(opcode2[i]==0x25) { // OR
2147               emit_or(s1l,s2l,tl);
2148               emit_or(s1h,s2h,th);
2149             } else
2150             if(opcode2[i]==0x26) { // XOR
2151               emit_xor(s1l,s2l,tl);
2152               emit_xor(s1h,s2h,th);
2153             } else
2154             if(opcode2[i]==0x27) { // NOR
2155               emit_or(s1l,s2l,tl);
2156               emit_or(s1h,s2h,th);
2157               emit_not(tl,tl);
2158               emit_not(th,th);
2159             }
2160           }
2161           else
2162           {
2163             if(opcode2[i]==0x24) { // AND
2164               emit_zeroreg(tl);
2165               emit_zeroreg(th);
2166             } else
2167             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2168               if(rs1[i]){
2169                 if(s1l>=0) emit_mov(s1l,tl);
2170                 else emit_loadreg(rs1[i],tl);
2171                 if(s1h>=0) emit_mov(s1h,th);
2172                 else emit_loadreg(rs1[i]|64,th);
2173               }
2174               else
2175               if(rs2[i]){
2176                 if(s2l>=0) emit_mov(s2l,tl);
2177                 else emit_loadreg(rs2[i],tl);
2178                 if(s2h>=0) emit_mov(s2h,th);
2179                 else emit_loadreg(rs2[i]|64,th);
2180               }
2181               else{
2182                 emit_zeroreg(tl);
2183                 emit_zeroreg(th);
2184               }
2185             } else
2186             if(opcode2[i]==0x27) { // NOR
2187               if(rs1[i]){
2188                 if(s1l>=0) emit_not(s1l,tl);
2189                 else{
2190                   emit_loadreg(rs1[i],tl);
2191                   emit_not(tl,tl);
2192                 }
2193                 if(s1h>=0) emit_not(s1h,th);
2194                 else{
2195                   emit_loadreg(rs1[i]|64,th);
2196                   emit_not(th,th);
2197                 }
2198               }
2199               else
2200               if(rs2[i]){
2201                 if(s2l>=0) emit_not(s2l,tl);
2202                 else{
2203                   emit_loadreg(rs2[i],tl);
2204                   emit_not(tl,tl);
2205                 }
2206                 if(s2h>=0) emit_not(s2h,th);
2207                 else{
2208                   emit_loadreg(rs2[i]|64,th);
2209                   emit_not(th,th);
2210                 }
2211               }
2212               else {
2213                 emit_movimm(-1,tl);
2214                 emit_movimm(-1,th);
2215               }
2216             }
2217           }
2218         }
2219       }
2220       else
2221       {
2222         // 32 bit
2223         if(tl>=0) {
2224           s1l=get_reg(i_regs->regmap,rs1[i]);
2225           s2l=get_reg(i_regs->regmap,rs2[i]);
2226           if(rs1[i]&&rs2[i]) {
2227             assert(s1l>=0);
2228             assert(s2l>=0);
2229             if(opcode2[i]==0x24) { // AND
2230               emit_and(s1l,s2l,tl);
2231             } else
2232             if(opcode2[i]==0x25) { // OR
2233               emit_or(s1l,s2l,tl);
2234             } else
2235             if(opcode2[i]==0x26) { // XOR
2236               emit_xor(s1l,s2l,tl);
2237             } else
2238             if(opcode2[i]==0x27) { // NOR
2239               emit_or(s1l,s2l,tl);
2240               emit_not(tl,tl);
2241             }
2242           }
2243           else
2244           {
2245             if(opcode2[i]==0x24) { // AND
2246               emit_zeroreg(tl);
2247             } else
2248             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2249               if(rs1[i]){
2250                 if(s1l>=0) emit_mov(s1l,tl);
2251                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2252               }
2253               else
2254               if(rs2[i]){
2255                 if(s2l>=0) emit_mov(s2l,tl);
2256                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2257               }
2258               else emit_zeroreg(tl);
2259             } else
2260             if(opcode2[i]==0x27) { // NOR
2261               if(rs1[i]){
2262                 if(s1l>=0) emit_not(s1l,tl);
2263                 else {
2264                   emit_loadreg(rs1[i],tl);
2265                   emit_not(tl,tl);
2266                 }
2267               }
2268               else
2269               if(rs2[i]){
2270                 if(s2l>=0) emit_not(s2l,tl);
2271                 else {
2272                   emit_loadreg(rs2[i],tl);
2273                   emit_not(tl,tl);
2274                 }
2275               }
2276               else emit_movimm(-1,tl);
2277             }
2278           }
2279         }
2280       }
2281     }
2282   }
2283 }
2284
2285 void imm16_assemble(int i,struct regstat *i_regs)
2286 {
2287   if (opcode[i]==0x0f) { // LUI
2288     if(rt1[i]) {
2289       signed char t;
2290       t=get_reg(i_regs->regmap,rt1[i]);
2291       //assert(t>=0);
2292       if(t>=0) {
2293         if(!((i_regs->isconst>>t)&1))
2294           emit_movimm(imm[i]<<16,t);
2295       }
2296     }
2297   }
2298   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2299     if(rt1[i]) {
2300       signed char s,t;
2301       t=get_reg(i_regs->regmap,rt1[i]);
2302       s=get_reg(i_regs->regmap,rs1[i]);
2303       if(rs1[i]) {
2304         //assert(t>=0);
2305         //assert(s>=0);
2306         if(t>=0) {
2307           if(!((i_regs->isconst>>t)&1)) {
2308             if(s<0) {
2309               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2310               emit_addimm(t,imm[i],t);
2311             }else{
2312               if(!((i_regs->wasconst>>s)&1))
2313                 emit_addimm(s,imm[i],t);
2314               else
2315                 emit_movimm(constmap[i][s]+imm[i],t);
2316             }
2317           }
2318         }
2319       } else {
2320         if(t>=0) {
2321           if(!((i_regs->isconst>>t)&1))
2322             emit_movimm(imm[i],t);
2323         }
2324       }
2325     }
2326   }
2327   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2328     if(rt1[i]) {
2329       signed char sh,sl,th,tl;
2330       th=get_reg(i_regs->regmap,rt1[i]|64);
2331       tl=get_reg(i_regs->regmap,rt1[i]);
2332       sh=get_reg(i_regs->regmap,rs1[i]|64);
2333       sl=get_reg(i_regs->regmap,rs1[i]);
2334       if(tl>=0) {
2335         if(rs1[i]) {
2336           assert(sh>=0);
2337           assert(sl>=0);
2338           if(th>=0) {
2339             emit_addimm64_32(sh,sl,imm[i],th,tl);
2340           }
2341           else {
2342             emit_addimm(sl,imm[i],tl);
2343           }
2344         } else {
2345           emit_movimm(imm[i],tl);
2346           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2347         }
2348       }
2349     }
2350   }
2351   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2352     if(rt1[i]) {
2353       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2354       signed char sh,sl,t;
2355       t=get_reg(i_regs->regmap,rt1[i]);
2356       sh=get_reg(i_regs->regmap,rs1[i]|64);
2357       sl=get_reg(i_regs->regmap,rs1[i]);
2358       //assert(t>=0);
2359       if(t>=0) {
2360         if(rs1[i]>0) {
2361           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2362           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2363             if(opcode[i]==0x0a) { // SLTI
2364               if(sl<0) {
2365                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2366                 emit_slti32(t,imm[i],t);
2367               }else{
2368                 emit_slti32(sl,imm[i],t);
2369               }
2370             }
2371             else { // SLTIU
2372               if(sl<0) {
2373                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2374                 emit_sltiu32(t,imm[i],t);
2375               }else{
2376                 emit_sltiu32(sl,imm[i],t);
2377               }
2378             }
2379           }else{ // 64-bit
2380             assert(sl>=0);
2381             if(opcode[i]==0x0a) // SLTI
2382               emit_slti64_32(sh,sl,imm[i],t);
2383             else // SLTIU
2384               emit_sltiu64_32(sh,sl,imm[i],t);
2385           }
2386         }else{
2387           // SLTI(U) with r0 is just stupid,
2388           // nonetheless examples can be found
2389           if(opcode[i]==0x0a) // SLTI
2390             if(0<imm[i]) emit_movimm(1,t);
2391             else emit_zeroreg(t);
2392           else // SLTIU
2393           {
2394             if(imm[i]) emit_movimm(1,t);
2395             else emit_zeroreg(t);
2396           }
2397         }
2398       }
2399     }
2400   }
2401   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2402     if(rt1[i]) {
2403       signed char sh,sl,th,tl;
2404       th=get_reg(i_regs->regmap,rt1[i]|64);
2405       tl=get_reg(i_regs->regmap,rt1[i]);
2406       sh=get_reg(i_regs->regmap,rs1[i]|64);
2407       sl=get_reg(i_regs->regmap,rs1[i]);
2408       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2409         if(opcode[i]==0x0c) //ANDI
2410         {
2411           if(rs1[i]) {
2412             if(sl<0) {
2413               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2414               emit_andimm(tl,imm[i],tl);
2415             }else{
2416               if(!((i_regs->wasconst>>sl)&1))
2417                 emit_andimm(sl,imm[i],tl);
2418               else
2419                 emit_movimm(constmap[i][sl]&imm[i],tl);
2420             }
2421           }
2422           else
2423             emit_zeroreg(tl);
2424           if(th>=0) emit_zeroreg(th);
2425         }
2426         else
2427         {
2428           if(rs1[i]) {
2429             if(sl<0) {
2430               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2431             }
2432             if(th>=0) {
2433               if(sh<0) {
2434                 emit_loadreg(rs1[i]|64,th);
2435               }else{
2436                 emit_mov(sh,th);
2437               }
2438             }
2439             if(opcode[i]==0x0d) { // ORI
2440               if(sl<0) {
2441                 emit_orimm(tl,imm[i],tl);
2442               }else{
2443                 if(!((i_regs->wasconst>>sl)&1))
2444                   emit_orimm(sl,imm[i],tl);
2445                 else
2446                   emit_movimm(constmap[i][sl]|imm[i],tl);
2447               }
2448             }
2449             if(opcode[i]==0x0e) { // XORI
2450               if(sl<0) {
2451                 emit_xorimm(tl,imm[i],tl);
2452               }else{
2453                 if(!((i_regs->wasconst>>sl)&1))
2454                   emit_xorimm(sl,imm[i],tl);
2455                 else
2456                   emit_movimm(constmap[i][sl]^imm[i],tl);
2457               }
2458             }
2459           }
2460           else {
2461             emit_movimm(imm[i],tl);
2462             if(th>=0) emit_zeroreg(th);
2463           }
2464         }
2465       }
2466     }
2467   }
2468 }
2469
2470 void shiftimm_assemble(int i,struct regstat *i_regs)
2471 {
2472   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2473   {
2474     if(rt1[i]) {
2475       signed char s,t;
2476       t=get_reg(i_regs->regmap,rt1[i]);
2477       s=get_reg(i_regs->regmap,rs1[i]);
2478       //assert(t>=0);
2479       if(t>=0&&!((i_regs->isconst>>t)&1)){
2480         if(rs1[i]==0)
2481         {
2482           emit_zeroreg(t);
2483         }
2484         else
2485         {
2486           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2487           if(imm[i]) {
2488             if(opcode2[i]==0) // SLL
2489             {
2490               emit_shlimm(s<0?t:s,imm[i],t);
2491             }
2492             if(opcode2[i]==2) // SRL
2493             {
2494               emit_shrimm(s<0?t:s,imm[i],t);
2495             }
2496             if(opcode2[i]==3) // SRA
2497             {
2498               emit_sarimm(s<0?t:s,imm[i],t);
2499             }
2500           }else{
2501             // Shift by zero
2502             if(s>=0 && s!=t) emit_mov(s,t);
2503           }
2504         }
2505       }
2506       //emit_storereg(rt1[i],t); //DEBUG
2507     }
2508   }
2509   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2510   {
2511     if(rt1[i]) {
2512       signed char sh,sl,th,tl;
2513       th=get_reg(i_regs->regmap,rt1[i]|64);
2514       tl=get_reg(i_regs->regmap,rt1[i]);
2515       sh=get_reg(i_regs->regmap,rs1[i]|64);
2516       sl=get_reg(i_regs->regmap,rs1[i]);
2517       if(tl>=0) {
2518         if(rs1[i]==0)
2519         {
2520           emit_zeroreg(tl);
2521           if(th>=0) emit_zeroreg(th);
2522         }
2523         else
2524         {
2525           assert(sl>=0);
2526           assert(sh>=0);
2527           if(imm[i]) {
2528             if(opcode2[i]==0x38) // DSLL
2529             {
2530               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2531               emit_shlimm(sl,imm[i],tl);
2532             }
2533             if(opcode2[i]==0x3a) // DSRL
2534             {
2535               emit_shrdimm(sl,sh,imm[i],tl);
2536               if(th>=0) emit_shrimm(sh,imm[i],th);
2537             }
2538             if(opcode2[i]==0x3b) // DSRA
2539             {
2540               emit_shrdimm(sl,sh,imm[i],tl);
2541               if(th>=0) emit_sarimm(sh,imm[i],th);
2542             }
2543           }else{
2544             // Shift by zero
2545             if(sl!=tl) emit_mov(sl,tl);
2546             if(th>=0&&sh!=th) emit_mov(sh,th);
2547           }
2548         }
2549       }
2550     }
2551   }
2552   if(opcode2[i]==0x3c) // DSLL32
2553   {
2554     if(rt1[i]) {
2555       signed char sl,tl,th;
2556       tl=get_reg(i_regs->regmap,rt1[i]);
2557       th=get_reg(i_regs->regmap,rt1[i]|64);
2558       sl=get_reg(i_regs->regmap,rs1[i]);
2559       if(th>=0||tl>=0){
2560         assert(tl>=0);
2561         assert(th>=0);
2562         assert(sl>=0);
2563         emit_mov(sl,th);
2564         emit_zeroreg(tl);
2565         if(imm[i]>32)
2566         {
2567           emit_shlimm(th,imm[i]&31,th);
2568         }
2569       }
2570     }
2571   }
2572   if(opcode2[i]==0x3e) // DSRL32
2573   {
2574     if(rt1[i]) {
2575       signed char sh,tl,th;
2576       tl=get_reg(i_regs->regmap,rt1[i]);
2577       th=get_reg(i_regs->regmap,rt1[i]|64);
2578       sh=get_reg(i_regs->regmap,rs1[i]|64);
2579       if(tl>=0){
2580         assert(sh>=0);
2581         emit_mov(sh,tl);
2582         if(th>=0) emit_zeroreg(th);
2583         if(imm[i]>32)
2584         {
2585           emit_shrimm(tl,imm[i]&31,tl);
2586         }
2587       }
2588     }
2589   }
2590   if(opcode2[i]==0x3f) // DSRA32
2591   {
2592     if(rt1[i]) {
2593       signed char sh,tl;
2594       tl=get_reg(i_regs->regmap,rt1[i]);
2595       sh=get_reg(i_regs->regmap,rs1[i]|64);
2596       if(tl>=0){
2597         assert(sh>=0);
2598         emit_mov(sh,tl);
2599         if(imm[i]>32)
2600         {
2601           emit_sarimm(tl,imm[i]&31,tl);
2602         }
2603       }
2604     }
2605   }
2606 }
2607
2608 #ifndef shift_assemble
2609 void shift_assemble(int i,struct regstat *i_regs)
2610 {
2611   printf("Need shift_assemble for this architecture.\n");
2612   exit(1);
2613 }
2614 #endif
2615
2616 void load_assemble(int i,struct regstat *i_regs)
2617 {
2618   int s,th,tl,addr,map=-1;
2619   int offset;
2620   int jaddr=0;
2621   int memtarget=0,c=0;
2622   int fastload_reg_override=0;
2623   u_int hr,reglist=0;
2624   th=get_reg(i_regs->regmap,rt1[i]|64);
2625   tl=get_reg(i_regs->regmap,rt1[i]);
2626   s=get_reg(i_regs->regmap,rs1[i]);
2627   offset=imm[i];
2628   for(hr=0;hr<HOST_REGS;hr++) {
2629     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2630   }
2631   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2632   if(s>=0) {
2633     c=(i_regs->wasconst>>s)&1;
2634     if (c) {
2635       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2636     }
2637   }
2638   //printf("load_assemble: c=%d\n",c);
2639   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2640   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2641   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2642     ||rt1[i]==0) {
2643       // could be FIFO, must perform the read
2644       // ||dummy read
2645       assem_debug("(forced read)\n");
2646       tl=get_reg(i_regs->regmap,-1);
2647       assert(tl>=0);
2648   }
2649   if(offset||s<0||c) addr=tl;
2650   else addr=s;
2651   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2652  if(tl>=0) {
2653   //printf("load_assemble: c=%d\n",c);
2654   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2655   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2656   reglist&=~(1<<tl);
2657   if(th>=0) reglist&=~(1<<th);
2658   if(!c) {
2659     #ifdef RAM_OFFSET
2660     map=get_reg(i_regs->regmap,ROREG);
2661     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2662     #endif
2663     #ifdef R29_HACK
2664     // Strmnnrmn's speed hack
2665     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2666     #endif
2667     {
2668       jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2669     }
2670   }
2671   else if(ram_offset&&memtarget) {
2672     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2673     fastload_reg_override=HOST_TEMPREG;
2674   }
2675   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2676   if (opcode[i]==0x20) { // LB
2677     if(!c||memtarget) {
2678       if(!dummy) {
2679         #ifdef HOST_IMM_ADDR32
2680         if(c)
2681           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2682         else
2683         #endif
2684         {
2685           //emit_xorimm(addr,3,tl);
2686           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2687           int x=0,a=tl;
2688 #ifdef BIG_ENDIAN_MIPS
2689           if(!c) emit_xorimm(addr,3,tl);
2690           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2691 #else
2692           if(!c) a=addr;
2693 #endif
2694           if(fastload_reg_override) a=fastload_reg_override;
2695
2696           emit_movsbl_indexed_tlb(x,a,map,tl);
2697         }
2698       }
2699       if(jaddr)
2700         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2701     }
2702     else
2703       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2704   }
2705   if (opcode[i]==0x21) { // LH
2706     if(!c||memtarget) {
2707       if(!dummy) {
2708         #ifdef HOST_IMM_ADDR32
2709         if(c)
2710           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2711         else
2712         #endif
2713         {
2714           int x=0,a=tl;
2715 #ifdef BIG_ENDIAN_MIPS
2716           if(!c) emit_xorimm(addr,2,tl);
2717           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2718 #else
2719           if(!c) a=addr;
2720 #endif
2721           if(fastload_reg_override) a=fastload_reg_override;
2722           //#ifdef
2723           //emit_movswl_indexed_tlb(x,tl,map,tl);
2724           //else
2725           if(map>=0) {
2726             emit_movswl_indexed(x,a,tl);
2727           }else{
2728             #if 1 //def RAM_OFFSET
2729             emit_movswl_indexed(x,a,tl);
2730             #else
2731             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2732             #endif
2733           }
2734         }
2735       }
2736       if(jaddr)
2737         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2738     }
2739     else
2740       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2741   }
2742   if (opcode[i]==0x23) { // LW
2743     if(!c||memtarget) {
2744       if(!dummy) {
2745         int a=addr;
2746         if(fastload_reg_override) a=fastload_reg_override;
2747         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2748         #ifdef HOST_IMM_ADDR32
2749         if(c)
2750           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2751         else
2752         #endif
2753         emit_readword_indexed_tlb(0,a,map,tl);
2754       }
2755       if(jaddr)
2756         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2757     }
2758     else
2759       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2760   }
2761   if (opcode[i]==0x24) { // LBU
2762     if(!c||memtarget) {
2763       if(!dummy) {
2764         #ifdef HOST_IMM_ADDR32
2765         if(c)
2766           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2767         else
2768         #endif
2769         {
2770           //emit_xorimm(addr,3,tl);
2771           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2772           int x=0,a=tl;
2773 #ifdef BIG_ENDIAN_MIPS
2774           if(!c) emit_xorimm(addr,3,tl);
2775           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2776 #else
2777           if(!c) a=addr;
2778 #endif
2779           if(fastload_reg_override) a=fastload_reg_override;
2780
2781           emit_movzbl_indexed_tlb(x,a,map,tl);
2782         }
2783       }
2784       if(jaddr)
2785         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2786     }
2787     else
2788       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2789   }
2790   if (opcode[i]==0x25) { // LHU
2791     if(!c||memtarget) {
2792       if(!dummy) {
2793         #ifdef HOST_IMM_ADDR32
2794         if(c)
2795           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2796         else
2797         #endif
2798         {
2799           int x=0,a=tl;
2800 #ifdef BIG_ENDIAN_MIPS
2801           if(!c) emit_xorimm(addr,2,tl);
2802           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2803 #else
2804           if(!c) a=addr;
2805 #endif
2806           if(fastload_reg_override) a=fastload_reg_override;
2807           //#ifdef
2808           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2809           //#else
2810           if(map>=0) {
2811             emit_movzwl_indexed(x,a,tl);
2812           }else{
2813             #if 1 //def RAM_OFFSET
2814             emit_movzwl_indexed(x,a,tl);
2815             #else
2816             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
2817             #endif
2818           }
2819         }
2820       }
2821       if(jaddr)
2822         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2823     }
2824     else
2825       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2826   }
2827   if (opcode[i]==0x27) { // LWU
2828     assert(th>=0);
2829     if(!c||memtarget) {
2830       if(!dummy) {
2831         int a=addr;
2832         if(fastload_reg_override) a=fastload_reg_override;
2833         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2834         #ifdef HOST_IMM_ADDR32
2835         if(c)
2836           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2837         else
2838         #endif
2839         emit_readword_indexed_tlb(0,a,map,tl);
2840       }
2841       if(jaddr)
2842         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2843     }
2844     else {
2845       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2846     }
2847     emit_zeroreg(th);
2848   }
2849   if (opcode[i]==0x37) { // LD
2850     if(!c||memtarget) {
2851       if(!dummy) {
2852         int a=addr;
2853         if(fastload_reg_override) a=fastload_reg_override;
2854         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
2855         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
2856         #ifdef HOST_IMM_ADDR32
2857         if(c)
2858           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2859         else
2860         #endif
2861         emit_readdword_indexed_tlb(0,a,map,th,tl);
2862       }
2863       if(jaddr)
2864         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2865     }
2866     else
2867       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2868   }
2869  }
2870   //emit_storereg(rt1[i],tl); // DEBUG
2871   //if(opcode[i]==0x23)
2872   //if(opcode[i]==0x24)
2873   //if(opcode[i]==0x23||opcode[i]==0x24)
2874   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
2875   {
2876     //emit_pusha();
2877     save_regs(0x100f);
2878         emit_readword((int)&last_count,ECX);
2879         #ifdef __i386__
2880         if(get_reg(i_regs->regmap,CCREG)<0)
2881           emit_loadreg(CCREG,HOST_CCREG);
2882         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2883         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2884         emit_writeword(HOST_CCREG,(int)&Count);
2885         #endif
2886         #ifdef __arm__
2887         if(get_reg(i_regs->regmap,CCREG)<0)
2888           emit_loadreg(CCREG,0);
2889         else
2890           emit_mov(HOST_CCREG,0);
2891         emit_add(0,ECX,0);
2892         emit_addimm(0,2*ccadj[i],0);
2893         emit_writeword(0,(int)&Count);
2894         #endif
2895     emit_call((int)memdebug);
2896     //emit_popa();
2897     restore_regs(0x100f);
2898   }*/
2899 }
2900
2901 #ifndef loadlr_assemble
2902 void loadlr_assemble(int i,struct regstat *i_regs)
2903 {
2904   printf("Need loadlr_assemble for this architecture.\n");
2905   exit(1);
2906 }
2907 #endif
2908
2909 void store_assemble(int i,struct regstat *i_regs)
2910 {
2911   int s,th,tl,map=-1;
2912   int addr,temp;
2913   int offset;
2914   int jaddr=0,type;
2915   int memtarget=0,c=0;
2916   int agr=AGEN1+(i&1);
2917   int faststore_reg_override=0;
2918   u_int hr,reglist=0;
2919   th=get_reg(i_regs->regmap,rs2[i]|64);
2920   tl=get_reg(i_regs->regmap,rs2[i]);
2921   s=get_reg(i_regs->regmap,rs1[i]);
2922   temp=get_reg(i_regs->regmap,agr);
2923   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2924   offset=imm[i];
2925   if(s>=0) {
2926     c=(i_regs->wasconst>>s)&1;
2927     if(c) {
2928       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2929     }
2930   }
2931   assert(tl>=0);
2932   assert(temp>=0);
2933   for(hr=0;hr<HOST_REGS;hr++) {
2934     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2935   }
2936   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2937   if(offset||s<0||c) addr=temp;
2938   else addr=s;
2939   if(!c) {
2940     jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
2941   }
2942   else if(ram_offset&&memtarget) {
2943     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2944     faststore_reg_override=HOST_TEMPREG;
2945   }
2946
2947   if (opcode[i]==0x28) { // SB
2948     if(!c||memtarget) {
2949       int x=0,a=temp;
2950 #ifdef BIG_ENDIAN_MIPS
2951       if(!c) emit_xorimm(addr,3,temp);
2952       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2953 #else
2954       if(!c) a=addr;
2955 #endif
2956       if(faststore_reg_override) a=faststore_reg_override;
2957       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
2958       emit_writebyte_indexed_tlb(tl,x,a,map,a);
2959     }
2960     type=STOREB_STUB;
2961   }
2962   if (opcode[i]==0x29) { // SH
2963     if(!c||memtarget) {
2964       int x=0,a=temp;
2965 #ifdef BIG_ENDIAN_MIPS
2966       if(!c) emit_xorimm(addr,2,temp);
2967       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2968 #else
2969       if(!c) a=addr;
2970 #endif
2971       if(faststore_reg_override) a=faststore_reg_override;
2972       //#ifdef
2973       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
2974       //#else
2975       if(map>=0) {
2976         emit_writehword_indexed(tl,x,a);
2977       }else
2978         //emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
2979         emit_writehword_indexed(tl,x,a);
2980     }
2981     type=STOREH_STUB;
2982   }
2983   if (opcode[i]==0x2B) { // SW
2984     if(!c||memtarget) {
2985       int a=addr;
2986       if(faststore_reg_override) a=faststore_reg_override;
2987       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
2988       emit_writeword_indexed_tlb(tl,0,a,map,temp);
2989     }
2990     type=STOREW_STUB;
2991   }
2992   if (opcode[i]==0x3F) { // SD
2993     if(!c||memtarget) {
2994       int a=addr;
2995       if(faststore_reg_override) a=faststore_reg_override;
2996       if(rs2[i]) {
2997         assert(th>=0);
2998         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
2999         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3000         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
3001       }else{
3002         // Store zero
3003         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3004         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3005         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
3006       }
3007     }
3008     type=STORED_STUB;
3009   }
3010   if(jaddr) {
3011     // PCSX store handlers don't check invcode again
3012     reglist|=1<<addr;
3013     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3014     jaddr=0;
3015   }
3016   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3017     if(!c||memtarget) {
3018       #ifdef DESTRUCTIVE_SHIFT
3019       // The x86 shift operation is 'destructive'; it overwrites the
3020       // source register, so we need to make a copy first and use that.
3021       addr=temp;
3022       #endif
3023       #if defined(HOST_IMM8)
3024       int ir=get_reg(i_regs->regmap,INVCP);
3025       assert(ir>=0);
3026       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3027       #else
3028       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3029       #endif
3030       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3031       emit_callne(invalidate_addr_reg[addr]);
3032       #else
3033       int jaddr2=(int)out;
3034       emit_jne(0);
3035       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3036       #endif
3037     }
3038   }
3039   u_int addr_val=constmap[i][s]+offset;
3040   if(jaddr) {
3041     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3042   } else if(c&&!memtarget) {
3043     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
3044   }
3045   // basic current block modification detection..
3046   // not looking back as that should be in mips cache already
3047   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
3048     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
3049     assert(i_regs->regmap==regs[i].regmap); // not delay slot
3050     if(i_regs->regmap==regs[i].regmap) {
3051       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
3052       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
3053       emit_movimm(start+i*4+4,0);
3054       emit_writeword(0,(int)&pcaddr);
3055       emit_jmp((int)do_interrupt);
3056     }
3057   }
3058   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3059   //if(opcode[i]==0x2B || opcode[i]==0x28)
3060   //if(opcode[i]==0x2B || opcode[i]==0x29)
3061   //if(opcode[i]==0x2B)
3062   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3063   {
3064     #ifdef __i386__
3065     emit_pusha();
3066     #endif
3067     #ifdef __arm__
3068     save_regs(0x100f);
3069     #endif
3070         emit_readword((int)&last_count,ECX);
3071         #ifdef __i386__
3072         if(get_reg(i_regs->regmap,CCREG)<0)
3073           emit_loadreg(CCREG,HOST_CCREG);
3074         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3075         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3076         emit_writeword(HOST_CCREG,(int)&Count);
3077         #endif
3078         #ifdef __arm__
3079         if(get_reg(i_regs->regmap,CCREG)<0)
3080           emit_loadreg(CCREG,0);
3081         else
3082           emit_mov(HOST_CCREG,0);
3083         emit_add(0,ECX,0);
3084         emit_addimm(0,2*ccadj[i],0);
3085         emit_writeword(0,(int)&Count);
3086         #endif
3087     emit_call((int)memdebug);
3088     #ifdef __i386__
3089     emit_popa();
3090     #endif
3091     #ifdef __arm__
3092     restore_regs(0x100f);
3093     #endif
3094   }*/
3095 }
3096
3097 void storelr_assemble(int i,struct regstat *i_regs)
3098 {
3099   int s,th,tl;
3100   int temp;
3101   int temp2=-1;
3102   int offset;
3103   int jaddr=0;
3104   int case1,case2,case3;
3105   int done0,done1,done2;
3106   int memtarget=0,c=0;
3107   int agr=AGEN1+(i&1);
3108   u_int hr,reglist=0;
3109   th=get_reg(i_regs->regmap,rs2[i]|64);
3110   tl=get_reg(i_regs->regmap,rs2[i]);
3111   s=get_reg(i_regs->regmap,rs1[i]);
3112   temp=get_reg(i_regs->regmap,agr);
3113   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3114   offset=imm[i];
3115   if(s>=0) {
3116     c=(i_regs->isconst>>s)&1;
3117     if(c) {
3118       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3119     }
3120   }
3121   assert(tl>=0);
3122   for(hr=0;hr<HOST_REGS;hr++) {
3123     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3124   }
3125   assert(temp>=0);
3126   if(!c) {
3127     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3128     if(!offset&&s!=temp) emit_mov(s,temp);
3129     jaddr=(int)out;
3130     emit_jno(0);
3131   }
3132   else
3133   {
3134     if(!memtarget||!rs1[i]) {
3135       jaddr=(int)out;
3136       emit_jmp(0);
3137     }
3138   }
3139   #ifdef RAM_OFFSET
3140   int map=get_reg(i_regs->regmap,ROREG);
3141   if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3142   #else
3143   if((u_int)rdram!=0x80000000)
3144     emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3145   #endif
3146
3147   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3148     temp2=get_reg(i_regs->regmap,FTEMP);
3149     if(!rs2[i]) temp2=th=tl;
3150   }
3151
3152 #ifndef BIG_ENDIAN_MIPS
3153     emit_xorimm(temp,3,temp);
3154 #endif
3155   emit_testimm(temp,2);
3156   case2=(int)out;
3157   emit_jne(0);
3158   emit_testimm(temp,1);
3159   case1=(int)out;
3160   emit_jne(0);
3161   // 0
3162   if (opcode[i]==0x2A) { // SWL
3163     emit_writeword_indexed(tl,0,temp);
3164   }
3165   if (opcode[i]==0x2E) { // SWR
3166     emit_writebyte_indexed(tl,3,temp);
3167   }
3168   if (opcode[i]==0x2C) { // SDL
3169     emit_writeword_indexed(th,0,temp);
3170     if(rs2[i]) emit_mov(tl,temp2);
3171   }
3172   if (opcode[i]==0x2D) { // SDR
3173     emit_writebyte_indexed(tl,3,temp);
3174     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3175   }
3176   done0=(int)out;
3177   emit_jmp(0);
3178   // 1
3179   set_jump_target(case1,(int)out);
3180   if (opcode[i]==0x2A) { // SWL
3181     // Write 3 msb into three least significant bytes
3182     if(rs2[i]) emit_rorimm(tl,8,tl);
3183     emit_writehword_indexed(tl,-1,temp);
3184     if(rs2[i]) emit_rorimm(tl,16,tl);
3185     emit_writebyte_indexed(tl,1,temp);
3186     if(rs2[i]) emit_rorimm(tl,8,tl);
3187   }
3188   if (opcode[i]==0x2E) { // SWR
3189     // Write two lsb into two most significant bytes
3190     emit_writehword_indexed(tl,1,temp);
3191   }
3192   if (opcode[i]==0x2C) { // SDL
3193     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3194     // Write 3 msb into three least significant bytes
3195     if(rs2[i]) emit_rorimm(th,8,th);
3196     emit_writehword_indexed(th,-1,temp);
3197     if(rs2[i]) emit_rorimm(th,16,th);
3198     emit_writebyte_indexed(th,1,temp);
3199     if(rs2[i]) emit_rorimm(th,8,th);
3200   }
3201   if (opcode[i]==0x2D) { // SDR
3202     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3203     // Write two lsb into two most significant bytes
3204     emit_writehword_indexed(tl,1,temp);
3205   }
3206   done1=(int)out;
3207   emit_jmp(0);
3208   // 2
3209   set_jump_target(case2,(int)out);
3210   emit_testimm(temp,1);
3211   case3=(int)out;
3212   emit_jne(0);
3213   if (opcode[i]==0x2A) { // SWL
3214     // Write two msb into two least significant bytes
3215     if(rs2[i]) emit_rorimm(tl,16,tl);
3216     emit_writehword_indexed(tl,-2,temp);
3217     if(rs2[i]) emit_rorimm(tl,16,tl);
3218   }
3219   if (opcode[i]==0x2E) { // SWR
3220     // Write 3 lsb into three most significant bytes
3221     emit_writebyte_indexed(tl,-1,temp);
3222     if(rs2[i]) emit_rorimm(tl,8,tl);
3223     emit_writehword_indexed(tl,0,temp);
3224     if(rs2[i]) emit_rorimm(tl,24,tl);
3225   }
3226   if (opcode[i]==0x2C) { // SDL
3227     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3228     // Write two msb into two least significant bytes
3229     if(rs2[i]) emit_rorimm(th,16,th);
3230     emit_writehword_indexed(th,-2,temp);
3231     if(rs2[i]) emit_rorimm(th,16,th);
3232   }
3233   if (opcode[i]==0x2D) { // SDR
3234     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3235     // Write 3 lsb into three most significant bytes
3236     emit_writebyte_indexed(tl,-1,temp);
3237     if(rs2[i]) emit_rorimm(tl,8,tl);
3238     emit_writehword_indexed(tl,0,temp);
3239     if(rs2[i]) emit_rorimm(tl,24,tl);
3240   }
3241   done2=(int)out;
3242   emit_jmp(0);
3243   // 3
3244   set_jump_target(case3,(int)out);
3245   if (opcode[i]==0x2A) { // SWL
3246     // Write msb into least significant byte
3247     if(rs2[i]) emit_rorimm(tl,24,tl);
3248     emit_writebyte_indexed(tl,-3,temp);
3249     if(rs2[i]) emit_rorimm(tl,8,tl);
3250   }
3251   if (opcode[i]==0x2E) { // SWR
3252     // Write entire word
3253     emit_writeword_indexed(tl,-3,temp);
3254   }
3255   if (opcode[i]==0x2C) { // SDL
3256     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3257     // Write msb into least significant byte
3258     if(rs2[i]) emit_rorimm(th,24,th);
3259     emit_writebyte_indexed(th,-3,temp);
3260     if(rs2[i]) emit_rorimm(th,8,th);
3261   }
3262   if (opcode[i]==0x2D) { // SDR
3263     if(rs2[i]) emit_mov(th,temp2);
3264     // Write entire word
3265     emit_writeword_indexed(tl,-3,temp);
3266   }
3267   set_jump_target(done0,(int)out);
3268   set_jump_target(done1,(int)out);
3269   set_jump_target(done2,(int)out);
3270   if (opcode[i]==0x2C) { // SDL
3271     emit_testimm(temp,4);
3272     done0=(int)out;
3273     emit_jne(0);
3274     emit_andimm(temp,~3,temp);
3275     emit_writeword_indexed(temp2,4,temp);
3276     set_jump_target(done0,(int)out);
3277   }
3278   if (opcode[i]==0x2D) { // SDR
3279     emit_testimm(temp,4);
3280     done0=(int)out;
3281     emit_jeq(0);
3282     emit_andimm(temp,~3,temp);
3283     emit_writeword_indexed(temp2,-4,temp);
3284     set_jump_target(done0,(int)out);
3285   }
3286   if(!c||!memtarget)
3287     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3288   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3289     #ifdef RAM_OFFSET
3290     int map=get_reg(i_regs->regmap,ROREG);
3291     if(map<0) map=HOST_TEMPREG;
3292     gen_orig_addr_w(temp,map);
3293     #else
3294     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3295     #endif
3296     #if defined(HOST_IMM8)
3297     int ir=get_reg(i_regs->regmap,INVCP);
3298     assert(ir>=0);
3299     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3300     #else
3301     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3302     #endif
3303     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3304     emit_callne(invalidate_addr_reg[temp]);
3305     #else
3306     int jaddr2=(int)out;
3307     emit_jne(0);
3308     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3309     #endif
3310   }
3311   /*
3312     emit_pusha();
3313     //save_regs(0x100f);
3314         emit_readword((int)&last_count,ECX);
3315         if(get_reg(i_regs->regmap,CCREG)<0)
3316           emit_loadreg(CCREG,HOST_CCREG);
3317         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3318         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3319         emit_writeword(HOST_CCREG,(int)&Count);
3320     emit_call((int)memdebug);
3321     emit_popa();
3322     //restore_regs(0x100f);
3323   */
3324 }
3325
3326 void c1ls_assemble(int i,struct regstat *i_regs)
3327 {
3328   cop1_unusable(i, i_regs);
3329 }
3330
3331 void c2ls_assemble(int i,struct regstat *i_regs)
3332 {
3333   int s,tl;
3334   int ar;
3335   int offset;
3336   int memtarget=0,c=0;
3337   int jaddr2=0,type;
3338   int agr=AGEN1+(i&1);
3339   int fastio_reg_override=0;
3340   u_int hr,reglist=0;
3341   u_int copr=(source[i]>>16)&0x1f;
3342   s=get_reg(i_regs->regmap,rs1[i]);
3343   tl=get_reg(i_regs->regmap,FTEMP);
3344   offset=imm[i];
3345   assert(rs1[i]>0);
3346   assert(tl>=0);
3347
3348   for(hr=0;hr<HOST_REGS;hr++) {
3349     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3350   }
3351   if(i_regs->regmap[HOST_CCREG]==CCREG)
3352     reglist&=~(1<<HOST_CCREG);
3353
3354   // get the address
3355   if (opcode[i]==0x3a) { // SWC2
3356     ar=get_reg(i_regs->regmap,agr);
3357     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3358     reglist|=1<<ar;
3359   } else { // LWC2
3360     ar=tl;
3361   }
3362   if(s>=0) c=(i_regs->wasconst>>s)&1;
3363   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3364   if (!offset&&!c&&s>=0) ar=s;
3365   assert(ar>=0);
3366
3367   if (opcode[i]==0x3a) { // SWC2
3368     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3369     type=STOREW_STUB;
3370   }
3371   else
3372     type=LOADW_STUB;
3373
3374   if(c&&!memtarget) {
3375     jaddr2=(int)out;
3376     emit_jmp(0); // inline_readstub/inline_writestub?
3377   }
3378   else {
3379     if(!c) {
3380       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3381     }
3382     else if(ram_offset&&memtarget) {
3383       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3384       fastio_reg_override=HOST_TEMPREG;
3385     }
3386     if (opcode[i]==0x32) { // LWC2
3387       #ifdef HOST_IMM_ADDR32
3388       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3389       else
3390       #endif
3391       int a=ar;
3392       if(fastio_reg_override) a=fastio_reg_override;
3393       emit_readword_indexed(0,a,tl);
3394     }
3395     if (opcode[i]==0x3a) { // SWC2
3396       #ifdef DESTRUCTIVE_SHIFT
3397       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3398       #endif
3399       int a=ar;
3400       if(fastio_reg_override) a=fastio_reg_override;
3401       emit_writeword_indexed(tl,0,a);
3402     }
3403   }
3404   if(jaddr2)
3405     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3406   if(opcode[i]==0x3a) // SWC2
3407   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3408 #if defined(HOST_IMM8)
3409     int ir=get_reg(i_regs->regmap,INVCP);
3410     assert(ir>=0);
3411     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3412 #else
3413     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3414 #endif
3415     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3416     emit_callne(invalidate_addr_reg[ar]);
3417     #else
3418     int jaddr3=(int)out;
3419     emit_jne(0);
3420     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3421     #endif
3422   }
3423   if (opcode[i]==0x32) { // LWC2
3424     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3425   }
3426 }
3427
3428 #ifndef multdiv_assemble
3429 void multdiv_assemble(int i,struct regstat *i_regs)
3430 {
3431   printf("Need multdiv_assemble for this architecture.\n");
3432   exit(1);
3433 }
3434 #endif
3435
3436 void mov_assemble(int i,struct regstat *i_regs)
3437 {
3438   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3439   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3440   if(rt1[i]) {
3441     signed char sh,sl,th,tl;
3442     th=get_reg(i_regs->regmap,rt1[i]|64);
3443     tl=get_reg(i_regs->regmap,rt1[i]);
3444     //assert(tl>=0);
3445     if(tl>=0) {
3446       sh=get_reg(i_regs->regmap,rs1[i]|64);
3447       sl=get_reg(i_regs->regmap,rs1[i]);
3448       if(sl>=0) emit_mov(sl,tl);
3449       else emit_loadreg(rs1[i],tl);
3450       if(th>=0) {
3451         if(sh>=0) emit_mov(sh,th);
3452         else emit_loadreg(rs1[i]|64,th);
3453       }
3454     }
3455   }
3456 }
3457
3458 #ifndef fconv_assemble
3459 void fconv_assemble(int i,struct regstat *i_regs)
3460 {
3461   printf("Need fconv_assemble for this architecture.\n");
3462   exit(1);
3463 }
3464 #endif
3465
3466 #if 0
3467 void float_assemble(int i,struct regstat *i_regs)
3468 {
3469   printf("Need float_assemble for this architecture.\n");
3470   exit(1);
3471 }
3472 #endif
3473
3474 void syscall_assemble(int i,struct regstat *i_regs)
3475 {
3476   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3477   assert(ccreg==HOST_CCREG);
3478   assert(!is_delayslot);
3479   (void)ccreg;
3480   emit_movimm(start+i*4,EAX); // Get PC
3481   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3482   emit_jmp((int)jump_syscall_hle); // XXX
3483 }
3484
3485 void hlecall_assemble(int i,struct regstat *i_regs)
3486 {
3487   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3488   assert(ccreg==HOST_CCREG);
3489   assert(!is_delayslot);
3490   (void)ccreg;
3491   emit_movimm(start+i*4+4,0); // Get PC
3492   emit_movimm((int)psxHLEt[source[i]&7],1);
3493   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3494   emit_jmp((int)jump_hlecall);
3495 }
3496
3497 void intcall_assemble(int i,struct regstat *i_regs)
3498 {
3499   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3500   assert(ccreg==HOST_CCREG);
3501   assert(!is_delayslot);
3502   (void)ccreg;
3503   emit_movimm(start+i*4,0); // Get PC
3504   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3505   emit_jmp((int)jump_intcall);
3506 }
3507
3508 void ds_assemble(int i,struct regstat *i_regs)
3509 {
3510   speculate_register_values(i);
3511   is_delayslot=1;
3512   switch(itype[i]) {
3513     case ALU:
3514       alu_assemble(i,i_regs);break;
3515     case IMM16:
3516       imm16_assemble(i,i_regs);break;
3517     case SHIFT:
3518       shift_assemble(i,i_regs);break;
3519     case SHIFTIMM:
3520       shiftimm_assemble(i,i_regs);break;
3521     case LOAD:
3522       load_assemble(i,i_regs);break;
3523     case LOADLR:
3524       loadlr_assemble(i,i_regs);break;
3525     case STORE:
3526       store_assemble(i,i_regs);break;
3527     case STORELR:
3528       storelr_assemble(i,i_regs);break;
3529     case COP0:
3530       cop0_assemble(i,i_regs);break;
3531     case COP1:
3532       cop1_assemble(i,i_regs);break;
3533     case C1LS:
3534       c1ls_assemble(i,i_regs);break;
3535     case COP2:
3536       cop2_assemble(i,i_regs);break;
3537     case C2LS:
3538       c2ls_assemble(i,i_regs);break;
3539     case C2OP:
3540       c2op_assemble(i,i_regs);break;
3541     case FCONV:
3542       fconv_assemble(i,i_regs);break;
3543     case FLOAT:
3544       float_assemble(i,i_regs);break;
3545     case FCOMP:
3546       fcomp_assemble(i,i_regs);break;
3547     case MULTDIV:
3548       multdiv_assemble(i,i_regs);break;
3549     case MOV:
3550       mov_assemble(i,i_regs);break;
3551     case SYSCALL:
3552     case HLECALL:
3553     case INTCALL:
3554     case SPAN:
3555     case UJUMP:
3556     case RJUMP:
3557     case CJUMP:
3558     case SJUMP:
3559     case FJUMP:
3560       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3561   }
3562   is_delayslot=0;
3563 }
3564
3565 // Is the branch target a valid internal jump?
3566 int internal_branch(uint64_t i_is32,int addr)
3567 {
3568   if(addr&1) return 0; // Indirect (register) jump
3569   if(addr>=start && addr<start+slen*4-4)
3570   {
3571     //int t=(addr-start)>>2;
3572     // Delay slots are not valid branch targets
3573     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3574     // 64 -> 32 bit transition requires a recompile
3575     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3576     {
3577       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3578       else printf("optimizable: yes\n");
3579     }*/
3580     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3581     return 1;
3582   }
3583   return 0;
3584 }
3585
3586 #ifndef wb_invalidate
3587 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3588   uint64_t u,uint64_t uu)
3589 {
3590   int hr;
3591   for(hr=0;hr<HOST_REGS;hr++) {
3592     if(hr!=EXCLUDE_REG) {
3593       if(pre[hr]!=entry[hr]) {
3594         if(pre[hr]>=0) {
3595           if((dirty>>hr)&1) {
3596             if(get_reg(entry,pre[hr])<0) {
3597               if(pre[hr]<64) {
3598                 if(!((u>>pre[hr])&1)) {
3599                   emit_storereg(pre[hr],hr);
3600                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3601                     emit_sarimm(hr,31,hr);
3602                     emit_storereg(pre[hr]|64,hr);
3603                   }
3604                 }
3605               }else{
3606                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3607                   emit_storereg(pre[hr],hr);
3608                 }
3609               }
3610             }
3611           }
3612         }
3613       }
3614     }
3615   }
3616   // Move from one register to another (no writeback)
3617   for(hr=0;hr<HOST_REGS;hr++) {
3618     if(hr!=EXCLUDE_REG) {
3619       if(pre[hr]!=entry[hr]) {
3620         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3621           int nr;
3622           if((nr=get_reg(entry,pre[hr]))>=0) {
3623             emit_mov(hr,nr);
3624           }
3625         }
3626       }
3627     }
3628   }
3629 }
3630 #endif
3631
3632 // Load the specified registers
3633 // This only loads the registers given as arguments because
3634 // we don't want to load things that will be overwritten
3635 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3636 {
3637   int hr;
3638   // Load 32-bit regs
3639   for(hr=0;hr<HOST_REGS;hr++) {
3640     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3641       if(entry[hr]!=regmap[hr]) {
3642         if(regmap[hr]==rs1||regmap[hr]==rs2)
3643         {
3644           if(regmap[hr]==0) {
3645             emit_zeroreg(hr);
3646           }
3647           else
3648           {
3649             emit_loadreg(regmap[hr],hr);
3650           }
3651         }
3652       }
3653     }
3654   }
3655   //Load 64-bit regs
3656   for(hr=0;hr<HOST_REGS;hr++) {
3657     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3658       if(entry[hr]!=regmap[hr]) {
3659         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3660         {
3661           assert(regmap[hr]!=64);
3662           if((is32>>(regmap[hr]&63))&1) {
3663             int lr=get_reg(regmap,regmap[hr]-64);
3664             if(lr>=0)
3665               emit_sarimm(lr,31,hr);
3666             else
3667               emit_loadreg(regmap[hr],hr);
3668           }
3669           else
3670           {
3671             emit_loadreg(regmap[hr],hr);
3672           }
3673         }
3674       }
3675     }
3676   }
3677 }
3678
3679 // Load registers prior to the start of a loop
3680 // so that they are not loaded within the loop
3681 static void loop_preload(signed char pre[],signed char entry[])
3682 {
3683   int hr;
3684   for(hr=0;hr<HOST_REGS;hr++) {
3685     if(hr!=EXCLUDE_REG) {
3686       if(pre[hr]!=entry[hr]) {
3687         if(entry[hr]>=0) {
3688           if(get_reg(pre,entry[hr])<0) {
3689             assem_debug("loop preload:\n");
3690             //printf("loop preload: %d\n",hr);
3691             if(entry[hr]==0) {
3692               emit_zeroreg(hr);
3693             }
3694             else if(entry[hr]<TEMPREG)
3695             {
3696               emit_loadreg(entry[hr],hr);
3697             }
3698             else if(entry[hr]-64<TEMPREG)
3699             {
3700               emit_loadreg(entry[hr],hr);
3701             }
3702           }
3703         }
3704       }
3705     }
3706   }
3707 }
3708
3709 // Generate address for load/store instruction
3710 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
3711 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3712 {
3713   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
3714     int ra=-1;
3715     int agr=AGEN1+(i&1);
3716     if(itype[i]==LOAD) {
3717       ra=get_reg(i_regs->regmap,rt1[i]);
3718       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3719       assert(ra>=0);
3720     }
3721     if(itype[i]==LOADLR) {
3722       ra=get_reg(i_regs->regmap,FTEMP);
3723     }
3724     if(itype[i]==STORE||itype[i]==STORELR) {
3725       ra=get_reg(i_regs->regmap,agr);
3726       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3727     }
3728     if(itype[i]==C1LS||itype[i]==C2LS) {
3729       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
3730         ra=get_reg(i_regs->regmap,FTEMP);
3731       else { // SWC1/SDC1/SWC2/SDC2
3732         ra=get_reg(i_regs->regmap,agr);
3733         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3734       }
3735     }
3736     int rs=get_reg(i_regs->regmap,rs1[i]);
3737     if(ra>=0) {
3738       int offset=imm[i];
3739       int c=(i_regs->wasconst>>rs)&1;
3740       if(rs1[i]==0) {
3741         // Using r0 as a base address
3742         if(!entry||entry[ra]!=agr) {
3743           if (opcode[i]==0x22||opcode[i]==0x26) {
3744             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3745           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3746             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3747           }else{
3748             emit_movimm(offset,ra);
3749           }
3750         } // else did it in the previous cycle
3751       }
3752       else if(rs<0) {
3753         if(!entry||entry[ra]!=rs1[i])
3754           emit_loadreg(rs1[i],ra);
3755         //if(!entry||entry[ra]!=rs1[i])
3756         //  printf("poor load scheduling!\n");
3757       }
3758       else if(c) {
3759         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3760           if(!entry||entry[ra]!=agr) {
3761             if (opcode[i]==0x22||opcode[i]==0x26) {
3762               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3763             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3764               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3765             }else{
3766               #ifdef HOST_IMM_ADDR32
3767               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3768               #endif
3769               emit_movimm(constmap[i][rs]+offset,ra);
3770               regs[i].loadedconst|=1<<ra;
3771             }
3772           } // else did it in the previous cycle
3773         } // else load_consts already did it
3774       }
3775       if(offset&&!c&&rs1[i]) {
3776         if(rs>=0) {
3777           emit_addimm(rs,offset,ra);
3778         }else{
3779           emit_addimm(ra,offset,ra);
3780         }
3781       }
3782     }
3783   }
3784   // Preload constants for next instruction
3785   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
3786     int agr,ra;
3787     // Actual address
3788     agr=AGEN1+((i+1)&1);
3789     ra=get_reg(i_regs->regmap,agr);
3790     if(ra>=0) {
3791       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3792       int offset=imm[i+1];
3793       int c=(regs[i+1].wasconst>>rs)&1;
3794       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3795         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3796           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3797         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3798           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3799         }else{
3800           #ifdef HOST_IMM_ADDR32
3801           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3802           #endif
3803           emit_movimm(constmap[i+1][rs]+offset,ra);
3804           regs[i+1].loadedconst|=1<<ra;
3805         }
3806       }
3807       else if(rs1[i+1]==0) {
3808         // Using r0 as a base address
3809         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3810           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3811         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3812           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3813         }else{
3814           emit_movimm(offset,ra);
3815         }
3816       }
3817     }
3818   }
3819 }
3820
3821 static int get_final_value(int hr, int i, int *value)
3822 {
3823   int reg=regs[i].regmap[hr];
3824   while(i<slen-1) {
3825     if(regs[i+1].regmap[hr]!=reg) break;
3826     if(!((regs[i+1].isconst>>hr)&1)) break;
3827     if(bt[i+1]) break;
3828     i++;
3829   }
3830   if(i<slen-1) {
3831     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3832       *value=constmap[i][hr];
3833       return 1;
3834     }
3835     if(!bt[i+1]) {
3836       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3837         // Load in delay slot, out-of-order execution
3838         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3839         {
3840           // Precompute load address
3841           *value=constmap[i][hr]+imm[i+2];
3842           return 1;
3843         }
3844       }
3845       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3846       {
3847         // Precompute load address
3848         *value=constmap[i][hr]+imm[i+1];
3849         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
3850         return 1;
3851       }
3852     }
3853   }
3854   *value=constmap[i][hr];
3855   //printf("c=%x\n",(int)constmap[i][hr]);
3856   if(i==slen-1) return 1;
3857   if(reg<64) {
3858     return !((unneeded_reg[i+1]>>reg)&1);
3859   }else{
3860     return !((unneeded_reg_upper[i+1]>>reg)&1);
3861   }
3862 }
3863
3864 // Load registers with known constants
3865 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
3866 {
3867   int hr,hr2;
3868   // propagate loaded constant flags
3869   if(i==0||bt[i])
3870     regs[i].loadedconst=0;
3871   else {
3872     for(hr=0;hr<HOST_REGS;hr++) {
3873       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
3874          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
3875       {
3876         regs[i].loadedconst|=1<<hr;
3877       }
3878     }
3879   }
3880   // Load 32-bit regs
3881   for(hr=0;hr<HOST_REGS;hr++) {
3882     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3883       //if(entry[hr]!=regmap[hr]) {
3884       if(!((regs[i].loadedconst>>hr)&1)) {
3885         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3886           int value,similar=0;
3887           if(get_final_value(hr,i,&value)) {
3888             // see if some other register has similar value
3889             for(hr2=0;hr2<HOST_REGS;hr2++) {
3890               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
3891                 if(is_similar_value(value,constmap[i][hr2])) {
3892                   similar=1;
3893                   break;
3894                 }
3895               }
3896             }
3897             if(similar) {
3898               int value2;
3899               if(get_final_value(hr2,i,&value2)) // is this needed?
3900                 emit_movimm_from(value2,hr2,value,hr);
3901               else
3902                 emit_movimm(value,hr);
3903             }
3904             else if(value==0) {
3905               emit_zeroreg(hr);
3906             }
3907             else {
3908               emit_movimm(value,hr);
3909             }
3910           }
3911           regs[i].loadedconst|=1<<hr;
3912         }
3913       }
3914     }
3915   }
3916   // Load 64-bit regs
3917   for(hr=0;hr<HOST_REGS;hr++) {
3918     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3919       //if(entry[hr]!=regmap[hr]) {
3920       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3921         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3922           if((is32>>(regmap[hr]&63))&1) {
3923             int lr=get_reg(regmap,regmap[hr]-64);
3924             assert(lr>=0);
3925             emit_sarimm(lr,31,hr);
3926           }
3927           else
3928           {
3929             int value;
3930             if(get_final_value(hr,i,&value)) {
3931               if(value==0) {
3932                 emit_zeroreg(hr);
3933               }
3934               else {
3935                 emit_movimm(value,hr);
3936               }
3937             }
3938           }
3939         }
3940       }
3941     }
3942   }
3943 }
3944 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
3945 {
3946   int hr;
3947   // Load 32-bit regs
3948   for(hr=0;hr<HOST_REGS;hr++) {
3949     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3950       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3951         int value=constmap[i][hr];
3952         if(value==0) {
3953           emit_zeroreg(hr);
3954         }
3955         else {
3956           emit_movimm(value,hr);
3957         }
3958       }
3959     }
3960   }
3961   // Load 64-bit regs
3962   for(hr=0;hr<HOST_REGS;hr++) {
3963     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3964       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3965         if((is32>>(regmap[hr]&63))&1) {
3966           int lr=get_reg(regmap,regmap[hr]-64);
3967           assert(lr>=0);
3968           emit_sarimm(lr,31,hr);
3969         }
3970         else
3971         {
3972           int value=constmap[i][hr];
3973           if(value==0) {
3974             emit_zeroreg(hr);
3975           }
3976           else {
3977             emit_movimm(value,hr);
3978           }
3979         }
3980       }
3981     }
3982   }
3983 }
3984
3985 // Write out all dirty registers (except cycle count)
3986 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
3987 {
3988   int hr;
3989   for(hr=0;hr<HOST_REGS;hr++) {
3990     if(hr!=EXCLUDE_REG) {
3991       if(i_regmap[hr]>0) {
3992         if(i_regmap[hr]!=CCREG) {
3993           if((i_dirty>>hr)&1) {
3994             if(i_regmap[hr]<64) {
3995               emit_storereg(i_regmap[hr],hr);
3996             }else{
3997               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
3998                 emit_storereg(i_regmap[hr],hr);
3999               }
4000             }
4001           }
4002         }
4003       }
4004     }
4005   }
4006 }
4007 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4008 // This writes the registers not written by store_regs_bt
4009 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4010 {
4011   int hr;
4012   int t=(addr-start)>>2;
4013   for(hr=0;hr<HOST_REGS;hr++) {
4014     if(hr!=EXCLUDE_REG) {
4015       if(i_regmap[hr]>0) {
4016         if(i_regmap[hr]!=CCREG) {
4017           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4018             if((i_dirty>>hr)&1) {
4019               if(i_regmap[hr]<64) {
4020                 emit_storereg(i_regmap[hr],hr);
4021               }else{
4022                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4023                   emit_storereg(i_regmap[hr],hr);
4024                 }
4025               }
4026             }
4027           }
4028         }
4029       }
4030     }
4031   }
4032 }
4033
4034 // Load all registers (except cycle count)
4035 void load_all_regs(signed char i_regmap[])
4036 {
4037   int hr;
4038   for(hr=0;hr<HOST_REGS;hr++) {
4039     if(hr!=EXCLUDE_REG) {
4040       if(i_regmap[hr]==0) {
4041         emit_zeroreg(hr);
4042       }
4043       else
4044       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4045       {
4046         emit_loadreg(i_regmap[hr],hr);
4047       }
4048     }
4049   }
4050 }
4051
4052 // Load all current registers also needed by next instruction
4053 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4054 {
4055   int hr;
4056   for(hr=0;hr<HOST_REGS;hr++) {
4057     if(hr!=EXCLUDE_REG) {
4058       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4059         if(i_regmap[hr]==0) {
4060           emit_zeroreg(hr);
4061         }
4062         else
4063         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4064         {
4065           emit_loadreg(i_regmap[hr],hr);
4066         }
4067       }
4068     }
4069   }
4070 }
4071
4072 // Load all regs, storing cycle count if necessary
4073 void load_regs_entry(int t)
4074 {
4075   int hr;
4076   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
4077   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
4078   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4079     emit_storereg(CCREG,HOST_CCREG);
4080   }
4081   // Load 32-bit regs
4082   for(hr=0;hr<HOST_REGS;hr++) {
4083     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4084       if(regs[t].regmap_entry[hr]==0) {
4085         emit_zeroreg(hr);
4086       }
4087       else if(regs[t].regmap_entry[hr]!=CCREG)
4088       {
4089         emit_loadreg(regs[t].regmap_entry[hr],hr);
4090       }
4091     }
4092   }
4093   // Load 64-bit regs
4094   for(hr=0;hr<HOST_REGS;hr++) {
4095     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4096       assert(regs[t].regmap_entry[hr]!=64);
4097       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4098         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4099         if(lr<0) {
4100           emit_loadreg(regs[t].regmap_entry[hr],hr);
4101         }
4102         else
4103         {
4104           emit_sarimm(lr,31,hr);
4105         }
4106       }
4107       else
4108       {
4109         emit_loadreg(regs[t].regmap_entry[hr],hr);
4110       }
4111     }
4112   }
4113 }
4114
4115 // Store dirty registers prior to branch
4116 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4117 {
4118   if(internal_branch(i_is32,addr))
4119   {
4120     int t=(addr-start)>>2;
4121     int hr;
4122     for(hr=0;hr<HOST_REGS;hr++) {
4123       if(hr!=EXCLUDE_REG) {
4124         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4125           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4126             if((i_dirty>>hr)&1) {
4127               if(i_regmap[hr]<64) {
4128                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4129                   emit_storereg(i_regmap[hr],hr);
4130                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4131                     #ifdef DESTRUCTIVE_WRITEBACK
4132                     emit_sarimm(hr,31,hr);
4133                     emit_storereg(i_regmap[hr]|64,hr);
4134                     #else
4135                     emit_sarimm(hr,31,HOST_TEMPREG);
4136                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4137                     #endif
4138                   }
4139                 }
4140               }else{
4141                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4142                   emit_storereg(i_regmap[hr],hr);
4143                 }
4144               }
4145             }
4146           }
4147         }
4148       }
4149     }
4150   }
4151   else
4152   {
4153     // Branch out of this block, write out all dirty regs
4154     wb_dirtys(i_regmap,i_is32,i_dirty);
4155   }
4156 }
4157
4158 // Load all needed registers for branch target
4159 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4160 {
4161   //if(addr>=start && addr<(start+slen*4))
4162   if(internal_branch(i_is32,addr))
4163   {
4164     int t=(addr-start)>>2;
4165     int hr;
4166     // Store the cycle count before loading something else
4167     if(i_regmap[HOST_CCREG]!=CCREG) {
4168       assert(i_regmap[HOST_CCREG]==-1);
4169     }
4170     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4171       emit_storereg(CCREG,HOST_CCREG);
4172     }
4173     // Load 32-bit regs
4174     for(hr=0;hr<HOST_REGS;hr++) {
4175       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4176         #ifdef DESTRUCTIVE_WRITEBACK
4177         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4178         #else
4179         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4180         #endif
4181           if(regs[t].regmap_entry[hr]==0) {
4182             emit_zeroreg(hr);
4183           }
4184           else if(regs[t].regmap_entry[hr]!=CCREG)
4185           {
4186             emit_loadreg(regs[t].regmap_entry[hr],hr);
4187           }
4188         }
4189       }
4190     }
4191     //Load 64-bit regs
4192     for(hr=0;hr<HOST_REGS;hr++) {
4193       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4194         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4195           assert(regs[t].regmap_entry[hr]!=64);
4196           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4197             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4198             if(lr<0) {
4199               emit_loadreg(regs[t].regmap_entry[hr],hr);
4200             }
4201             else
4202             {
4203               emit_sarimm(lr,31,hr);
4204             }
4205           }
4206           else
4207           {
4208             emit_loadreg(regs[t].regmap_entry[hr],hr);
4209           }
4210         }
4211         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4212           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4213           assert(lr>=0);
4214           emit_sarimm(lr,31,hr);
4215         }
4216       }
4217     }
4218   }
4219 }
4220
4221 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4222 {
4223   if(addr>=start && addr<start+slen*4-4)
4224   {
4225     int t=(addr-start)>>2;
4226     int hr;
4227     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4228     for(hr=0;hr<HOST_REGS;hr++)
4229     {
4230       if(hr!=EXCLUDE_REG)
4231       {
4232         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4233         {
4234           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4235           {
4236             return 0;
4237           }
4238           else
4239           if((i_dirty>>hr)&1)
4240           {
4241             if(i_regmap[hr]<TEMPREG)
4242             {
4243               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4244                 return 0;
4245             }
4246             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4247             {
4248               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4249                 return 0;
4250             }
4251           }
4252         }
4253         else // Same register but is it 32-bit or dirty?
4254         if(i_regmap[hr]>=0)
4255         {
4256           if(!((regs[t].dirty>>hr)&1))
4257           {
4258             if((i_dirty>>hr)&1)
4259             {
4260               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4261               {
4262                 //printf("%x: dirty no match\n",addr);
4263                 return 0;
4264               }
4265             }
4266           }
4267           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4268           {
4269             //printf("%x: is32 no match\n",addr);
4270             return 0;
4271           }
4272         }
4273       }
4274     }
4275     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4276     // Delay slots are not valid branch targets
4277     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4278     // Delay slots require additional processing, so do not match
4279     if(is_ds[t]) return 0;
4280   }
4281   else
4282   {
4283     int hr;
4284     for(hr=0;hr<HOST_REGS;hr++)
4285     {
4286       if(hr!=EXCLUDE_REG)
4287       {
4288         if(i_regmap[hr]>=0)
4289         {
4290           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4291           {
4292             if((i_dirty>>hr)&1)
4293             {
4294               return 0;
4295             }
4296           }
4297         }
4298       }
4299     }
4300   }
4301   return 1;
4302 }
4303
4304 // Used when a branch jumps into the delay slot of another branch
4305 void ds_assemble_entry(int i)
4306 {
4307   int t=(ba[i]-start)>>2;
4308   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4309   assem_debug("Assemble delay slot at %x\n",ba[i]);
4310   assem_debug("<->\n");
4311   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4312     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4313   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4314   address_generation(t,&regs[t],regs[t].regmap_entry);
4315   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4316     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4317   cop1_usable=0;
4318   is_delayslot=0;
4319   switch(itype[t]) {
4320     case ALU:
4321       alu_assemble(t,&regs[t]);break;
4322     case IMM16:
4323       imm16_assemble(t,&regs[t]);break;
4324     case SHIFT:
4325       shift_assemble(t,&regs[t]);break;
4326     case SHIFTIMM:
4327       shiftimm_assemble(t,&regs[t]);break;
4328     case LOAD:
4329       load_assemble(t,&regs[t]);break;
4330     case LOADLR:
4331       loadlr_assemble(t,&regs[t]);break;
4332     case STORE:
4333       store_assemble(t,&regs[t]);break;
4334     case STORELR:
4335       storelr_assemble(t,&regs[t]);break;
4336     case COP0:
4337       cop0_assemble(t,&regs[t]);break;
4338     case COP1:
4339       cop1_assemble(t,&regs[t]);break;
4340     case C1LS:
4341       c1ls_assemble(t,&regs[t]);break;
4342     case COP2:
4343       cop2_assemble(t,&regs[t]);break;
4344     case C2LS:
4345       c2ls_assemble(t,&regs[t]);break;
4346     case C2OP:
4347       c2op_assemble(t,&regs[t]);break;
4348     case FCONV:
4349       fconv_assemble(t,&regs[t]);break;
4350     case FLOAT:
4351       float_assemble(t,&regs[t]);break;
4352     case FCOMP:
4353       fcomp_assemble(t,&regs[t]);break;
4354     case MULTDIV:
4355       multdiv_assemble(t,&regs[t]);break;
4356     case MOV:
4357       mov_assemble(t,&regs[t]);break;
4358     case SYSCALL:
4359     case HLECALL:
4360     case INTCALL:
4361     case SPAN:
4362     case UJUMP:
4363     case RJUMP:
4364     case CJUMP:
4365     case SJUMP:
4366     case FJUMP:
4367       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4368   }
4369   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4370   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4371   if(internal_branch(regs[t].is32,ba[i]+4))
4372     assem_debug("branch: internal\n");
4373   else
4374     assem_debug("branch: external\n");
4375   assert(internal_branch(regs[t].is32,ba[i]+4));
4376   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4377   emit_jmp(0);
4378 }
4379
4380 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4381 {
4382   int count;
4383   int jaddr;
4384   int idle=0;
4385   int t=0;
4386   if(itype[i]==RJUMP)
4387   {
4388     *adj=0;
4389   }
4390   //if(ba[i]>=start && ba[i]<(start+slen*4))
4391   if(internal_branch(branch_regs[i].is32,ba[i]))
4392   {
4393     t=(ba[i]-start)>>2;
4394     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4395     else *adj=ccadj[t];
4396   }
4397   else
4398   {
4399     *adj=0;
4400   }
4401   count=ccadj[i];
4402   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4403     // Idle loop
4404     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4405     idle=(int)out;
4406     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4407     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4408     jaddr=(int)out;
4409     emit_jmp(0);
4410   }
4411   else if(*adj==0||invert) {
4412     int cycles=CLOCK_ADJUST(count+2);
4413     // faster loop HACK
4414     if (t&&*adj) {
4415       int rel=t-i;
4416       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4417         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4418     }
4419     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4420     jaddr=(int)out;
4421     emit_jns(0);
4422   }
4423   else
4424   {
4425     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4426     jaddr=(int)out;
4427     emit_jns(0);
4428   }
4429   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4430 }
4431
4432 void do_ccstub(int n)
4433 {
4434   literal_pool(256);
4435   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4436   set_jump_target(stubs[n][1],(int)out);
4437   int i=stubs[n][4];
4438   if(stubs[n][6]==NULLDS) {
4439     // Delay slot instruction is nullified ("likely" branch)
4440     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4441   }
4442   else if(stubs[n][6]!=TAKEN) {
4443     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4444   }
4445   else {
4446     if(internal_branch(branch_regs[i].is32,ba[i]))
4447       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4448   }
4449   if(stubs[n][5]!=-1)
4450   {
4451     // Save PC as return address
4452     emit_movimm(stubs[n][5],EAX);
4453     emit_writeword(EAX,(int)&pcaddr);
4454   }
4455   else
4456   {
4457     // Return address depends on which way the branch goes
4458     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4459     {
4460       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4461       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4462       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4463       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4464       if(rs1[i]==0)
4465       {
4466         s1l=s2l;s1h=s2h;
4467         s2l=s2h=-1;
4468       }
4469       else if(rs2[i]==0)
4470       {
4471         s2l=s2h=-1;
4472       }
4473       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4474         s1h=s2h=-1;
4475       }
4476       assert(s1l>=0);
4477       #ifdef DESTRUCTIVE_WRITEBACK
4478       if(rs1[i]) {
4479         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4480           emit_loadreg(rs1[i],s1l);
4481       }
4482       else {
4483         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4484           emit_loadreg(rs2[i],s1l);
4485       }
4486       if(s2l>=0)
4487         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4488           emit_loadreg(rs2[i],s2l);
4489       #endif
4490       int hr=0;
4491       int addr=-1,alt=-1,ntaddr=-1;
4492       while(hr<HOST_REGS)
4493       {
4494         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4495            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4496            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4497         {
4498           addr=hr++;break;
4499         }
4500         hr++;
4501       }
4502       while(hr<HOST_REGS)
4503       {
4504         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4505            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4506            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4507         {
4508           alt=hr++;break;
4509         }
4510         hr++;
4511       }
4512       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4513       {
4514         while(hr<HOST_REGS)
4515         {
4516           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4517              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4518              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4519           {
4520             ntaddr=hr;break;
4521           }
4522           hr++;
4523         }
4524         assert(hr<HOST_REGS);
4525       }
4526       if((opcode[i]&0x2f)==4) // BEQ
4527       {
4528         #ifdef HAVE_CMOV_IMM
4529         if(s1h<0) {
4530           if(s2l>=0) emit_cmp(s1l,s2l);
4531           else emit_test(s1l,s1l);
4532           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4533         }
4534         else
4535         #endif
4536         {
4537           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4538           if(s1h>=0) {
4539             if(s2h>=0) emit_cmp(s1h,s2h);
4540             else emit_test(s1h,s1h);
4541             emit_cmovne_reg(alt,addr);
4542           }
4543           if(s2l>=0) emit_cmp(s1l,s2l);
4544           else emit_test(s1l,s1l);
4545           emit_cmovne_reg(alt,addr);
4546         }
4547       }
4548       if((opcode[i]&0x2f)==5) // BNE
4549       {
4550         #ifdef HAVE_CMOV_IMM
4551         if(s1h<0) {
4552           if(s2l>=0) emit_cmp(s1l,s2l);
4553           else emit_test(s1l,s1l);
4554           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4555         }
4556         else
4557         #endif
4558         {
4559           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4560           if(s1h>=0) {
4561             if(s2h>=0) emit_cmp(s1h,s2h);
4562             else emit_test(s1h,s1h);
4563             emit_cmovne_reg(alt,addr);
4564           }
4565           if(s2l>=0) emit_cmp(s1l,s2l);
4566           else emit_test(s1l,s1l);
4567           emit_cmovne_reg(alt,addr);
4568         }
4569       }
4570       if((opcode[i]&0x2f)==6) // BLEZ
4571       {
4572         //emit_movimm(ba[i],alt);
4573         //emit_movimm(start+i*4+8,addr);
4574         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4575         emit_cmpimm(s1l,1);
4576         if(s1h>=0) emit_mov(addr,ntaddr);
4577         emit_cmovl_reg(alt,addr);
4578         if(s1h>=0) {
4579           emit_test(s1h,s1h);
4580           emit_cmovne_reg(ntaddr,addr);
4581           emit_cmovs_reg(alt,addr);
4582         }
4583       }
4584       if((opcode[i]&0x2f)==7) // BGTZ
4585       {
4586         //emit_movimm(ba[i],addr);
4587         //emit_movimm(start+i*4+8,ntaddr);
4588         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4589         emit_cmpimm(s1l,1);
4590         if(s1h>=0) emit_mov(addr,alt);
4591         emit_cmovl_reg(ntaddr,addr);
4592         if(s1h>=0) {
4593           emit_test(s1h,s1h);
4594           emit_cmovne_reg(alt,addr);
4595           emit_cmovs_reg(ntaddr,addr);
4596         }
4597       }
4598       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4599       {
4600         //emit_movimm(ba[i],alt);
4601         //emit_movimm(start+i*4+8,addr);
4602         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4603         if(s1h>=0) emit_test(s1h,s1h);
4604         else emit_test(s1l,s1l);
4605         emit_cmovs_reg(alt,addr);
4606       }
4607       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4608       {
4609         //emit_movimm(ba[i],addr);
4610         //emit_movimm(start+i*4+8,alt);
4611         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4612         if(s1h>=0) emit_test(s1h,s1h);
4613         else emit_test(s1l,s1l);
4614         emit_cmovs_reg(alt,addr);
4615       }
4616       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4617         if(source[i]&0x10000) // BC1T
4618         {
4619           //emit_movimm(ba[i],alt);
4620           //emit_movimm(start+i*4+8,addr);
4621           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4622           emit_testimm(s1l,0x800000);
4623           emit_cmovne_reg(alt,addr);
4624         }
4625         else // BC1F
4626         {
4627           //emit_movimm(ba[i],addr);
4628           //emit_movimm(start+i*4+8,alt);
4629           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4630           emit_testimm(s1l,0x800000);
4631           emit_cmovne_reg(alt,addr);
4632         }
4633       }
4634       emit_writeword(addr,(int)&pcaddr);
4635     }
4636     else
4637     if(itype[i]==RJUMP)
4638     {
4639       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4640       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4641         r=get_reg(branch_regs[i].regmap,RTEMP);
4642       }
4643       emit_writeword(r,(int)&pcaddr);
4644     }
4645     else {SysPrintf("Unknown branch type in do_ccstub\n");exit(1);}
4646   }
4647   // Update cycle count
4648   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4649   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
4650   emit_call((int)cc_interrupt);
4651   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
4652   if(stubs[n][6]==TAKEN) {
4653     if(internal_branch(branch_regs[i].is32,ba[i]))
4654       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4655     else if(itype[i]==RJUMP) {
4656       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4657         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4658       else
4659         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4660     }
4661   }else if(stubs[n][6]==NOTTAKEN) {
4662     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4663     else load_all_regs(branch_regs[i].regmap);
4664   }else if(stubs[n][6]==NULLDS) {
4665     // Delay slot instruction is nullified ("likely" branch)
4666     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4667     else load_all_regs(regs[i].regmap);
4668   }else{
4669     load_all_regs(branch_regs[i].regmap);
4670   }
4671   emit_jmp(stubs[n][2]); // return address
4672
4673   /* This works but uses a lot of memory...
4674   emit_readword((int)&last_count,ECX);
4675   emit_add(HOST_CCREG,ECX,EAX);
4676   emit_writeword(EAX,(int)&Count);
4677   emit_call((int)gen_interupt);
4678   emit_readword((int)&Count,HOST_CCREG);
4679   emit_readword((int)&next_interupt,EAX);
4680   emit_readword((int)&pending_exception,EBX);
4681   emit_writeword(EAX,(int)&last_count);
4682   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
4683   emit_test(EBX,EBX);
4684   int jne_instr=(int)out;
4685   emit_jne(0);
4686   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
4687   load_all_regs(branch_regs[i].regmap);
4688   emit_jmp(stubs[n][2]); // return address
4689   set_jump_target(jne_instr,(int)out);
4690   emit_readword((int)&pcaddr,EAX);
4691   // Call get_addr_ht instead of doing the hash table here.
4692   // This code is executed infrequently and takes up a lot of space
4693   // so smaller is better.
4694   emit_storereg(CCREG,HOST_CCREG);
4695   emit_pushreg(EAX);
4696   emit_call((int)get_addr_ht);
4697   emit_loadreg(CCREG,HOST_CCREG);
4698   emit_addimm(ESP,4,ESP);
4699   emit_jmpreg(EAX);*/
4700 }
4701
4702 static void add_to_linker(int addr,int target,int ext)
4703 {
4704   link_addr[linkcount][0]=addr;
4705   link_addr[linkcount][1]=target;
4706   link_addr[linkcount][2]=ext;
4707   linkcount++;
4708 }
4709
4710 static void ujump_assemble_write_ra(int i)
4711 {
4712   int rt;
4713   unsigned int return_address;
4714   rt=get_reg(branch_regs[i].regmap,31);
4715   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4716   //assert(rt>=0);
4717   return_address=start+i*4+8;
4718   if(rt>=0) {
4719     #ifdef USE_MINI_HT
4720     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
4721       int temp=-1; // note: must be ds-safe
4722       #ifdef HOST_TEMPREG
4723       temp=HOST_TEMPREG;
4724       #endif
4725       if(temp>=0) do_miniht_insert(return_address,rt,temp);
4726       else emit_movimm(return_address,rt);
4727     }
4728     else
4729     #endif
4730     {
4731       #ifdef REG_PREFETCH
4732       if(temp>=0)
4733       {
4734         if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4735       }
4736       #endif
4737       emit_movimm(return_address,rt); // PC into link register
4738       #ifdef IMM_PREFETCH
4739       emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4740       #endif
4741     }
4742   }
4743 }
4744
4745 void ujump_assemble(int i,struct regstat *i_regs)
4746 {
4747   int ra_done=0;
4748   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4749   address_generation(i+1,i_regs,regs[i].regmap_entry);
4750   #ifdef REG_PREFETCH
4751   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4752   if(rt1[i]==31&&temp>=0)
4753   {
4754     signed char *i_regmap=i_regs->regmap;
4755     int return_address=start+i*4+8;
4756     if(get_reg(branch_regs[i].regmap,31)>0)
4757     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4758   }
4759   #endif
4760   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4761     ujump_assemble_write_ra(i); // writeback ra for DS
4762     ra_done=1;
4763   }
4764   ds_assemble(i+1,i_regs);
4765   uint64_t bc_unneeded=branch_regs[i].u;
4766   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4767   bc_unneeded|=1|(1LL<<rt1[i]);
4768   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4769   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4770                 bc_unneeded,bc_unneeded_upper);
4771   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4772   if(!ra_done&&rt1[i]==31)
4773     ujump_assemble_write_ra(i);
4774   int cc,adj;
4775   cc=get_reg(branch_regs[i].regmap,CCREG);
4776   assert(cc==HOST_CCREG);
4777   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4778   #ifdef REG_PREFETCH
4779   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4780   #endif
4781   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4782   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4783   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4784   if(internal_branch(branch_regs[i].is32,ba[i]))
4785     assem_debug("branch: internal\n");
4786   else
4787     assem_debug("branch: external\n");
4788   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4789     ds_assemble_entry(i);
4790   }
4791   else {
4792     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4793     emit_jmp(0);
4794   }
4795 }
4796
4797 static void rjump_assemble_write_ra(int i)
4798 {
4799   int rt,return_address;
4800   assert(rt1[i+1]!=rt1[i]);
4801   assert(rt2[i+1]!=rt1[i]);
4802   rt=get_reg(branch_regs[i].regmap,rt1[i]);
4803   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4804   assert(rt>=0);
4805   return_address=start+i*4+8;
4806   #ifdef REG_PREFETCH
4807   if(temp>=0)
4808   {
4809     if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4810   }
4811   #endif
4812   emit_movimm(return_address,rt); // PC into link register
4813   #ifdef IMM_PREFETCH
4814   emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4815   #endif
4816 }
4817
4818 void rjump_assemble(int i,struct regstat *i_regs)
4819 {
4820   int temp;
4821   int rs,cc;
4822   int ra_done=0;
4823   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4824   assert(rs>=0);
4825   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4826     // Delay slot abuse, make a copy of the branch address register
4827     temp=get_reg(branch_regs[i].regmap,RTEMP);
4828     assert(temp>=0);
4829     assert(regs[i].regmap[temp]==RTEMP);
4830     emit_mov(rs,temp);
4831     rs=temp;
4832   }
4833   address_generation(i+1,i_regs,regs[i].regmap_entry);
4834   #ifdef REG_PREFETCH
4835   if(rt1[i]==31)
4836   {
4837     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4838       signed char *i_regmap=i_regs->regmap;
4839       int return_address=start+i*4+8;
4840       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4841     }
4842   }
4843   #endif
4844   #ifdef USE_MINI_HT
4845   if(rs1[i]==31) {
4846     int rh=get_reg(regs[i].regmap,RHASH);
4847     if(rh>=0) do_preload_rhash(rh);
4848   }
4849   #endif
4850   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4851     rjump_assemble_write_ra(i);
4852     ra_done=1;
4853   }
4854   ds_assemble(i+1,i_regs);
4855   uint64_t bc_unneeded=branch_regs[i].u;
4856   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4857   bc_unneeded|=1|(1LL<<rt1[i]);
4858   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4859   bc_unneeded&=~(1LL<<rs1[i]);
4860   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4861                 bc_unneeded,bc_unneeded_upper);
4862   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4863   if(!ra_done&&rt1[i]!=0)
4864     rjump_assemble_write_ra(i);
4865   cc=get_reg(branch_regs[i].regmap,CCREG);
4866   assert(cc==HOST_CCREG);
4867   (void)cc;
4868   #ifdef USE_MINI_HT
4869   int rh=get_reg(branch_regs[i].regmap,RHASH);
4870   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4871   if(rs1[i]==31) {
4872     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4873     do_preload_rhtbl(ht);
4874     do_rhash(rs,rh);
4875   }
4876   #endif
4877   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4878   #ifdef DESTRUCTIVE_WRITEBACK
4879   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4880     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4881       emit_loadreg(rs1[i],rs);
4882     }
4883   }
4884   #endif
4885   #ifdef REG_PREFETCH
4886   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4887   #endif
4888   #ifdef USE_MINI_HT
4889   if(rs1[i]==31) {
4890     do_miniht_load(ht,rh);
4891   }
4892   #endif
4893   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4894   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4895   //assert(adj==0);
4896   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4897   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
4898   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
4899     // special case for RFE
4900     emit_jmp(0);
4901   else
4902     emit_jns(0);
4903   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4904   #ifdef USE_MINI_HT
4905   if(rs1[i]==31) {
4906     do_miniht_jump(rs,rh,ht);
4907   }
4908   else
4909   #endif
4910   {
4911     //if(rs!=EAX) emit_mov(rs,EAX);
4912     //emit_jmp((int)jump_vaddr_eax);
4913     emit_jmp(jump_vaddr_reg[rs]);
4914   }
4915   /* Check hash table
4916   temp=!rs;
4917   emit_mov(rs,temp);
4918   emit_shrimm(rs,16,rs);
4919   emit_xor(temp,rs,rs);
4920   emit_movzwl_reg(rs,rs);
4921   emit_shlimm(rs,4,rs);
4922   emit_cmpmem_indexed((int)hash_table,rs,temp);
4923   emit_jne((int)out+14);
4924   emit_readword_indexed((int)hash_table+4,rs,rs);
4925   emit_jmpreg(rs);
4926   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
4927   emit_addimm_no_flags(8,rs);
4928   emit_jeq((int)out-17);
4929   // No hit on hash table, call compiler
4930   emit_pushreg(temp);
4931 //DEBUG >
4932 #ifdef DEBUG_CYCLE_COUNT
4933   emit_readword((int)&last_count,ECX);
4934   emit_add(HOST_CCREG,ECX,HOST_CCREG);
4935   emit_readword((int)&next_interupt,ECX);
4936   emit_writeword(HOST_CCREG,(int)&Count);
4937   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
4938   emit_writeword(ECX,(int)&last_count);
4939 #endif
4940 //DEBUG <
4941   emit_storereg(CCREG,HOST_CCREG);
4942   emit_call((int)get_addr);
4943   emit_loadreg(CCREG,HOST_CCREG);
4944   emit_addimm(ESP,4,ESP);
4945   emit_jmpreg(EAX);*/
4946   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4947   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4948   #endif
4949 }
4950
4951 void cjump_assemble(int i,struct regstat *i_regs)
4952 {
4953   signed char *i_regmap=i_regs->regmap;
4954   int cc;
4955   int match;
4956   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4957   assem_debug("match=%d\n",match);
4958   int s1h,s1l,s2h,s2l;
4959   int prev_cop1_usable=cop1_usable;
4960   int unconditional=0,nop=0;
4961   int only32=0;
4962   int invert=0;
4963   int internal=internal_branch(branch_regs[i].is32,ba[i]);
4964   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4965   if(!match) invert=1;
4966   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4967   if(i>(ba[i]-start)>>2) invert=1;
4968   #endif
4969
4970   if(ooo[i]) {
4971     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4972     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4973     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4974     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4975   }
4976   else {
4977     s1l=get_reg(i_regmap,rs1[i]);
4978     s1h=get_reg(i_regmap,rs1[i]|64);
4979     s2l=get_reg(i_regmap,rs2[i]);
4980     s2h=get_reg(i_regmap,rs2[i]|64);
4981   }
4982   if(rs1[i]==0&&rs2[i]==0)
4983   {
4984     if(opcode[i]&1) nop=1;
4985     else unconditional=1;
4986     //assert(opcode[i]!=5);
4987     //assert(opcode[i]!=7);
4988     //assert(opcode[i]!=0x15);
4989     //assert(opcode[i]!=0x17);
4990   }
4991   else if(rs1[i]==0)
4992   {
4993     s1l=s2l;s1h=s2h;
4994     s2l=s2h=-1;
4995     only32=(regs[i].was32>>rs2[i])&1;
4996   }
4997   else if(rs2[i]==0)
4998   {
4999     s2l=s2h=-1;
5000     only32=(regs[i].was32>>rs1[i])&1;
5001   }
5002   else {
5003     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5004   }
5005
5006   if(ooo[i]) {
5007     // Out of order execution (delay slot first)
5008     //printf("OOOE\n");
5009     address_generation(i+1,i_regs,regs[i].regmap_entry);
5010     ds_assemble(i+1,i_regs);
5011     int adj;
5012     uint64_t bc_unneeded=branch_regs[i].u;
5013     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5014     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5015     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5016     bc_unneeded|=1;
5017     bc_unneeded_upper|=1;
5018     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5019                   bc_unneeded,bc_unneeded_upper);
5020     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5021     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5022     cc=get_reg(branch_regs[i].regmap,CCREG);
5023     assert(cc==HOST_CCREG);
5024     if(unconditional)
5025       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5026     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5027     //assem_debug("cycle count (adj)\n");
5028     if(unconditional) {
5029       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5030       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5031         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5032         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5033         if(internal)
5034           assem_debug("branch: internal\n");
5035         else
5036           assem_debug("branch: external\n");
5037         if(internal&&is_ds[(ba[i]-start)>>2]) {
5038           ds_assemble_entry(i);
5039         }
5040         else {
5041           add_to_linker((int)out,ba[i],internal);
5042           emit_jmp(0);
5043         }
5044         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5045         if(((u_int)out)&7) emit_addnop(0);
5046         #endif
5047       }
5048     }
5049     else if(nop) {
5050       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5051       int jaddr=(int)out;
5052       emit_jns(0);
5053       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5054     }
5055     else {
5056       int taken=0,nottaken=0,nottaken1=0;
5057       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5058       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5059       if(!only32)
5060       {
5061         assert(s1h>=0);
5062         if(opcode[i]==4) // BEQ
5063         {
5064           if(s2h>=0) emit_cmp(s1h,s2h);
5065           else emit_test(s1h,s1h);
5066           nottaken1=(int)out;
5067           emit_jne(1);
5068         }
5069         if(opcode[i]==5) // BNE
5070         {
5071           if(s2h>=0) emit_cmp(s1h,s2h);
5072           else emit_test(s1h,s1h);
5073           if(invert) taken=(int)out;
5074           else add_to_linker((int)out,ba[i],internal);
5075           emit_jne(0);
5076         }
5077         if(opcode[i]==6) // BLEZ
5078         {
5079           emit_test(s1h,s1h);
5080           if(invert) taken=(int)out;
5081           else add_to_linker((int)out,ba[i],internal);
5082           emit_js(0);
5083           nottaken1=(int)out;
5084           emit_jne(1);
5085         }
5086         if(opcode[i]==7) // BGTZ
5087         {
5088           emit_test(s1h,s1h);
5089           nottaken1=(int)out;
5090           emit_js(1);
5091           if(invert) taken=(int)out;
5092           else add_to_linker((int)out,ba[i],internal);
5093           emit_jne(0);
5094         }
5095       } // if(!only32)
5096
5097       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5098       assert(s1l>=0);
5099       if(opcode[i]==4) // BEQ
5100       {
5101         if(s2l>=0) emit_cmp(s1l,s2l);
5102         else emit_test(s1l,s1l);
5103         if(invert){
5104           nottaken=(int)out;
5105           emit_jne(1);
5106         }else{
5107           add_to_linker((int)out,ba[i],internal);
5108           emit_jeq(0);
5109         }
5110       }
5111       if(opcode[i]==5) // BNE
5112       {
5113         if(s2l>=0) emit_cmp(s1l,s2l);
5114         else emit_test(s1l,s1l);
5115         if(invert){
5116           nottaken=(int)out;
5117           emit_jeq(1);
5118         }else{
5119           add_to_linker((int)out,ba[i],internal);
5120           emit_jne(0);
5121         }
5122       }
5123       if(opcode[i]==6) // BLEZ
5124       {
5125         emit_cmpimm(s1l,1);
5126         if(invert){
5127           nottaken=(int)out;
5128           emit_jge(1);
5129         }else{
5130           add_to_linker((int)out,ba[i],internal);
5131           emit_jl(0);
5132         }
5133       }
5134       if(opcode[i]==7) // BGTZ
5135       {
5136         emit_cmpimm(s1l,1);
5137         if(invert){
5138           nottaken=(int)out;
5139           emit_jl(1);
5140         }else{
5141           add_to_linker((int)out,ba[i],internal);
5142           emit_jge(0);
5143         }
5144       }
5145       if(invert) {
5146         if(taken) set_jump_target(taken,(int)out);
5147         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5148         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5149           if(adj) {
5150             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5151             add_to_linker((int)out,ba[i],internal);
5152           }else{
5153             emit_addnop(13);
5154             add_to_linker((int)out,ba[i],internal*2);
5155           }
5156           emit_jmp(0);
5157         }else
5158         #endif
5159         {
5160           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5161           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5162           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5163           if(internal)
5164             assem_debug("branch: internal\n");
5165           else
5166             assem_debug("branch: external\n");
5167           if(internal&&is_ds[(ba[i]-start)>>2]) {
5168             ds_assemble_entry(i);
5169           }
5170           else {
5171             add_to_linker((int)out,ba[i],internal);
5172             emit_jmp(0);
5173           }
5174         }
5175         set_jump_target(nottaken,(int)out);
5176       }
5177
5178       if(nottaken1) set_jump_target(nottaken1,(int)out);
5179       if(adj) {
5180         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5181       }
5182     } // (!unconditional)
5183   } // if(ooo)
5184   else
5185   {
5186     // In-order execution (branch first)
5187     //if(likely[i]) printf("IOL\n");
5188     //else
5189     //printf("IOE\n");
5190     int taken=0,nottaken=0,nottaken1=0;
5191     if(!unconditional&&!nop) {
5192       if(!only32)
5193       {
5194         assert(s1h>=0);
5195         if((opcode[i]&0x2f)==4) // BEQ
5196         {
5197           if(s2h>=0) emit_cmp(s1h,s2h);
5198           else emit_test(s1h,s1h);
5199           nottaken1=(int)out;
5200           emit_jne(2);
5201         }
5202         if((opcode[i]&0x2f)==5) // BNE
5203         {
5204           if(s2h>=0) emit_cmp(s1h,s2h);
5205           else emit_test(s1h,s1h);
5206           taken=(int)out;
5207           emit_jne(1);
5208         }
5209         if((opcode[i]&0x2f)==6) // BLEZ
5210         {
5211           emit_test(s1h,s1h);
5212           taken=(int)out;
5213           emit_js(1);
5214           nottaken1=(int)out;
5215           emit_jne(2);
5216         }
5217         if((opcode[i]&0x2f)==7) // BGTZ
5218         {
5219           emit_test(s1h,s1h);
5220           nottaken1=(int)out;
5221           emit_js(2);
5222           taken=(int)out;
5223           emit_jne(1);
5224         }
5225       } // if(!only32)
5226
5227       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5228       assert(s1l>=0);
5229       if((opcode[i]&0x2f)==4) // BEQ
5230       {
5231         if(s2l>=0) emit_cmp(s1l,s2l);
5232         else emit_test(s1l,s1l);
5233         nottaken=(int)out;
5234         emit_jne(2);
5235       }
5236       if((opcode[i]&0x2f)==5) // BNE
5237       {
5238         if(s2l>=0) emit_cmp(s1l,s2l);
5239         else emit_test(s1l,s1l);
5240         nottaken=(int)out;
5241         emit_jeq(2);
5242       }
5243       if((opcode[i]&0x2f)==6) // BLEZ
5244       {
5245         emit_cmpimm(s1l,1);
5246         nottaken=(int)out;
5247         emit_jge(2);
5248       }
5249       if((opcode[i]&0x2f)==7) // BGTZ
5250       {
5251         emit_cmpimm(s1l,1);
5252         nottaken=(int)out;
5253         emit_jl(2);
5254       }
5255     } // if(!unconditional)
5256     int adj;
5257     uint64_t ds_unneeded=branch_regs[i].u;
5258     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5259     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5260     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5261     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5262     ds_unneeded|=1;
5263     ds_unneeded_upper|=1;
5264     // branch taken
5265     if(!nop) {
5266       if(taken) set_jump_target(taken,(int)out);
5267       assem_debug("1:\n");
5268       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5269                     ds_unneeded,ds_unneeded_upper);
5270       // load regs
5271       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5272       address_generation(i+1,&branch_regs[i],0);
5273       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5274       ds_assemble(i+1,&branch_regs[i]);
5275       cc=get_reg(branch_regs[i].regmap,CCREG);
5276       if(cc==-1) {
5277         emit_loadreg(CCREG,cc=HOST_CCREG);
5278         // CHECK: Is the following instruction (fall thru) allocated ok?
5279       }
5280       assert(cc==HOST_CCREG);
5281       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5282       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5283       assem_debug("cycle count (adj)\n");
5284       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5285       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5286       if(internal)
5287         assem_debug("branch: internal\n");
5288       else
5289         assem_debug("branch: external\n");
5290       if(internal&&is_ds[(ba[i]-start)>>2]) {
5291         ds_assemble_entry(i);
5292       }
5293       else {
5294         add_to_linker((int)out,ba[i],internal);
5295         emit_jmp(0);
5296       }
5297     }
5298     // branch not taken
5299     cop1_usable=prev_cop1_usable;
5300     if(!unconditional) {
5301       if(nottaken1) set_jump_target(nottaken1,(int)out);
5302       set_jump_target(nottaken,(int)out);
5303       assem_debug("2:\n");
5304       if(!likely[i]) {
5305         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5306                       ds_unneeded,ds_unneeded_upper);
5307         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5308         address_generation(i+1,&branch_regs[i],0);
5309         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5310         ds_assemble(i+1,&branch_regs[i]);
5311       }
5312       cc=get_reg(branch_regs[i].regmap,CCREG);
5313       if(cc==-1&&!likely[i]) {
5314         // Cycle count isn't in a register, temporarily load it then write it out
5315         emit_loadreg(CCREG,HOST_CCREG);
5316         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5317         int jaddr=(int)out;
5318         emit_jns(0);
5319         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5320         emit_storereg(CCREG,HOST_CCREG);
5321       }
5322       else{
5323         cc=get_reg(i_regmap,CCREG);
5324         assert(cc==HOST_CCREG);
5325         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5326         int jaddr=(int)out;
5327         emit_jns(0);
5328         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5329       }
5330     }
5331   }
5332 }
5333
5334 void sjump_assemble(int i,struct regstat *i_regs)
5335 {
5336   signed char *i_regmap=i_regs->regmap;
5337   int cc;
5338   int match;
5339   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5340   assem_debug("smatch=%d\n",match);
5341   int s1h,s1l;
5342   int prev_cop1_usable=cop1_usable;
5343   int unconditional=0,nevertaken=0;
5344   int only32=0;
5345   int invert=0;
5346   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5347   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5348   if(!match) invert=1;
5349   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5350   if(i>(ba[i]-start)>>2) invert=1;
5351   #endif
5352
5353   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5354   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5355
5356   if(ooo[i]) {
5357     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5358     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5359   }
5360   else {
5361     s1l=get_reg(i_regmap,rs1[i]);
5362     s1h=get_reg(i_regmap,rs1[i]|64);
5363   }
5364   if(rs1[i]==0)
5365   {
5366     if(opcode2[i]&1) unconditional=1;
5367     else nevertaken=1;
5368     // These are never taken (r0 is never less than zero)
5369     //assert(opcode2[i]!=0);
5370     //assert(opcode2[i]!=2);
5371     //assert(opcode2[i]!=0x10);
5372     //assert(opcode2[i]!=0x12);
5373   }
5374   else {
5375     only32=(regs[i].was32>>rs1[i])&1;
5376   }
5377
5378   if(ooo[i]) {
5379     // Out of order execution (delay slot first)
5380     //printf("OOOE\n");
5381     address_generation(i+1,i_regs,regs[i].regmap_entry);
5382     ds_assemble(i+1,i_regs);
5383     int adj;
5384     uint64_t bc_unneeded=branch_regs[i].u;
5385     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5386     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5387     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5388     bc_unneeded|=1;
5389     bc_unneeded_upper|=1;
5390     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5391                   bc_unneeded,bc_unneeded_upper);
5392     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5393     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5394     if(rt1[i]==31) {
5395       int rt,return_address;
5396       rt=get_reg(branch_regs[i].regmap,31);
5397       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5398       if(rt>=0) {
5399         // Save the PC even if the branch is not taken
5400         return_address=start+i*4+8;
5401         emit_movimm(return_address,rt); // PC into link register
5402         #ifdef IMM_PREFETCH
5403         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5404         #endif
5405       }
5406     }
5407     cc=get_reg(branch_regs[i].regmap,CCREG);
5408     assert(cc==HOST_CCREG);
5409     if(unconditional)
5410       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5411     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5412     assem_debug("cycle count (adj)\n");
5413     if(unconditional) {
5414       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5415       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5416         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5417         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5418         if(internal)
5419           assem_debug("branch: internal\n");
5420         else
5421           assem_debug("branch: external\n");
5422         if(internal&&is_ds[(ba[i]-start)>>2]) {
5423           ds_assemble_entry(i);
5424         }
5425         else {
5426           add_to_linker((int)out,ba[i],internal);
5427           emit_jmp(0);
5428         }
5429         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5430         if(((u_int)out)&7) emit_addnop(0);
5431         #endif
5432       }
5433     }
5434     else if(nevertaken) {
5435       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5436       int jaddr=(int)out;
5437       emit_jns(0);
5438       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5439     }
5440     else {
5441       int nottaken=0;
5442       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5443       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5444       if(!only32)
5445       {
5446         assert(s1h>=0);
5447         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5448         {
5449           emit_test(s1h,s1h);
5450           if(invert){
5451             nottaken=(int)out;
5452             emit_jns(1);
5453           }else{
5454             add_to_linker((int)out,ba[i],internal);
5455             emit_js(0);
5456           }
5457         }
5458         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5459         {
5460           emit_test(s1h,s1h);
5461           if(invert){
5462             nottaken=(int)out;
5463             emit_js(1);
5464           }else{
5465             add_to_linker((int)out,ba[i],internal);
5466             emit_jns(0);
5467           }
5468         }
5469       } // if(!only32)
5470       else
5471       {
5472         assert(s1l>=0);
5473         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5474         {
5475           emit_test(s1l,s1l);
5476           if(invert){
5477             nottaken=(int)out;
5478             emit_jns(1);
5479           }else{
5480             add_to_linker((int)out,ba[i],internal);
5481             emit_js(0);
5482           }
5483         }
5484         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5485         {
5486           emit_test(s1l,s1l);
5487           if(invert){
5488             nottaken=(int)out;
5489             emit_js(1);
5490           }else{
5491             add_to_linker((int)out,ba[i],internal);
5492             emit_jns(0);
5493           }
5494         }
5495       } // if(!only32)
5496
5497       if(invert) {
5498         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5499         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5500           if(adj) {
5501             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5502             add_to_linker((int)out,ba[i],internal);
5503           }else{
5504             emit_addnop(13);
5505             add_to_linker((int)out,ba[i],internal*2);
5506           }
5507           emit_jmp(0);
5508         }else
5509         #endif
5510         {
5511           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5512           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5513           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5514           if(internal)
5515             assem_debug("branch: internal\n");
5516           else
5517             assem_debug("branch: external\n");
5518           if(internal&&is_ds[(ba[i]-start)>>2]) {
5519             ds_assemble_entry(i);
5520           }
5521           else {
5522             add_to_linker((int)out,ba[i],internal);
5523             emit_jmp(0);
5524           }
5525         }
5526         set_jump_target(nottaken,(int)out);
5527       }
5528
5529       if(adj) {
5530         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5531       }
5532     } // (!unconditional)
5533   } // if(ooo)
5534   else
5535   {
5536     // In-order execution (branch first)
5537     //printf("IOE\n");
5538     int nottaken=0;
5539     if(rt1[i]==31) {
5540       int rt,return_address;
5541       rt=get_reg(branch_regs[i].regmap,31);
5542       if(rt>=0) {
5543         // Save the PC even if the branch is not taken
5544         return_address=start+i*4+8;
5545         emit_movimm(return_address,rt); // PC into link register
5546         #ifdef IMM_PREFETCH
5547         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5548         #endif
5549       }
5550     }
5551     if(!unconditional) {
5552       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5553       if(!only32)
5554       {
5555         assert(s1h>=0);
5556         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5557         {
5558           emit_test(s1h,s1h);
5559           nottaken=(int)out;
5560           emit_jns(1);
5561         }
5562         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5563         {
5564           emit_test(s1h,s1h);
5565           nottaken=(int)out;
5566           emit_js(1);
5567         }
5568       } // if(!only32)
5569       else
5570       {
5571         assert(s1l>=0);
5572         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5573         {
5574           emit_test(s1l,s1l);
5575           nottaken=(int)out;
5576           emit_jns(1);
5577         }
5578         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5579         {
5580           emit_test(s1l,s1l);
5581           nottaken=(int)out;
5582           emit_js(1);
5583         }
5584       }
5585     } // if(!unconditional)
5586     int adj;
5587     uint64_t ds_unneeded=branch_regs[i].u;
5588     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5589     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5590     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5591     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5592     ds_unneeded|=1;
5593     ds_unneeded_upper|=1;
5594     // branch taken
5595     if(!nevertaken) {
5596       //assem_debug("1:\n");
5597       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5598                     ds_unneeded,ds_unneeded_upper);
5599       // load regs
5600       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5601       address_generation(i+1,&branch_regs[i],0);
5602       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5603       ds_assemble(i+1,&branch_regs[i]);
5604       cc=get_reg(branch_regs[i].regmap,CCREG);
5605       if(cc==-1) {
5606         emit_loadreg(CCREG,cc=HOST_CCREG);
5607         // CHECK: Is the following instruction (fall thru) allocated ok?
5608       }
5609       assert(cc==HOST_CCREG);
5610       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5611       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5612       assem_debug("cycle count (adj)\n");
5613       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5614       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5615       if(internal)
5616         assem_debug("branch: internal\n");
5617       else
5618         assem_debug("branch: external\n");
5619       if(internal&&is_ds[(ba[i]-start)>>2]) {
5620         ds_assemble_entry(i);
5621       }
5622       else {
5623         add_to_linker((int)out,ba[i],internal);
5624         emit_jmp(0);
5625       }
5626     }
5627     // branch not taken
5628     cop1_usable=prev_cop1_usable;
5629     if(!unconditional) {
5630       set_jump_target(nottaken,(int)out);
5631       assem_debug("1:\n");
5632       if(!likely[i]) {
5633         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5634                       ds_unneeded,ds_unneeded_upper);
5635         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5636         address_generation(i+1,&branch_regs[i],0);
5637         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5638         ds_assemble(i+1,&branch_regs[i]);
5639       }
5640       cc=get_reg(branch_regs[i].regmap,CCREG);
5641       if(cc==-1&&!likely[i]) {
5642         // Cycle count isn't in a register, temporarily load it then write it out
5643         emit_loadreg(CCREG,HOST_CCREG);
5644         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5645         int jaddr=(int)out;
5646         emit_jns(0);
5647         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5648         emit_storereg(CCREG,HOST_CCREG);
5649       }
5650       else{
5651         cc=get_reg(i_regmap,CCREG);
5652         assert(cc==HOST_CCREG);
5653         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5654         int jaddr=(int)out;
5655         emit_jns(0);
5656         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5657       }
5658     }
5659   }
5660 }
5661
5662 void fjump_assemble(int i,struct regstat *i_regs)
5663 {
5664   signed char *i_regmap=i_regs->regmap;
5665   int cc;
5666   int match;
5667   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5668   assem_debug("fmatch=%d\n",match);
5669   int fs,cs;
5670   int eaddr;
5671   int invert=0;
5672   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5673   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5674   if(!match) invert=1;
5675   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5676   if(i>(ba[i]-start)>>2) invert=1;
5677   #endif
5678
5679   if(ooo[i]) {
5680     fs=get_reg(branch_regs[i].regmap,FSREG);
5681     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5682   }
5683   else {
5684     fs=get_reg(i_regmap,FSREG);
5685   }
5686
5687   // Check cop1 unusable
5688   if(!cop1_usable) {
5689     cs=get_reg(i_regmap,CSREG);
5690     assert(cs>=0);
5691     emit_testimm(cs,0x20000000);
5692     eaddr=(int)out;
5693     emit_jeq(0);
5694     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
5695     cop1_usable=1;
5696   }
5697
5698   if(ooo[i]) {
5699     // Out of order execution (delay slot first)
5700     //printf("OOOE\n");
5701     ds_assemble(i+1,i_regs);
5702     int adj;
5703     uint64_t bc_unneeded=branch_regs[i].u;
5704     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5705     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5706     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5707     bc_unneeded|=1;
5708     bc_unneeded_upper|=1;
5709     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5710                   bc_unneeded,bc_unneeded_upper);
5711     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5712     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5713     cc=get_reg(branch_regs[i].regmap,CCREG);
5714     assert(cc==HOST_CCREG);
5715     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5716     assem_debug("cycle count (adj)\n");
5717     if(1) {
5718       int nottaken=0;
5719       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5720       if(1) {
5721         assert(fs>=0);
5722         emit_testimm(fs,0x800000);
5723         if(source[i]&0x10000) // BC1T
5724         {
5725           if(invert){
5726             nottaken=(int)out;
5727             emit_jeq(1);
5728           }else{
5729             add_to_linker((int)out,ba[i],internal);
5730             emit_jne(0);
5731           }
5732         }
5733         else // BC1F
5734           if(invert){
5735             nottaken=(int)out;
5736             emit_jne(1);
5737           }else{
5738             add_to_linker((int)out,ba[i],internal);
5739             emit_jeq(0);
5740           }
5741         {
5742         }
5743       } // if(!only32)
5744
5745       if(invert) {
5746         if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5747         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5748         else if(match) emit_addnop(13);
5749         #endif
5750         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5751         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5752         if(internal)
5753           assem_debug("branch: internal\n");
5754         else
5755           assem_debug("branch: external\n");
5756         if(internal&&is_ds[(ba[i]-start)>>2]) {
5757           ds_assemble_entry(i);
5758         }
5759         else {
5760           add_to_linker((int)out,ba[i],internal);
5761           emit_jmp(0);
5762         }
5763         set_jump_target(nottaken,(int)out);
5764       }
5765
5766       if(adj) {
5767         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5768       }
5769     } // (!unconditional)
5770   } // if(ooo)
5771   else
5772   {
5773     // In-order execution (branch first)
5774     //printf("IOE\n");
5775     int nottaken=0;
5776     if(1) {
5777       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5778       if(1) {
5779         assert(fs>=0);
5780         emit_testimm(fs,0x800000);
5781         if(source[i]&0x10000) // BC1T
5782         {
5783           nottaken=(int)out;
5784           emit_jeq(1);
5785         }
5786         else // BC1F
5787         {
5788           nottaken=(int)out;
5789           emit_jne(1);
5790         }
5791       }
5792     } // if(!unconditional)
5793     int adj;
5794     uint64_t ds_unneeded=branch_regs[i].u;
5795     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5796     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5797     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5798     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5799     ds_unneeded|=1;
5800     ds_unneeded_upper|=1;
5801     // branch taken
5802     //assem_debug("1:\n");
5803     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5804                   ds_unneeded,ds_unneeded_upper);
5805     // load regs
5806     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5807     address_generation(i+1,&branch_regs[i],0);
5808     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5809     ds_assemble(i+1,&branch_regs[i]);
5810     cc=get_reg(branch_regs[i].regmap,CCREG);
5811     if(cc==-1) {
5812       emit_loadreg(CCREG,cc=HOST_CCREG);
5813       // CHECK: Is the following instruction (fall thru) allocated ok?
5814     }
5815     assert(cc==HOST_CCREG);
5816     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5817     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5818     assem_debug("cycle count (adj)\n");
5819     if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5820     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5821     if(internal)
5822       assem_debug("branch: internal\n");
5823     else
5824       assem_debug("branch: external\n");
5825     if(internal&&is_ds[(ba[i]-start)>>2]) {
5826       ds_assemble_entry(i);
5827     }
5828     else {
5829       add_to_linker((int)out,ba[i],internal);
5830       emit_jmp(0);
5831     }
5832
5833     // branch not taken
5834     if(1) { // <- FIXME (don't need this)
5835       set_jump_target(nottaken,(int)out);
5836       assem_debug("1:\n");
5837       if(!likely[i]) {
5838         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5839                       ds_unneeded,ds_unneeded_upper);
5840         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5841         address_generation(i+1,&branch_regs[i],0);
5842         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5843         ds_assemble(i+1,&branch_regs[i]);
5844       }
5845       cc=get_reg(branch_regs[i].regmap,CCREG);
5846       if(cc==-1&&!likely[i]) {
5847         // Cycle count isn't in a register, temporarily load it then write it out
5848         emit_loadreg(CCREG,HOST_CCREG);
5849         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5850         int jaddr=(int)out;
5851         emit_jns(0);
5852         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5853         emit_storereg(CCREG,HOST_CCREG);
5854       }
5855       else{
5856         cc=get_reg(i_regmap,CCREG);
5857         assert(cc==HOST_CCREG);
5858         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5859         int jaddr=(int)out;
5860         emit_jns(0);
5861         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5862       }
5863     }
5864   }
5865 }
5866
5867 static void pagespan_assemble(int i,struct regstat *i_regs)
5868 {
5869   int s1l=get_reg(i_regs->regmap,rs1[i]);
5870   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
5871   int s2l=get_reg(i_regs->regmap,rs2[i]);
5872   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
5873   int taken=0;
5874   int nottaken=0;
5875   int unconditional=0;
5876   if(rs1[i]==0)
5877   {
5878     s1l=s2l;s1h=s2h;
5879     s2l=s2h=-1;
5880   }
5881   else if(rs2[i]==0)
5882   {
5883     s2l=s2h=-1;
5884   }
5885   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
5886     s1h=s2h=-1;
5887   }
5888   int hr=0;
5889   int addr=-1,alt=-1,ntaddr=-1;
5890   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5891   else {
5892     while(hr<HOST_REGS)
5893     {
5894       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5895          (i_regs->regmap[hr]&63)!=rs1[i] &&
5896          (i_regs->regmap[hr]&63)!=rs2[i] )
5897       {
5898         addr=hr++;break;
5899       }
5900       hr++;
5901     }
5902   }
5903   while(hr<HOST_REGS)
5904   {
5905     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5906        (i_regs->regmap[hr]&63)!=rs1[i] &&
5907        (i_regs->regmap[hr]&63)!=rs2[i] )
5908     {
5909       alt=hr++;break;
5910     }
5911     hr++;
5912   }
5913   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5914   {
5915     while(hr<HOST_REGS)
5916     {
5917       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5918          (i_regs->regmap[hr]&63)!=rs1[i] &&
5919          (i_regs->regmap[hr]&63)!=rs2[i] )
5920       {
5921         ntaddr=hr;break;
5922       }
5923       hr++;
5924     }
5925   }
5926   assert(hr<HOST_REGS);
5927   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5928     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
5929   }
5930   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5931   if(opcode[i]==2) // J
5932   {
5933     unconditional=1;
5934   }
5935   if(opcode[i]==3) // JAL
5936   {
5937     // TODO: mini_ht
5938     int rt=get_reg(i_regs->regmap,31);
5939     emit_movimm(start+i*4+8,rt);
5940     unconditional=1;
5941   }
5942   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
5943   {
5944     emit_mov(s1l,addr);
5945     if(opcode2[i]==9) // JALR
5946     {
5947       int rt=get_reg(i_regs->regmap,rt1[i]);
5948       emit_movimm(start+i*4+8,rt);
5949     }
5950   }
5951   if((opcode[i]&0x3f)==4) // BEQ
5952   {
5953     if(rs1[i]==rs2[i])
5954     {
5955       unconditional=1;
5956     }
5957     else
5958     #ifdef HAVE_CMOV_IMM
5959     if(s1h<0) {
5960       if(s2l>=0) emit_cmp(s1l,s2l);
5961       else emit_test(s1l,s1l);
5962       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5963     }
5964     else
5965     #endif
5966     {
5967       assert(s1l>=0);
5968       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5969       if(s1h>=0) {
5970         if(s2h>=0) emit_cmp(s1h,s2h);
5971         else emit_test(s1h,s1h);
5972         emit_cmovne_reg(alt,addr);
5973       }
5974       if(s2l>=0) emit_cmp(s1l,s2l);
5975       else emit_test(s1l,s1l);
5976       emit_cmovne_reg(alt,addr);
5977     }
5978   }
5979   if((opcode[i]&0x3f)==5) // BNE
5980   {
5981     #ifdef HAVE_CMOV_IMM
5982     if(s1h<0) {
5983       if(s2l>=0) emit_cmp(s1l,s2l);
5984       else emit_test(s1l,s1l);
5985       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5986     }
5987     else
5988     #endif
5989     {
5990       assert(s1l>=0);
5991       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5992       if(s1h>=0) {
5993         if(s2h>=0) emit_cmp(s1h,s2h);
5994         else emit_test(s1h,s1h);
5995         emit_cmovne_reg(alt,addr);
5996       }
5997       if(s2l>=0) emit_cmp(s1l,s2l);
5998       else emit_test(s1l,s1l);
5999       emit_cmovne_reg(alt,addr);
6000     }
6001   }
6002   if((opcode[i]&0x3f)==0x14) // BEQL
6003   {
6004     if(s1h>=0) {
6005       if(s2h>=0) emit_cmp(s1h,s2h);
6006       else emit_test(s1h,s1h);
6007       nottaken=(int)out;
6008       emit_jne(0);
6009     }
6010     if(s2l>=0) emit_cmp(s1l,s2l);
6011     else emit_test(s1l,s1l);
6012     if(nottaken) set_jump_target(nottaken,(int)out);
6013     nottaken=(int)out;
6014     emit_jne(0);
6015   }
6016   if((opcode[i]&0x3f)==0x15) // BNEL
6017   {
6018     if(s1h>=0) {
6019       if(s2h>=0) emit_cmp(s1h,s2h);
6020       else emit_test(s1h,s1h);
6021       taken=(int)out;
6022       emit_jne(0);
6023     }
6024     if(s2l>=0) emit_cmp(s1l,s2l);
6025     else emit_test(s1l,s1l);
6026     nottaken=(int)out;
6027     emit_jeq(0);
6028     if(taken) set_jump_target(taken,(int)out);
6029   }
6030   if((opcode[i]&0x3f)==6) // BLEZ
6031   {
6032     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6033     emit_cmpimm(s1l,1);
6034     if(s1h>=0) emit_mov(addr,ntaddr);
6035     emit_cmovl_reg(alt,addr);
6036     if(s1h>=0) {
6037       emit_test(s1h,s1h);
6038       emit_cmovne_reg(ntaddr,addr);
6039       emit_cmovs_reg(alt,addr);
6040     }
6041   }
6042   if((opcode[i]&0x3f)==7) // BGTZ
6043   {
6044     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6045     emit_cmpimm(s1l,1);
6046     if(s1h>=0) emit_mov(addr,alt);
6047     emit_cmovl_reg(ntaddr,addr);
6048     if(s1h>=0) {
6049       emit_test(s1h,s1h);
6050       emit_cmovne_reg(alt,addr);
6051       emit_cmovs_reg(ntaddr,addr);
6052     }
6053   }
6054   if((opcode[i]&0x3f)==0x16) // BLEZL
6055   {
6056     assert((opcode[i]&0x3f)!=0x16);
6057   }
6058   if((opcode[i]&0x3f)==0x17) // BGTZL
6059   {
6060     assert((opcode[i]&0x3f)!=0x17);
6061   }
6062   assert(opcode[i]!=1); // BLTZ/BGEZ
6063
6064   //FIXME: Check CSREG
6065   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6066     if((source[i]&0x30000)==0) // BC1F
6067     {
6068       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6069       emit_testimm(s1l,0x800000);
6070       emit_cmovne_reg(alt,addr);
6071     }
6072     if((source[i]&0x30000)==0x10000) // BC1T
6073     {
6074       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6075       emit_testimm(s1l,0x800000);
6076       emit_cmovne_reg(alt,addr);
6077     }
6078     if((source[i]&0x30000)==0x20000) // BC1FL
6079     {
6080       emit_testimm(s1l,0x800000);
6081       nottaken=(int)out;
6082       emit_jne(0);
6083     }
6084     if((source[i]&0x30000)==0x30000) // BC1TL
6085     {
6086       emit_testimm(s1l,0x800000);
6087       nottaken=(int)out;
6088       emit_jeq(0);
6089     }
6090   }
6091
6092   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6093   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6094   if(likely[i]||unconditional)
6095   {
6096     emit_movimm(ba[i],HOST_BTREG);
6097   }
6098   else if(addr!=HOST_BTREG)
6099   {
6100     emit_mov(addr,HOST_BTREG);
6101   }
6102   void *branch_addr=out;
6103   emit_jmp(0);
6104   int target_addr=start+i*4+5;
6105   void *stub=out;
6106   void *compiled_target_addr=check_addr(target_addr);
6107   emit_extjump_ds((int)branch_addr,target_addr);
6108   if(compiled_target_addr) {
6109     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6110     add_link(target_addr,stub);
6111   }
6112   else set_jump_target((int)branch_addr,(int)stub);
6113   if(likely[i]) {
6114     // Not-taken path
6115     set_jump_target((int)nottaken,(int)out);
6116     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6117     void *branch_addr=out;
6118     emit_jmp(0);
6119     int target_addr=start+i*4+8;
6120     void *stub=out;
6121     void *compiled_target_addr=check_addr(target_addr);
6122     emit_extjump_ds((int)branch_addr,target_addr);
6123     if(compiled_target_addr) {
6124       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6125       add_link(target_addr,stub);
6126     }
6127     else set_jump_target((int)branch_addr,(int)stub);
6128   }
6129 }
6130
6131 // Assemble the delay slot for the above
6132 static void pagespan_ds()
6133 {
6134   assem_debug("initial delay slot:\n");
6135   u_int vaddr=start+1;
6136   u_int page=get_page(vaddr);
6137   u_int vpage=get_vpage(vaddr);
6138   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6139   do_dirty_stub_ds();
6140   ll_add(jump_in+page,vaddr,(void *)out);
6141   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6142   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6143     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6144   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6145     emit_writeword(HOST_BTREG,(int)&branch_target);
6146   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6147   address_generation(0,&regs[0],regs[0].regmap_entry);
6148   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6149     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6150   cop1_usable=0;
6151   is_delayslot=0;
6152   switch(itype[0]) {
6153     case ALU:
6154       alu_assemble(0,&regs[0]);break;
6155     case IMM16:
6156       imm16_assemble(0,&regs[0]);break;
6157     case SHIFT:
6158       shift_assemble(0,&regs[0]);break;
6159     case SHIFTIMM:
6160       shiftimm_assemble(0,&regs[0]);break;
6161     case LOAD:
6162       load_assemble(0,&regs[0]);break;
6163     case LOADLR:
6164       loadlr_assemble(0,&regs[0]);break;
6165     case STORE:
6166       store_assemble(0,&regs[0]);break;
6167     case STORELR:
6168       storelr_assemble(0,&regs[0]);break;
6169     case COP0:
6170       cop0_assemble(0,&regs[0]);break;
6171     case COP1:
6172       cop1_assemble(0,&regs[0]);break;
6173     case C1LS:
6174       c1ls_assemble(0,&regs[0]);break;
6175     case COP2:
6176       cop2_assemble(0,&regs[0]);break;
6177     case C2LS:
6178       c2ls_assemble(0,&regs[0]);break;
6179     case C2OP:
6180       c2op_assemble(0,&regs[0]);break;
6181     case FCONV:
6182       fconv_assemble(0,&regs[0]);break;
6183     case FLOAT:
6184       float_assemble(0,&regs[0]);break;
6185     case FCOMP:
6186       fcomp_assemble(0,&regs[0]);break;
6187     case MULTDIV:
6188       multdiv_assemble(0,&regs[0]);break;
6189     case MOV:
6190       mov_assemble(0,&regs[0]);break;
6191     case SYSCALL:
6192     case HLECALL:
6193     case INTCALL:
6194     case SPAN:
6195     case UJUMP:
6196     case RJUMP:
6197     case CJUMP:
6198     case SJUMP:
6199     case FJUMP:
6200       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
6201   }
6202   int btaddr=get_reg(regs[0].regmap,BTREG);
6203   if(btaddr<0) {
6204     btaddr=get_reg(regs[0].regmap,-1);
6205     emit_readword((int)&branch_target,btaddr);
6206   }
6207   assert(btaddr!=HOST_CCREG);
6208   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6209 #ifdef HOST_IMM8
6210   emit_movimm(start+4,HOST_TEMPREG);
6211   emit_cmp(btaddr,HOST_TEMPREG);
6212 #else
6213   emit_cmpimm(btaddr,start+4);
6214 #endif
6215   int branch=(int)out;
6216   emit_jeq(0);
6217   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6218   emit_jmp(jump_vaddr_reg[btaddr]);
6219   set_jump_target(branch,(int)out);
6220   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6221   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6222 }
6223
6224 // Basic liveness analysis for MIPS registers
6225 void unneeded_registers(int istart,int iend,int r)
6226 {
6227   int i;
6228   uint64_t u,uu,gte_u,b,bu,gte_bu;
6229   uint64_t temp_u,temp_uu,temp_gte_u=0;
6230   uint64_t tdep;
6231   uint64_t gte_u_unknown=0;
6232   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
6233     gte_u_unknown=~0ll;
6234   if(iend==slen-1) {
6235     u=1;uu=1;
6236     gte_u=gte_u_unknown;
6237   }else{
6238     u=unneeded_reg[iend+1];
6239     uu=unneeded_reg_upper[iend+1];
6240     u=1;uu=1;
6241     gte_u=gte_unneeded[iend+1];
6242   }
6243
6244   for (i=iend;i>=istart;i--)
6245   {
6246     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6247     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6248     {
6249       // If subroutine call, flag return address as a possible branch target
6250       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6251
6252       if(ba[i]<start || ba[i]>=(start+slen*4))
6253       {
6254         // Branch out of this block, flush all regs
6255         u=1;
6256         uu=1;
6257         gte_u=gte_u_unknown;
6258         /* Hexagon hack
6259         if(itype[i]==UJUMP&&rt1[i]==31)
6260         {
6261           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6262         }
6263         if(itype[i]==RJUMP&&rs1[i]==31)
6264         {
6265           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6266         }
6267         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6268           if(itype[i]==UJUMP&&rt1[i]==31)
6269           {
6270             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6271             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6272           }
6273           if(itype[i]==RJUMP&&rs1[i]==31)
6274           {
6275             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6276             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6277           }
6278         }*/
6279         branch_unneeded_reg[i]=u;
6280         branch_unneeded_reg_upper[i]=uu;
6281         // Merge in delay slot
6282         tdep=(~uu>>rt1[i+1])&1;
6283         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6284         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6285         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6286         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6287         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6288         u|=1;uu|=1;
6289         gte_u|=gte_rt[i+1];
6290         gte_u&=~gte_rs[i+1];
6291         // If branch is "likely" (and conditional)
6292         // then we skip the delay slot on the fall-thru path
6293         if(likely[i]) {
6294           if(i<slen-1) {
6295             u&=unneeded_reg[i+2];
6296             uu&=unneeded_reg_upper[i+2];
6297             gte_u&=gte_unneeded[i+2];
6298           }
6299           else
6300           {
6301             u=1;
6302             uu=1;
6303             gte_u=gte_u_unknown;
6304           }
6305         }
6306       }
6307       else
6308       {
6309         // Internal branch, flag target
6310         bt[(ba[i]-start)>>2]=1;
6311         if(ba[i]<=start+i*4) {
6312           // Backward branch
6313           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6314           {
6315             // Unconditional branch
6316             temp_u=1;temp_uu=1;
6317             temp_gte_u=0;
6318           } else {
6319             // Conditional branch (not taken case)
6320             temp_u=unneeded_reg[i+2];
6321             temp_uu=unneeded_reg_upper[i+2];
6322             temp_gte_u&=gte_unneeded[i+2];
6323           }
6324           // Merge in delay slot
6325           tdep=(~temp_uu>>rt1[i+1])&1;
6326           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6327           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6328           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6329           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6330           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6331           temp_u|=1;temp_uu|=1;
6332           temp_gte_u|=gte_rt[i+1];
6333           temp_gte_u&=~gte_rs[i+1];
6334           // If branch is "likely" (and conditional)
6335           // then we skip the delay slot on the fall-thru path
6336           if(likely[i]) {
6337             if(i<slen-1) {
6338               temp_u&=unneeded_reg[i+2];
6339               temp_uu&=unneeded_reg_upper[i+2];
6340               temp_gte_u&=gte_unneeded[i+2];
6341             }
6342             else
6343             {
6344               temp_u=1;
6345               temp_uu=1;
6346               temp_gte_u=gte_u_unknown;
6347             }
6348           }
6349           tdep=(~temp_uu>>rt1[i])&1;
6350           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6351           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6352           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6353           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6354           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6355           temp_u|=1;temp_uu|=1;
6356           temp_gte_u|=gte_rt[i];
6357           temp_gte_u&=~gte_rs[i];
6358           unneeded_reg[i]=temp_u;
6359           unneeded_reg_upper[i]=temp_uu;
6360           gte_unneeded[i]=temp_gte_u;
6361           // Only go three levels deep.  This recursion can take an
6362           // excessive amount of time if there are a lot of nested loops.
6363           if(r<2) {
6364             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6365           }else{
6366             unneeded_reg[(ba[i]-start)>>2]=1;
6367             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6368             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
6369           }
6370         } /*else*/ if(1) {
6371           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6372           {
6373             // Unconditional branch
6374             u=unneeded_reg[(ba[i]-start)>>2];
6375             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6376             gte_u=gte_unneeded[(ba[i]-start)>>2];
6377             branch_unneeded_reg[i]=u;
6378             branch_unneeded_reg_upper[i]=uu;
6379         //u=1;
6380         //uu=1;
6381         //branch_unneeded_reg[i]=u;
6382         //branch_unneeded_reg_upper[i]=uu;
6383             // Merge in delay slot
6384             tdep=(~uu>>rt1[i+1])&1;
6385             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6386             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6387             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6388             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6389             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6390             u|=1;uu|=1;
6391             gte_u|=gte_rt[i+1];
6392             gte_u&=~gte_rs[i+1];
6393           } else {
6394             // Conditional branch
6395             b=unneeded_reg[(ba[i]-start)>>2];
6396             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6397             gte_bu=gte_unneeded[(ba[i]-start)>>2];
6398             branch_unneeded_reg[i]=b;
6399             branch_unneeded_reg_upper[i]=bu;
6400         //b=1;
6401         //bu=1;
6402         //branch_unneeded_reg[i]=b;
6403         //branch_unneeded_reg_upper[i]=bu;
6404             // Branch delay slot
6405             tdep=(~uu>>rt1[i+1])&1;
6406             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6407             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6408             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6409             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6410             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6411             b|=1;bu|=1;
6412             gte_bu|=gte_rt[i+1];
6413             gte_bu&=~gte_rs[i+1];
6414             // If branch is "likely" then we skip the
6415             // delay slot on the fall-thru path
6416             if(likely[i]) {
6417               u=b;
6418               uu=bu;
6419               gte_u=gte_bu;
6420               if(i<slen-1) {
6421                 u&=unneeded_reg[i+2];
6422                 uu&=unneeded_reg_upper[i+2];
6423                 gte_u&=gte_unneeded[i+2];
6424         //u=1;
6425         //uu=1;
6426               }
6427             } else {
6428               u&=b;
6429               uu&=bu;
6430               gte_u&=gte_bu;
6431         //u=1;
6432         //uu=1;
6433             }
6434             if(i<slen-1) {
6435               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6436               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6437         //branch_unneeded_reg[i]=1;
6438         //branch_unneeded_reg_upper[i]=1;
6439             } else {
6440               branch_unneeded_reg[i]=1;
6441               branch_unneeded_reg_upper[i]=1;
6442             }
6443           }
6444         }
6445       }
6446     }
6447     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6448     {
6449       // SYSCALL instruction (software interrupt)
6450       u=1;
6451       uu=1;
6452     }
6453     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6454     {
6455       // ERET instruction (return from interrupt)
6456       u=1;
6457       uu=1;
6458     }
6459     //u=uu=1; // DEBUG
6460     tdep=(~uu>>rt1[i])&1;
6461     // Written registers are unneeded
6462     u|=1LL<<rt1[i];
6463     u|=1LL<<rt2[i];
6464     uu|=1LL<<rt1[i];
6465     uu|=1LL<<rt2[i];
6466     gte_u|=gte_rt[i];
6467     // Accessed registers are needed
6468     u&=~(1LL<<rs1[i]);
6469     u&=~(1LL<<rs2[i]);
6470     uu&=~(1LL<<us1[i]);
6471     uu&=~(1LL<<us2[i]);
6472     gte_u&=~gte_rs[i];
6473     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
6474       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
6475     // Source-target dependencies
6476     uu&=~(tdep<<dep1[i]);
6477     uu&=~(tdep<<dep2[i]);
6478     // R0 is always unneeded
6479     u|=1;uu|=1;
6480     // Save it
6481     unneeded_reg[i]=u;
6482     unneeded_reg_upper[i]=uu;
6483     gte_unneeded[i]=gte_u;
6484     /*
6485     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6486     printf("U:");
6487     int r;
6488     for(r=1;r<=CCREG;r++) {
6489       if((unneeded_reg[i]>>r)&1) {
6490         if(r==HIREG) printf(" HI");
6491         else if(r==LOREG) printf(" LO");
6492         else printf(" r%d",r);
6493       }
6494     }
6495     printf(" UU:");
6496     for(r=1;r<=CCREG;r++) {
6497       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6498         if(r==HIREG) printf(" HI");
6499         else if(r==LOREG) printf(" LO");
6500         else printf(" r%d",r);
6501       }
6502     }
6503     printf("\n");*/
6504   }
6505   for (i=iend;i>=istart;i--)
6506   {
6507     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
6508   }
6509 }
6510
6511 // Write back dirty registers as soon as we will no longer modify them,
6512 // so that we don't end up with lots of writes at the branches.
6513 void clean_registers(int istart,int iend,int wr)
6514 {
6515   int i;
6516   int r;
6517   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6518   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6519   if(iend==slen-1) {
6520     will_dirty_i=will_dirty_next=0;
6521     wont_dirty_i=wont_dirty_next=0;
6522   }else{
6523     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6524     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6525   }
6526   for (i=iend;i>=istart;i--)
6527   {
6528     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6529     {
6530       if(ba[i]<start || ba[i]>=(start+slen*4))
6531       {
6532         // Branch out of this block, flush all regs
6533         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6534         {
6535           // Unconditional branch
6536           will_dirty_i=0;
6537           wont_dirty_i=0;
6538           // Merge in delay slot (will dirty)
6539           for(r=0;r<HOST_REGS;r++) {
6540             if(r!=EXCLUDE_REG) {
6541               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6542               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6543               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6544               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6545               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6546               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6547               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6548               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6549               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6550               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6551               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6552               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6553               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6554               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6555             }
6556           }
6557         }
6558         else
6559         {
6560           // Conditional branch
6561           will_dirty_i=0;
6562           wont_dirty_i=wont_dirty_next;
6563           // Merge in delay slot (will dirty)
6564           for(r=0;r<HOST_REGS;r++) {
6565             if(r!=EXCLUDE_REG) {
6566               if(!likely[i]) {
6567                 // Might not dirty if likely branch is not taken
6568                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6569                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6570                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6571                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6572                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6573                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6574                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6575                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6576                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6577                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6578                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6579                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6580                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6581                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6582               }
6583             }
6584           }
6585         }
6586         // Merge in delay slot (wont dirty)
6587         for(r=0;r<HOST_REGS;r++) {
6588           if(r!=EXCLUDE_REG) {
6589             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6590             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6591             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6592             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6593             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6594             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6595             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6596             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6597             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6598             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6599           }
6600         }
6601         if(wr) {
6602           #ifndef DESTRUCTIVE_WRITEBACK
6603           branch_regs[i].dirty&=wont_dirty_i;
6604           #endif
6605           branch_regs[i].dirty|=will_dirty_i;
6606         }
6607       }
6608       else
6609       {
6610         // Internal branch
6611         if(ba[i]<=start+i*4) {
6612           // Backward branch
6613           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6614           {
6615             // Unconditional branch
6616             temp_will_dirty=0;
6617             temp_wont_dirty=0;
6618             // Merge in delay slot (will dirty)
6619             for(r=0;r<HOST_REGS;r++) {
6620               if(r!=EXCLUDE_REG) {
6621                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6622                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6623                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6624                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6625                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6626                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6627                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6628                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6629                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6630                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6631                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6632                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6633                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6634                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6635               }
6636             }
6637           } else {
6638             // Conditional branch (not taken case)
6639             temp_will_dirty=will_dirty_next;
6640             temp_wont_dirty=wont_dirty_next;
6641             // Merge in delay slot (will dirty)
6642             for(r=0;r<HOST_REGS;r++) {
6643               if(r!=EXCLUDE_REG) {
6644                 if(!likely[i]) {
6645                   // Will not dirty if likely branch is not taken
6646                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6647                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6648                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6649                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6650                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6651                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
6652                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6653                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6654                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6655                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6656                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6657                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6658                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6659                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6660                 }
6661               }
6662             }
6663           }
6664           // Merge in delay slot (wont dirty)
6665           for(r=0;r<HOST_REGS;r++) {
6666             if(r!=EXCLUDE_REG) {
6667               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6668               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6669               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6670               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6671               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6672               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6673               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6674               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6675               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6676               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6677             }
6678           }
6679           // Deal with changed mappings
6680           if(i<iend) {
6681             for(r=0;r<HOST_REGS;r++) {
6682               if(r!=EXCLUDE_REG) {
6683                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
6684                   temp_will_dirty&=~(1<<r);
6685                   temp_wont_dirty&=~(1<<r);
6686                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6687                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6688                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6689                   } else {
6690                     temp_will_dirty|=1<<r;
6691                     temp_wont_dirty|=1<<r;
6692                   }
6693                 }
6694               }
6695             }
6696           }
6697           if(wr) {
6698             will_dirty[i]=temp_will_dirty;
6699             wont_dirty[i]=temp_wont_dirty;
6700             clean_registers((ba[i]-start)>>2,i-1,0);
6701           }else{
6702             // Limit recursion.  It can take an excessive amount
6703             // of time if there are a lot of nested loops.
6704             will_dirty[(ba[i]-start)>>2]=0;
6705             wont_dirty[(ba[i]-start)>>2]=-1;
6706           }
6707         }
6708         /*else*/ if(1)
6709         {
6710           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6711           {
6712             // Unconditional branch
6713             will_dirty_i=0;
6714             wont_dirty_i=0;
6715           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6716             for(r=0;r<HOST_REGS;r++) {
6717               if(r!=EXCLUDE_REG) {
6718                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6719                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
6720                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6721                 }
6722                 if(branch_regs[i].regmap[r]>=0) {
6723                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6724                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6725                 }
6726               }
6727             }
6728           //}
6729             // Merge in delay slot
6730             for(r=0;r<HOST_REGS;r++) {
6731               if(r!=EXCLUDE_REG) {
6732                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6733                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6734                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6735                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6736                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6737                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6738                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6739                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6740                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6741                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6742                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6743                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6744                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6745                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6746               }
6747             }
6748           } else {
6749             // Conditional branch
6750             will_dirty_i=will_dirty_next;
6751             wont_dirty_i=wont_dirty_next;
6752           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6753             for(r=0;r<HOST_REGS;r++) {
6754               if(r!=EXCLUDE_REG) {
6755                 signed char target_reg=branch_regs[i].regmap[r];
6756                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6757                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6758                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6759                 }
6760                 else if(target_reg>=0) {
6761                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6762                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6763                 }
6764                 // Treat delay slot as part of branch too
6765                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6766                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6767                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6768                 }
6769                 else
6770                 {
6771                   will_dirty[i+1]&=~(1<<r);
6772                 }*/
6773               }
6774             }
6775           //}
6776             // Merge in delay slot
6777             for(r=0;r<HOST_REGS;r++) {
6778               if(r!=EXCLUDE_REG) {
6779                 if(!likely[i]) {
6780                   // Might not dirty if likely branch is not taken
6781                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6782                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6783                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6784                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6785                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6786                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6787                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6788                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6789                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6790                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6791                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6792                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6793                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6794                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6795                 }
6796               }
6797             }
6798           }
6799           // Merge in delay slot (won't dirty)
6800           for(r=0;r<HOST_REGS;r++) {
6801             if(r!=EXCLUDE_REG) {
6802               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6803               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6804               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6805               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6806               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6807               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6808               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6809               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6810               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6811               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6812             }
6813           }
6814           if(wr) {
6815             #ifndef DESTRUCTIVE_WRITEBACK
6816             branch_regs[i].dirty&=wont_dirty_i;
6817             #endif
6818             branch_regs[i].dirty|=will_dirty_i;
6819           }
6820         }
6821       }
6822     }
6823     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6824     {
6825       // SYSCALL instruction (software interrupt)
6826       will_dirty_i=0;
6827       wont_dirty_i=0;
6828     }
6829     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6830     {
6831       // ERET instruction (return from interrupt)
6832       will_dirty_i=0;
6833       wont_dirty_i=0;
6834     }
6835     will_dirty_next=will_dirty_i;
6836     wont_dirty_next=wont_dirty_i;
6837     for(r=0;r<HOST_REGS;r++) {
6838       if(r!=EXCLUDE_REG) {
6839         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6840         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6841         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6842         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6843         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6844         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6845         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6846         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6847         if(i>istart) {
6848           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP)
6849           {
6850             // Don't store a register immediately after writing it,
6851             // may prevent dual-issue.
6852             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
6853             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
6854           }
6855         }
6856       }
6857     }
6858     // Save it
6859     will_dirty[i]=will_dirty_i;
6860     wont_dirty[i]=wont_dirty_i;
6861     // Mark registers that won't be dirtied as not dirty
6862     if(wr) {
6863       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
6864       for(r=0;r<HOST_REGS;r++) {
6865         if((will_dirty_i>>r)&1) {
6866           printf(" r%d",r);
6867         }
6868       }
6869       printf("\n");*/
6870
6871       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
6872         regs[i].dirty|=will_dirty_i;
6873         #ifndef DESTRUCTIVE_WRITEBACK
6874         regs[i].dirty&=wont_dirty_i;
6875         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6876         {
6877           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
6878             for(r=0;r<HOST_REGS;r++) {
6879               if(r!=EXCLUDE_REG) {
6880                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
6881                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
6882                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6883               }
6884             }
6885           }
6886         }
6887         else
6888         {
6889           if(i<iend) {
6890             for(r=0;r<HOST_REGS;r++) {
6891               if(r!=EXCLUDE_REG) {
6892                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
6893                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
6894                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6895               }
6896             }
6897           }
6898         }
6899         #endif
6900       //}
6901     }
6902     // Deal with changed mappings
6903     temp_will_dirty=will_dirty_i;
6904     temp_wont_dirty=wont_dirty_i;
6905     for(r=0;r<HOST_REGS;r++) {
6906       if(r!=EXCLUDE_REG) {
6907         int nr;
6908         if(regs[i].regmap[r]==regmap_pre[i][r]) {
6909           if(wr) {
6910             #ifndef DESTRUCTIVE_WRITEBACK
6911             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6912             #endif
6913             regs[i].wasdirty|=will_dirty_i&(1<<r);
6914           }
6915         }
6916         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
6917           // Register moved to a different register
6918           will_dirty_i&=~(1<<r);
6919           wont_dirty_i&=~(1<<r);
6920           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
6921           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
6922           if(wr) {
6923             #ifndef DESTRUCTIVE_WRITEBACK
6924             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6925             #endif
6926             regs[i].wasdirty|=will_dirty_i&(1<<r);
6927           }
6928         }
6929         else {
6930           will_dirty_i&=~(1<<r);
6931           wont_dirty_i&=~(1<<r);
6932           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6933             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6934             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6935           } else {
6936             wont_dirty_i|=1<<r;
6937             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
6938           }
6939         }
6940       }
6941     }
6942   }
6943 }
6944
6945 #ifdef DISASM
6946   /* disassembly */
6947 void disassemble_inst(int i)
6948 {
6949     if (bt[i]) printf("*"); else printf(" ");
6950     switch(itype[i]) {
6951       case UJUMP:
6952         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6953       case CJUMP:
6954         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
6955       case SJUMP:
6956         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
6957       case FJUMP:
6958         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6959       case RJUMP:
6960         if (opcode[i]==0x9&&rt1[i]!=31)
6961           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
6962         else
6963           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6964         break;
6965       case SPAN:
6966         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
6967       case IMM16:
6968         if(opcode[i]==0xf) //LUI
6969           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
6970         else
6971           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6972         break;
6973       case LOAD:
6974       case LOADLR:
6975         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6976         break;
6977       case STORE:
6978       case STORELR:
6979         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
6980         break;
6981       case ALU:
6982       case SHIFT:
6983         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
6984         break;
6985       case MULTDIV:
6986         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
6987         break;
6988       case SHIFTIMM:
6989         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6990         break;
6991       case MOV:
6992         if((opcode2[i]&0x1d)==0x10)
6993           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
6994         else if((opcode2[i]&0x1d)==0x11)
6995           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6996         else
6997           printf (" %x: %s\n",start+i*4,insn[i]);
6998         break;
6999       case COP0:
7000         if(opcode2[i]==0)
7001           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7002         else if(opcode2[i]==4)
7003           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7004         else printf (" %x: %s\n",start+i*4,insn[i]);
7005         break;
7006       case COP1:
7007         if(opcode2[i]<3)
7008           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7009         else if(opcode2[i]>3)
7010           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7011         else printf (" %x: %s\n",start+i*4,insn[i]);
7012         break;
7013       case COP2:
7014         if(opcode2[i]<3)
7015           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
7016         else if(opcode2[i]>3)
7017           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
7018         else printf (" %x: %s\n",start+i*4,insn[i]);
7019         break;
7020       case C1LS:
7021         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7022         break;
7023       case C2LS:
7024         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7025         break;
7026       case INTCALL:
7027         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
7028         break;
7029       default:
7030         //printf (" %s %8x\n",insn[i],source[i]);
7031         printf (" %x: %s\n",start+i*4,insn[i]);
7032     }
7033 }
7034 #else
7035 static void disassemble_inst(int i) {}
7036 #endif // DISASM
7037
7038 #define DRC_TEST_VAL 0x74657374
7039
7040 static int new_dynarec_test(void)
7041 {
7042   int (*testfunc)(void) = (void *)out;
7043   void *beginning;
7044   int ret;
7045
7046   beginning = start_block();
7047   emit_movimm(DRC_TEST_VAL,0); // test
7048   emit_jmpreg(14);
7049   literal_pool(0);
7050   end_block(beginning);
7051   SysPrintf("testing if we can run recompiled code..\n");
7052   ret = testfunc();
7053   if (ret == DRC_TEST_VAL)
7054     SysPrintf("test passed.\n");
7055   else
7056     SysPrintf("test failed: %08x\n", ret);
7057   out=(u_char *)BASE_ADDR;
7058   return ret == DRC_TEST_VAL;
7059 }
7060
7061 // clear the state completely, instead of just marking
7062 // things invalid like invalidate_all_pages() does
7063 void new_dynarec_clear_full(void)
7064 {
7065   int n;
7066   out=(u_char *)BASE_ADDR;
7067   memset(invalid_code,1,sizeof(invalid_code));
7068   memset(hash_table,0xff,sizeof(hash_table));
7069   memset(mini_ht,-1,sizeof(mini_ht));
7070   memset(restore_candidate,0,sizeof(restore_candidate));
7071   memset(shadow,0,sizeof(shadow));
7072   copy=shadow;
7073   expirep=16384; // Expiry pointer, +2 blocks
7074   pending_exception=0;
7075   literalcount=0;
7076   stop_after_jal=0;
7077   inv_code_start=inv_code_end=~0;
7078   // TLB
7079   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7080   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7081   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7082 }
7083
7084 void new_dynarec_init(void)
7085 {
7086   SysPrintf("Init new dynarec\n");
7087
7088 #ifdef _3DS
7089   check_rosalina();
7090 #endif
7091
7092   // allocate/prepare a buffer for translation cache
7093   // see assem_arm.h for some explanation
7094 #if   defined(BASE_ADDR_FIXED)
7095   if (mmap (translation_cache, 1 << TARGET_SIZE_2,
7096         PROT_READ | PROT_WRITE | PROT_EXEC,
7097         MAP_PRIVATE | MAP_ANONYMOUS,
7098         -1, 0) != translation_cache)
7099   {
7100     SysPrintf("mmap() failed: %s\n", strerror(errno));
7101     SysPrintf("disable BASE_ADDR_FIXED and recompile\n");
7102     abort();
7103   }
7104 #elif defined(BASE_ADDR_DYNAMIC)
7105 #ifdef VITA
7106   sceBlock = getVMBlock();//sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
7107   if (sceBlock < 0)
7108     SysPrintf("sceKernelAllocMemBlockForVM failed\n");
7109   int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&translation_cache);
7110   if (ret < 0)
7111     SysPrintf("sceKernelGetMemBlockBase failed\n");
7112     
7113   sceKernelOpenVMDomain();
7114   sceClibPrintf("translation_cache = 0x%08X \n ", translation_cache);
7115 #elif defined(_MSC_VER)
7116   base_addr = VirtualAlloc(NULL, 1<<TARGET_SIZE_2, MEM_COMMIT | MEM_RESERVE,
7117       PAGE_EXECUTE_READWRITE);
7118 #else
7119   translation_cache = mmap (NULL, 1 << TARGET_SIZE_2,
7120       PROT_READ | PROT_WRITE | PROT_EXEC,
7121       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
7122   if (translation_cache == MAP_FAILED) {
7123     SysPrintf("mmap() failed: %s\n", strerror(errno));
7124     abort();
7125   }
7126 #endif
7127 #else
7128 #ifndef NO_WRITE_EXEC
7129   // not all systems allow execute in data segment by default
7130   if (mprotect((void *)BASE_ADDR, 1<<TARGET_SIZE_2, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
7131     SysPrintf("mprotect() failed: %s\n", strerror(errno));
7132 #endif
7133 #endif
7134
7135   out=(u_char *)BASE_ADDR;
7136   cycle_multiplier=200;
7137   new_dynarec_clear_full();
7138 #ifdef HOST_IMM8
7139   // Copy this into local area so we don't have to put it in every literal pool
7140   invc_ptr=invalid_code;
7141 #endif
7142   arch_init();
7143   new_dynarec_test();
7144 #ifndef RAM_FIXED
7145   ram_offset=(u_int)rdram-0x80000000;
7146 #endif
7147   if (ram_offset!=0)
7148     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
7149 }
7150
7151 void new_dynarec_cleanup(void)
7152 {
7153   int n;
7154 #if defined(BASE_ADDR_FIXED) || defined(BASE_ADDR_DYNAMIC)
7155 #ifndef VITA
7156 #if defined(_MSC_VER)
7157   VirtualFree(base_addr, 0, MEM_RELEASE);
7158 #else
7159   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0)
7160     SysPrintf("munmap() failed\n");
7161 #endif
7162 #endif
7163 #endif
7164   for(n=0;n<4096;n++)
7165     ll_clear(jump_in+n);
7166   for(n=0;n<4096;n++)
7167     ll_clear(jump_out+n);
7168   for(n=0;n<4096;n++)
7169     ll_clear(jump_dirty+n);
7170 #ifdef ROM_COPY
7171   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
7172 #endif
7173 }
7174
7175 static u_int *get_source_start(u_int addr, u_int *limit)
7176 {
7177   if (addr < 0x00200000 ||
7178     (0xa0000000 <= addr && addr < 0xa0200000)) {
7179     // used for BIOS calls mostly?
7180     *limit = (addr&0xa0000000)|0x00200000;
7181     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7182   }
7183   else if (!Config.HLE && (
7184     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
7185     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
7186     // BIOS
7187     *limit = (addr & 0xfff00000) | 0x80000;
7188     return (u_int *)((u_int)psxR + (addr&0x7ffff));
7189   }
7190   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
7191     *limit = (addr & 0x80600000) + 0x00200000;
7192     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7193   }
7194   return NULL;
7195 }
7196
7197 static u_int scan_for_ret(u_int addr)
7198 {
7199   u_int limit = 0;
7200   u_int *mem;
7201
7202   mem = get_source_start(addr, &limit);
7203   if (mem == NULL)
7204     return addr;
7205
7206   if (limit > addr + 0x1000)
7207     limit = addr + 0x1000;
7208   for (; addr < limit; addr += 4, mem++) {
7209     if (*mem == 0x03e00008) // jr $ra
7210       return addr + 8;
7211   }
7212   return addr;
7213 }
7214
7215 struct savestate_block {
7216   uint32_t addr;
7217   uint32_t regflags;
7218 };
7219
7220 static int addr_cmp(const void *p1_, const void *p2_)
7221 {
7222   const struct savestate_block *p1 = p1_, *p2 = p2_;
7223   return p1->addr - p2->addr;
7224 }
7225
7226 int new_dynarec_save_blocks(void *save, int size)
7227 {
7228   struct savestate_block *blocks = save;
7229   int maxcount = size / sizeof(blocks[0]);
7230   struct savestate_block tmp_blocks[1024];
7231   struct ll_entry *head;
7232   int p, s, d, o, bcnt;
7233   u_int addr;
7234
7235   o = 0;
7236   for (p = 0; p < sizeof(jump_in) / sizeof(jump_in[0]); p++) {
7237     bcnt = 0;
7238     for (head = jump_in[p]; head != NULL; head = head->next) {
7239       tmp_blocks[bcnt].addr = head->vaddr;
7240       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
7241       bcnt++;
7242     }
7243     if (bcnt < 1)
7244       continue;
7245     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
7246
7247     addr = tmp_blocks[0].addr;
7248     for (s = d = 0; s < bcnt; s++) {
7249       if (tmp_blocks[s].addr < addr)
7250         continue;
7251       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
7252         tmp_blocks[d++] = tmp_blocks[s];
7253       addr = scan_for_ret(tmp_blocks[s].addr);
7254     }
7255
7256     if (o + d > maxcount)
7257       d = maxcount - o;
7258     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
7259     o += d;
7260   }
7261
7262   return o * sizeof(blocks[0]);
7263 }
7264
7265 void new_dynarec_load_blocks(const void *save, int size)
7266 {
7267   const struct savestate_block *blocks = save;
7268   int count = size / sizeof(blocks[0]);
7269   u_int regs_save[32];
7270   uint32_t f;
7271   int i, b;
7272
7273   get_addr(psxRegs.pc);
7274
7275   // change GPRs for speculation to at least partially work..
7276   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
7277   for (i = 1; i < 32; i++)
7278     psxRegs.GPR.r[i] = 0x80000000;
7279
7280   for (b = 0; b < count; b++) {
7281     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7282       if (f & 1)
7283         psxRegs.GPR.r[i] = 0x1f800000;
7284     }
7285
7286     get_addr(blocks[b].addr);
7287
7288     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7289       if (f & 1)
7290         psxRegs.GPR.r[i] = 0x80000000;
7291     }
7292   }
7293
7294   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
7295 }
7296
7297 int new_recompile_block(int addr)
7298 {
7299   u_int pagelimit = 0;
7300   u_int state_rflags = 0;
7301   int i;
7302
7303   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7304   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7305   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7306   //if(debug)
7307   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7308   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7309   /*if(Count>=312978186) {
7310     rlist();
7311   }*/
7312   //rlist();
7313
7314   // this is just for speculation
7315   for (i = 1; i < 32; i++) {
7316     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
7317       state_rflags |= 1 << i;
7318   }
7319
7320   start = (u_int)addr&~3;
7321   //assert(((u_int)addr&1)==0);
7322   new_dynarec_did_compile=1;
7323   if (Config.HLE && start == 0x80001000) // hlecall
7324   {
7325     // XXX: is this enough? Maybe check hleSoftCall?
7326     void *beginning=start_block();
7327     u_int page=get_page(start);
7328
7329     invalid_code[start>>12]=0;
7330     emit_movimm(start,0);
7331     emit_writeword(0,(int)&pcaddr);
7332     emit_jmp((int)new_dyna_leave);
7333     literal_pool(0);
7334     end_block(beginning);
7335     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
7336     return 0;
7337   }
7338
7339   source = get_source_start(start, &pagelimit);
7340   if (source == NULL) {
7341     SysPrintf("Compile at bogus memory address: %08x\n", addr);
7342     exit(1);
7343   }
7344
7345   /* Pass 1: disassemble */
7346   /* Pass 2: register dependencies, branch targets */
7347   /* Pass 3: register allocation */
7348   /* Pass 4: branch dependencies */
7349   /* Pass 5: pre-alloc */
7350   /* Pass 6: optimize clean/dirty state */
7351   /* Pass 7: flag 32-bit registers */
7352   /* Pass 8: assembly */
7353   /* Pass 9: linker */
7354   /* Pass 10: garbage collection / free memory */
7355
7356   int j;
7357   int done=0;
7358   unsigned int type,op,op2;
7359
7360   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7361
7362   /* Pass 1 disassembly */
7363
7364   for(i=0;!done;i++) {
7365     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
7366     minimum_free_regs[i]=0;
7367     opcode[i]=op=source[i]>>26;
7368     switch(op)
7369     {
7370       case 0x00: strcpy(insn[i],"special"); type=NI;
7371         op2=source[i]&0x3f;
7372         switch(op2)
7373         {
7374           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7375           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7376           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7377           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7378           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7379           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7380           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7381           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7382           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7383           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7384           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7385           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7386           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7387           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7388           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7389           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7390           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7391           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7392           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7393           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7394           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7395           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7396           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7397           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7398           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7399           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7400           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7401           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7402           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7403           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7404           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7405           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7406           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7407           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7408           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7409 #if 0
7410           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7411           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7412           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7413           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7414           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7415           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7416           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7417           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7418           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7419           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7420           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7421           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7422           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7423           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7424           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7425           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7426           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7427 #endif
7428         }
7429         break;
7430       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7431         op2=(source[i]>>16)&0x1f;
7432         switch(op2)
7433         {
7434           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7435           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7436           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7437           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7438           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7439           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7440           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7441           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7442           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7443           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7444           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7445           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7446           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7447           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7448         }
7449         break;
7450       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7451       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7452       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7453       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7454       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7455       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7456       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7457       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7458       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7459       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7460       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7461       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7462       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7463       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7464       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7465         op2=(source[i]>>21)&0x1f;
7466         switch(op2)
7467         {
7468           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7469           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7470           case 0x10: strcpy(insn[i],"tlb"); type=NI;
7471           switch(source[i]&0x3f)
7472           {
7473             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
7474             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
7475             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
7476             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
7477             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
7478             //case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
7479           }
7480         }
7481         break;
7482       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7483         op2=(source[i]>>21)&0x1f;
7484         switch(op2)
7485         {
7486           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7487           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7488           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7489           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7490           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7491           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7492           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7493           switch((source[i]>>16)&0x3)
7494           {
7495             case 0x00: strcpy(insn[i],"BC1F"); break;
7496             case 0x01: strcpy(insn[i],"BC1T"); break;
7497             case 0x02: strcpy(insn[i],"BC1FL"); break;
7498             case 0x03: strcpy(insn[i],"BC1TL"); break;
7499           }
7500           break;
7501           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7502           switch(source[i]&0x3f)
7503           {
7504             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7505             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7506             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7507             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7508             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7509             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7510             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7511             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7512             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7513             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7514             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7515             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7516             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7517             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7518             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7519             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
7520             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
7521             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
7522             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
7523             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
7524             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
7525             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
7526             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
7527             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
7528             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
7529             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
7530             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
7531             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
7532             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
7533             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
7534             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
7535             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
7536             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
7537             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
7538             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
7539           }
7540           break;
7541           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
7542           switch(source[i]&0x3f)
7543           {
7544             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
7545             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
7546             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
7547             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
7548             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
7549             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
7550             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
7551             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
7552             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
7553             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
7554             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
7555             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
7556             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
7557             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
7558             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
7559             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
7560             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
7561             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
7562             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
7563             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
7564             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
7565             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
7566             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
7567             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
7568             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
7569             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
7570             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
7571             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
7572             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
7573             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
7574             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
7575             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
7576             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
7577             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
7578             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
7579           }
7580           break;
7581           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
7582           switch(source[i]&0x3f)
7583           {
7584             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
7585             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
7586           }
7587           break;
7588           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
7589           switch(source[i]&0x3f)
7590           {
7591             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
7592             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
7593           }
7594           break;
7595         }
7596         break;
7597 #if 0
7598       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7599       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7600       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7601       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7602       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7603       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7604       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7605       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7606 #endif
7607       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7608       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7609       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7610       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7611       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7612       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7613       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7614 #if 0
7615       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7616 #endif
7617       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7618       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7619       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7620       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7621 #if 0
7622       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7623       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7624 #endif
7625       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7626       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7627       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7628       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7629 #if 0
7630       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7631       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7632       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7633 #endif
7634       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7635       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7636 #if 0
7637       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7638       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7639       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7640 #endif
7641       case 0x12: strcpy(insn[i],"COP2"); type=NI;
7642         op2=(source[i]>>21)&0x1f;
7643         //if (op2 & 0x10) {
7644         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
7645           if (gte_handlers[source[i]&0x3f]!=NULL) {
7646             if (gte_regnames[source[i]&0x3f]!=NULL)
7647               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
7648             else
7649               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
7650             type=C2OP;
7651           }
7652         }
7653         else switch(op2)
7654         {
7655           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
7656           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
7657           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
7658           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
7659         }
7660         break;
7661       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
7662       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
7663       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
7664       default: strcpy(insn[i],"???"); type=NI;
7665         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
7666         break;
7667     }
7668     itype[i]=type;
7669     opcode2[i]=op2;
7670     /* Get registers/immediates */
7671     lt1[i]=0;
7672     us1[i]=0;
7673     us2[i]=0;
7674     dep1[i]=0;
7675     dep2[i]=0;
7676     gte_rs[i]=gte_rt[i]=0;
7677     switch(type) {
7678       case LOAD:
7679         rs1[i]=(source[i]>>21)&0x1f;
7680         rs2[i]=0;
7681         rt1[i]=(source[i]>>16)&0x1f;
7682         rt2[i]=0;
7683         imm[i]=(short)source[i];
7684         break;
7685       case STORE:
7686       case STORELR:
7687         rs1[i]=(source[i]>>21)&0x1f;
7688         rs2[i]=(source[i]>>16)&0x1f;
7689         rt1[i]=0;
7690         rt2[i]=0;
7691         imm[i]=(short)source[i];
7692         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
7693         break;
7694       case LOADLR:
7695         // LWL/LWR only load part of the register,
7696         // therefore the target register must be treated as a source too
7697         rs1[i]=(source[i]>>21)&0x1f;
7698         rs2[i]=(source[i]>>16)&0x1f;
7699         rt1[i]=(source[i]>>16)&0x1f;
7700         rt2[i]=0;
7701         imm[i]=(short)source[i];
7702         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
7703         if(op==0x26) dep1[i]=rt1[i]; // LWR
7704         break;
7705       case IMM16:
7706         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7707         else rs1[i]=(source[i]>>21)&0x1f;
7708         rs2[i]=0;
7709         rt1[i]=(source[i]>>16)&0x1f;
7710         rt2[i]=0;
7711         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7712           imm[i]=(unsigned short)source[i];
7713         }else{
7714           imm[i]=(short)source[i];
7715         }
7716         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
7717         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
7718         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7719         break;
7720       case UJUMP:
7721         rs1[i]=0;
7722         rs2[i]=0;
7723         rt1[i]=0;
7724         rt2[i]=0;
7725         // The JAL instruction writes to r31.
7726         if (op&1) {
7727           rt1[i]=31;
7728         }
7729         rs2[i]=CCREG;
7730         break;
7731       case RJUMP:
7732         rs1[i]=(source[i]>>21)&0x1f;
7733         rs2[i]=0;
7734         rt1[i]=0;
7735         rt2[i]=0;
7736         // The JALR instruction writes to rd.
7737         if (op2&1) {
7738           rt1[i]=(source[i]>>11)&0x1f;
7739         }
7740         rs2[i]=CCREG;
7741         break;
7742       case CJUMP:
7743         rs1[i]=(source[i]>>21)&0x1f;
7744         rs2[i]=(source[i]>>16)&0x1f;
7745         rt1[i]=0;
7746         rt2[i]=0;
7747         if(op&2) { // BGTZ/BLEZ
7748           rs2[i]=0;
7749         }
7750         us1[i]=rs1[i];
7751         us2[i]=rs2[i];
7752         likely[i]=op>>4;
7753         break;
7754       case SJUMP:
7755         rs1[i]=(source[i]>>21)&0x1f;
7756         rs2[i]=CCREG;
7757         rt1[i]=0;
7758         rt2[i]=0;
7759         us1[i]=rs1[i];
7760         if(op2&0x10) { // BxxAL
7761           rt1[i]=31;
7762           // NOTE: If the branch is not taken, r31 is still overwritten
7763         }
7764         likely[i]=(op2&2)>>1;
7765         break;
7766       case FJUMP:
7767         rs1[i]=FSREG;
7768         rs2[i]=CSREG;
7769         rt1[i]=0;
7770         rt2[i]=0;
7771         likely[i]=((source[i])>>17)&1;
7772         break;
7773       case ALU:
7774         rs1[i]=(source[i]>>21)&0x1f; // source
7775         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7776         rt1[i]=(source[i]>>11)&0x1f; // destination
7777         rt2[i]=0;
7778         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7779           us1[i]=rs1[i];us2[i]=rs2[i];
7780         }
7781         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7782           dep1[i]=rs1[i];dep2[i]=rs2[i];
7783         }
7784         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7785           dep1[i]=rs1[i];dep2[i]=rs2[i];
7786         }
7787         break;
7788       case MULTDIV:
7789         rs1[i]=(source[i]>>21)&0x1f; // source
7790         rs2[i]=(source[i]>>16)&0x1f; // divisor
7791         rt1[i]=HIREG;
7792         rt2[i]=LOREG;
7793         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7794           us1[i]=rs1[i];us2[i]=rs2[i];
7795         }
7796         break;
7797       case MOV:
7798         rs1[i]=0;
7799         rs2[i]=0;
7800         rt1[i]=0;
7801         rt2[i]=0;
7802         if(op2==0x10) rs1[i]=HIREG; // MFHI
7803         if(op2==0x11) rt1[i]=HIREG; // MTHI
7804         if(op2==0x12) rs1[i]=LOREG; // MFLO
7805         if(op2==0x13) rt1[i]=LOREG; // MTLO
7806         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7807         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7808         dep1[i]=rs1[i];
7809         break;
7810       case SHIFT:
7811         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7812         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7813         rt1[i]=(source[i]>>11)&0x1f; // destination
7814         rt2[i]=0;
7815         // DSLLV/DSRLV/DSRAV are 64-bit
7816         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
7817         break;
7818       case SHIFTIMM:
7819         rs1[i]=(source[i]>>16)&0x1f;
7820         rs2[i]=0;
7821         rt1[i]=(source[i]>>11)&0x1f;
7822         rt2[i]=0;
7823         imm[i]=(source[i]>>6)&0x1f;
7824         // DSxx32 instructions
7825         if(op2>=0x3c) imm[i]|=0x20;
7826         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
7827         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
7828         break;
7829       case COP0:
7830         rs1[i]=0;
7831         rs2[i]=0;
7832         rt1[i]=0;
7833         rt2[i]=0;
7834         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
7835         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
7836         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7837         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7838         break;
7839       case COP1:
7840         rs1[i]=0;
7841         rs2[i]=0;
7842         rt1[i]=0;
7843         rt2[i]=0;
7844         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7845         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7846         if(op2==5) us1[i]=rs1[i]; // DMTC1
7847         rs2[i]=CSREG;
7848         break;
7849       case COP2:
7850         rs1[i]=0;
7851         rs2[i]=0;
7852         rt1[i]=0;
7853         rt2[i]=0;
7854         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
7855         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
7856         rs2[i]=CSREG;
7857         int gr=(source[i]>>11)&0x1F;
7858         switch(op2)
7859         {
7860           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
7861           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
7862           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
7863           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
7864         }
7865         break;
7866       case C1LS:
7867         rs1[i]=(source[i]>>21)&0x1F;
7868         rs2[i]=CSREG;
7869         rt1[i]=0;
7870         rt2[i]=0;
7871         imm[i]=(short)source[i];
7872         break;
7873       case C2LS:
7874         rs1[i]=(source[i]>>21)&0x1F;
7875         rs2[i]=0;
7876         rt1[i]=0;
7877         rt2[i]=0;
7878         imm[i]=(short)source[i];
7879         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
7880         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
7881         break;
7882       case C2OP:
7883         rs1[i]=0;
7884         rs2[i]=0;
7885         rt1[i]=0;
7886         rt2[i]=0;
7887         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
7888         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
7889         gte_rt[i]|=1ll<<63; // every op changes flags
7890         if((source[i]&0x3f)==GTE_MVMVA) {
7891           int v = (source[i] >> 15) & 3;
7892           gte_rs[i]&=~0xe3fll;
7893           if(v==3) gte_rs[i]|=0xe00ll;
7894           else gte_rs[i]|=3ll<<(v*2);
7895         }
7896         break;
7897       case FLOAT:
7898       case FCONV:
7899         rs1[i]=0;
7900         rs2[i]=CSREG;
7901         rt1[i]=0;
7902         rt2[i]=0;
7903         break;
7904       case FCOMP:
7905         rs1[i]=FSREG;
7906         rs2[i]=CSREG;
7907         rt1[i]=FSREG;
7908         rt2[i]=0;
7909         break;
7910       case SYSCALL:
7911       case HLECALL:
7912       case INTCALL:
7913         rs1[i]=CCREG;
7914         rs2[i]=0;
7915         rt1[i]=0;
7916         rt2[i]=0;
7917         break;
7918       default:
7919         rs1[i]=0;
7920         rs2[i]=0;
7921         rt1[i]=0;
7922         rt2[i]=0;
7923     }
7924     /* Calculate branch target addresses */
7925     if(type==UJUMP)
7926       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
7927     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
7928       ba[i]=start+i*4+8; // Ignore never taken branch
7929     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
7930       ba[i]=start+i*4+8; // Ignore never taken branch
7931     else if(type==CJUMP||type==SJUMP||type==FJUMP)
7932       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
7933     else ba[i]=-1;
7934     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
7935       int do_in_intrp=0;
7936       // branch in delay slot?
7937       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
7938         // don't handle first branch and call interpreter if it's hit
7939         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
7940         do_in_intrp=1;
7941       }
7942       // basic load delay detection
7943       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
7944         int t=(ba[i-1]-start)/4;
7945         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
7946           // jump target wants DS result - potential load delay effect
7947           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
7948           do_in_intrp=1;
7949           bt[t+1]=1; // expected return from interpreter
7950         }
7951         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
7952               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
7953           // v0 overwrite like this is a sign of trouble, bail out
7954           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
7955           do_in_intrp=1;
7956         }
7957       }
7958       if(do_in_intrp) {
7959         rs1[i-1]=CCREG;
7960         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
7961         ba[i-1]=-1;
7962         itype[i-1]=INTCALL;
7963         done=2;
7964         i--; // don't compile the DS
7965       }
7966     }
7967     /* Is this the end of the block? */
7968     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
7969       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
7970         done=2;
7971       }
7972       else {
7973         if(stop_after_jal) done=1;
7974         // Stop on BREAK
7975         if((source[i+1]&0xfc00003f)==0x0d) done=1;
7976       }
7977       // Don't recompile stuff that's already compiled
7978       if(check_addr(start+i*4+4)) done=1;
7979       // Don't get too close to the limit
7980       if(i>MAXBLOCK/2) done=1;
7981     }
7982     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
7983     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
7984     if(done==2) {
7985       // Does the block continue due to a branch?
7986       for(j=i-1;j>=0;j--)
7987       {
7988         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
7989         if(ba[j]==start+i*4+4) done=j=0;
7990         if(ba[j]==start+i*4+8) done=j=0;
7991       }
7992     }
7993     //assert(i<MAXBLOCK-1);
7994     if(start+i*4==pagelimit-4) done=1;
7995     assert(start+i*4<pagelimit);
7996     if (i==MAXBLOCK-1) done=1;
7997     // Stop if we're compiling junk
7998     if(itype[i]==NI&&opcode[i]==0x11) {
7999       done=stop_after_jal=1;
8000       SysPrintf("Disabled speculative precompilation\n");
8001     }
8002   }
8003   slen=i;
8004   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8005     if(start+i*4==pagelimit) {
8006       itype[i-1]=SPAN;
8007     }
8008   }
8009   assert(slen>0);
8010
8011   /* Pass 2 - Register dependencies and branch targets */
8012
8013   unneeded_registers(0,slen-1,0);
8014
8015   /* Pass 3 - Register allocation */
8016
8017   struct regstat current; // Current register allocations/status
8018   current.is32=1;
8019   current.dirty=0;
8020   current.u=unneeded_reg[0];
8021   current.uu=unneeded_reg_upper[0];
8022   clear_all_regs(current.regmap);
8023   alloc_reg(&current,0,CCREG);
8024   dirty_reg(&current,CCREG);
8025   current.isconst=0;
8026   current.wasconst=0;
8027   current.waswritten=0;
8028   int ds=0;
8029   int cc=0;
8030   int hr=-1;
8031
8032   if((u_int)addr&1) {
8033     // First instruction is delay slot
8034     cc=-1;
8035     bt[1]=1;
8036     ds=1;
8037     unneeded_reg[0]=1;
8038     unneeded_reg_upper[0]=1;
8039     current.regmap[HOST_BTREG]=BTREG;
8040   }
8041
8042   for(i=0;i<slen;i++)
8043   {
8044     if(bt[i])
8045     {
8046       int hr;
8047       for(hr=0;hr<HOST_REGS;hr++)
8048       {
8049         // Is this really necessary?
8050         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8051       }
8052       current.isconst=0;
8053       current.waswritten=0;
8054     }
8055     if(i>1)
8056     {
8057       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8058       {
8059         if(rs1[i-2]==0||rs2[i-2]==0)
8060         {
8061           if(rs1[i-2]) {
8062             current.is32|=1LL<<rs1[i-2];
8063             int hr=get_reg(current.regmap,rs1[i-2]|64);
8064             if(hr>=0) current.regmap[hr]=-1;
8065           }
8066           if(rs2[i-2]) {
8067             current.is32|=1LL<<rs2[i-2];
8068             int hr=get_reg(current.regmap,rs2[i-2]|64);
8069             if(hr>=0) current.regmap[hr]=-1;
8070           }
8071         }
8072       }
8073     }
8074     current.is32=-1LL;
8075
8076     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8077     regs[i].wasconst=current.isconst;
8078     regs[i].was32=current.is32;
8079     regs[i].wasdirty=current.dirty;
8080     regs[i].loadedconst=0;
8081     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8082       if(i+1<slen) {
8083         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8084         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8085         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8086         current.u|=1;
8087         current.uu|=1;
8088       } else {
8089         current.u=1;
8090         current.uu=1;
8091       }
8092     } else {
8093       if(i+1<slen) {
8094         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8095         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8096         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8097         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8098         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8099         current.u|=1;
8100         current.uu|=1;
8101       } else { SysPrintf("oops, branch at end of block with no delay slot\n");exit(1); }
8102     }
8103     is_ds[i]=ds;
8104     if(ds) {
8105       ds=0; // Skip delay slot, already allocated as part of branch
8106       // ...but we need to alloc it in case something jumps here
8107       if(i+1<slen) {
8108         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8109         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8110       }else{
8111         current.u=branch_unneeded_reg[i-1];
8112         current.uu=branch_unneeded_reg_upper[i-1];
8113       }
8114       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8115       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8116       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8117       current.u|=1;
8118       current.uu|=1;
8119       struct regstat temp;
8120       memcpy(&temp,&current,sizeof(current));
8121       temp.wasdirty=temp.dirty;
8122       temp.was32=temp.is32;
8123       // TODO: Take into account unconditional branches, as below
8124       delayslot_alloc(&temp,i);
8125       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8126       regs[i].wasdirty=temp.wasdirty;
8127       regs[i].was32=temp.was32;
8128       regs[i].dirty=temp.dirty;
8129       regs[i].is32=temp.is32;
8130       regs[i].isconst=0;
8131       regs[i].wasconst=0;
8132       current.isconst=0;
8133       // Create entry (branch target) regmap
8134       for(hr=0;hr<HOST_REGS;hr++)
8135       {
8136         int r=temp.regmap[hr];
8137         if(r>=0) {
8138           if(r!=regmap_pre[i][hr]) {
8139             regs[i].regmap_entry[hr]=-1;
8140           }
8141           else
8142           {
8143             if(r<64){
8144               if((current.u>>r)&1) {
8145                 regs[i].regmap_entry[hr]=-1;
8146                 regs[i].regmap[hr]=-1;
8147                 //Don't clear regs in the delay slot as the branch might need them
8148                 //current.regmap[hr]=-1;
8149               }else
8150                 regs[i].regmap_entry[hr]=r;
8151             }
8152             else {
8153               if((current.uu>>(r&63))&1) {
8154                 regs[i].regmap_entry[hr]=-1;
8155                 regs[i].regmap[hr]=-1;
8156                 //Don't clear regs in the delay slot as the branch might need them
8157                 //current.regmap[hr]=-1;
8158               }else
8159                 regs[i].regmap_entry[hr]=r;
8160             }
8161           }
8162         } else {
8163           // First instruction expects CCREG to be allocated
8164           if(i==0&&hr==HOST_CCREG)
8165             regs[i].regmap_entry[hr]=CCREG;
8166           else
8167             regs[i].regmap_entry[hr]=-1;
8168         }
8169       }
8170     }
8171     else { // Not delay slot
8172       switch(itype[i]) {
8173         case UJUMP:
8174           //current.isconst=0; // DEBUG
8175           //current.wasconst=0; // DEBUG
8176           //regs[i].wasconst=0; // DEBUG
8177           clear_const(&current,rt1[i]);
8178           alloc_cc(&current,i);
8179           dirty_reg(&current,CCREG);
8180           if (rt1[i]==31) {
8181             alloc_reg(&current,i,31);
8182             dirty_reg(&current,31);
8183             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8184             //assert(rt1[i+1]!=rt1[i]);
8185             #ifdef REG_PREFETCH
8186             alloc_reg(&current,i,PTEMP);
8187             #endif
8188             //current.is32|=1LL<<rt1[i];
8189           }
8190           ooo[i]=1;
8191           delayslot_alloc(&current,i+1);
8192           //current.isconst=0; // DEBUG
8193           ds=1;
8194           //printf("i=%d, isconst=%x\n",i,current.isconst);
8195           break;
8196         case RJUMP:
8197           //current.isconst=0;
8198           //current.wasconst=0;
8199           //regs[i].wasconst=0;
8200           clear_const(&current,rs1[i]);
8201           clear_const(&current,rt1[i]);
8202           alloc_cc(&current,i);
8203           dirty_reg(&current,CCREG);
8204           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8205             alloc_reg(&current,i,rs1[i]);
8206             if (rt1[i]!=0) {
8207               alloc_reg(&current,i,rt1[i]);
8208               dirty_reg(&current,rt1[i]);
8209               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
8210               assert(rt1[i+1]!=rt1[i]);
8211               #ifdef REG_PREFETCH
8212               alloc_reg(&current,i,PTEMP);
8213               #endif
8214             }
8215             #ifdef USE_MINI_HT
8216             if(rs1[i]==31) { // JALR
8217               alloc_reg(&current,i,RHASH);
8218               #ifndef HOST_IMM_ADDR32
8219               alloc_reg(&current,i,RHTBL);
8220               #endif
8221             }
8222             #endif
8223             delayslot_alloc(&current,i+1);
8224           } else {
8225             // The delay slot overwrites our source register,
8226             // allocate a temporary register to hold the old value.
8227             current.isconst=0;
8228             current.wasconst=0;
8229             regs[i].wasconst=0;
8230             delayslot_alloc(&current,i+1);
8231             current.isconst=0;
8232             alloc_reg(&current,i,RTEMP);
8233           }
8234           //current.isconst=0; // DEBUG
8235           ooo[i]=1;
8236           ds=1;
8237           break;
8238         case CJUMP:
8239           //current.isconst=0;
8240           //current.wasconst=0;
8241           //regs[i].wasconst=0;
8242           clear_const(&current,rs1[i]);
8243           clear_const(&current,rs2[i]);
8244           if((opcode[i]&0x3E)==4) // BEQ/BNE
8245           {
8246             alloc_cc(&current,i);
8247             dirty_reg(&current,CCREG);
8248             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8249             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8250             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8251             {
8252               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8253               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8254             }
8255             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8256                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8257               // The delay slot overwrites one of our conditions.
8258               // Allocate the branch condition registers instead.
8259               current.isconst=0;
8260               current.wasconst=0;
8261               regs[i].wasconst=0;
8262               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8263               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8264               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8265               {
8266                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8267                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8268               }
8269             }
8270             else
8271             {
8272               ooo[i]=1;
8273               delayslot_alloc(&current,i+1);
8274             }
8275           }
8276           else
8277           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8278           {
8279             alloc_cc(&current,i);
8280             dirty_reg(&current,CCREG);
8281             alloc_reg(&current,i,rs1[i]);
8282             if(!(current.is32>>rs1[i]&1))
8283             {
8284               alloc_reg64(&current,i,rs1[i]);
8285             }
8286             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8287               // The delay slot overwrites one of our conditions.
8288               // Allocate the branch condition registers instead.
8289               current.isconst=0;
8290               current.wasconst=0;
8291               regs[i].wasconst=0;
8292               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8293               if(!((current.is32>>rs1[i])&1))
8294               {
8295                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8296               }
8297             }
8298             else
8299             {
8300               ooo[i]=1;
8301               delayslot_alloc(&current,i+1);
8302             }
8303           }
8304           else
8305           // Don't alloc the delay slot yet because we might not execute it
8306           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
8307           {
8308             current.isconst=0;
8309             current.wasconst=0;
8310             regs[i].wasconst=0;
8311             alloc_cc(&current,i);
8312             dirty_reg(&current,CCREG);
8313             alloc_reg(&current,i,rs1[i]);
8314             alloc_reg(&current,i,rs2[i]);
8315             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8316             {
8317               alloc_reg64(&current,i,rs1[i]);
8318               alloc_reg64(&current,i,rs2[i]);
8319             }
8320           }
8321           else
8322           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
8323           {
8324             current.isconst=0;
8325             current.wasconst=0;
8326             regs[i].wasconst=0;
8327             alloc_cc(&current,i);
8328             dirty_reg(&current,CCREG);
8329             alloc_reg(&current,i,rs1[i]);
8330             if(!(current.is32>>rs1[i]&1))
8331             {
8332               alloc_reg64(&current,i,rs1[i]);
8333             }
8334           }
8335           ds=1;
8336           //current.isconst=0;
8337           break;
8338         case SJUMP:
8339           //current.isconst=0;
8340           //current.wasconst=0;
8341           //regs[i].wasconst=0;
8342           clear_const(&current,rs1[i]);
8343           clear_const(&current,rt1[i]);
8344           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
8345           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
8346           {
8347             alloc_cc(&current,i);
8348             dirty_reg(&current,CCREG);
8349             alloc_reg(&current,i,rs1[i]);
8350             if(!(current.is32>>rs1[i]&1))
8351             {
8352               alloc_reg64(&current,i,rs1[i]);
8353             }
8354             if (rt1[i]==31) { // BLTZAL/BGEZAL
8355               alloc_reg(&current,i,31);
8356               dirty_reg(&current,31);
8357               //#ifdef REG_PREFETCH
8358               //alloc_reg(&current,i,PTEMP);
8359               //#endif
8360               //current.is32|=1LL<<rt1[i];
8361             }
8362             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
8363                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
8364               // Allocate the branch condition registers instead.
8365               current.isconst=0;
8366               current.wasconst=0;
8367               regs[i].wasconst=0;
8368               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8369               if(!((current.is32>>rs1[i])&1))
8370               {
8371                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8372               }
8373             }
8374             else
8375             {
8376               ooo[i]=1;
8377               delayslot_alloc(&current,i+1);
8378             }
8379           }
8380           else
8381           // Don't alloc the delay slot yet because we might not execute it
8382           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
8383           {
8384             current.isconst=0;
8385             current.wasconst=0;
8386             regs[i].wasconst=0;
8387             alloc_cc(&current,i);
8388             dirty_reg(&current,CCREG);
8389             alloc_reg(&current,i,rs1[i]);
8390             if(!(current.is32>>rs1[i]&1))
8391             {
8392               alloc_reg64(&current,i,rs1[i]);
8393             }
8394           }
8395           ds=1;
8396           //current.isconst=0;
8397           break;
8398         case FJUMP:
8399           current.isconst=0;
8400           current.wasconst=0;
8401           regs[i].wasconst=0;
8402           if(likely[i]==0) // BC1F/BC1T
8403           {
8404             // TODO: Theoretically we can run out of registers here on x86.
8405             // The delay slot can allocate up to six, and we need to check
8406             // CSREG before executing the delay slot.  Possibly we can drop
8407             // the cycle count and then reload it after checking that the
8408             // FPU is in a usable state, or don't do out-of-order execution.
8409             alloc_cc(&current,i);
8410             dirty_reg(&current,CCREG);
8411             alloc_reg(&current,i,FSREG);
8412             alloc_reg(&current,i,CSREG);
8413             if(itype[i+1]==FCOMP) {
8414               // The delay slot overwrites the branch condition.
8415               // Allocate the branch condition registers instead.
8416               alloc_cc(&current,i);
8417               dirty_reg(&current,CCREG);
8418               alloc_reg(&current,i,CSREG);
8419               alloc_reg(&current,i,FSREG);
8420             }
8421             else {
8422               ooo[i]=1;
8423               delayslot_alloc(&current,i+1);
8424               alloc_reg(&current,i+1,CSREG);
8425             }
8426           }
8427           else
8428           // Don't alloc the delay slot yet because we might not execute it
8429           if(likely[i]) // BC1FL/BC1TL
8430           {
8431             alloc_cc(&current,i);
8432             dirty_reg(&current,CCREG);
8433             alloc_reg(&current,i,CSREG);
8434             alloc_reg(&current,i,FSREG);
8435           }
8436           ds=1;
8437           current.isconst=0;
8438           break;
8439         case IMM16:
8440           imm16_alloc(&current,i);
8441           break;
8442         case LOAD:
8443         case LOADLR:
8444           load_alloc(&current,i);
8445           break;
8446         case STORE:
8447         case STORELR:
8448           store_alloc(&current,i);
8449           break;
8450         case ALU:
8451           alu_alloc(&current,i);
8452           break;
8453         case SHIFT:
8454           shift_alloc(&current,i);
8455           break;
8456         case MULTDIV:
8457           multdiv_alloc(&current,i);
8458           break;
8459         case SHIFTIMM:
8460           shiftimm_alloc(&current,i);
8461           break;
8462         case MOV:
8463           mov_alloc(&current,i);
8464           break;
8465         case COP0:
8466           cop0_alloc(&current,i);
8467           break;
8468         case COP1:
8469         case COP2:
8470           cop1_alloc(&current,i);
8471           break;
8472         case C1LS:
8473           c1ls_alloc(&current,i);
8474           break;
8475         case C2LS:
8476           c2ls_alloc(&current,i);
8477           break;
8478         case C2OP:
8479           c2op_alloc(&current,i);
8480           break;
8481         case FCONV:
8482           fconv_alloc(&current,i);
8483           break;
8484         case FLOAT:
8485           float_alloc(&current,i);
8486           break;
8487         case FCOMP:
8488           fcomp_alloc(&current,i);
8489           break;
8490         case SYSCALL:
8491         case HLECALL:
8492         case INTCALL:
8493           syscall_alloc(&current,i);
8494           break;
8495         case SPAN:
8496           pagespan_alloc(&current,i);
8497           break;
8498       }
8499
8500       // Drop the upper half of registers that have become 32-bit
8501       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
8502       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8503         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8504         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8505         current.uu|=1;
8506       } else {
8507         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
8508         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8509         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8510         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8511         current.uu|=1;
8512       }
8513
8514       // Create entry (branch target) regmap
8515       for(hr=0;hr<HOST_REGS;hr++)
8516       {
8517         int r,or;
8518         r=current.regmap[hr];
8519         if(r>=0) {
8520           if(r!=regmap_pre[i][hr]) {
8521             // TODO: delay slot (?)
8522             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
8523             if(or<0||(r&63)>=TEMPREG){
8524               regs[i].regmap_entry[hr]=-1;
8525             }
8526             else
8527             {
8528               // Just move it to a different register
8529               regs[i].regmap_entry[hr]=r;
8530               // If it was dirty before, it's still dirty
8531               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
8532             }
8533           }
8534           else
8535           {
8536             // Unneeded
8537             if(r==0){
8538               regs[i].regmap_entry[hr]=0;
8539             }
8540             else
8541             if(r<64){
8542               if((current.u>>r)&1) {
8543                 regs[i].regmap_entry[hr]=-1;
8544                 //regs[i].regmap[hr]=-1;
8545                 current.regmap[hr]=-1;
8546               }else
8547                 regs[i].regmap_entry[hr]=r;
8548             }
8549             else {
8550               if((current.uu>>(r&63))&1) {
8551                 regs[i].regmap_entry[hr]=-1;
8552                 //regs[i].regmap[hr]=-1;
8553                 current.regmap[hr]=-1;
8554               }else
8555                 regs[i].regmap_entry[hr]=r;
8556             }
8557           }
8558         } else {
8559           // Branches expect CCREG to be allocated at the target
8560           if(regmap_pre[i][hr]==CCREG)
8561             regs[i].regmap_entry[hr]=CCREG;
8562           else
8563             regs[i].regmap_entry[hr]=-1;
8564         }
8565       }
8566       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8567     }
8568
8569     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
8570       current.waswritten|=1<<rs1[i-1];
8571     current.waswritten&=~(1<<rt1[i]);
8572     current.waswritten&=~(1<<rt2[i]);
8573     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
8574       current.waswritten&=~(1<<rs1[i]);
8575
8576     /* Branch post-alloc */
8577     if(i>0)
8578     {
8579       current.was32=current.is32;
8580       current.wasdirty=current.dirty;
8581       switch(itype[i-1]) {
8582         case UJUMP:
8583           memcpy(&branch_regs[i-1],&current,sizeof(current));
8584           branch_regs[i-1].isconst=0;
8585           branch_regs[i-1].wasconst=0;
8586           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8587           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8588           alloc_cc(&branch_regs[i-1],i-1);
8589           dirty_reg(&branch_regs[i-1],CCREG);
8590           if(rt1[i-1]==31) { // JAL
8591             alloc_reg(&branch_regs[i-1],i-1,31);
8592             dirty_reg(&branch_regs[i-1],31);
8593             branch_regs[i-1].is32|=1LL<<31;
8594           }
8595           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8596           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8597           break;
8598         case RJUMP:
8599           memcpy(&branch_regs[i-1],&current,sizeof(current));
8600           branch_regs[i-1].isconst=0;
8601           branch_regs[i-1].wasconst=0;
8602           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8603           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8604           alloc_cc(&branch_regs[i-1],i-1);
8605           dirty_reg(&branch_regs[i-1],CCREG);
8606           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8607           if(rt1[i-1]!=0) { // JALR
8608             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
8609             dirty_reg(&branch_regs[i-1],rt1[i-1]);
8610             branch_regs[i-1].is32|=1LL<<rt1[i-1];
8611           }
8612           #ifdef USE_MINI_HT
8613           if(rs1[i-1]==31) { // JALR
8614             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8615             #ifndef HOST_IMM_ADDR32
8616             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8617             #endif
8618           }
8619           #endif
8620           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8621           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8622           break;
8623         case CJUMP:
8624           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8625           {
8626             alloc_cc(&current,i-1);
8627             dirty_reg(&current,CCREG);
8628             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8629                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8630               // The delay slot overwrote one of our conditions
8631               // Delay slot goes after the test (in order)
8632               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8633               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8634               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8635               current.u|=1;
8636               current.uu|=1;
8637               delayslot_alloc(&current,i);
8638               current.isconst=0;
8639             }
8640             else
8641             {
8642               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8643               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8644               // Alloc the branch condition registers
8645               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8646               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8647               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
8648               {
8649                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
8650                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
8651               }
8652             }
8653             memcpy(&branch_regs[i-1],&current,sizeof(current));
8654             branch_regs[i-1].isconst=0;
8655             branch_regs[i-1].wasconst=0;
8656             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8657             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8658           }
8659           else
8660           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8661           {
8662             alloc_cc(&current,i-1);
8663             dirty_reg(&current,CCREG);
8664             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8665               // The delay slot overwrote the branch condition
8666               // Delay slot goes after the test (in order)
8667               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8668               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8669               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8670               current.u|=1;
8671               current.uu|=1;
8672               delayslot_alloc(&current,i);
8673               current.isconst=0;
8674             }
8675             else
8676             {
8677               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8678               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8679               // Alloc the branch condition register
8680               alloc_reg(&current,i-1,rs1[i-1]);
8681               if(!(current.is32>>rs1[i-1]&1))
8682               {
8683                 alloc_reg64(&current,i-1,rs1[i-1]);
8684               }
8685             }
8686             memcpy(&branch_regs[i-1],&current,sizeof(current));
8687             branch_regs[i-1].isconst=0;
8688             branch_regs[i-1].wasconst=0;
8689             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8690             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8691           }
8692           else
8693           // Alloc the delay slot in case the branch is taken
8694           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8695           {
8696             memcpy(&branch_regs[i-1],&current,sizeof(current));
8697             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8698             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8699             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8700             alloc_cc(&branch_regs[i-1],i);
8701             dirty_reg(&branch_regs[i-1],CCREG);
8702             delayslot_alloc(&branch_regs[i-1],i);
8703             branch_regs[i-1].isconst=0;
8704             alloc_reg(&current,i,CCREG); // Not taken path
8705             dirty_reg(&current,CCREG);
8706             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8707           }
8708           else
8709           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8710           {
8711             memcpy(&branch_regs[i-1],&current,sizeof(current));
8712             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8713             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8714             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8715             alloc_cc(&branch_regs[i-1],i);
8716             dirty_reg(&branch_regs[i-1],CCREG);
8717             delayslot_alloc(&branch_regs[i-1],i);
8718             branch_regs[i-1].isconst=0;
8719             alloc_reg(&current,i,CCREG); // Not taken path
8720             dirty_reg(&current,CCREG);
8721             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8722           }
8723           break;
8724         case SJUMP:
8725           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8726           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8727           {
8728             alloc_cc(&current,i-1);
8729             dirty_reg(&current,CCREG);
8730             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8731               // The delay slot overwrote the branch condition
8732               // Delay slot goes after the test (in order)
8733               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8734               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8735               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8736               current.u|=1;
8737               current.uu|=1;
8738               delayslot_alloc(&current,i);
8739               current.isconst=0;
8740             }
8741             else
8742             {
8743               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8744               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8745               // Alloc the branch condition register
8746               alloc_reg(&current,i-1,rs1[i-1]);
8747               if(!(current.is32>>rs1[i-1]&1))
8748               {
8749                 alloc_reg64(&current,i-1,rs1[i-1]);
8750               }
8751             }
8752             memcpy(&branch_regs[i-1],&current,sizeof(current));
8753             branch_regs[i-1].isconst=0;
8754             branch_regs[i-1].wasconst=0;
8755             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8756             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8757           }
8758           else
8759           // Alloc the delay slot in case the branch is taken
8760           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
8761           {
8762             memcpy(&branch_regs[i-1],&current,sizeof(current));
8763             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8764             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8765             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8766             alloc_cc(&branch_regs[i-1],i);
8767             dirty_reg(&branch_regs[i-1],CCREG);
8768             delayslot_alloc(&branch_regs[i-1],i);
8769             branch_regs[i-1].isconst=0;
8770             alloc_reg(&current,i,CCREG); // Not taken path
8771             dirty_reg(&current,CCREG);
8772             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8773           }
8774           // FIXME: BLTZAL/BGEZAL
8775           if(opcode2[i-1]&0x10) { // BxxZAL
8776             alloc_reg(&branch_regs[i-1],i-1,31);
8777             dirty_reg(&branch_regs[i-1],31);
8778             branch_regs[i-1].is32|=1LL<<31;
8779           }
8780           break;
8781         case FJUMP:
8782           if(likely[i-1]==0) // BC1F/BC1T
8783           {
8784             alloc_cc(&current,i-1);
8785             dirty_reg(&current,CCREG);
8786             if(itype[i]==FCOMP) {
8787               // The delay slot overwrote the branch condition
8788               // Delay slot goes after the test (in order)
8789               delayslot_alloc(&current,i);
8790               current.isconst=0;
8791             }
8792             else
8793             {
8794               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8795               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8796               // Alloc the branch condition register
8797               alloc_reg(&current,i-1,FSREG);
8798             }
8799             memcpy(&branch_regs[i-1],&current,sizeof(current));
8800             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8801           }
8802           else // BC1FL/BC1TL
8803           {
8804             // Alloc the delay slot in case the branch is taken
8805             memcpy(&branch_regs[i-1],&current,sizeof(current));
8806             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8807             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8808             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8809             alloc_cc(&branch_regs[i-1],i);
8810             dirty_reg(&branch_regs[i-1],CCREG);
8811             delayslot_alloc(&branch_regs[i-1],i);
8812             branch_regs[i-1].isconst=0;
8813             alloc_reg(&current,i,CCREG); // Not taken path
8814             dirty_reg(&current,CCREG);
8815             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8816           }
8817           break;
8818       }
8819
8820       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
8821       {
8822         if(rt1[i-1]==31) // JAL/JALR
8823         {
8824           // Subroutine call will return here, don't alloc any registers
8825           current.is32=1;
8826           current.dirty=0;
8827           clear_all_regs(current.regmap);
8828           alloc_reg(&current,i,CCREG);
8829           dirty_reg(&current,CCREG);
8830         }
8831         else if(i+1<slen)
8832         {
8833           // Internal branch will jump here, match registers to caller
8834           current.is32=0x3FFFFFFFFLL;
8835           current.dirty=0;
8836           clear_all_regs(current.regmap);
8837           alloc_reg(&current,i,CCREG);
8838           dirty_reg(&current,CCREG);
8839           for(j=i-1;j>=0;j--)
8840           {
8841             if(ba[j]==start+i*4+4) {
8842               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
8843               current.is32=branch_regs[j].is32;
8844               current.dirty=branch_regs[j].dirty;
8845               break;
8846             }
8847           }
8848           while(j>=0) {
8849             if(ba[j]==start+i*4+4) {
8850               for(hr=0;hr<HOST_REGS;hr++) {
8851                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
8852                   current.regmap[hr]=-1;
8853                 }
8854                 current.is32&=branch_regs[j].is32;
8855                 current.dirty&=branch_regs[j].dirty;
8856               }
8857             }
8858             j--;
8859           }
8860         }
8861       }
8862     }
8863
8864     // Count cycles in between branches
8865     ccadj[i]=cc;
8866     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
8867     {
8868       cc=0;
8869     }
8870 #if !defined(DRC_DBG)
8871     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
8872     {
8873       // GTE runs in parallel until accessed, divide by 2 for a rough guess
8874       cc+=gte_cycletab[source[i]&0x3f]/2;
8875     }
8876     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
8877     {
8878       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
8879     }
8880     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
8881     {
8882       cc+=4;
8883     }
8884     else if(itype[i]==C2LS)
8885     {
8886       cc+=4;
8887     }
8888 #endif
8889     else
8890     {
8891       cc++;
8892     }
8893
8894     flush_dirty_uppers(&current);
8895     if(!is_ds[i]) {
8896       regs[i].is32=current.is32;
8897       regs[i].dirty=current.dirty;
8898       regs[i].isconst=current.isconst;
8899       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
8900     }
8901     for(hr=0;hr<HOST_REGS;hr++) {
8902       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
8903         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
8904           regs[i].wasconst&=~(1<<hr);
8905         }
8906       }
8907     }
8908     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
8909     regs[i].waswritten=current.waswritten;
8910   }
8911
8912   /* Pass 4 - Cull unused host registers */
8913
8914   uint64_t nr=0;
8915
8916   for (i=slen-1;i>=0;i--)
8917   {
8918     int hr;
8919     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8920     {
8921       if(ba[i]<start || ba[i]>=(start+slen*4))
8922       {
8923         // Branch out of this block, don't need anything
8924         nr=0;
8925       }
8926       else
8927       {
8928         // Internal branch
8929         // Need whatever matches the target
8930         nr=0;
8931         int t=(ba[i]-start)>>2;
8932         for(hr=0;hr<HOST_REGS;hr++)
8933         {
8934           if(regs[i].regmap_entry[hr]>=0) {
8935             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
8936           }
8937         }
8938       }
8939       // Conditional branch may need registers for following instructions
8940       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8941       {
8942         if(i<slen-2) {
8943           nr|=needed_reg[i+2];
8944           for(hr=0;hr<HOST_REGS;hr++)
8945           {
8946             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
8947             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
8948           }
8949         }
8950       }
8951       // Don't need stuff which is overwritten
8952       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8953       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8954       // Merge in delay slot
8955       for(hr=0;hr<HOST_REGS;hr++)
8956       {
8957         if(!likely[i]) {
8958           // These are overwritten unless the branch is "likely"
8959           // and the delay slot is nullified if not taken
8960           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8961           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8962         }
8963         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8964         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8965         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8966         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8967         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8968         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8969         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8970         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8971         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
8972           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8973           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8974         }
8975         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
8976           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8977           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8978         }
8979         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
8980           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8981           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8982         }
8983       }
8984     }
8985     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
8986     {
8987       // SYSCALL instruction (software interrupt)
8988       nr=0;
8989     }
8990     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
8991     {
8992       // ERET instruction (return from interrupt)
8993       nr=0;
8994     }
8995     else // Non-branch
8996     {
8997       if(i<slen-1) {
8998         for(hr=0;hr<HOST_REGS;hr++) {
8999           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9000           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9001           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9002           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9003         }
9004       }
9005     }
9006     for(hr=0;hr<HOST_REGS;hr++)
9007     {
9008       // Overwritten registers are not needed
9009       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9010       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9011       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9012       // Source registers are needed
9013       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9014       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9015       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9016       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9017       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9018       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9019       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9020       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9021       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9022         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9023         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9024       }
9025       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9026         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9027         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9028       }
9029       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
9030         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9031         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9032       }
9033       // Don't store a register immediately after writing it,
9034       // may prevent dual-issue.
9035       // But do so if this is a branch target, otherwise we
9036       // might have to load the register before the branch.
9037       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9038         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9039            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9040           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9041           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9042         }
9043         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9044            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9045           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9046           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9047         }
9048       }
9049     }
9050     // Cycle count is needed at branches.  Assume it is needed at the target too.
9051     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9052       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9053       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9054     }
9055     // Save it
9056     needed_reg[i]=nr;
9057
9058     // Deallocate unneeded registers
9059     for(hr=0;hr<HOST_REGS;hr++)
9060     {
9061       if(!((nr>>hr)&1)) {
9062         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9063         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9064            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9065            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9066         {
9067           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9068           {
9069             if(likely[i]) {
9070               regs[i].regmap[hr]=-1;
9071               regs[i].isconst&=~(1<<hr);
9072               if(i<slen-2) {
9073                 regmap_pre[i+2][hr]=-1;
9074                 regs[i+2].wasconst&=~(1<<hr);
9075               }
9076             }
9077           }
9078         }
9079         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9080         {
9081           int d1=0,d2=0,map=0,temp=0;
9082           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9083           {
9084             d1=dep1[i+1];
9085             d2=dep2[i+1];
9086           }
9087           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
9088              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9089             map=INVCP;
9090           }
9091           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9092              itype[i+1]==C1LS || itype[i+1]==C2LS)
9093             temp=FTEMP;
9094           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9095              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9096              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9097              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9098              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9099              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9100              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9101              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9102              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9103              regs[i].regmap[hr]!=map )
9104           {
9105             regs[i].regmap[hr]=-1;
9106             regs[i].isconst&=~(1<<hr);
9107             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9108                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9109                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9110                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9111                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9112                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9113                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9114                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9115                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9116                branch_regs[i].regmap[hr]!=map)
9117             {
9118               branch_regs[i].regmap[hr]=-1;
9119               branch_regs[i].regmap_entry[hr]=-1;
9120               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9121               {
9122                 if(!likely[i]&&i<slen-2) {
9123                   regmap_pre[i+2][hr]=-1;
9124                   regs[i+2].wasconst&=~(1<<hr);
9125                 }
9126               }
9127             }
9128           }
9129         }
9130         else
9131         {
9132           // Non-branch
9133           if(i>0)
9134           {
9135             int d1=0,d2=0,map=-1,temp=-1;
9136             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9137             {
9138               d1=dep1[i];
9139               d2=dep2[i];
9140             }
9141             if(itype[i]==STORE || itype[i]==STORELR ||
9142                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9143               map=INVCP;
9144             }
9145             if(itype[i]==LOADLR || itype[i]==STORELR ||
9146                itype[i]==C1LS || itype[i]==C2LS)
9147               temp=FTEMP;
9148             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9149                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9150                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9151                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9152                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9153                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9154             {
9155               if(i<slen-1&&!is_ds[i]) {
9156                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9157                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9158                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9159                 {
9160                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9161                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9162                 }
9163                 regmap_pre[i+1][hr]=-1;
9164                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9165                 regs[i+1].wasconst&=~(1<<hr);
9166               }
9167               regs[i].regmap[hr]=-1;
9168               regs[i].isconst&=~(1<<hr);
9169             }
9170           }
9171         }
9172       }
9173     }
9174   }
9175
9176   /* Pass 5 - Pre-allocate registers */
9177
9178   // If a register is allocated during a loop, try to allocate it for the
9179   // entire loop, if possible.  This avoids loading/storing registers
9180   // inside of the loop.
9181
9182   signed char f_regmap[HOST_REGS];
9183   clear_all_regs(f_regmap);
9184   for(i=0;i<slen-1;i++)
9185   {
9186     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9187     {
9188       if(ba[i]>=start && ba[i]<(start+i*4))
9189       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9190       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9191       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9192       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9193       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
9194       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
9195       {
9196         int t=(ba[i]-start)>>2;
9197         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9198         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
9199         for(hr=0;hr<HOST_REGS;hr++)
9200         {
9201           if(regs[i].regmap[hr]>64) {
9202             if(!((regs[i].dirty>>hr)&1))
9203               f_regmap[hr]=regs[i].regmap[hr];
9204             else f_regmap[hr]=-1;
9205           }
9206           else if(regs[i].regmap[hr]>=0) {
9207             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9208               // dealloc old register
9209               int n;
9210               for(n=0;n<HOST_REGS;n++)
9211               {
9212                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9213               }
9214               // and alloc new one
9215               f_regmap[hr]=regs[i].regmap[hr];
9216             }
9217           }
9218           if(branch_regs[i].regmap[hr]>64) {
9219             if(!((branch_regs[i].dirty>>hr)&1))
9220               f_regmap[hr]=branch_regs[i].regmap[hr];
9221             else f_regmap[hr]=-1;
9222           }
9223           else if(branch_regs[i].regmap[hr]>=0) {
9224             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
9225               // dealloc old register
9226               int n;
9227               for(n=0;n<HOST_REGS;n++)
9228               {
9229                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
9230               }
9231               // and alloc new one
9232               f_regmap[hr]=branch_regs[i].regmap[hr];
9233             }
9234           }
9235           if(ooo[i]) {
9236             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
9237               f_regmap[hr]=branch_regs[i].regmap[hr];
9238           }else{
9239             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
9240               f_regmap[hr]=branch_regs[i].regmap[hr];
9241           }
9242           // Avoid dirty->clean transition
9243           #ifdef DESTRUCTIVE_WRITEBACK
9244           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9245           #endif
9246           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
9247           // case above, however it's always a good idea.  We can't hoist the
9248           // load if the register was already allocated, so there's no point
9249           // wasting time analyzing most of these cases.  It only "succeeds"
9250           // when the mapping was different and the load can be replaced with
9251           // a mov, which is of negligible benefit.  So such cases are
9252           // skipped below.
9253           if(f_regmap[hr]>0) {
9254             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
9255               int r=f_regmap[hr];
9256               for(j=t;j<=i;j++)
9257               {
9258                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9259                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9260                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9261                 if(r>63) {
9262                   // NB This can exclude the case where the upper-half
9263                   // register is lower numbered than the lower-half
9264                   // register.  Not sure if it's worth fixing...
9265                   if(get_reg(regs[j].regmap,r&63)<0) break;
9266                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
9267                   if(regs[j].is32&(1LL<<(r&63))) break;
9268                 }
9269                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9270                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9271                   int k;
9272                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9273                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9274                     if(r>63) {
9275                       if(get_reg(regs[i].regmap,r&63)<0) break;
9276                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9277                     }
9278                     k=i;
9279                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9280                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9281                         //printf("no free regs for store %x\n",start+(k-1)*4);
9282                         break;
9283                       }
9284                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9285                         //printf("no-match due to different register\n");
9286                         break;
9287                       }
9288                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9289                         //printf("no-match due to branch\n");
9290                         break;
9291                       }
9292                       // call/ret fast path assumes no registers allocated
9293                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
9294                         break;
9295                       }
9296                       if(r>63) {
9297                         // NB This can exclude the case where the upper-half
9298                         // register is lower numbered than the lower-half
9299                         // register.  Not sure if it's worth fixing...
9300                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9301                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9302                       }
9303                       k--;
9304                     }
9305                     if(i<slen-1) {
9306                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9307                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9308                         //printf("bad match after branch\n");
9309                         break;
9310                       }
9311                     }
9312                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
9313                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
9314                       while(k<i) {
9315                         regs[k].regmap_entry[hr]=f_regmap[hr];
9316                         regs[k].regmap[hr]=f_regmap[hr];
9317                         regmap_pre[k+1][hr]=f_regmap[hr];
9318                         regs[k].wasdirty&=~(1<<hr);
9319                         regs[k].dirty&=~(1<<hr);
9320                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
9321                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
9322                         regs[k].wasconst&=~(1<<hr);
9323                         regs[k].isconst&=~(1<<hr);
9324                         k++;
9325                       }
9326                     }
9327                     else {
9328                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
9329                       break;
9330                     }
9331                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
9332                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
9333                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
9334                       regs[i].regmap_entry[hr]=f_regmap[hr];
9335                       regs[i].regmap[hr]=f_regmap[hr];
9336                       regs[i].wasdirty&=~(1<<hr);
9337                       regs[i].dirty&=~(1<<hr);
9338                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
9339                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
9340                       regs[i].wasconst&=~(1<<hr);
9341                       regs[i].isconst&=~(1<<hr);
9342                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
9343                       branch_regs[i].wasdirty&=~(1<<hr);
9344                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
9345                       branch_regs[i].regmap[hr]=f_regmap[hr];
9346                       branch_regs[i].dirty&=~(1<<hr);
9347                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
9348                       branch_regs[i].wasconst&=~(1<<hr);
9349                       branch_regs[i].isconst&=~(1<<hr);
9350                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
9351                         regmap_pre[i+2][hr]=f_regmap[hr];
9352                         regs[i+2].wasdirty&=~(1<<hr);
9353                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
9354                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
9355                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
9356                       }
9357                     }
9358                   }
9359                   for(k=t;k<j;k++) {
9360                     // Alloc register clean at beginning of loop,
9361                     // but may dirty it in pass 6
9362                     regs[k].regmap_entry[hr]=f_regmap[hr];
9363                     regs[k].regmap[hr]=f_regmap[hr];
9364                     regs[k].dirty&=~(1<<hr);
9365                     regs[k].wasconst&=~(1<<hr);
9366                     regs[k].isconst&=~(1<<hr);
9367                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
9368                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
9369                       branch_regs[k].regmap[hr]=f_regmap[hr];
9370                       branch_regs[k].dirty&=~(1<<hr);
9371                       branch_regs[k].wasconst&=~(1<<hr);
9372                       branch_regs[k].isconst&=~(1<<hr);
9373                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
9374                         regmap_pre[k+2][hr]=f_regmap[hr];
9375                         regs[k+2].wasdirty&=~(1<<hr);
9376                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
9377                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
9378                       }
9379                     }
9380                     else
9381                     {
9382                       regmap_pre[k+1][hr]=f_regmap[hr];
9383                       regs[k+1].wasdirty&=~(1<<hr);
9384                     }
9385                   }
9386                   if(regs[j].regmap[hr]==f_regmap[hr])
9387                     regs[j].regmap_entry[hr]=f_regmap[hr];
9388                   break;
9389                 }
9390                 if(j==i) break;
9391                 if(regs[j].regmap[hr]>=0)
9392                   break;
9393                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
9394                   //printf("no-match due to different register\n");
9395                   break;
9396                 }
9397                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
9398                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
9399                   break;
9400                 }
9401                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9402                 {
9403                   // Stop on unconditional branch
9404                   break;
9405                 }
9406                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
9407                 {
9408                   if(ooo[j]) {
9409                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
9410                       break;
9411                   }else{
9412                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
9413                       break;
9414                   }
9415                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
9416                     //printf("no-match due to different register (branch)\n");
9417                     break;
9418                   }
9419                 }
9420                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9421                   //printf("No free regs for store %x\n",start+j*4);
9422                   break;
9423                 }
9424                 if(f_regmap[hr]>=64) {
9425                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
9426                     break;
9427                   }
9428                   else
9429                   {
9430                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
9431                       break;
9432                     }
9433                   }
9434                 }
9435               }
9436             }
9437           }
9438         }
9439       }
9440     }else{
9441       // Non branch or undetermined branch target
9442       for(hr=0;hr<HOST_REGS;hr++)
9443       {
9444         if(hr!=EXCLUDE_REG) {
9445           if(regs[i].regmap[hr]>64) {
9446             if(!((regs[i].dirty>>hr)&1))
9447               f_regmap[hr]=regs[i].regmap[hr];
9448           }
9449           else if(regs[i].regmap[hr]>=0) {
9450             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9451               // dealloc old register
9452               int n;
9453               for(n=0;n<HOST_REGS;n++)
9454               {
9455                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9456               }
9457               // and alloc new one
9458               f_regmap[hr]=regs[i].regmap[hr];
9459             }
9460           }
9461         }
9462       }
9463       // Try to restore cycle count at branch targets
9464       if(bt[i]) {
9465         for(j=i;j<slen-1;j++) {
9466           if(regs[j].regmap[HOST_CCREG]!=-1) break;
9467           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9468             //printf("no free regs for store %x\n",start+j*4);
9469             break;
9470           }
9471         }
9472         if(regs[j].regmap[HOST_CCREG]==CCREG) {
9473           int k=i;
9474           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
9475           while(k<j) {
9476             regs[k].regmap_entry[HOST_CCREG]=CCREG;
9477             regs[k].regmap[HOST_CCREG]=CCREG;
9478             regmap_pre[k+1][HOST_CCREG]=CCREG;
9479             regs[k+1].wasdirty|=1<<HOST_CCREG;
9480             regs[k].dirty|=1<<HOST_CCREG;
9481             regs[k].wasconst&=~(1<<HOST_CCREG);
9482             regs[k].isconst&=~(1<<HOST_CCREG);
9483             k++;
9484           }
9485           regs[j].regmap_entry[HOST_CCREG]=CCREG;
9486         }
9487         // Work backwards from the branch target
9488         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9489         {
9490           //printf("Extend backwards\n");
9491           int k;
9492           k=i;
9493           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9494             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9495               //printf("no free regs for store %x\n",start+(k-1)*4);
9496               break;
9497             }
9498             k--;
9499           }
9500           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9501             //printf("Extend CC, %x ->\n",start+k*4);
9502             while(k<=i) {
9503               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9504               regs[k].regmap[HOST_CCREG]=CCREG;
9505               regmap_pre[k+1][HOST_CCREG]=CCREG;
9506               regs[k+1].wasdirty|=1<<HOST_CCREG;
9507               regs[k].dirty|=1<<HOST_CCREG;
9508               regs[k].wasconst&=~(1<<HOST_CCREG);
9509               regs[k].isconst&=~(1<<HOST_CCREG);
9510               k++;
9511             }
9512           }
9513           else {
9514             //printf("Fail Extend CC, %x ->\n",start+k*4);
9515           }
9516         }
9517       }
9518       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9519          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9520          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9521          itype[i]!=FCONV&&itype[i]!=FCOMP)
9522       {
9523         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9524       }
9525     }
9526   }
9527
9528   // Cache memory offset or tlb map pointer if a register is available
9529   #ifndef HOST_IMM_ADDR32
9530   #ifndef RAM_OFFSET
9531   if(0)
9532   #endif
9533   {
9534     int earliest_available[HOST_REGS];
9535     int loop_start[HOST_REGS];
9536     int score[HOST_REGS];
9537     int end[HOST_REGS];
9538     int reg=ROREG;
9539
9540     // Init
9541     for(hr=0;hr<HOST_REGS;hr++) {
9542       score[hr]=0;earliest_available[hr]=0;
9543       loop_start[hr]=MAXBLOCK;
9544     }
9545     for(i=0;i<slen-1;i++)
9546     {
9547       // Can't do anything if no registers are available
9548       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
9549         for(hr=0;hr<HOST_REGS;hr++) {
9550           score[hr]=0;earliest_available[hr]=i+1;
9551           loop_start[hr]=MAXBLOCK;
9552         }
9553       }
9554       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9555         if(!ooo[i]) {
9556           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
9557             for(hr=0;hr<HOST_REGS;hr++) {
9558               score[hr]=0;earliest_available[hr]=i+1;
9559               loop_start[hr]=MAXBLOCK;
9560             }
9561           }
9562         }else{
9563           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
9564             for(hr=0;hr<HOST_REGS;hr++) {
9565               score[hr]=0;earliest_available[hr]=i+1;
9566               loop_start[hr]=MAXBLOCK;
9567             }
9568           }
9569         }
9570       }
9571       // Mark unavailable registers
9572       for(hr=0;hr<HOST_REGS;hr++) {
9573         if(regs[i].regmap[hr]>=0) {
9574           score[hr]=0;earliest_available[hr]=i+1;
9575           loop_start[hr]=MAXBLOCK;
9576         }
9577         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9578           if(branch_regs[i].regmap[hr]>=0) {
9579             score[hr]=0;earliest_available[hr]=i+2;
9580             loop_start[hr]=MAXBLOCK;
9581           }
9582         }
9583       }
9584       // No register allocations after unconditional jumps
9585       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
9586       {
9587         for(hr=0;hr<HOST_REGS;hr++) {
9588           score[hr]=0;earliest_available[hr]=i+2;
9589           loop_start[hr]=MAXBLOCK;
9590         }
9591         i++; // Skip delay slot too
9592         //printf("skip delay slot: %x\n",start+i*4);
9593       }
9594       else
9595       // Possible match
9596       if(itype[i]==LOAD||itype[i]==LOADLR||
9597          itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
9598         for(hr=0;hr<HOST_REGS;hr++) {
9599           if(hr!=EXCLUDE_REG) {
9600             end[hr]=i-1;
9601             for(j=i;j<slen-1;j++) {
9602               if(regs[j].regmap[hr]>=0) break;
9603               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9604                 if(branch_regs[j].regmap[hr]>=0) break;
9605                 if(ooo[j]) {
9606                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
9607                 }else{
9608                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
9609                 }
9610               }
9611               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
9612               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9613                 int t=(ba[j]-start)>>2;
9614                 if(t<j&&t>=earliest_available[hr]) {
9615                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
9616                     // Score a point for hoisting loop invariant
9617                     if(t<loop_start[hr]) loop_start[hr]=t;
9618                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
9619                     score[hr]++;
9620                     end[hr]=j;
9621                   }
9622                 }
9623                 else if(t<j) {
9624                   if(regs[t].regmap[hr]==reg) {
9625                     // Score a point if the branch target matches this register
9626                     score[hr]++;
9627                     end[hr]=j;
9628                   }
9629                 }
9630                 if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
9631                    itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
9632                   score[hr]++;
9633                   end[hr]=j;
9634                 }
9635               }
9636               if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9637               {
9638                 // Stop on unconditional branch
9639                 break;
9640               }
9641               else
9642               if(itype[j]==LOAD||itype[j]==LOADLR||
9643                  itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
9644                 score[hr]++;
9645                 end[hr]=j;
9646               }
9647             }
9648           }
9649         }
9650         // Find highest score and allocate that register
9651         int maxscore=0;
9652         for(hr=0;hr<HOST_REGS;hr++) {
9653           if(hr!=EXCLUDE_REG) {
9654             if(score[hr]>score[maxscore]) {
9655               maxscore=hr;
9656               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
9657             }
9658           }
9659         }
9660         if(score[maxscore]>1)
9661         {
9662           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
9663           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
9664             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
9665             assert(regs[j].regmap[maxscore]<0);
9666             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
9667             regs[j].regmap[maxscore]=reg;
9668             regs[j].dirty&=~(1<<maxscore);
9669             regs[j].wasconst&=~(1<<maxscore);
9670             regs[j].isconst&=~(1<<maxscore);
9671             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9672               branch_regs[j].regmap[maxscore]=reg;
9673               branch_regs[j].wasdirty&=~(1<<maxscore);
9674               branch_regs[j].dirty&=~(1<<maxscore);
9675               branch_regs[j].wasconst&=~(1<<maxscore);
9676               branch_regs[j].isconst&=~(1<<maxscore);
9677               if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
9678                 regmap_pre[j+2][maxscore]=reg;
9679                 regs[j+2].wasdirty&=~(1<<maxscore);
9680               }
9681               // loop optimization (loop_preload)
9682               int t=(ba[j]-start)>>2;
9683               if(t==loop_start[maxscore]) {
9684                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
9685                   regs[t].regmap_entry[maxscore]=reg;
9686               }
9687             }
9688             else
9689             {
9690               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
9691                 regmap_pre[j+1][maxscore]=reg;
9692                 regs[j+1].wasdirty&=~(1<<maxscore);
9693               }
9694             }
9695           }
9696           i=j-1;
9697           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
9698           for(hr=0;hr<HOST_REGS;hr++) {
9699             score[hr]=0;earliest_available[hr]=i+i;
9700             loop_start[hr]=MAXBLOCK;
9701           }
9702         }
9703       }
9704     }
9705   }
9706   #endif
9707
9708   // This allocates registers (if possible) one instruction prior
9709   // to use, which can avoid a load-use penalty on certain CPUs.
9710   for(i=0;i<slen-1;i++)
9711   {
9712     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9713     {
9714       if(!bt[i+1])
9715       {
9716         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
9717            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
9718         {
9719           if(rs1[i+1]) {
9720             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9721             {
9722               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9723               {
9724                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9725                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9726                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9727                 regs[i].isconst&=~(1<<hr);
9728                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9729                 constmap[i][hr]=constmap[i+1][hr];
9730                 regs[i+1].wasdirty&=~(1<<hr);
9731                 regs[i].dirty&=~(1<<hr);
9732               }
9733             }
9734           }
9735           if(rs2[i+1]) {
9736             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
9737             {
9738               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9739               {
9740                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9741                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9742                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9743                 regs[i].isconst&=~(1<<hr);
9744                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9745                 constmap[i][hr]=constmap[i+1][hr];
9746                 regs[i+1].wasdirty&=~(1<<hr);
9747                 regs[i].dirty&=~(1<<hr);
9748               }
9749             }
9750           }
9751           // Preload target address for load instruction (non-constant)
9752           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9753             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9754             {
9755               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9756               {
9757                 regs[i].regmap[hr]=rs1[i+1];
9758                 regmap_pre[i+1][hr]=rs1[i+1];
9759                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9760                 regs[i].isconst&=~(1<<hr);
9761                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9762                 constmap[i][hr]=constmap[i+1][hr];
9763                 regs[i+1].wasdirty&=~(1<<hr);
9764                 regs[i].dirty&=~(1<<hr);
9765               }
9766             }
9767           }
9768           // Load source into target register
9769           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9770             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9771             {
9772               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9773               {
9774                 regs[i].regmap[hr]=rs1[i+1];
9775                 regmap_pre[i+1][hr]=rs1[i+1];
9776                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9777                 regs[i].isconst&=~(1<<hr);
9778                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9779                 constmap[i][hr]=constmap[i+1][hr];
9780                 regs[i+1].wasdirty&=~(1<<hr);
9781                 regs[i].dirty&=~(1<<hr);
9782               }
9783             }
9784           }
9785           // Address for store instruction (non-constant)
9786           if(itype[i+1]==STORE||itype[i+1]==STORELR
9787              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
9788             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9789               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
9790               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9791               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
9792               assert(hr>=0);
9793               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9794               {
9795                 regs[i].regmap[hr]=rs1[i+1];
9796                 regmap_pre[i+1][hr]=rs1[i+1];
9797                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9798                 regs[i].isconst&=~(1<<hr);
9799                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9800                 constmap[i][hr]=constmap[i+1][hr];
9801                 regs[i+1].wasdirty&=~(1<<hr);
9802                 regs[i].dirty&=~(1<<hr);
9803               }
9804             }
9805           }
9806           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
9807             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9808               int nr;
9809               hr=get_reg(regs[i+1].regmap,FTEMP);
9810               assert(hr>=0);
9811               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9812               {
9813                 regs[i].regmap[hr]=rs1[i+1];
9814                 regmap_pre[i+1][hr]=rs1[i+1];
9815                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9816                 regs[i].isconst&=~(1<<hr);
9817                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9818                 constmap[i][hr]=constmap[i+1][hr];
9819                 regs[i+1].wasdirty&=~(1<<hr);
9820                 regs[i].dirty&=~(1<<hr);
9821               }
9822               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9823               {
9824                 // move it to another register
9825                 regs[i+1].regmap[hr]=-1;
9826                 regmap_pre[i+2][hr]=-1;
9827                 regs[i+1].regmap[nr]=FTEMP;
9828                 regmap_pre[i+2][nr]=FTEMP;
9829                 regs[i].regmap[nr]=rs1[i+1];
9830                 regmap_pre[i+1][nr]=rs1[i+1];
9831                 regs[i+1].regmap_entry[nr]=rs1[i+1];
9832                 regs[i].isconst&=~(1<<nr);
9833                 regs[i+1].isconst&=~(1<<nr);
9834                 regs[i].dirty&=~(1<<nr);
9835                 regs[i+1].wasdirty&=~(1<<nr);
9836                 regs[i+1].dirty&=~(1<<nr);
9837                 regs[i+2].wasdirty&=~(1<<nr);
9838               }
9839             }
9840           }
9841           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
9842             if(itype[i+1]==LOAD)
9843               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
9844             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
9845               hr=get_reg(regs[i+1].regmap,FTEMP);
9846             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
9847               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
9848               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9849             }
9850             if(hr>=0&&regs[i].regmap[hr]<0) {
9851               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
9852               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
9853                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
9854                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
9855                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
9856                 regs[i].isconst&=~(1<<hr);
9857                 regs[i+1].wasdirty&=~(1<<hr);
9858                 regs[i].dirty&=~(1<<hr);
9859               }
9860             }
9861           }
9862         }
9863       }
9864     }
9865   }
9866
9867   /* Pass 6 - Optimize clean/dirty state */
9868   clean_registers(0,slen-1,1);
9869
9870   /* Pass 7 - Identify 32-bit registers */
9871   for (i=slen-1;i>=0;i--)
9872   {
9873     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9874     {
9875       // Conditional branch
9876       if((source[i]>>16)!=0x1000&&i<slen-2) {
9877         // Mark this address as a branch target since it may be called
9878         // upon return from interrupt
9879         bt[i+2]=1;
9880       }
9881     }
9882   }
9883
9884   if(itype[slen-1]==SPAN) {
9885     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
9886   }
9887
9888 #ifdef DISASM
9889   /* Debug/disassembly */
9890   for(i=0;i<slen;i++)
9891   {
9892     printf("U:");
9893     int r;
9894     for(r=1;r<=CCREG;r++) {
9895       if((unneeded_reg[i]>>r)&1) {
9896         if(r==HIREG) printf(" HI");
9897         else if(r==LOREG) printf(" LO");
9898         else printf(" r%d",r);
9899       }
9900     }
9901     printf("\n");
9902     #if defined(__i386__) || defined(__x86_64__)
9903     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
9904     #endif
9905     #ifdef __arm__
9906     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
9907     #endif
9908     printf("needs: ");
9909     if(needed_reg[i]&1) printf("eax ");
9910     if((needed_reg[i]>>1)&1) printf("ecx ");
9911     if((needed_reg[i]>>2)&1) printf("edx ");
9912     if((needed_reg[i]>>3)&1) printf("ebx ");
9913     if((needed_reg[i]>>5)&1) printf("ebp ");
9914     if((needed_reg[i]>>6)&1) printf("esi ");
9915     if((needed_reg[i]>>7)&1) printf("edi ");
9916     printf("\n");
9917     #if defined(__i386__) || defined(__x86_64__)
9918     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
9919     printf("dirty: ");
9920     if(regs[i].wasdirty&1) printf("eax ");
9921     if((regs[i].wasdirty>>1)&1) printf("ecx ");
9922     if((regs[i].wasdirty>>2)&1) printf("edx ");
9923     if((regs[i].wasdirty>>3)&1) printf("ebx ");
9924     if((regs[i].wasdirty>>5)&1) printf("ebp ");
9925     if((regs[i].wasdirty>>6)&1) printf("esi ");
9926     if((regs[i].wasdirty>>7)&1) printf("edi ");
9927     #endif
9928     #ifdef __arm__
9929     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
9930     printf("dirty: ");
9931     if(regs[i].wasdirty&1) printf("r0 ");
9932     if((regs[i].wasdirty>>1)&1) printf("r1 ");
9933     if((regs[i].wasdirty>>2)&1) printf("r2 ");
9934     if((regs[i].wasdirty>>3)&1) printf("r3 ");
9935     if((regs[i].wasdirty>>4)&1) printf("r4 ");
9936     if((regs[i].wasdirty>>5)&1) printf("r5 ");
9937     if((regs[i].wasdirty>>6)&1) printf("r6 ");
9938     if((regs[i].wasdirty>>7)&1) printf("r7 ");
9939     if((regs[i].wasdirty>>8)&1) printf("r8 ");
9940     if((regs[i].wasdirty>>9)&1) printf("r9 ");
9941     if((regs[i].wasdirty>>10)&1) printf("r10 ");
9942     if((regs[i].wasdirty>>12)&1) printf("r12 ");
9943     #endif
9944     printf("\n");
9945     disassemble_inst(i);
9946     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
9947     #if defined(__i386__) || defined(__x86_64__)
9948     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
9949     if(regs[i].dirty&1) printf("eax ");
9950     if((regs[i].dirty>>1)&1) printf("ecx ");
9951     if((regs[i].dirty>>2)&1) printf("edx ");
9952     if((regs[i].dirty>>3)&1) printf("ebx ");
9953     if((regs[i].dirty>>5)&1) printf("ebp ");
9954     if((regs[i].dirty>>6)&1) printf("esi ");
9955     if((regs[i].dirty>>7)&1) printf("edi ");
9956     #endif
9957     #ifdef __arm__
9958     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
9959     if(regs[i].dirty&1) printf("r0 ");
9960     if((regs[i].dirty>>1)&1) printf("r1 ");
9961     if((regs[i].dirty>>2)&1) printf("r2 ");
9962     if((regs[i].dirty>>3)&1) printf("r3 ");
9963     if((regs[i].dirty>>4)&1) printf("r4 ");
9964     if((regs[i].dirty>>5)&1) printf("r5 ");
9965     if((regs[i].dirty>>6)&1) printf("r6 ");
9966     if((regs[i].dirty>>7)&1) printf("r7 ");
9967     if((regs[i].dirty>>8)&1) printf("r8 ");
9968     if((regs[i].dirty>>9)&1) printf("r9 ");
9969     if((regs[i].dirty>>10)&1) printf("r10 ");
9970     if((regs[i].dirty>>12)&1) printf("r12 ");
9971     #endif
9972     printf("\n");
9973     if(regs[i].isconst) {
9974       printf("constants: ");
9975       #if defined(__i386__) || defined(__x86_64__)
9976       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
9977       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
9978       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
9979       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
9980       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
9981       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
9982       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
9983       #endif
9984       #ifdef __arm__
9985       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
9986       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
9987       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
9988       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
9989       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
9990       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
9991       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
9992       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
9993       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
9994       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
9995       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
9996       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
9997       #endif
9998       printf("\n");
9999     }
10000     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10001       #if defined(__i386__) || defined(__x86_64__)
10002       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
10003       if(branch_regs[i].dirty&1) printf("eax ");
10004       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
10005       if((branch_regs[i].dirty>>2)&1) printf("edx ");
10006       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
10007       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
10008       if((branch_regs[i].dirty>>6)&1) printf("esi ");
10009       if((branch_regs[i].dirty>>7)&1) printf("edi ");
10010       #endif
10011       #ifdef __arm__
10012       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
10013       if(branch_regs[i].dirty&1) printf("r0 ");
10014       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
10015       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
10016       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
10017       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
10018       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
10019       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
10020       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
10021       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
10022       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
10023       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
10024       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
10025       #endif
10026     }
10027   }
10028 #endif // DISASM
10029
10030   /* Pass 8 - Assembly */
10031   linkcount=0;stubcount=0;
10032   ds=0;is_delayslot=0;
10033   cop1_usable=0;
10034   uint64_t is32_pre=0;
10035   u_int dirty_pre=0;
10036   void *beginning=start_block();
10037   if((u_int)addr&1) {
10038     ds=1;
10039     pagespan_ds();
10040   }
10041   u_int instr_addr0_override=0;
10042
10043   if (start == 0x80030000) {
10044     // nasty hack for fastbios thing
10045     // override block entry to this code
10046     instr_addr0_override=(u_int)out;
10047     emit_movimm(start,0);
10048     // abuse io address var as a flag that we
10049     // have already returned here once
10050     emit_readword((int)&address,1);
10051     emit_writeword(0,(int)&pcaddr);
10052     emit_writeword(0,(int)&address);
10053     emit_cmp(0,1);
10054     emit_jne((int)new_dyna_leave);
10055   }
10056   for(i=0;i<slen;i++)
10057   {
10058     //if(ds) printf("ds: ");
10059     disassemble_inst(i);
10060     if(ds) {
10061       ds=0; // Skip delay slot
10062       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
10063       instr_addr[i]=0;
10064     } else {
10065       speculate_register_values(i);
10066       #ifndef DESTRUCTIVE_WRITEBACK
10067       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10068       {
10069         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
10070               unneeded_reg[i],unneeded_reg_upper[i]);
10071       }
10072       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
10073         is32_pre=branch_regs[i].is32;
10074         dirty_pre=branch_regs[i].dirty;
10075       }else{
10076         is32_pre=regs[i].is32;
10077         dirty_pre=regs[i].dirty;
10078       }
10079       #endif
10080       // write back
10081       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10082       {
10083         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
10084                       unneeded_reg[i],unneeded_reg_upper[i]);
10085         loop_preload(regmap_pre[i],regs[i].regmap_entry);
10086       }
10087       // branch target entry point
10088       instr_addr[i]=(u_int)out;
10089       assem_debug("<->\n");
10090       // load regs
10091       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
10092         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
10093       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
10094       address_generation(i,&regs[i],regs[i].regmap_entry);
10095       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
10096       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10097       {
10098         // Load the delay slot registers if necessary
10099         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
10100           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10101         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
10102           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10103         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
10104           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10105       }
10106       else if(i+1<slen)
10107       {
10108         // Preload registers for following instruction
10109         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10110           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
10111             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10112         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10113           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
10114             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10115       }
10116       // TODO: if(is_ooo(i)) address_generation(i+1);
10117       if(itype[i]==CJUMP||itype[i]==FJUMP)
10118         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
10119       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
10120         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10121       if(bt[i]) cop1_usable=0;
10122       // assemble
10123       switch(itype[i]) {
10124         case ALU:
10125           alu_assemble(i,&regs[i]);break;
10126         case IMM16:
10127           imm16_assemble(i,&regs[i]);break;
10128         case SHIFT:
10129           shift_assemble(i,&regs[i]);break;
10130         case SHIFTIMM:
10131           shiftimm_assemble(i,&regs[i]);break;
10132         case LOAD:
10133           load_assemble(i,&regs[i]);break;
10134         case LOADLR:
10135           loadlr_assemble(i,&regs[i]);break;
10136         case STORE:
10137           store_assemble(i,&regs[i]);break;
10138         case STORELR:
10139           storelr_assemble(i,&regs[i]);break;
10140         case COP0:
10141           cop0_assemble(i,&regs[i]);break;
10142         case COP1:
10143           cop1_assemble(i,&regs[i]);break;
10144         case C1LS:
10145           c1ls_assemble(i,&regs[i]);break;
10146         case COP2:
10147           cop2_assemble(i,&regs[i]);break;
10148         case C2LS:
10149           c2ls_assemble(i,&regs[i]);break;
10150         case C2OP:
10151           c2op_assemble(i,&regs[i]);break;
10152         case FCONV:
10153           fconv_assemble(i,&regs[i]);break;
10154         case FLOAT:
10155           float_assemble(i,&regs[i]);break;
10156         case FCOMP:
10157           fcomp_assemble(i,&regs[i]);break;
10158         case MULTDIV:
10159           multdiv_assemble(i,&regs[i]);break;
10160         case MOV:
10161           mov_assemble(i,&regs[i]);break;
10162         case SYSCALL:
10163           syscall_assemble(i,&regs[i]);break;
10164         case HLECALL:
10165           hlecall_assemble(i,&regs[i]);break;
10166         case INTCALL:
10167           intcall_assemble(i,&regs[i]);break;
10168         case UJUMP:
10169           ujump_assemble(i,&regs[i]);ds=1;break;
10170         case RJUMP:
10171           rjump_assemble(i,&regs[i]);ds=1;break;
10172         case CJUMP:
10173           cjump_assemble(i,&regs[i]);ds=1;break;
10174         case SJUMP:
10175           sjump_assemble(i,&regs[i]);ds=1;break;
10176         case FJUMP:
10177           fjump_assemble(i,&regs[i]);ds=1;break;
10178         case SPAN:
10179           pagespan_assemble(i,&regs[i]);break;
10180       }
10181       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10182         literal_pool(1024);
10183       else
10184         literal_pool_jumpover(256);
10185     }
10186   }
10187   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10188   // If the block did not end with an unconditional branch,
10189   // add a jump to the next instruction.
10190   if(i>1) {
10191     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10192       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10193       assert(i==slen);
10194       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10195         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10196         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10197           emit_loadreg(CCREG,HOST_CCREG);
10198         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10199       }
10200       else if(!likely[i-2])
10201       {
10202         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10203         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10204       }
10205       else
10206       {
10207         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10208         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10209       }
10210       add_to_linker((int)out,start+i*4,0);
10211       emit_jmp(0);
10212     }
10213   }
10214   else
10215   {
10216     assert(i>0);
10217     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10218     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10219     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10220       emit_loadreg(CCREG,HOST_CCREG);
10221     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10222     add_to_linker((int)out,start+i*4,0);
10223     emit_jmp(0);
10224   }
10225
10226   // TODO: delay slot stubs?
10227   // Stubs
10228   for(i=0;i<stubcount;i++)
10229   {
10230     switch(stubs[i][0])
10231     {
10232       case LOADB_STUB:
10233       case LOADH_STUB:
10234       case LOADW_STUB:
10235       case LOADD_STUB:
10236       case LOADBU_STUB:
10237       case LOADHU_STUB:
10238         do_readstub(i);break;
10239       case STOREB_STUB:
10240       case STOREH_STUB:
10241       case STOREW_STUB:
10242       case STORED_STUB:
10243         do_writestub(i);break;
10244       case CC_STUB:
10245         do_ccstub(i);break;
10246       case INVCODE_STUB:
10247         do_invstub(i);break;
10248       case FP_STUB:
10249         do_cop1stub(i);break;
10250       case STORELR_STUB:
10251         do_unalignedwritestub(i);break;
10252     }
10253   }
10254
10255   if (instr_addr0_override)
10256     instr_addr[0] = instr_addr0_override;
10257
10258   /* Pass 9 - Linker */
10259   for(i=0;i<linkcount;i++)
10260   {
10261     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10262     literal_pool(64);
10263     if(!link_addr[i][2])
10264     {
10265       void *stub=out;
10266       void *addr=check_addr(link_addr[i][1]);
10267       emit_extjump(link_addr[i][0],link_addr[i][1]);
10268       if(addr) {
10269         set_jump_target(link_addr[i][0],(int)addr);
10270         add_link(link_addr[i][1],stub);
10271       }
10272       else set_jump_target(link_addr[i][0],(int)stub);
10273     }
10274     else
10275     {
10276       // Internal branch
10277       int target=(link_addr[i][1]-start)>>2;
10278       assert(target>=0&&target<slen);
10279       assert(instr_addr[target]);
10280       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10281       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
10282       //#else
10283       set_jump_target(link_addr[i][0],instr_addr[target]);
10284       //#endif
10285     }
10286   }
10287   // External Branch Targets (jump_in)
10288   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
10289   for(i=0;i<slen;i++)
10290   {
10291     if(bt[i]||i==0)
10292     {
10293       if(instr_addr[i]) // TODO - delay slots (=null)
10294       {
10295         u_int vaddr=start+i*4;
10296         u_int page=get_page(vaddr);
10297         u_int vpage=get_vpage(vaddr);
10298         literal_pool(256);
10299         {
10300           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10301           assem_debug("jump_in: %x\n",start+i*4);
10302           ll_add(jump_dirty+vpage,vaddr,(void *)out);
10303           int entry_point=do_dirty_stub(i);
10304           ll_add_flags(jump_in+page,vaddr,state_rflags,(void *)entry_point);
10305           // If there was an existing entry in the hash table,
10306           // replace it with the new address.
10307           // Don't add new entries.  We'll insert the
10308           // ones that actually get used in check_addr().
10309           u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
10310           if(ht_bin[0]==vaddr) {
10311             ht_bin[1]=entry_point;
10312           }
10313           if(ht_bin[2]==vaddr) {
10314             ht_bin[3]=entry_point;
10315           }
10316         }
10317       }
10318     }
10319   }
10320   // Write out the literal pool if necessary
10321   literal_pool(0);
10322   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10323   // Align code
10324   if(((u_int)out)&7) emit_addnop(13);
10325   #endif
10326   assert((u_int)out-(u_int)beginning<MAX_OUTPUT_BLOCK_SIZE);
10327   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
10328   memcpy(copy,source,slen*4);
10329   copy+=slen*4;
10330
10331   end_block(beginning);
10332
10333   // If we're within 256K of the end of the buffer,
10334   // start over from the beginning. (Is 256K enough?)
10335   if((u_int)out>(u_int)BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
10336
10337   // Trap writes to any of the pages we compiled
10338   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
10339     invalid_code[i]=0;
10340   }
10341   inv_code_start=inv_code_end=~0;
10342
10343   // for PCSX we need to mark all mirrors too
10344   if(get_page(start)<(RAM_SIZE>>12))
10345     for(i=start>>12;i<=(start+slen*4)>>12;i++)
10346       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
10347       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
10348       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
10349
10350   /* Pass 10 - Free memory by expiring oldest blocks */
10351
10352   int end=((((int)out-(int)BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
10353   while(expirep!=end)
10354   {
10355     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
10356     int base=(int)BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
10357     inv_debug("EXP: Phase %d\n",expirep);
10358     switch((expirep>>11)&3)
10359     {
10360       case 0:
10361         // Clear jump_in and jump_dirty
10362         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
10363         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
10364         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
10365         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
10366         break;
10367       case 1:
10368         // Clear pointers
10369         ll_kill_pointers(jump_out[expirep&2047],base,shift);
10370         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
10371         break;
10372       case 2:
10373         // Clear hash table
10374         for(i=0;i<32;i++) {
10375           u_int *ht_bin=hash_table[((expirep&2047)<<5)+i];
10376           if((ht_bin[3]>>shift)==(base>>shift) ||
10377              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10378             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
10379             ht_bin[2]=ht_bin[3]=-1;
10380           }
10381           if((ht_bin[1]>>shift)==(base>>shift) ||
10382              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10383             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
10384             ht_bin[0]=ht_bin[2];
10385             ht_bin[1]=ht_bin[3];
10386             ht_bin[2]=ht_bin[3]=-1;
10387           }
10388         }
10389         break;
10390       case 3:
10391         // Clear jump_out
10392         #ifdef __arm__
10393         if((expirep&2047)==0)
10394           do_clear_cache();
10395         #endif
10396         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
10397         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
10398         break;
10399     }
10400     expirep=(expirep+1)&65535;
10401   }
10402   return 0;
10403 }
10404
10405 // vim:shiftwidth=2:expandtab